linux/fs/ext4/mballoc.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
   3 * Written by Alex Tomas <alex@clusterfs.com>
   4 *
   5 * This program is free software; you can redistribute it and/or modify
   6 * it under the terms of the GNU General Public License version 2 as
   7 * published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope that it will be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public Licens
  15 * along with this program; if not, write to the Free Software
  16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
  17 */
  18
  19
  20/*
  21 * mballoc.c contains the multiblocks allocation routines
  22 */
  23
  24#include "ext4_jbd2.h"
  25#include "mballoc.h"
  26#include <linux/log2.h>
  27#include <linux/module.h>
  28#include <linux/slab.h>
  29#include <linux/nospec.h>
  30#include <trace/events/ext4.h>
  31
  32#ifdef CONFIG_EXT4_DEBUG
  33ushort ext4_mballoc_debug __read_mostly;
  34
  35module_param_named(mballoc_debug, ext4_mballoc_debug, ushort, 0644);
  36MODULE_PARM_DESC(mballoc_debug, "Debugging level for ext4's mballoc");
  37#endif
  38
  39/*
  40 * MUSTDO:
  41 *   - test ext4_ext_search_left() and ext4_ext_search_right()
  42 *   - search for metadata in few groups
  43 *
  44 * TODO v4:
  45 *   - normalization should take into account whether file is still open
  46 *   - discard preallocations if no free space left (policy?)
  47 *   - don't normalize tails
  48 *   - quota
  49 *   - reservation for superuser
  50 *
  51 * TODO v3:
  52 *   - bitmap read-ahead (proposed by Oleg Drokin aka green)
  53 *   - track min/max extents in each group for better group selection
  54 *   - mb_mark_used() may allocate chunk right after splitting buddy
  55 *   - tree of groups sorted by number of free blocks
  56 *   - error handling
  57 */
  58
  59/*
  60 * The allocation request involve request for multiple number of blocks
  61 * near to the goal(block) value specified.
  62 *
  63 * During initialization phase of the allocator we decide to use the
  64 * group preallocation or inode preallocation depending on the size of
  65 * the file. The size of the file could be the resulting file size we
  66 * would have after allocation, or the current file size, which ever
  67 * is larger. If the size is less than sbi->s_mb_stream_request we
  68 * select to use the group preallocation. The default value of
  69 * s_mb_stream_request is 16 blocks. This can also be tuned via
  70 * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in
  71 * terms of number of blocks.
  72 *
  73 * The main motivation for having small file use group preallocation is to
  74 * ensure that we have small files closer together on the disk.
  75 *
  76 * First stage the allocator looks at the inode prealloc list,
  77 * ext4_inode_info->i_prealloc_list, which contains list of prealloc
  78 * spaces for this particular inode. The inode prealloc space is
  79 * represented as:
  80 *
  81 * pa_lstart -> the logical start block for this prealloc space
  82 * pa_pstart -> the physical start block for this prealloc space
  83 * pa_len    -> length for this prealloc space (in clusters)
  84 * pa_free   ->  free space available in this prealloc space (in clusters)
  85 *
  86 * The inode preallocation space is used looking at the _logical_ start
  87 * block. If only the logical file block falls within the range of prealloc
  88 * space we will consume the particular prealloc space. This makes sure that
  89 * we have contiguous physical blocks representing the file blocks
  90 *
  91 * The important thing to be noted in case of inode prealloc space is that
  92 * we don't modify the values associated to inode prealloc space except
  93 * pa_free.
  94 *
  95 * If we are not able to find blocks in the inode prealloc space and if we
  96 * have the group allocation flag set then we look at the locality group
  97 * prealloc space. These are per CPU prealloc list represented as
  98 *
  99 * ext4_sb_info.s_locality_groups[smp_processor_id()]
 100 *
 101 * The reason for having a per cpu locality group is to reduce the contention
 102 * between CPUs. It is possible to get scheduled at this point.
 103 *
 104 * The locality group prealloc space is used looking at whether we have
 105 * enough free space (pa_free) within the prealloc space.
 106 *
 107 * If we can't allocate blocks via inode prealloc or/and locality group
 108 * prealloc then we look at the buddy cache. The buddy cache is represented
 109 * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets
 110 * mapped to the buddy and bitmap information regarding different
 111 * groups. The buddy information is attached to buddy cache inode so that
 112 * we can access them through the page cache. The information regarding
 113 * each group is loaded via ext4_mb_load_buddy.  The information involve
 114 * block bitmap and buddy information. The information are stored in the
 115 * inode as:
 116 *
 117 *  {                        page                        }
 118 *  [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
 119 *
 120 *
 121 * one block each for bitmap and buddy information.  So for each group we
 122 * take up 2 blocks. A page can contain blocks_per_page (PAGE_CACHE_SIZE /
 123 * blocksize) blocks.  So it can have information regarding groups_per_page
 124 * which is blocks_per_page/2
 125 *
 126 * The buddy cache inode is not stored on disk. The inode is thrown
 127 * away when the filesystem is unmounted.
 128 *
 129 * We look for count number of blocks in the buddy cache. If we were able
 130 * to locate that many free blocks we return with additional information
 131 * regarding rest of the contiguous physical block available
 132 *
 133 * Before allocating blocks via buddy cache we normalize the request
 134 * blocks. This ensure we ask for more blocks that we needed. The extra
 135 * blocks that we get after allocation is added to the respective prealloc
 136 * list. In case of inode preallocation we follow a list of heuristics
 137 * based on file size. This can be found in ext4_mb_normalize_request. If
 138 * we are doing a group prealloc we try to normalize the request to
 139 * sbi->s_mb_group_prealloc.  The default value of s_mb_group_prealloc is
 140 * dependent on the cluster size; for non-bigalloc file systems, it is
 141 * 512 blocks. This can be tuned via
 142 * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in
 143 * terms of number of blocks. If we have mounted the file system with -O
 144 * stripe=<value> option the group prealloc request is normalized to the
 145 * the smallest multiple of the stripe value (sbi->s_stripe) which is
 146 * greater than the default mb_group_prealloc.
 147 *
 148 * The regular allocator (using the buddy cache) supports a few tunables.
 149 *
 150 * /sys/fs/ext4/<partition>/mb_min_to_scan
 151 * /sys/fs/ext4/<partition>/mb_max_to_scan
 152 * /sys/fs/ext4/<partition>/mb_order2_req
 153 *
 154 * The regular allocator uses buddy scan only if the request len is power of
 155 * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
 156 * value of s_mb_order2_reqs can be tuned via
 157 * /sys/fs/ext4/<partition>/mb_order2_req.  If the request len is equal to
 158 * stripe size (sbi->s_stripe), we try to search for contiguous block in
 159 * stripe size. This should result in better allocation on RAID setups. If
 160 * not, we search in the specific group using bitmap for best extents. The
 161 * tunable min_to_scan and max_to_scan control the behaviour here.
 162 * min_to_scan indicate how long the mballoc __must__ look for a best
 163 * extent and max_to_scan indicates how long the mballoc __can__ look for a
 164 * best extent in the found extents. Searching for the blocks starts with
 165 * the group specified as the goal value in allocation context via
 166 * ac_g_ex. Each group is first checked based on the criteria whether it
 167 * can be used for allocation. ext4_mb_good_group explains how the groups are
 168 * checked.
 169 *
 170 * Both the prealloc space are getting populated as above. So for the first
 171 * request we will hit the buddy cache which will result in this prealloc
 172 * space getting filled. The prealloc space is then later used for the
 173 * subsequent request.
 174 */
 175
 176/*
 177 * mballoc operates on the following data:
 178 *  - on-disk bitmap
 179 *  - in-core buddy (actually includes buddy and bitmap)
 180 *  - preallocation descriptors (PAs)
 181 *
 182 * there are two types of preallocations:
 183 *  - inode
 184 *    assiged to specific inode and can be used for this inode only.
 185 *    it describes part of inode's space preallocated to specific
 186 *    physical blocks. any block from that preallocated can be used
 187 *    independent. the descriptor just tracks number of blocks left
 188 *    unused. so, before taking some block from descriptor, one must
 189 *    make sure corresponded logical block isn't allocated yet. this
 190 *    also means that freeing any block within descriptor's range
 191 *    must discard all preallocated blocks.
 192 *  - locality group
 193 *    assigned to specific locality group which does not translate to
 194 *    permanent set of inodes: inode can join and leave group. space
 195 *    from this type of preallocation can be used for any inode. thus
 196 *    it's consumed from the beginning to the end.
 197 *
 198 * relation between them can be expressed as:
 199 *    in-core buddy = on-disk bitmap + preallocation descriptors
 200 *
 201 * this mean blocks mballoc considers used are:
 202 *  - allocated blocks (persistent)
 203 *  - preallocated blocks (non-persistent)
 204 *
 205 * consistency in mballoc world means that at any time a block is either
 206 * free or used in ALL structures. notice: "any time" should not be read
 207 * literally -- time is discrete and delimited by locks.
 208 *
 209 *  to keep it simple, we don't use block numbers, instead we count number of
 210 *  blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA.
 211 *
 212 * all operations can be expressed as:
 213 *  - init buddy:                       buddy = on-disk + PAs
 214 *  - new PA:                           buddy += N; PA = N
 215 *  - use inode PA:                     on-disk += N; PA -= N
 216 *  - discard inode PA                  buddy -= on-disk - PA; PA = 0
 217 *  - use locality group PA             on-disk += N; PA -= N
 218 *  - discard locality group PA         buddy -= PA; PA = 0
 219 *  note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap
 220 *        is used in real operation because we can't know actual used
 221 *        bits from PA, only from on-disk bitmap
 222 *
 223 * if we follow this strict logic, then all operations above should be atomic.
 224 * given some of them can block, we'd have to use something like semaphores
 225 * killing performance on high-end SMP hardware. let's try to relax it using
 226 * the following knowledge:
 227 *  1) if buddy is referenced, it's already initialized
 228 *  2) while block is used in buddy and the buddy is referenced,
 229 *     nobody can re-allocate that block
 230 *  3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has
 231 *     bit set and PA claims same block, it's OK. IOW, one can set bit in
 232 *     on-disk bitmap if buddy has same bit set or/and PA covers corresponded
 233 *     block
 234 *
 235 * so, now we're building a concurrency table:
 236 *  - init buddy vs.
 237 *    - new PA
 238 *      blocks for PA are allocated in the buddy, buddy must be referenced
 239 *      until PA is linked to allocation group to avoid concurrent buddy init
 240 *    - use inode PA
 241 *      we need to make sure that either on-disk bitmap or PA has uptodate data
 242 *      given (3) we care that PA-=N operation doesn't interfere with init
 243 *    - discard inode PA
 244 *      the simplest way would be to have buddy initialized by the discard
 245 *    - use locality group PA
 246 *      again PA-=N must be serialized with init
 247 *    - discard locality group PA
 248 *      the simplest way would be to have buddy initialized by the discard
 249 *  - new PA vs.
 250 *    - use inode PA
 251 *      i_data_sem serializes them
 252 *    - discard inode PA
 253 *      discard process must wait until PA isn't used by another process
 254 *    - use locality group PA
 255 *      some mutex should serialize them
 256 *    - discard locality group PA
 257 *      discard process must wait until PA isn't used by another process
 258 *  - use inode PA
 259 *    - use inode PA
 260 *      i_data_sem or another mutex should serializes them
 261 *    - discard inode PA
 262 *      discard process must wait until PA isn't used by another process
 263 *    - use locality group PA
 264 *      nothing wrong here -- they're different PAs covering different blocks
 265 *    - discard locality group PA
 266 *      discard process must wait until PA isn't used by another process
 267 *
 268 * now we're ready to make few consequences:
 269 *  - PA is referenced and while it is no discard is possible
 270 *  - PA is referenced until block isn't marked in on-disk bitmap
 271 *  - PA changes only after on-disk bitmap
 272 *  - discard must not compete with init. either init is done before
 273 *    any discard or they're serialized somehow
 274 *  - buddy init as sum of on-disk bitmap and PAs is done atomically
 275 *
 276 * a special case when we've used PA to emptiness. no need to modify buddy
 277 * in this case, but we should care about concurrent init
 278 *
 279 */
 280
 281 /*
 282 * Logic in few words:
 283 *
 284 *  - allocation:
 285 *    load group
 286 *    find blocks
 287 *    mark bits in on-disk bitmap
 288 *    release group
 289 *
 290 *  - use preallocation:
 291 *    find proper PA (per-inode or group)
 292 *    load group
 293 *    mark bits in on-disk bitmap
 294 *    release group
 295 *    release PA
 296 *
 297 *  - free:
 298 *    load group
 299 *    mark bits in on-disk bitmap
 300 *    release group
 301 *
 302 *  - discard preallocations in group:
 303 *    mark PAs deleted
 304 *    move them onto local list
 305 *    load on-disk bitmap
 306 *    load group
 307 *    remove PA from object (inode or locality group)
 308 *    mark free blocks in-core
 309 *
 310 *  - discard inode's preallocations:
 311 */
 312
 313/*
 314 * Locking rules
 315 *
 316 * Locks:
 317 *  - bitlock on a group        (group)
 318 *  - object (inode/locality)   (object)
 319 *  - per-pa lock               (pa)
 320 *
 321 * Paths:
 322 *  - new pa
 323 *    object
 324 *    group
 325 *
 326 *  - find and use pa:
 327 *    pa
 328 *
 329 *  - release consumed pa:
 330 *    pa
 331 *    group
 332 *    object
 333 *
 334 *  - generate in-core bitmap:
 335 *    group
 336 *        pa
 337 *
 338 *  - discard all for given object (inode, locality group):
 339 *    object
 340 *        pa
 341 *    group
 342 *
 343 *  - discard all for given group:
 344 *    group
 345 *        pa
 346 *    group
 347 *        object
 348 *
 349 */
 350static struct kmem_cache *ext4_pspace_cachep;
 351static struct kmem_cache *ext4_ac_cachep;
 352static struct kmem_cache *ext4_free_data_cachep;
 353
 354/* We create slab caches for groupinfo data structures based on the
 355 * superblock block size.  There will be one per mounted filesystem for
 356 * each unique s_blocksize_bits */
 357#define NR_GRPINFO_CACHES 8
 358static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
 359
 360static const char *ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
 361        "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k",
 362        "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k",
 363        "ext4_groupinfo_64k", "ext4_groupinfo_128k"
 364};
 365
 366static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
 367                                        ext4_group_t group);
 368static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
 369                                                ext4_group_t group);
 370static void ext4_free_data_callback(struct super_block *sb,
 371                                struct ext4_journal_cb_entry *jce, int rc);
 372
 373static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
 374{
 375#if BITS_PER_LONG == 64
 376        *bit += ((unsigned long) addr & 7UL) << 3;
 377        addr = (void *) ((unsigned long) addr & ~7UL);
 378#elif BITS_PER_LONG == 32
 379        *bit += ((unsigned long) addr & 3UL) << 3;
 380        addr = (void *) ((unsigned long) addr & ~3UL);
 381#else
 382#error "how many bits you are?!"
 383#endif
 384        return addr;
 385}
 386
 387static inline int mb_test_bit(int bit, void *addr)
 388{
 389        /*
 390         * ext4_test_bit on architecture like powerpc
 391         * needs unsigned long aligned address
 392         */
 393        addr = mb_correct_addr_and_bit(&bit, addr);
 394        return ext4_test_bit(bit, addr);
 395}
 396
 397static inline void mb_set_bit(int bit, void *addr)
 398{
 399        addr = mb_correct_addr_and_bit(&bit, addr);
 400        ext4_set_bit(bit, addr);
 401}
 402
 403static inline void mb_clear_bit(int bit, void *addr)
 404{
 405        addr = mb_correct_addr_and_bit(&bit, addr);
 406        ext4_clear_bit(bit, addr);
 407}
 408
 409static inline int mb_test_and_clear_bit(int bit, void *addr)
 410{
 411        addr = mb_correct_addr_and_bit(&bit, addr);
 412        return ext4_test_and_clear_bit(bit, addr);
 413}
 414
 415static inline int mb_find_next_zero_bit(void *addr, int max, int start)
 416{
 417        int fix = 0, ret, tmpmax;
 418        addr = mb_correct_addr_and_bit(&fix, addr);
 419        tmpmax = max + fix;
 420        start += fix;
 421
 422        ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix;
 423        if (ret > max)
 424                return max;
 425        return ret;
 426}
 427
 428static inline int mb_find_next_bit(void *addr, int max, int start)
 429{
 430        int fix = 0, ret, tmpmax;
 431        addr = mb_correct_addr_and_bit(&fix, addr);
 432        tmpmax = max + fix;
 433        start += fix;
 434
 435        ret = ext4_find_next_bit(addr, tmpmax, start) - fix;
 436        if (ret > max)
 437                return max;
 438        return ret;
 439}
 440
 441static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
 442{
 443        char *bb;
 444
 445        BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
 446        BUG_ON(max == NULL);
 447
 448        if (order > e4b->bd_blkbits + 1) {
 449                *max = 0;
 450                return NULL;
 451        }
 452
 453        /* at order 0 we see each particular block */
 454        if (order == 0) {
 455                *max = 1 << (e4b->bd_blkbits + 3);
 456                return e4b->bd_bitmap;
 457        }
 458
 459        bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
 460        *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
 461
 462        return bb;
 463}
 464
 465#ifdef DOUBLE_CHECK
 466static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
 467                           int first, int count)
 468{
 469        int i;
 470        struct super_block *sb = e4b->bd_sb;
 471
 472        if (unlikely(e4b->bd_info->bb_bitmap == NULL))
 473                return;
 474        assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
 475        for (i = 0; i < count; i++) {
 476                if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
 477                        ext4_fsblk_t blocknr;
 478
 479                        blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
 480                        blocknr += EXT4_C2B(EXT4_SB(sb), first + i);
 481                        ext4_grp_locked_error(sb, e4b->bd_group,
 482                                              inode ? inode->i_ino : 0,
 483                                              blocknr,
 484                                              "freeing block already freed "
 485                                              "(bit %u)",
 486                                              first + i);
 487                }
 488                mb_clear_bit(first + i, e4b->bd_info->bb_bitmap);
 489        }
 490}
 491
 492static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count)
 493{
 494        int i;
 495
 496        if (unlikely(e4b->bd_info->bb_bitmap == NULL))
 497                return;
 498        assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
 499        for (i = 0; i < count; i++) {
 500                BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap));
 501                mb_set_bit(first + i, e4b->bd_info->bb_bitmap);
 502        }
 503}
 504
 505static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
 506{
 507        if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) {
 508                unsigned char *b1, *b2;
 509                int i;
 510                b1 = (unsigned char *) e4b->bd_info->bb_bitmap;
 511                b2 = (unsigned char *) bitmap;
 512                for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
 513                        if (b1[i] != b2[i]) {
 514                                ext4_msg(e4b->bd_sb, KERN_ERR,
 515                                         "corruption in group %u "
 516                                         "at byte %u(%u): %x in copy != %x "
 517                                         "on disk/prealloc",
 518                                         e4b->bd_group, i, i * 8, b1[i], b2[i]);
 519                                BUG();
 520                        }
 521                }
 522        }
 523}
 524
 525#else
 526static inline void mb_free_blocks_double(struct inode *inode,
 527                                struct ext4_buddy *e4b, int first, int count)
 528{
 529        return;
 530}
 531static inline void mb_mark_used_double(struct ext4_buddy *e4b,
 532                                                int first, int count)
 533{
 534        return;
 535}
 536static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
 537{
 538        return;
 539}
 540#endif
 541
 542#ifdef AGGRESSIVE_CHECK
 543
 544#define MB_CHECK_ASSERT(assert)                                         \
 545do {                                                                    \
 546        if (!(assert)) {                                                \
 547                printk(KERN_EMERG                                       \
 548                        "Assertion failure in %s() at %s:%d: \"%s\"\n", \
 549                        function, file, line, # assert);                \
 550                BUG();                                                  \
 551        }                                                               \
 552} while (0)
 553
 554static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
 555                                const char *function, int line)
 556{
 557        struct super_block *sb = e4b->bd_sb;
 558        int order = e4b->bd_blkbits + 1;
 559        int max;
 560        int max2;
 561        int i;
 562        int j;
 563        int k;
 564        int count;
 565        struct ext4_group_info *grp;
 566        int fragments = 0;
 567        int fstart;
 568        struct list_head *cur;
 569        void *buddy;
 570        void *buddy2;
 571
 572        {
 573                static int mb_check_counter;
 574                if (mb_check_counter++ % 100 != 0)
 575                        return 0;
 576        }
 577
 578        while (order > 1) {
 579                buddy = mb_find_buddy(e4b, order, &max);
 580                MB_CHECK_ASSERT(buddy);
 581                buddy2 = mb_find_buddy(e4b, order - 1, &max2);
 582                MB_CHECK_ASSERT(buddy2);
 583                MB_CHECK_ASSERT(buddy != buddy2);
 584                MB_CHECK_ASSERT(max * 2 == max2);
 585
 586                count = 0;
 587                for (i = 0; i < max; i++) {
 588
 589                        if (mb_test_bit(i, buddy)) {
 590                                /* only single bit in buddy2 may be 1 */
 591                                if (!mb_test_bit(i << 1, buddy2)) {
 592                                        MB_CHECK_ASSERT(
 593                                                mb_test_bit((i<<1)+1, buddy2));
 594                                } else if (!mb_test_bit((i << 1) + 1, buddy2)) {
 595                                        MB_CHECK_ASSERT(
 596                                                mb_test_bit(i << 1, buddy2));
 597                                }
 598                                continue;
 599                        }
 600
 601                        /* both bits in buddy2 must be 1 */
 602                        MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2));
 603                        MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2));
 604
 605                        for (j = 0; j < (1 << order); j++) {
 606                                k = (i * (1 << order)) + j;
 607                                MB_CHECK_ASSERT(
 608                                        !mb_test_bit(k, e4b->bd_bitmap));
 609                        }
 610                        count++;
 611                }
 612                MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count);
 613                order--;
 614        }
 615
 616        fstart = -1;
 617        buddy = mb_find_buddy(e4b, 0, &max);
 618        for (i = 0; i < max; i++) {
 619                if (!mb_test_bit(i, buddy)) {
 620                        MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free);
 621                        if (fstart == -1) {
 622                                fragments++;
 623                                fstart = i;
 624                        }
 625                        continue;
 626                }
 627                fstart = -1;
 628                /* check used bits only */
 629                for (j = 0; j < e4b->bd_blkbits + 1; j++) {
 630                        buddy2 = mb_find_buddy(e4b, j, &max2);
 631                        k = i >> j;
 632                        MB_CHECK_ASSERT(k < max2);
 633                        MB_CHECK_ASSERT(mb_test_bit(k, buddy2));
 634                }
 635        }
 636        MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info));
 637        MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);
 638
 639        grp = ext4_get_group_info(sb, e4b->bd_group);
 640        list_for_each(cur, &grp->bb_prealloc_list) {
 641                ext4_group_t groupnr;
 642                struct ext4_prealloc_space *pa;
 643                pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
 644                ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k);
 645                MB_CHECK_ASSERT(groupnr == e4b->bd_group);
 646                for (i = 0; i < pa->pa_len; i++)
 647                        MB_CHECK_ASSERT(mb_test_bit(k + i, buddy));
 648        }
 649        return 0;
 650}
 651#undef MB_CHECK_ASSERT
 652#define mb_check_buddy(e4b) __mb_check_buddy(e4b,       \
 653                                        __FILE__, __func__, __LINE__)
 654#else
 655#define mb_check_buddy(e4b)
 656#endif
 657
 658/*
 659 * Divide blocks started from @first with length @len into
 660 * smaller chunks with power of 2 blocks.
 661 * Clear the bits in bitmap which the blocks of the chunk(s) covered,
 662 * then increase bb_counters[] for corresponded chunk size.
 663 */
 664static void ext4_mb_mark_free_simple(struct super_block *sb,
 665                                void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
 666                                        struct ext4_group_info *grp)
 667{
 668        struct ext4_sb_info *sbi = EXT4_SB(sb);
 669        ext4_grpblk_t min;
 670        ext4_grpblk_t max;
 671        ext4_grpblk_t chunk;
 672        unsigned int border;
 673
 674        BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb));
 675
 676        border = 2 << sb->s_blocksize_bits;
 677
 678        while (len > 0) {
 679                /* find how many blocks can be covered since this position */
 680                max = ffs(first | border) - 1;
 681
 682                /* find how many blocks of power 2 we need to mark */
 683                min = fls(len) - 1;
 684
 685                if (max < min)
 686                        min = max;
 687                chunk = 1 << min;
 688
 689                /* mark multiblock chunks only */
 690                grp->bb_counters[min]++;
 691                if (min > 0)
 692                        mb_clear_bit(first >> min,
 693                                     buddy + sbi->s_mb_offsets[min]);
 694
 695                len -= chunk;
 696                first += chunk;
 697        }
 698}
 699
 700/*
 701 * Cache the order of the largest free extent we have available in this block
 702 * group.
 703 */
 704static void
 705mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
 706{
 707        int i;
 708        int bits;
 709
 710        grp->bb_largest_free_order = -1; /* uninit */
 711
 712        bits = sb->s_blocksize_bits + 1;
 713        for (i = bits; i >= 0; i--) {
 714                if (grp->bb_counters[i] > 0) {
 715                        grp->bb_largest_free_order = i;
 716                        break;
 717                }
 718        }
 719}
 720
 721static noinline_for_stack
 722void ext4_mb_generate_buddy(struct super_block *sb,
 723                                void *buddy, void *bitmap, ext4_group_t group)
 724{
 725        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
 726        struct ext4_sb_info *sbi = EXT4_SB(sb);
 727        ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
 728        ext4_grpblk_t i = 0;
 729        ext4_grpblk_t first;
 730        ext4_grpblk_t len;
 731        unsigned free = 0;
 732        unsigned fragments = 0;
 733        unsigned long long period = get_cycles();
 734
 735        /* initialize buddy from bitmap which is aggregation
 736         * of on-disk bitmap and preallocations */
 737        i = mb_find_next_zero_bit(bitmap, max, 0);
 738        grp->bb_first_free = i;
 739        while (i < max) {
 740                fragments++;
 741                first = i;
 742                i = mb_find_next_bit(bitmap, max, i);
 743                len = i - first;
 744                free += len;
 745                if (len > 1)
 746                        ext4_mb_mark_free_simple(sb, buddy, first, len, grp);
 747                else
 748                        grp->bb_counters[0]++;
 749                if (i < max)
 750                        i = mb_find_next_zero_bit(bitmap, max, i);
 751        }
 752        grp->bb_fragments = fragments;
 753
 754        if (free != grp->bb_free) {
 755                ext4_grp_locked_error(sb, group, 0, 0,
 756                                      "block bitmap and bg descriptor "
 757                                      "inconsistent: %u vs %u free clusters",
 758                                      free, grp->bb_free);
 759                /*
 760                 * If we intend to continue, we consider group descriptor
 761                 * corrupt and update bb_free using bitmap value
 762                 */
 763                grp->bb_free = free;
 764                if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
 765                        percpu_counter_sub(&sbi->s_freeclusters_counter,
 766                                           grp->bb_free);
 767                set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
 768        }
 769        mb_set_largest_free_order(sb, grp);
 770
 771        clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
 772
 773        period = get_cycles() - period;
 774        spin_lock(&EXT4_SB(sb)->s_bal_lock);
 775        EXT4_SB(sb)->s_mb_buddies_generated++;
 776        EXT4_SB(sb)->s_mb_generation_time += period;
 777        spin_unlock(&EXT4_SB(sb)->s_bal_lock);
 778}
 779
 780static void mb_regenerate_buddy(struct ext4_buddy *e4b)
 781{
 782        int count;
 783        int order = 1;
 784        void *buddy;
 785
 786        while ((buddy = mb_find_buddy(e4b, order++, &count))) {
 787                ext4_set_bits(buddy, 0, count);
 788        }
 789        e4b->bd_info->bb_fragments = 0;
 790        memset(e4b->bd_info->bb_counters, 0,
 791                sizeof(*e4b->bd_info->bb_counters) *
 792                (e4b->bd_sb->s_blocksize_bits + 2));
 793
 794        ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy,
 795                e4b->bd_bitmap, e4b->bd_group);
 796}
 797
 798/* The buddy information is attached the buddy cache inode
 799 * for convenience. The information regarding each group
 800 * is loaded via ext4_mb_load_buddy. The information involve
 801 * block bitmap and buddy information. The information are
 802 * stored in the inode as
 803 *
 804 * {                        page                        }
 805 * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
 806 *
 807 *
 808 * one block each for bitmap and buddy information.
 809 * So for each group we take up 2 blocks. A page can
 810 * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize)  blocks.
 811 * So it can have information regarding groups_per_page which
 812 * is blocks_per_page/2
 813 *
 814 * Locking note:  This routine takes the block group lock of all groups
 815 * for this page; do not hold this lock when calling this routine!
 816 */
 817
 818static int ext4_mb_init_cache(struct page *page, char *incore)
 819{
 820        ext4_group_t ngroups;
 821        int blocksize;
 822        int blocks_per_page;
 823        int groups_per_page;
 824        int err = 0;
 825        int i;
 826        ext4_group_t first_group, group;
 827        int first_block;
 828        struct super_block *sb;
 829        struct buffer_head *bhs;
 830        struct buffer_head **bh = NULL;
 831        struct inode *inode;
 832        char *data;
 833        char *bitmap;
 834        struct ext4_group_info *grinfo;
 835
 836        mb_debug(1, "init page %lu\n", page->index);
 837
 838        inode = page->mapping->host;
 839        sb = inode->i_sb;
 840        ngroups = ext4_get_groups_count(sb);
 841        blocksize = 1 << inode->i_blkbits;
 842        blocks_per_page = PAGE_CACHE_SIZE / blocksize;
 843
 844        groups_per_page = blocks_per_page >> 1;
 845        if (groups_per_page == 0)
 846                groups_per_page = 1;
 847
 848        /* allocate buffer_heads to read bitmaps */
 849        if (groups_per_page > 1) {
 850                i = sizeof(struct buffer_head *) * groups_per_page;
 851                bh = kzalloc(i, GFP_NOFS);
 852                if (bh == NULL) {
 853                        err = -ENOMEM;
 854                        goto out;
 855                }
 856        } else
 857                bh = &bhs;
 858
 859        first_group = page->index * blocks_per_page / 2;
 860
 861        /* read all groups the page covers into the cache */
 862        for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
 863                if (group >= ngroups)
 864                        break;
 865
 866                grinfo = ext4_get_group_info(sb, group);
 867                /*
 868                 * If page is uptodate then we came here after online resize
 869                 * which added some new uninitialized group info structs, so
 870                 * we must skip all initialized uptodate buddies on the page,
 871                 * which may be currently in use by an allocating task.
 872                 */
 873                if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) {
 874                        bh[i] = NULL;
 875                        continue;
 876                }
 877                if (!(bh[i] = ext4_read_block_bitmap_nowait(sb, group))) {
 878                        err = -ENOMEM;
 879                        goto out;
 880                }
 881                mb_debug(1, "read bitmap for group %u\n", group);
 882        }
 883
 884        /* wait for I/O completion */
 885        for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
 886                if (bh[i] && ext4_wait_block_bitmap(sb, group, bh[i]))
 887                        err = -EIO;
 888        }
 889
 890        first_block = page->index * blocks_per_page;
 891        for (i = 0; i < blocks_per_page; i++) {
 892                group = (first_block + i) >> 1;
 893                if (group >= ngroups)
 894                        break;
 895
 896                if (!bh[group - first_group])
 897                        /* skip initialized uptodate buddy */
 898                        continue;
 899
 900                if (!buffer_verified(bh[group - first_group]))
 901                        /* Skip faulty bitmaps */
 902                        continue;
 903                err = 0;
 904
 905                /*
 906                 * data carry information regarding this
 907                 * particular group in the format specified
 908                 * above
 909                 *
 910                 */
 911                data = page_address(page) + (i * blocksize);
 912                bitmap = bh[group - first_group]->b_data;
 913
 914                /*
 915                 * We place the buddy block and bitmap block
 916                 * close together
 917                 */
 918                if ((first_block + i) & 1) {
 919                        /* this is block of buddy */
 920                        BUG_ON(incore == NULL);
 921                        mb_debug(1, "put buddy for group %u in page %lu/%x\n",
 922                                group, page->index, i * blocksize);
 923                        trace_ext4_mb_buddy_bitmap_load(sb, group);
 924                        grinfo = ext4_get_group_info(sb, group);
 925                        grinfo->bb_fragments = 0;
 926                        memset(grinfo->bb_counters, 0,
 927                               sizeof(*grinfo->bb_counters) *
 928                                (sb->s_blocksize_bits+2));
 929                        /*
 930                         * incore got set to the group block bitmap below
 931                         */
 932                        ext4_lock_group(sb, group);
 933                        /* init the buddy */
 934                        memset(data, 0xff, blocksize);
 935                        ext4_mb_generate_buddy(sb, data, incore, group);
 936                        ext4_unlock_group(sb, group);
 937                        incore = NULL;
 938                } else {
 939                        /* this is block of bitmap */
 940                        BUG_ON(incore != NULL);
 941                        mb_debug(1, "put bitmap for group %u in page %lu/%x\n",
 942                                group, page->index, i * blocksize);
 943                        trace_ext4_mb_bitmap_load(sb, group);
 944
 945                        /* see comments in ext4_mb_put_pa() */
 946                        ext4_lock_group(sb, group);
 947                        memcpy(data, bitmap, blocksize);
 948
 949                        /* mark all preallocated blks used in in-core bitmap */
 950                        ext4_mb_generate_from_pa(sb, data, group);
 951                        ext4_mb_generate_from_freelist(sb, data, group);
 952                        ext4_unlock_group(sb, group);
 953
 954                        /* set incore so that the buddy information can be
 955                         * generated using this
 956                         */
 957                        incore = data;
 958                }
 959        }
 960        SetPageUptodate(page);
 961
 962out:
 963        if (bh) {
 964                for (i = 0; i < groups_per_page; i++)
 965                        brelse(bh[i]);
 966                if (bh != &bhs)
 967                        kfree(bh);
 968        }
 969        return err;
 970}
 971
 972/*
 973 * Lock the buddy and bitmap pages. This make sure other parallel init_group
 974 * on the same buddy page doesn't happen whild holding the buddy page lock.
 975 * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap
 976 * are on the same page e4b->bd_buddy_page is NULL and return value is 0.
 977 */
 978static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
 979                ext4_group_t group, struct ext4_buddy *e4b)
 980{
 981        struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
 982        int block, pnum, poff;
 983        int blocks_per_page;
 984        struct page *page;
 985
 986        e4b->bd_buddy_page = NULL;
 987        e4b->bd_bitmap_page = NULL;
 988
 989        blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
 990        /*
 991         * the buddy cache inode stores the block bitmap
 992         * and buddy information in consecutive blocks.
 993         * So for each group we need two blocks.
 994         */
 995        block = group * 2;
 996        pnum = block / blocks_per_page;
 997        poff = block % blocks_per_page;
 998        page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
 999        if (!page)
1000                return -ENOMEM;
1001        BUG_ON(page->mapping != inode->i_mapping);
1002        e4b->bd_bitmap_page = page;
1003        e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
1004
1005        if (blocks_per_page >= 2) {
1006                /* buddy and bitmap are on the same page */
1007                return 0;
1008        }
1009
1010        block++;
1011        pnum = block / blocks_per_page;
1012        page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1013        if (!page)
1014                return -ENOMEM;
1015        BUG_ON(page->mapping != inode->i_mapping);
1016        e4b->bd_buddy_page = page;
1017        return 0;
1018}
1019
1020static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
1021{
1022        if (e4b->bd_bitmap_page) {
1023                unlock_page(e4b->bd_bitmap_page);
1024                page_cache_release(e4b->bd_bitmap_page);
1025        }
1026        if (e4b->bd_buddy_page) {
1027                unlock_page(e4b->bd_buddy_page);
1028                page_cache_release(e4b->bd_buddy_page);
1029        }
1030}
1031
1032/*
1033 * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
1034 * block group lock of all groups for this page; do not hold the BG lock when
1035 * calling this routine!
1036 */
1037static noinline_for_stack
1038int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
1039{
1040
1041        struct ext4_group_info *this_grp;
1042        struct ext4_buddy e4b;
1043        struct page *page;
1044        int ret = 0;
1045
1046        might_sleep();
1047        mb_debug(1, "init group %u\n", group);
1048        this_grp = ext4_get_group_info(sb, group);
1049        /*
1050         * This ensures that we don't reinit the buddy cache
1051         * page which map to the group from which we are already
1052         * allocating. If we are looking at the buddy cache we would
1053         * have taken a reference using ext4_mb_load_buddy and that
1054         * would have pinned buddy page to page cache.
1055         */
1056        ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b);
1057        if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
1058                /*
1059                 * somebody initialized the group
1060                 * return without doing anything
1061                 */
1062                goto err;
1063        }
1064
1065        page = e4b.bd_bitmap_page;
1066        ret = ext4_mb_init_cache(page, NULL);
1067        if (ret)
1068                goto err;
1069        if (!PageUptodate(page)) {
1070                ret = -EIO;
1071                goto err;
1072        }
1073        mark_page_accessed(page);
1074
1075        if (e4b.bd_buddy_page == NULL) {
1076                /*
1077                 * If both the bitmap and buddy are in
1078                 * the same page we don't need to force
1079                 * init the buddy
1080                 */
1081                ret = 0;
1082                goto err;
1083        }
1084        /* init buddy cache */
1085        page = e4b.bd_buddy_page;
1086        ret = ext4_mb_init_cache(page, e4b.bd_bitmap);
1087        if (ret)
1088                goto err;
1089        if (!PageUptodate(page)) {
1090                ret = -EIO;
1091                goto err;
1092        }
1093        mark_page_accessed(page);
1094err:
1095        ext4_mb_put_buddy_page_lock(&e4b);
1096        return ret;
1097}
1098
1099/*
1100 * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
1101 * block group lock of all groups for this page; do not hold the BG lock when
1102 * calling this routine!
1103 */
1104static noinline_for_stack int
1105ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1106                                        struct ext4_buddy *e4b)
1107{
1108        int blocks_per_page;
1109        int block;
1110        int pnum;
1111        int poff;
1112        struct page *page;
1113        int ret;
1114        struct ext4_group_info *grp;
1115        struct ext4_sb_info *sbi = EXT4_SB(sb);
1116        struct inode *inode = sbi->s_buddy_cache;
1117
1118        might_sleep();
1119        mb_debug(1, "load group %u\n", group);
1120
1121        blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1122        grp = ext4_get_group_info(sb, group);
1123
1124        e4b->bd_blkbits = sb->s_blocksize_bits;
1125        e4b->bd_info = grp;
1126        e4b->bd_sb = sb;
1127        e4b->bd_group = group;
1128        e4b->bd_buddy_page = NULL;
1129        e4b->bd_bitmap_page = NULL;
1130
1131        if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
1132                /*
1133                 * we need full data about the group
1134                 * to make a good selection
1135                 */
1136                ret = ext4_mb_init_group(sb, group);
1137                if (ret)
1138                        return ret;
1139        }
1140
1141        /*
1142         * the buddy cache inode stores the block bitmap
1143         * and buddy information in consecutive blocks.
1144         * So for each group we need two blocks.
1145         */
1146        block = group * 2;
1147        pnum = block / blocks_per_page;
1148        poff = block % blocks_per_page;
1149
1150        /* we could use find_or_create_page(), but it locks page
1151         * what we'd like to avoid in fast path ... */
1152        page = find_get_page(inode->i_mapping, pnum);
1153        if (page == NULL || !PageUptodate(page)) {
1154                if (page)
1155                        /*
1156                         * drop the page reference and try
1157                         * to get the page with lock. If we
1158                         * are not uptodate that implies
1159                         * somebody just created the page but
1160                         * is yet to initialize the same. So
1161                         * wait for it to initialize.
1162                         */
1163                        page_cache_release(page);
1164                page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1165                if (page) {
1166                        BUG_ON(page->mapping != inode->i_mapping);
1167                        if (!PageUptodate(page)) {
1168                                ret = ext4_mb_init_cache(page, NULL);
1169                                if (ret) {
1170                                        unlock_page(page);
1171                                        goto err;
1172                                }
1173                                mb_cmp_bitmaps(e4b, page_address(page) +
1174                                               (poff * sb->s_blocksize));
1175                        }
1176                        unlock_page(page);
1177                }
1178        }
1179        if (page == NULL) {
1180                ret = -ENOMEM;
1181                goto err;
1182        }
1183        if (!PageUptodate(page)) {
1184                ret = -EIO;
1185                goto err;
1186        }
1187        e4b->bd_bitmap_page = page;
1188        e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
1189        mark_page_accessed(page);
1190
1191        block++;
1192        pnum = block / blocks_per_page;
1193        poff = block % blocks_per_page;
1194
1195        page = find_get_page(inode->i_mapping, pnum);
1196        if (page == NULL || !PageUptodate(page)) {
1197                if (page)
1198                        page_cache_release(page);
1199                page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1200                if (page) {
1201                        BUG_ON(page->mapping != inode->i_mapping);
1202                        if (!PageUptodate(page)) {
1203                                ret = ext4_mb_init_cache(page, e4b->bd_bitmap);
1204                                if (ret) {
1205                                        unlock_page(page);
1206                                        goto err;
1207                                }
1208                        }
1209                        unlock_page(page);
1210                }
1211        }
1212        if (page == NULL) {
1213                ret = -ENOMEM;
1214                goto err;
1215        }
1216        if (!PageUptodate(page)) {
1217                ret = -EIO;
1218                goto err;
1219        }
1220        e4b->bd_buddy_page = page;
1221        e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
1222        mark_page_accessed(page);
1223
1224        BUG_ON(e4b->bd_bitmap_page == NULL);
1225        BUG_ON(e4b->bd_buddy_page == NULL);
1226
1227        return 0;
1228
1229err:
1230        if (page)
1231                page_cache_release(page);
1232        if (e4b->bd_bitmap_page)
1233                page_cache_release(e4b->bd_bitmap_page);
1234        if (e4b->bd_buddy_page)
1235                page_cache_release(e4b->bd_buddy_page);
1236        e4b->bd_buddy = NULL;
1237        e4b->bd_bitmap = NULL;
1238        return ret;
1239}
1240
1241static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
1242{
1243        if (e4b->bd_bitmap_page)
1244                page_cache_release(e4b->bd_bitmap_page);
1245        if (e4b->bd_buddy_page)
1246                page_cache_release(e4b->bd_buddy_page);
1247}
1248
1249
1250static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
1251{
1252        int order = 1;
1253        int bb_incr = 1 << (e4b->bd_blkbits - 1);
1254        void *bb;
1255
1256        BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
1257        BUG_ON(block >= (1 << (e4b->bd_blkbits + 3)));
1258
1259        bb = e4b->bd_buddy;
1260        while (order <= e4b->bd_blkbits + 1) {
1261                block = block >> 1;
1262                if (!mb_test_bit(block, bb)) {
1263                        /* this block is part of buddy of order 'order' */
1264                        return order;
1265                }
1266                bb += bb_incr;
1267                bb_incr >>= 1;
1268                order++;
1269        }
1270        return 0;
1271}
1272
1273static void mb_clear_bits(void *bm, int cur, int len)
1274{
1275        __u32 *addr;
1276
1277        len = cur + len;
1278        while (cur < len) {
1279                if ((cur & 31) == 0 && (len - cur) >= 32) {
1280                        /* fast path: clear whole word at once */
1281                        addr = bm + (cur >> 3);
1282                        *addr = 0;
1283                        cur += 32;
1284                        continue;
1285                }
1286                mb_clear_bit(cur, bm);
1287                cur++;
1288        }
1289}
1290
1291/* clear bits in given range
1292 * will return first found zero bit if any, -1 otherwise
1293 */
1294static int mb_test_and_clear_bits(void *bm, int cur, int len)
1295{
1296        __u32 *addr;
1297        int zero_bit = -1;
1298
1299        len = cur + len;
1300        while (cur < len) {
1301                if ((cur & 31) == 0 && (len - cur) >= 32) {
1302                        /* fast path: clear whole word at once */
1303                        addr = bm + (cur >> 3);
1304                        if (*addr != (__u32)(-1) && zero_bit == -1)
1305                                zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0);
1306                        *addr = 0;
1307                        cur += 32;
1308                        continue;
1309                }
1310                if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1)
1311                        zero_bit = cur;
1312                cur++;
1313        }
1314
1315        return zero_bit;
1316}
1317
1318void ext4_set_bits(void *bm, int cur, int len)
1319{
1320        __u32 *addr;
1321
1322        len = cur + len;
1323        while (cur < len) {
1324                if ((cur & 31) == 0 && (len - cur) >= 32) {
1325                        /* fast path: set whole word at once */
1326                        addr = bm + (cur >> 3);
1327                        *addr = 0xffffffff;
1328                        cur += 32;
1329                        continue;
1330                }
1331                mb_set_bit(cur, bm);
1332                cur++;
1333        }
1334}
1335
1336/*
1337 * _________________________________________________________________ */
1338
1339static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side)
1340{
1341        if (mb_test_bit(*bit + side, bitmap)) {
1342                mb_clear_bit(*bit, bitmap);
1343                (*bit) -= side;
1344                return 1;
1345        }
1346        else {
1347                (*bit) += side;
1348                mb_set_bit(*bit, bitmap);
1349                return -1;
1350        }
1351}
1352
1353static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last)
1354{
1355        int max;
1356        int order = 1;
1357        void *buddy = mb_find_buddy(e4b, order, &max);
1358
1359        while (buddy) {
1360                void *buddy2;
1361
1362                /* Bits in range [first; last] are known to be set since
1363                 * corresponding blocks were allocated. Bits in range
1364                 * (first; last) will stay set because they form buddies on
1365                 * upper layer. We just deal with borders if they don't
1366                 * align with upper layer and then go up.
1367                 * Releasing entire group is all about clearing
1368                 * single bit of highest order buddy.
1369                 */
1370
1371                /* Example:
1372                 * ---------------------------------
1373                 * |   1   |   1   |   1   |   1   |
1374                 * ---------------------------------
1375                 * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
1376                 * ---------------------------------
1377                 *   0   1   2   3   4   5   6   7
1378                 *      \_____________________/
1379                 *
1380                 * Neither [1] nor [6] is aligned to above layer.
1381                 * Left neighbour [0] is free, so mark it busy,
1382                 * decrease bb_counters and extend range to
1383                 * [0; 6]
1384                 * Right neighbour [7] is busy. It can't be coaleasced with [6], so
1385                 * mark [6] free, increase bb_counters and shrink range to
1386                 * [0; 5].
1387                 * Then shift range to [0; 2], go up and do the same.
1388                 */
1389
1390
1391                if (first & 1)
1392                        e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1);
1393                if (!(last & 1))
1394                        e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1);
1395                if (first > last)
1396                        break;
1397                order++;
1398
1399                if (first == last || !(buddy2 = mb_find_buddy(e4b, order, &max))) {
1400                        mb_clear_bits(buddy, first, last - first + 1);
1401                        e4b->bd_info->bb_counters[order - 1] += last - first + 1;
1402                        break;
1403                }
1404                first >>= 1;
1405                last >>= 1;
1406                buddy = buddy2;
1407        }
1408}
1409
1410static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1411                           int first, int count)
1412{
1413        int left_is_free = 0;
1414        int right_is_free = 0;
1415        int block;
1416        int last = first + count - 1;
1417        struct super_block *sb = e4b->bd_sb;
1418
1419        if (WARN_ON(count == 0))
1420                return;
1421        BUG_ON(last >= (sb->s_blocksize << 3));
1422        assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
1423        /* Don't bother if the block group is corrupt. */
1424        if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
1425                return;
1426
1427        mb_check_buddy(e4b);
1428        mb_free_blocks_double(inode, e4b, first, count);
1429
1430        e4b->bd_info->bb_free += count;
1431        if (first < e4b->bd_info->bb_first_free)
1432                e4b->bd_info->bb_first_free = first;
1433
1434        /* access memory sequentially: check left neighbour,
1435         * clear range and then check right neighbour
1436         */
1437        if (first != 0)
1438                left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap);
1439        block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count);
1440        if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0])
1441                right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap);
1442
1443        if (unlikely(block != -1)) {
1444                struct ext4_sb_info *sbi = EXT4_SB(sb);
1445                ext4_fsblk_t blocknr;
1446
1447                blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
1448                blocknr += EXT4_C2B(EXT4_SB(sb), block);
1449                ext4_grp_locked_error(sb, e4b->bd_group,
1450                                      inode ? inode->i_ino : 0,
1451                                      blocknr,
1452                                      "freeing already freed block "
1453                                      "(bit %u); block bitmap corrupt.",
1454                                      block);
1455                if (!EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))
1456                        percpu_counter_sub(&sbi->s_freeclusters_counter,
1457                                           e4b->bd_info->bb_free);
1458                /* Mark the block group as corrupt. */
1459                set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT,
1460                        &e4b->bd_info->bb_state);
1461                mb_regenerate_buddy(e4b);
1462                goto done;
1463        }
1464
1465        /* let's maintain fragments counter */
1466        if (left_is_free && right_is_free)
1467                e4b->bd_info->bb_fragments--;
1468        else if (!left_is_free && !right_is_free)
1469                e4b->bd_info->bb_fragments++;
1470
1471        /* buddy[0] == bd_bitmap is a special case, so handle
1472         * it right away and let mb_buddy_mark_free stay free of
1473         * zero order checks.
1474         * Check if neighbours are to be coaleasced,
1475         * adjust bitmap bb_counters and borders appropriately.
1476         */
1477        if (first & 1) {
1478                first += !left_is_free;
1479                e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1;
1480        }
1481        if (!(last & 1)) {
1482                last -= !right_is_free;
1483                e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1;
1484        }
1485
1486        if (first <= last)
1487                mb_buddy_mark_free(e4b, first >> 1, last >> 1);
1488
1489done:
1490        mb_set_largest_free_order(sb, e4b->bd_info);
1491        mb_check_buddy(e4b);
1492}
1493
1494static int mb_find_extent(struct ext4_buddy *e4b, int block,
1495                                int needed, struct ext4_free_extent *ex)
1496{
1497        int next = block;
1498        int max, order;
1499        void *buddy;
1500
1501        assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
1502        BUG_ON(ex == NULL);
1503
1504        buddy = mb_find_buddy(e4b, 0, &max);
1505        BUG_ON(buddy == NULL);
1506        BUG_ON(block >= max);
1507        if (mb_test_bit(block, buddy)) {
1508                ex->fe_len = 0;
1509                ex->fe_start = 0;
1510                ex->fe_group = 0;
1511                return 0;
1512        }
1513
1514        /* find actual order */
1515        order = mb_find_order_for_block(e4b, block);
1516        block = block >> order;
1517
1518        ex->fe_len = 1 << order;
1519        ex->fe_start = block << order;
1520        ex->fe_group = e4b->bd_group;
1521
1522        /* calc difference from given start */
1523        next = next - ex->fe_start;
1524        ex->fe_len -= next;
1525        ex->fe_start += next;
1526
1527        while (needed > ex->fe_len &&
1528               mb_find_buddy(e4b, order, &max)) {
1529
1530                if (block + 1 >= max)
1531                        break;
1532
1533                next = (block + 1) * (1 << order);
1534                if (mb_test_bit(next, e4b->bd_bitmap))
1535                        break;
1536
1537                order = mb_find_order_for_block(e4b, next);
1538
1539                block = next >> order;
1540                ex->fe_len += 1 << order;
1541        }
1542
1543        BUG_ON(ex->fe_start + ex->fe_len > (1 << (e4b->bd_blkbits + 3)));
1544        return ex->fe_len;
1545}
1546
1547static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
1548{
1549        int ord;
1550        int mlen = 0;
1551        int max = 0;
1552        int cur;
1553        int start = ex->fe_start;
1554        int len = ex->fe_len;
1555        unsigned ret = 0;
1556        int len0 = len;
1557        void *buddy;
1558
1559        BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
1560        BUG_ON(e4b->bd_group != ex->fe_group);
1561        assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
1562        mb_check_buddy(e4b);
1563        mb_mark_used_double(e4b, start, len);
1564
1565        e4b->bd_info->bb_free -= len;
1566        if (e4b->bd_info->bb_first_free == start)
1567                e4b->bd_info->bb_first_free += len;
1568
1569        /* let's maintain fragments counter */
1570        if (start != 0)
1571                mlen = !mb_test_bit(start - 1, e4b->bd_bitmap);
1572        if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0])
1573                max = !mb_test_bit(start + len, e4b->bd_bitmap);
1574        if (mlen && max)
1575                e4b->bd_info->bb_fragments++;
1576        else if (!mlen && !max)
1577                e4b->bd_info->bb_fragments--;
1578
1579        /* let's maintain buddy itself */
1580        while (len) {
1581                ord = mb_find_order_for_block(e4b, start);
1582
1583                if (((start >> ord) << ord) == start && len >= (1 << ord)) {
1584                        /* the whole chunk may be allocated at once! */
1585                        mlen = 1 << ord;
1586                        buddy = mb_find_buddy(e4b, ord, &max);
1587                        BUG_ON((start >> ord) >= max);
1588                        mb_set_bit(start >> ord, buddy);
1589                        e4b->bd_info->bb_counters[ord]--;
1590                        start += mlen;
1591                        len -= mlen;
1592                        BUG_ON(len < 0);
1593                        continue;
1594                }
1595
1596                /* store for history */
1597                if (ret == 0)
1598                        ret = len | (ord << 16);
1599
1600                /* we have to split large buddy */
1601                BUG_ON(ord <= 0);
1602                buddy = mb_find_buddy(e4b, ord, &max);
1603                mb_set_bit(start >> ord, buddy);
1604                e4b->bd_info->bb_counters[ord]--;
1605
1606                ord--;
1607                cur = (start >> ord) & ~1U;
1608                buddy = mb_find_buddy(e4b, ord, &max);
1609                mb_clear_bit(cur, buddy);
1610                mb_clear_bit(cur + 1, buddy);
1611                e4b->bd_info->bb_counters[ord]++;
1612                e4b->bd_info->bb_counters[ord]++;
1613        }
1614        mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
1615
1616        ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0);
1617        mb_check_buddy(e4b);
1618
1619        return ret;
1620}
1621
1622/*
1623 * Must be called under group lock!
1624 */
1625static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
1626                                        struct ext4_buddy *e4b)
1627{
1628        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
1629        int ret;
1630
1631        BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group);
1632        BUG_ON(ac->ac_status == AC_STATUS_FOUND);
1633
1634        ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len);
1635        ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical;
1636        ret = mb_mark_used(e4b, &ac->ac_b_ex);
1637
1638        /* preallocation can change ac_b_ex, thus we store actually
1639         * allocated blocks for history */
1640        ac->ac_f_ex = ac->ac_b_ex;
1641
1642        ac->ac_status = AC_STATUS_FOUND;
1643        ac->ac_tail = ret & 0xffff;
1644        ac->ac_buddy = ret >> 16;
1645
1646        /*
1647         * take the page reference. We want the page to be pinned
1648         * so that we don't get a ext4_mb_init_cache_call for this
1649         * group until we update the bitmap. That would mean we
1650         * double allocate blocks. The reference is dropped
1651         * in ext4_mb_release_context
1652         */
1653        ac->ac_bitmap_page = e4b->bd_bitmap_page;
1654        get_page(ac->ac_bitmap_page);
1655        ac->ac_buddy_page = e4b->bd_buddy_page;
1656        get_page(ac->ac_buddy_page);
1657        /* store last allocated for subsequent stream allocation */
1658        if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
1659                spin_lock(&sbi->s_md_lock);
1660                sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
1661                sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
1662                spin_unlock(&sbi->s_md_lock);
1663        }
1664}
1665
1666/*
1667 * regular allocator, for general purposes allocation
1668 */
1669
1670static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
1671                                        struct ext4_buddy *e4b,
1672                                        int finish_group)
1673{
1674        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
1675        struct ext4_free_extent *bex = &ac->ac_b_ex;
1676        struct ext4_free_extent *gex = &ac->ac_g_ex;
1677        struct ext4_free_extent ex;
1678        int max;
1679
1680        if (ac->ac_status == AC_STATUS_FOUND)
1681                return;
1682        /*
1683         * We don't want to scan for a whole year
1684         */
1685        if (ac->ac_found > sbi->s_mb_max_to_scan &&
1686                        !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
1687                ac->ac_status = AC_STATUS_BREAK;
1688                return;
1689        }
1690
1691        /*
1692         * Haven't found good chunk so far, let's continue
1693         */
1694        if (bex->fe_len < gex->fe_len)
1695                return;
1696
1697        if ((finish_group || ac->ac_found > sbi->s_mb_min_to_scan)
1698                        && bex->fe_group == e4b->bd_group) {
1699                /* recheck chunk's availability - we don't know
1700                 * when it was found (within this lock-unlock
1701                 * period or not) */
1702                max = mb_find_extent(e4b, bex->fe_start, gex->fe_len, &ex);
1703                if (max >= gex->fe_len) {
1704                        ext4_mb_use_best_found(ac, e4b);
1705                        return;
1706                }
1707        }
1708}
1709
1710/*
1711 * The routine checks whether found extent is good enough. If it is,
1712 * then the extent gets marked used and flag is set to the context
1713 * to stop scanning. Otherwise, the extent is compared with the
1714 * previous found extent and if new one is better, then it's stored
1715 * in the context. Later, the best found extent will be used, if
1716 * mballoc can't find good enough extent.
1717 *
1718 * FIXME: real allocation policy is to be designed yet!
1719 */
1720static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
1721                                        struct ext4_free_extent *ex,
1722                                        struct ext4_buddy *e4b)
1723{
1724        struct ext4_free_extent *bex = &ac->ac_b_ex;
1725        struct ext4_free_extent *gex = &ac->ac_g_ex;
1726
1727        BUG_ON(ex->fe_len <= 0);
1728        BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
1729        BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
1730        BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
1731
1732        ac->ac_found++;
1733
1734        /*
1735         * The special case - take what you catch first
1736         */
1737        if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
1738                *bex = *ex;
1739                ext4_mb_use_best_found(ac, e4b);
1740                return;
1741        }
1742
1743        /*
1744         * Let's check whether the chuck is good enough
1745         */
1746        if (ex->fe_len == gex->fe_len) {
1747                *bex = *ex;
1748                ext4_mb_use_best_found(ac, e4b);
1749                return;
1750        }
1751
1752        /*
1753         * If this is first found extent, just store it in the context
1754         */
1755        if (bex->fe_len == 0) {
1756                *bex = *ex;
1757                return;
1758        }
1759
1760        /*
1761         * If new found extent is better, store it in the context
1762         */
1763        if (bex->fe_len < gex->fe_len) {
1764                /* if the request isn't satisfied, any found extent
1765                 * larger than previous best one is better */
1766                if (ex->fe_len > bex->fe_len)
1767                        *bex = *ex;
1768        } else if (ex->fe_len > gex->fe_len) {
1769                /* if the request is satisfied, then we try to find
1770                 * an extent that still satisfy the request, but is
1771                 * smaller than previous one */
1772                if (ex->fe_len < bex->fe_len)
1773                        *bex = *ex;
1774        }
1775
1776        ext4_mb_check_limits(ac, e4b, 0);
1777}
1778
1779static noinline_for_stack
1780int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
1781                                        struct ext4_buddy *e4b)
1782{
1783        struct ext4_free_extent ex = ac->ac_b_ex;
1784        ext4_group_t group = ex.fe_group;
1785        int max;
1786        int err;
1787
1788        BUG_ON(ex.fe_len <= 0);
1789        err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
1790        if (err)
1791                return err;
1792
1793        ext4_lock_group(ac->ac_sb, group);
1794        max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex);
1795
1796        if (max > 0) {
1797                ac->ac_b_ex = ex;
1798                ext4_mb_use_best_found(ac, e4b);
1799        }
1800
1801        ext4_unlock_group(ac->ac_sb, group);
1802        ext4_mb_unload_buddy(e4b);
1803
1804        return 0;
1805}
1806
1807static noinline_for_stack
1808int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
1809                                struct ext4_buddy *e4b)
1810{
1811        ext4_group_t group = ac->ac_g_ex.fe_group;
1812        int max;
1813        int err;
1814        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
1815        struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
1816        struct ext4_free_extent ex;
1817
1818        if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
1819                return 0;
1820        if (grp->bb_free == 0)
1821                return 0;
1822
1823        err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
1824        if (err)
1825                return err;
1826
1827        if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) {
1828                ext4_mb_unload_buddy(e4b);
1829                return 0;
1830        }
1831
1832        ext4_lock_group(ac->ac_sb, group);
1833        max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
1834                             ac->ac_g_ex.fe_len, &ex);
1835        ex.fe_logical = 0xDEADFA11; /* debug value */
1836
1837        if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
1838                ext4_fsblk_t start;
1839
1840                start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) +
1841                        ex.fe_start;
1842                /* use do_div to get remainder (would be 64-bit modulo) */
1843                if (do_div(start, sbi->s_stripe) == 0) {
1844                        ac->ac_found++;
1845                        ac->ac_b_ex = ex;
1846                        ext4_mb_use_best_found(ac, e4b);
1847                }
1848        } else if (max >= ac->ac_g_ex.fe_len) {
1849                BUG_ON(ex.fe_len <= 0);
1850                BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
1851                BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
1852                ac->ac_found++;
1853                ac->ac_b_ex = ex;
1854                ext4_mb_use_best_found(ac, e4b);
1855        } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) {
1856                /* Sometimes, caller may want to merge even small
1857                 * number of blocks to an existing extent */
1858                BUG_ON(ex.fe_len <= 0);
1859                BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
1860                BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
1861                ac->ac_found++;
1862                ac->ac_b_ex = ex;
1863                ext4_mb_use_best_found(ac, e4b);
1864        }
1865        ext4_unlock_group(ac->ac_sb, group);
1866        ext4_mb_unload_buddy(e4b);
1867
1868        return 0;
1869}
1870
1871/*
1872 * The routine scans buddy structures (not bitmap!) from given order
1873 * to max order and tries to find big enough chunk to satisfy the req
1874 */
1875static noinline_for_stack
1876void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
1877                                        struct ext4_buddy *e4b)
1878{
1879        struct super_block *sb = ac->ac_sb;
1880        struct ext4_group_info *grp = e4b->bd_info;
1881        void *buddy;
1882        int i;
1883        int k;
1884        int max;
1885
1886        BUG_ON(ac->ac_2order <= 0);
1887        for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) {
1888                if (grp->bb_counters[i] == 0)
1889                        continue;
1890
1891                buddy = mb_find_buddy(e4b, i, &max);
1892                BUG_ON(buddy == NULL);
1893
1894                k = mb_find_next_zero_bit(buddy, max, 0);
1895                BUG_ON(k >= max);
1896
1897                ac->ac_found++;
1898
1899                ac->ac_b_ex.fe_len = 1 << i;
1900                ac->ac_b_ex.fe_start = k << i;
1901                ac->ac_b_ex.fe_group = e4b->bd_group;
1902
1903                ext4_mb_use_best_found(ac, e4b);
1904
1905                BUG_ON(ac->ac_b_ex.fe_len != ac->ac_g_ex.fe_len);
1906
1907                if (EXT4_SB(sb)->s_mb_stats)
1908                        atomic_inc(&EXT4_SB(sb)->s_bal_2orders);
1909
1910                break;
1911        }
1912}
1913
1914/*
1915 * The routine scans the group and measures all found extents.
1916 * In order to optimize scanning, caller must pass number of
1917 * free blocks in the group, so the routine can know upper limit.
1918 */
1919static noinline_for_stack
1920void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
1921                                        struct ext4_buddy *e4b)
1922{
1923        struct super_block *sb = ac->ac_sb;
1924        void *bitmap = e4b->bd_bitmap;
1925        struct ext4_free_extent ex;
1926        int i;
1927        int free;
1928
1929        free = e4b->bd_info->bb_free;
1930        BUG_ON(free <= 0);
1931
1932        i = e4b->bd_info->bb_first_free;
1933
1934        while (free && ac->ac_status == AC_STATUS_CONTINUE) {
1935                i = mb_find_next_zero_bit(bitmap,
1936                                                EXT4_CLUSTERS_PER_GROUP(sb), i);
1937                if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) {
1938                        /*
1939                         * IF we have corrupt bitmap, we won't find any
1940                         * free blocks even though group info says we
1941                         * we have free blocks
1942                         */
1943                        ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
1944                                        "%d free clusters as per "
1945                                        "group info. But bitmap says 0",
1946                                        free);
1947                        break;
1948                }
1949
1950                mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex);
1951                BUG_ON(ex.fe_len <= 0);
1952                if (free < ex.fe_len) {
1953                        ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
1954                                        "%d free clusters as per "
1955                                        "group info. But got %d blocks",
1956                                        free, ex.fe_len);
1957                        /*
1958                         * The number of free blocks differs. This mostly
1959                         * indicate that the bitmap is corrupt. So exit
1960                         * without claiming the space.
1961                         */
1962                        break;
1963                }
1964                ex.fe_logical = 0xDEADC0DE; /* debug value */
1965                ext4_mb_measure_extent(ac, &ex, e4b);
1966
1967                i += ex.fe_len;
1968                free -= ex.fe_len;
1969        }
1970
1971        ext4_mb_check_limits(ac, e4b, 1);
1972}
1973
1974/*
1975 * This is a special case for storages like raid5
1976 * we try to find stripe-aligned chunks for stripe-size-multiple requests
1977 */
1978static noinline_for_stack
1979void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
1980                                 struct ext4_buddy *e4b)
1981{
1982        struct super_block *sb = ac->ac_sb;
1983        struct ext4_sb_info *sbi = EXT4_SB(sb);
1984        void *bitmap = e4b->bd_bitmap;
1985        struct ext4_free_extent ex;
1986        ext4_fsblk_t first_group_block;
1987        ext4_fsblk_t a;
1988        ext4_grpblk_t i;
1989        int max;
1990
1991        BUG_ON(sbi->s_stripe == 0);
1992
1993        /* find first stripe-aligned block in group */
1994        first_group_block = ext4_group_first_block_no(sb, e4b->bd_group);
1995
1996        a = first_group_block + sbi->s_stripe - 1;
1997        do_div(a, sbi->s_stripe);
1998        i = (a * sbi->s_stripe) - first_group_block;
1999
2000        while (i < EXT4_CLUSTERS_PER_GROUP(sb)) {
2001                if (!mb_test_bit(i, bitmap)) {
2002                        max = mb_find_extent(e4b, i, sbi->s_stripe, &ex);
2003                        if (max >= sbi->s_stripe) {
2004                                ac->ac_found++;
2005                                ex.fe_logical = 0xDEADF00D; /* debug value */
2006                                ac->ac_b_ex = ex;
2007                                ext4_mb_use_best_found(ac, e4b);
2008                                break;
2009                        }
2010                }
2011                i += sbi->s_stripe;
2012        }
2013}
2014
2015/*
2016 * This is now called BEFORE we load the buddy bitmap.
2017 * Returns either 1 or 0 indicating that the group is either suitable
2018 * for the allocation or not. In addition it can also return negative
2019 * error code when something goes wrong.
2020 */
2021static int ext4_mb_good_group(struct ext4_allocation_context *ac,
2022                                ext4_group_t group, int cr)
2023{
2024        unsigned free, fragments;
2025        int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
2026        struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
2027
2028        BUG_ON(cr < 0 || cr >= 4);
2029
2030        free = grp->bb_free;
2031        if (free == 0)
2032                return 0;
2033        if (cr <= 2 && free < ac->ac_g_ex.fe_len)
2034                return 0;
2035
2036        if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
2037                return 0;
2038
2039        /* We only do this if the grp has never been initialized */
2040        if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
2041                int ret = ext4_mb_init_group(ac->ac_sb, group);
2042                if (ret)
2043                        return ret;
2044        }
2045
2046        fragments = grp->bb_fragments;
2047        if (fragments == 0)
2048                return 0;
2049
2050        switch (cr) {
2051        case 0:
2052                BUG_ON(ac->ac_2order == 0);
2053
2054                /* Avoid using the first bg of a flexgroup for data files */
2055                if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
2056                    (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
2057                    ((group % flex_size) == 0))
2058                        return 0;
2059
2060                if ((ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) ||
2061                    (free / fragments) >= ac->ac_g_ex.fe_len)
2062                        return 1;
2063
2064                if (grp->bb_largest_free_order < ac->ac_2order)
2065                        return 0;
2066
2067                return 1;
2068        case 1:
2069                if ((free / fragments) >= ac->ac_g_ex.fe_len)
2070                        return 1;
2071                break;
2072        case 2:
2073                if (free >= ac->ac_g_ex.fe_len)
2074                        return 1;
2075                break;
2076        case 3:
2077                return 1;
2078        default:
2079                BUG();
2080        }
2081
2082        return 0;
2083}
2084
2085static noinline_for_stack int
2086ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
2087{
2088        ext4_group_t ngroups, group, i;
2089        int cr;
2090        int err = 0, first_err = 0;
2091        struct ext4_sb_info *sbi;
2092        struct super_block *sb;
2093        struct ext4_buddy e4b;
2094
2095        sb = ac->ac_sb;
2096        sbi = EXT4_SB(sb);
2097        ngroups = ext4_get_groups_count(sb);
2098        /* non-extent files are limited to low blocks/groups */
2099        if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
2100                ngroups = sbi->s_blockfile_groups;
2101
2102        BUG_ON(ac->ac_status == AC_STATUS_FOUND);
2103
2104        /* first, try the goal */
2105        err = ext4_mb_find_by_goal(ac, &e4b);
2106        if (err || ac->ac_status == AC_STATUS_FOUND)
2107                goto out;
2108
2109        if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
2110                goto out;
2111
2112        /*
2113         * ac->ac2_order is set only if the fe_len is a power of 2
2114         * if ac2_order is set we also set criteria to 0 so that we
2115         * try exact allocation using buddy.
2116         */
2117        i = fls(ac->ac_g_ex.fe_len);
2118        ac->ac_2order = 0;
2119        /*
2120         * We search using buddy data only if the order of the request
2121         * is greater than equal to the sbi_s_mb_order2_reqs
2122         * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
2123         */
2124        if (i >= sbi->s_mb_order2_reqs) {
2125                /*
2126                 * This should tell if fe_len is exactly power of 2
2127                 */
2128                if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0)
2129                        ac->ac_2order = array_index_nospec(i - 1,
2130                                                           sb->s_blocksize_bits + 2);
2131        }
2132
2133        /* if stream allocation is enabled, use global goal */
2134        if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
2135                /* TBD: may be hot point */
2136                spin_lock(&sbi->s_md_lock);
2137                ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
2138                ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
2139                spin_unlock(&sbi->s_md_lock);
2140        }
2141
2142        /* Let's just scan groups to find more-less suitable blocks */
2143        cr = ac->ac_2order ? 0 : 1;
2144        /*
2145         * cr == 0 try to get exact allocation,
2146         * cr == 3  try to get anything
2147         */
2148repeat:
2149        for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
2150                ac->ac_criteria = cr;
2151                /*
2152                 * searching for the right group start
2153                 * from the goal value specified
2154                 */
2155                group = ac->ac_g_ex.fe_group;
2156
2157                for (i = 0; i < ngroups; group++, i++) {
2158                        int ret = 0;
2159                        cond_resched();
2160                        /*
2161                         * Artificially restricted ngroups for non-extent
2162                         * files makes group > ngroups possible on first loop.
2163                         */
2164                        if (group >= ngroups)
2165                                group = 0;
2166
2167                        /* This now checks without needing the buddy page */
2168                        ret = ext4_mb_good_group(ac, group, cr);
2169                        if (ret <= 0) {
2170                                if (!first_err)
2171                                        first_err = ret;
2172                                continue;
2173                        }
2174
2175                        err = ext4_mb_load_buddy(sb, group, &e4b);
2176                        if (err)
2177                                goto out;
2178
2179                        ext4_lock_group(sb, group);
2180
2181                        /*
2182                         * We need to check again after locking the
2183                         * block group
2184                         */
2185                        ret = ext4_mb_good_group(ac, group, cr);
2186                        if (ret <= 0) {
2187                                ext4_unlock_group(sb, group);
2188                                ext4_mb_unload_buddy(&e4b);
2189                                if (!first_err)
2190                                        first_err = ret;
2191                                continue;
2192                        }
2193
2194                        ac->ac_groups_scanned++;
2195                        if (cr == 0 && ac->ac_2order < sb->s_blocksize_bits+2)
2196                                ext4_mb_simple_scan_group(ac, &e4b);
2197                        else if (cr == 1 && sbi->s_stripe &&
2198                                        !(ac->ac_g_ex.fe_len % sbi->s_stripe))
2199                                ext4_mb_scan_aligned(ac, &e4b);
2200                        else
2201                                ext4_mb_complex_scan_group(ac, &e4b);
2202
2203                        ext4_unlock_group(sb, group);
2204                        ext4_mb_unload_buddy(&e4b);
2205
2206                        if (ac->ac_status != AC_STATUS_CONTINUE)
2207                                break;
2208                }
2209        }
2210
2211        if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
2212            !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
2213                /*
2214                 * We've been searching too long. Let's try to allocate
2215                 * the best chunk we've found so far
2216                 */
2217
2218                ext4_mb_try_best_found(ac, &e4b);
2219                if (ac->ac_status != AC_STATUS_FOUND) {
2220                        /*
2221                         * Someone more lucky has already allocated it.
2222                         * The only thing we can do is just take first
2223                         * found block(s)
2224                        printk(KERN_DEBUG "EXT4-fs: someone won our chunk\n");
2225                         */
2226                        ac->ac_b_ex.fe_group = 0;
2227                        ac->ac_b_ex.fe_start = 0;
2228                        ac->ac_b_ex.fe_len = 0;
2229                        ac->ac_status = AC_STATUS_CONTINUE;
2230                        ac->ac_flags |= EXT4_MB_HINT_FIRST;
2231                        cr = 3;
2232                        atomic_inc(&sbi->s_mb_lost_chunks);
2233                        goto repeat;
2234                }
2235        }
2236out:
2237        if (!err && ac->ac_status != AC_STATUS_FOUND && first_err)
2238                err = first_err;
2239        return err;
2240}
2241
2242static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
2243{
2244        struct super_block *sb = seq->private;
2245        ext4_group_t group;
2246
2247        if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
2248                return NULL;
2249        group = *pos + 1;
2250        return (void *) ((unsigned long) group);
2251}
2252
2253static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
2254{
2255        struct super_block *sb = seq->private;
2256        ext4_group_t group;
2257
2258        ++*pos;
2259        if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
2260                return NULL;
2261        group = *pos + 1;
2262        return (void *) ((unsigned long) group);
2263}
2264
2265static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
2266{
2267        struct super_block *sb = seq->private;
2268        ext4_group_t group = (ext4_group_t) ((unsigned long) v);
2269        int i;
2270        int err, buddy_loaded = 0;
2271        struct ext4_buddy e4b;
2272        struct ext4_group_info *grinfo;
2273        struct sg {
2274                struct ext4_group_info info;
2275                ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2];
2276        } sg;
2277
2278        group--;
2279        if (group == 0)
2280                seq_printf(seq, "#%-5s: %-5s %-5s %-5s "
2281                                "[ %-5s %-5s %-5s %-5s %-5s %-5s %-5s "
2282                                  "%-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n",
2283                           "group", "free", "frags", "first",
2284                           "2^0", "2^1", "2^2", "2^3", "2^4", "2^5", "2^6",
2285                           "2^7", "2^8", "2^9", "2^10", "2^11", "2^12", "2^13");
2286
2287        i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
2288                sizeof(struct ext4_group_info);
2289        grinfo = ext4_get_group_info(sb, group);
2290        /* Load the group info in memory only if not already loaded. */
2291        if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) {
2292                err = ext4_mb_load_buddy(sb, group, &e4b);
2293                if (err) {
2294                        seq_printf(seq, "#%-5u: I/O error\n", group);
2295                        return 0;
2296                }
2297                buddy_loaded = 1;
2298        }
2299
2300        memcpy(&sg, ext4_get_group_info(sb, group), i);
2301
2302        if (buddy_loaded)
2303                ext4_mb_unload_buddy(&e4b);
2304
2305        seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
2306                        sg.info.bb_fragments, sg.info.bb_first_free);
2307        for (i = 0; i <= 13; i++)
2308                seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ?
2309                                sg.info.bb_counters[i] : 0);
2310        seq_printf(seq, " ]\n");
2311
2312        return 0;
2313}
2314
2315static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v)
2316{
2317}
2318
2319static const struct seq_operations ext4_mb_seq_groups_ops = {
2320        .start  = ext4_mb_seq_groups_start,
2321        .next   = ext4_mb_seq_groups_next,
2322        .stop   = ext4_mb_seq_groups_stop,
2323        .show   = ext4_mb_seq_groups_show,
2324};
2325
2326static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
2327{
2328        struct super_block *sb = PDE_DATA(inode);
2329        int rc;
2330
2331        rc = seq_open(file, &ext4_mb_seq_groups_ops);
2332        if (rc == 0) {
2333                struct seq_file *m = file->private_data;
2334                m->private = sb;
2335        }
2336        return rc;
2337
2338}
2339
2340static const struct file_operations ext4_mb_seq_groups_fops = {
2341        .owner          = THIS_MODULE,
2342        .open           = ext4_mb_seq_groups_open,
2343        .read           = seq_read,
2344        .llseek         = seq_lseek,
2345        .release        = seq_release,
2346};
2347
2348static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
2349{
2350        int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
2351        struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index];
2352
2353        BUG_ON(!cachep);
2354        return cachep;
2355}
2356
2357/*
2358 * Allocate the top-level s_group_info array for the specified number
2359 * of groups
2360 */
2361int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
2362{
2363        struct ext4_sb_info *sbi = EXT4_SB(sb);
2364        unsigned size;
2365        struct ext4_group_info ***new_groupinfo;
2366
2367        size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >>
2368                EXT4_DESC_PER_BLOCK_BITS(sb);
2369        if (size <= sbi->s_group_info_size)
2370                return 0;
2371
2372        size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size);
2373        new_groupinfo = kvzalloc(size, GFP_KERNEL);
2374        if (!new_groupinfo) {
2375                ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
2376                return -ENOMEM;
2377        }
2378        if (sbi->s_group_info) {
2379                memcpy(new_groupinfo, sbi->s_group_info,
2380                       sbi->s_group_info_size * sizeof(*sbi->s_group_info));
2381                ext4_kvfree(sbi->s_group_info);
2382        }
2383        sbi->s_group_info = new_groupinfo;
2384        sbi->s_group_info_size = size / sizeof(*sbi->s_group_info);
2385        ext4_debug("allocated s_groupinfo array for %d meta_bg's\n", 
2386                   sbi->s_group_info_size);
2387        return 0;
2388}
2389
2390/* Create and initialize ext4_group_info data for the given group. */
2391int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2392                          struct ext4_group_desc *desc)
2393{
2394        int i;
2395        int metalen = 0;
2396        struct ext4_sb_info *sbi = EXT4_SB(sb);
2397        struct ext4_group_info **meta_group_info;
2398        struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2399
2400        /*
2401         * First check if this group is the first of a reserved block.
2402         * If it's true, we have to allocate a new table of pointers
2403         * to ext4_group_info structures
2404         */
2405        if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
2406                metalen = sizeof(*meta_group_info) <<
2407                        EXT4_DESC_PER_BLOCK_BITS(sb);
2408                meta_group_info = kmalloc(metalen, GFP_NOFS);
2409                if (meta_group_info == NULL) {
2410                        ext4_msg(sb, KERN_ERR, "can't allocate mem "
2411                                 "for a buddy group");
2412                        goto exit_meta_group_info;
2413                }
2414                sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] =
2415                        meta_group_info;
2416        }
2417
2418        meta_group_info =
2419                sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
2420        i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
2421
2422        meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_NOFS);
2423        if (meta_group_info[i] == NULL) {
2424                ext4_msg(sb, KERN_ERR, "can't allocate buddy mem");
2425                goto exit_group_info;
2426        }
2427        set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
2428                &(meta_group_info[i]->bb_state));
2429
2430        /*
2431         * initialize bb_free to be able to skip
2432         * empty groups without initialization
2433         */
2434        if (ext4_has_group_desc_csum(sb) &&
2435            (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
2436                meta_group_info[i]->bb_free =
2437                        ext4_free_clusters_after_init(sb, group, desc);
2438        } else {
2439                meta_group_info[i]->bb_free =
2440                        ext4_free_group_clusters(sb, desc);
2441        }
2442
2443        INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
2444        init_rwsem(&meta_group_info[i]->alloc_sem);
2445        meta_group_info[i]->bb_free_root = RB_ROOT;
2446        meta_group_info[i]->bb_largest_free_order = -1;  /* uninit */
2447
2448#ifdef DOUBLE_CHECK
2449        {
2450                struct buffer_head *bh;
2451                meta_group_info[i]->bb_bitmap =
2452                        kmalloc(sb->s_blocksize, GFP_NOFS);
2453                BUG_ON(meta_group_info[i]->bb_bitmap == NULL);
2454                bh = ext4_read_block_bitmap(sb, group);
2455                BUG_ON(bh == NULL);
2456                memcpy(meta_group_info[i]->bb_bitmap, bh->b_data,
2457                        sb->s_blocksize);
2458                put_bh(bh);
2459        }
2460#endif
2461
2462        return 0;
2463
2464exit_group_info:
2465        /* If a meta_group_info table has been allocated, release it now */
2466        if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
2467                kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]);
2468                sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = NULL;
2469        }
2470exit_meta_group_info:
2471        return -ENOMEM;
2472} /* ext4_mb_add_groupinfo */
2473
2474static int ext4_mb_init_backend(struct super_block *sb)
2475{
2476        ext4_group_t ngroups = ext4_get_groups_count(sb);
2477        ext4_group_t i;
2478        struct ext4_sb_info *sbi = EXT4_SB(sb);
2479        int err;
2480        struct ext4_group_desc *desc;
2481        struct kmem_cache *cachep;
2482
2483        err = ext4_mb_alloc_groupinfo(sb, ngroups);
2484        if (err)
2485                return err;
2486
2487        sbi->s_buddy_cache = new_inode(sb);
2488        if (sbi->s_buddy_cache == NULL) {
2489                ext4_msg(sb, KERN_ERR, "can't get new inode");
2490                goto err_freesgi;
2491        }
2492        /* To avoid potentially colliding with an valid on-disk inode number,
2493         * use EXT4_BAD_INO for the buddy cache inode number.  This inode is
2494         * not in the inode hash, so it should never be found by iget(), but
2495         * this will avoid confusion if it ever shows up during debugging. */
2496        sbi->s_buddy_cache->i_ino = EXT4_BAD_INO;
2497        EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
2498        for (i = 0; i < ngroups; i++) {
2499                desc = ext4_get_group_desc(sb, i, NULL);
2500                if (desc == NULL) {
2501                        ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i);
2502                        goto err_freebuddy;
2503                }
2504                if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
2505                        goto err_freebuddy;
2506        }
2507
2508        return 0;
2509
2510err_freebuddy:
2511        cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2512        while (i-- > 0)
2513                kmem_cache_free(cachep, ext4_get_group_info(sb, i));
2514        i = sbi->s_group_info_size;
2515        while (i-- > 0)
2516                kfree(sbi->s_group_info[i]);
2517        iput(sbi->s_buddy_cache);
2518err_freesgi:
2519        ext4_kvfree(sbi->s_group_info);
2520        return -ENOMEM;
2521}
2522
2523static void ext4_groupinfo_destroy_slabs(void)
2524{
2525        int i;
2526
2527        for (i = 0; i < NR_GRPINFO_CACHES; i++) {
2528                if (ext4_groupinfo_caches[i])
2529                        kmem_cache_destroy(ext4_groupinfo_caches[i]);
2530                ext4_groupinfo_caches[i] = NULL;
2531        }
2532}
2533
2534static int ext4_groupinfo_create_slab(size_t size)
2535{
2536        static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex);
2537        int slab_size;
2538        int blocksize_bits = order_base_2(size);
2539        int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
2540        struct kmem_cache *cachep;
2541
2542        if (cache_index >= NR_GRPINFO_CACHES)
2543                return -EINVAL;
2544
2545        if (unlikely(cache_index < 0))
2546                cache_index = 0;
2547
2548        mutex_lock(&ext4_grpinfo_slab_create_mutex);
2549        if (ext4_groupinfo_caches[cache_index]) {
2550                mutex_unlock(&ext4_grpinfo_slab_create_mutex);
2551                return 0;       /* Already created */
2552        }
2553
2554        slab_size = offsetof(struct ext4_group_info,
2555                                bb_counters[blocksize_bits + 2]);
2556
2557        cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index],
2558                                        slab_size, 0, SLAB_RECLAIM_ACCOUNT,
2559                                        NULL);
2560
2561        ext4_groupinfo_caches[cache_index] = cachep;
2562
2563        mutex_unlock(&ext4_grpinfo_slab_create_mutex);
2564        if (!cachep) {
2565                printk(KERN_EMERG
2566                       "EXT4-fs: no memory for groupinfo slab cache\n");
2567                return -ENOMEM;
2568        }
2569
2570        return 0;
2571}
2572
2573int ext4_mb_init(struct super_block *sb)
2574{
2575        struct ext4_sb_info *sbi = EXT4_SB(sb);
2576        unsigned i, j;
2577        unsigned offset, offset_incr;
2578        unsigned max;
2579        int ret;
2580
2581        i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
2582
2583        sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
2584        if (sbi->s_mb_offsets == NULL) {
2585                ret = -ENOMEM;
2586                goto out;
2587        }
2588
2589        i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
2590        sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
2591        if (sbi->s_mb_maxs == NULL) {
2592                ret = -ENOMEM;
2593                goto out;
2594        }
2595
2596        ret = ext4_groupinfo_create_slab(sb->s_blocksize);
2597        if (ret < 0)
2598                goto out;
2599
2600        /* order 0 is regular bitmap */
2601        sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
2602        sbi->s_mb_offsets[0] = 0;
2603
2604        i = 1;
2605        offset = 0;
2606        offset_incr = 1 << (sb->s_blocksize_bits - 1);
2607        max = sb->s_blocksize << 2;
2608        do {
2609                sbi->s_mb_offsets[i] = offset;
2610                sbi->s_mb_maxs[i] = max;
2611                offset += offset_incr;
2612                offset_incr = offset_incr >> 1;
2613                max = max >> 1;
2614                i++;
2615        } while (i <= sb->s_blocksize_bits + 1);
2616
2617        spin_lock_init(&sbi->s_md_lock);
2618        spin_lock_init(&sbi->s_bal_lock);
2619
2620        sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
2621        sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
2622        sbi->s_mb_stats = MB_DEFAULT_STATS;
2623        sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
2624        sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
2625        /*
2626         * The default group preallocation is 512, which for 4k block
2627         * sizes translates to 2 megabytes.  However for bigalloc file
2628         * systems, this is probably too big (i.e, if the cluster size
2629         * is 1 megabyte, then group preallocation size becomes half a
2630         * gigabyte!).  As a default, we will keep a two megabyte
2631         * group pralloc size for cluster sizes up to 64k, and after
2632         * that, we will force a minimum group preallocation size of
2633         * 32 clusters.  This translates to 8 megs when the cluster
2634         * size is 256k, and 32 megs when the cluster size is 1 meg,
2635         * which seems reasonable as a default.
2636         */
2637        sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >>
2638                                       sbi->s_cluster_bits, 32);
2639        /*
2640         * If there is a s_stripe > 1, then we set the s_mb_group_prealloc
2641         * to the lowest multiple of s_stripe which is bigger than
2642         * the s_mb_group_prealloc as determined above. We want
2643         * the preallocation size to be an exact multiple of the
2644         * RAID stripe size so that preallocations don't fragment
2645         * the stripes.
2646         */
2647        if (sbi->s_stripe > 1) {
2648                sbi->s_mb_group_prealloc = roundup(
2649                        sbi->s_mb_group_prealloc, sbi->s_stripe);
2650        }
2651
2652        sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
2653        if (sbi->s_locality_groups == NULL) {
2654                ret = -ENOMEM;
2655                goto out;
2656        }
2657        for_each_possible_cpu(i) {
2658                struct ext4_locality_group *lg;
2659                lg = per_cpu_ptr(sbi->s_locality_groups, i);
2660                mutex_init(&lg->lg_mutex);
2661                for (j = 0; j < PREALLOC_TB_SIZE; j++)
2662                        INIT_LIST_HEAD(&lg->lg_prealloc_list[j]);
2663                spin_lock_init(&lg->lg_prealloc_lock);
2664        }
2665
2666        /* init file for buddy data */
2667        ret = ext4_mb_init_backend(sb);
2668        if (ret != 0)
2669                goto out_free_locality_groups;
2670
2671        if (sbi->s_proc)
2672                proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
2673                                 &ext4_mb_seq_groups_fops, sb);
2674
2675        return 0;
2676
2677out_free_locality_groups:
2678        free_percpu(sbi->s_locality_groups);
2679        sbi->s_locality_groups = NULL;
2680out:
2681        kfree(sbi->s_mb_offsets);
2682        sbi->s_mb_offsets = NULL;
2683        kfree(sbi->s_mb_maxs);
2684        sbi->s_mb_maxs = NULL;
2685        return ret;
2686}
2687
2688/* need to called with the ext4 group lock held */
2689static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
2690{
2691        struct ext4_prealloc_space *pa;
2692        struct list_head *cur, *tmp;
2693        int count = 0;
2694
2695        list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) {
2696                pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
2697                list_del(&pa->pa_group_list);
2698                count++;
2699                kmem_cache_free(ext4_pspace_cachep, pa);
2700        }
2701        if (count)
2702                mb_debug(1, "mballoc: %u PAs left\n", count);
2703
2704}
2705
2706int ext4_mb_release(struct super_block *sb)
2707{
2708        ext4_group_t ngroups = ext4_get_groups_count(sb);
2709        ext4_group_t i;
2710        int num_meta_group_infos;
2711        struct ext4_group_info *grinfo;
2712        struct ext4_sb_info *sbi = EXT4_SB(sb);
2713        struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2714
2715        if (sbi->s_proc)
2716                remove_proc_entry("mb_groups", sbi->s_proc);
2717
2718        if (sbi->s_group_info) {
2719                for (i = 0; i < ngroups; i++) {
2720                        grinfo = ext4_get_group_info(sb, i);
2721#ifdef DOUBLE_CHECK
2722                        kfree(grinfo->bb_bitmap);
2723#endif
2724                        ext4_lock_group(sb, i);
2725                        ext4_mb_cleanup_pa(grinfo);
2726                        ext4_unlock_group(sb, i);
2727                        kmem_cache_free(cachep, grinfo);
2728                }
2729                num_meta_group_infos = (ngroups +
2730                                EXT4_DESC_PER_BLOCK(sb) - 1) >>
2731                        EXT4_DESC_PER_BLOCK_BITS(sb);
2732                for (i = 0; i < num_meta_group_infos; i++)
2733                        kfree(sbi->s_group_info[i]);
2734                ext4_kvfree(sbi->s_group_info);
2735        }
2736        kfree(sbi->s_mb_offsets);
2737        kfree(sbi->s_mb_maxs);
2738        iput(sbi->s_buddy_cache);
2739        if (sbi->s_mb_stats) {
2740                ext4_msg(sb, KERN_INFO,
2741                       "mballoc: %u blocks %u reqs (%u success)",
2742                                atomic_read(&sbi->s_bal_allocated),
2743                                atomic_read(&sbi->s_bal_reqs),
2744                                atomic_read(&sbi->s_bal_success));
2745                ext4_msg(sb, KERN_INFO,
2746                      "mballoc: %u extents scanned, %u goal hits, "
2747                                "%u 2^N hits, %u breaks, %u lost",
2748                                atomic_read(&sbi->s_bal_ex_scanned),
2749                                atomic_read(&sbi->s_bal_goals),
2750                                atomic_read(&sbi->s_bal_2orders),
2751                                atomic_read(&sbi->s_bal_breaks),
2752                                atomic_read(&sbi->s_mb_lost_chunks));
2753                ext4_msg(sb, KERN_INFO,
2754                       "mballoc: %lu generated and it took %Lu",
2755                                sbi->s_mb_buddies_generated,
2756                                sbi->s_mb_generation_time);
2757                ext4_msg(sb, KERN_INFO,
2758                       "mballoc: %u preallocated, %u discarded",
2759                                atomic_read(&sbi->s_mb_preallocated),
2760                                atomic_read(&sbi->s_mb_discarded));
2761        }
2762
2763        free_percpu(sbi->s_locality_groups);
2764
2765        return 0;
2766}
2767
2768static inline int ext4_issue_discard(struct super_block *sb,
2769                ext4_group_t block_group, ext4_grpblk_t cluster, int count)
2770{
2771        ext4_fsblk_t discard_block;
2772
2773        discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) +
2774                         ext4_group_first_block_no(sb, block_group));
2775        count = EXT4_C2B(EXT4_SB(sb), count);
2776        trace_ext4_discard_blocks(sb,
2777                        (unsigned long long) discard_block, count);
2778        return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
2779}
2780
2781/*
2782 * This function is called by the jbd2 layer once the commit has finished,
2783 * so we know we can free the blocks that were released with that commit.
2784 */
2785static void ext4_free_data_callback(struct super_block *sb,
2786                                    struct ext4_journal_cb_entry *jce,
2787                                    int rc)
2788{
2789        struct ext4_free_data *entry = (struct ext4_free_data *)jce;
2790        struct ext4_buddy e4b;
2791        struct ext4_group_info *db;
2792        int err, count = 0, count2 = 0;
2793
2794        mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
2795                 entry->efd_count, entry->efd_group, entry);
2796
2797        if (test_opt(sb, DISCARD)) {
2798                err = ext4_issue_discard(sb, entry->efd_group,
2799                                         entry->efd_start_cluster,
2800                                         entry->efd_count);
2801                if (err && err != -EOPNOTSUPP)
2802                        ext4_msg(sb, KERN_WARNING, "discard request in"
2803                                 " group:%d block:%d count:%d failed"
2804                                 " with %d", entry->efd_group,
2805                                 entry->efd_start_cluster,
2806                                 entry->efd_count, err);
2807        }
2808
2809        err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
2810        /* we expect to find existing buddy because it's pinned */
2811        BUG_ON(err != 0);
2812
2813
2814        db = e4b.bd_info;
2815        /* there are blocks to put in buddy to make them really free */
2816        count += entry->efd_count;
2817        count2++;
2818        ext4_lock_group(sb, entry->efd_group);
2819        /* Take it out of per group rb tree */
2820        rb_erase(&entry->efd_node, &(db->bb_free_root));
2821        mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count);
2822
2823        /*
2824         * Clear the trimmed flag for the group so that the next
2825         * ext4_trim_fs can trim it.
2826         * If the volume is mounted with -o discard, online discard
2827         * is supported and the free blocks will be trimmed online.
2828         */
2829        if (!test_opt(sb, DISCARD))
2830                EXT4_MB_GRP_CLEAR_TRIMMED(db);
2831
2832        if (!db->bb_free_root.rb_node) {
2833                /* No more items in the per group rb tree
2834                 * balance refcounts from ext4_mb_free_metadata()
2835                 */
2836                page_cache_release(e4b.bd_buddy_page);
2837                page_cache_release(e4b.bd_bitmap_page);
2838        }
2839        ext4_unlock_group(sb, entry->efd_group);
2840        kmem_cache_free(ext4_free_data_cachep, entry);
2841        ext4_mb_unload_buddy(&e4b);
2842
2843        mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
2844}
2845
2846int __init ext4_init_mballoc(void)
2847{
2848        ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
2849                                        SLAB_RECLAIM_ACCOUNT);
2850        if (ext4_pspace_cachep == NULL)
2851                return -ENOMEM;
2852
2853        ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context,
2854                                    SLAB_RECLAIM_ACCOUNT);
2855        if (ext4_ac_cachep == NULL) {
2856                kmem_cache_destroy(ext4_pspace_cachep);
2857                return -ENOMEM;
2858        }
2859
2860        ext4_free_data_cachep = KMEM_CACHE(ext4_free_data,
2861                                           SLAB_RECLAIM_ACCOUNT);
2862        if (ext4_free_data_cachep == NULL) {
2863                kmem_cache_destroy(ext4_pspace_cachep);
2864                kmem_cache_destroy(ext4_ac_cachep);
2865                return -ENOMEM;
2866        }
2867        return 0;
2868}
2869
2870void ext4_exit_mballoc(void)
2871{
2872        /*
2873         * Wait for completion of call_rcu()'s on ext4_pspace_cachep
2874         * before destroying the slab cache.
2875         */
2876        rcu_barrier();
2877        kmem_cache_destroy(ext4_pspace_cachep);
2878        kmem_cache_destroy(ext4_ac_cachep);
2879        kmem_cache_destroy(ext4_free_data_cachep);
2880        ext4_groupinfo_destroy_slabs();
2881}
2882
2883
2884/*
2885 * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps
2886 * Returns 0 if success or error code
2887 */
2888static noinline_for_stack int
2889ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2890                                handle_t *handle, unsigned int reserv_clstrs)
2891{
2892        struct buffer_head *bitmap_bh = NULL;
2893        struct ext4_group_desc *gdp;
2894        struct buffer_head *gdp_bh;
2895        struct ext4_sb_info *sbi;
2896        struct super_block *sb;
2897        ext4_fsblk_t block;
2898        int err, len;
2899
2900        BUG_ON(ac->ac_status != AC_STATUS_FOUND);
2901        BUG_ON(ac->ac_b_ex.fe_len <= 0);
2902
2903        sb = ac->ac_sb;
2904        sbi = EXT4_SB(sb);
2905
2906        err = -EIO;
2907        bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
2908        if (!bitmap_bh)
2909                goto out_err;
2910
2911        BUFFER_TRACE(bitmap_bh, "getting write access");
2912        err = ext4_journal_get_write_access(handle, bitmap_bh);
2913        if (err)
2914                goto out_err;
2915
2916        err = -EIO;
2917        gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh);
2918        if (!gdp)
2919                goto out_err;
2920
2921        ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
2922                        ext4_free_group_clusters(sb, gdp));
2923
2924        BUFFER_TRACE(gdp_bh, "get_write_access");
2925        err = ext4_journal_get_write_access(handle, gdp_bh);
2926        if (err)
2927                goto out_err;
2928
2929        block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
2930
2931        len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
2932        if (!ext4_data_block_valid(sbi, block, len)) {
2933                ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
2934                           "fs metadata", block, block+len);
2935                /* File system mounted not to panic on error
2936                 * Fix the bitmap and return EFSCORRUPTED
2937                 * We leak some of the blocks here.
2938                 */
2939                ext4_lock_group(sb, ac->ac_b_ex.fe_group);
2940                ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
2941                              ac->ac_b_ex.fe_len);
2942                ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
2943                err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
2944                if (!err)
2945                        err = -EFSCORRUPTED;
2946                goto out_err;
2947        }
2948
2949        ext4_lock_group(sb, ac->ac_b_ex.fe_group);
2950#ifdef AGGRESSIVE_CHECK
2951        {
2952                int i;
2953                for (i = 0; i < ac->ac_b_ex.fe_len; i++) {
2954                        BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i,
2955                                                bitmap_bh->b_data));
2956                }
2957        }
2958#endif
2959        ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
2960                      ac->ac_b_ex.fe_len);
2961        if (ext4_has_group_desc_csum(sb) &&
2962            (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
2963                gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
2964                ext4_free_group_clusters_set(sb, gdp,
2965                                             ext4_free_clusters_after_init(sb,
2966                                                ac->ac_b_ex.fe_group, gdp));
2967        }
2968        len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len;
2969        ext4_free_group_clusters_set(sb, gdp, len);
2970        ext4_block_bitmap_csum_set(sb, ac->ac_b_ex.fe_group, gdp, bitmap_bh);
2971        ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp);
2972
2973        ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
2974        percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len);
2975        /*
2976         * Now reduce the dirty block count also. Should not go negative
2977         */
2978        if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
2979                /* release all the reserved blocks if non delalloc */
2980                percpu_counter_sub(&sbi->s_dirtyclusters_counter,
2981                                   reserv_clstrs);
2982
2983        if (sbi->s_log_groups_per_flex) {
2984                ext4_group_t flex_group = ext4_flex_group(sbi,
2985                                                          ac->ac_b_ex.fe_group);
2986                atomic64_sub(ac->ac_b_ex.fe_len,
2987                             &sbi->s_flex_groups[flex_group].free_clusters);
2988        }
2989
2990        err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
2991        if (err)
2992                goto out_err;
2993        err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
2994
2995out_err:
2996        brelse(bitmap_bh);
2997        return err;
2998}
2999
3000/*
3001 * here we normalize request for locality group
3002 * Group request are normalized to s_mb_group_prealloc, which goes to
3003 * s_strip if we set the same via mount option.
3004 * s_mb_group_prealloc can be configured via
3005 * /sys/fs/ext4/<partition>/mb_group_prealloc
3006 *
3007 * XXX: should we try to preallocate more than the group has now?
3008 */
3009static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
3010{
3011        struct super_block *sb = ac->ac_sb;
3012        struct ext4_locality_group *lg = ac->ac_lg;
3013
3014        BUG_ON(lg == NULL);
3015        ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
3016        mb_debug(1, "#%u: goal %u blocks for locality group\n",
3017                current->pid, ac->ac_g_ex.fe_len);
3018}
3019
3020/*
3021 * Normalization means making request better in terms of
3022 * size and alignment
3023 */
3024static noinline_for_stack void
3025ext4_mb_normalize_request(struct ext4_allocation_context *ac,
3026                                struct ext4_allocation_request *ar)
3027{
3028        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
3029        int bsbits, max;
3030        ext4_lblk_t end;
3031        loff_t size, start_off;
3032        loff_t orig_size __maybe_unused;
3033        ext4_lblk_t start;
3034        struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
3035        struct ext4_prealloc_space *pa;
3036
3037        /* do normalize only data requests, metadata requests
3038           do not need preallocation */
3039        if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
3040                return;
3041
3042        /* sometime caller may want exact blocks */
3043        if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
3044                return;
3045
3046        /* caller may indicate that preallocation isn't
3047         * required (it's a tail, for example) */
3048        if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC)
3049                return;
3050
3051        if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) {
3052                ext4_mb_normalize_group_request(ac);
3053                return ;
3054        }
3055
3056        bsbits = ac->ac_sb->s_blocksize_bits;
3057
3058        /* first, let's learn actual file size
3059         * given current request is allocated */
3060        size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
3061        size = size << bsbits;
3062        if (size < i_size_read(ac->ac_inode))
3063                size = i_size_read(ac->ac_inode);
3064        orig_size = size;
3065
3066        /* max size of free chunks */
3067        max = 2 << bsbits;
3068
3069#define NRL_CHECK_SIZE(req, size, max, chunk_size)      \
3070                (req <= (size) || max <= (chunk_size))
3071
3072        /* first, try to predict filesize */
3073        /* XXX: should this table be tunable? */
3074        start_off = 0;
3075        if (size <= 16 * 1024) {
3076                size = 16 * 1024;
3077        } else if (size <= 32 * 1024) {
3078                size = 32 * 1024;
3079        } else if (size <= 64 * 1024) {
3080                size = 64 * 1024;
3081        } else if (size <= 128 * 1024) {
3082                size = 128 * 1024;
3083        } else if (size <= 256 * 1024) {
3084                size = 256 * 1024;
3085        } else if (size <= 512 * 1024) {
3086                size = 512 * 1024;
3087        } else if (size <= 1024 * 1024) {
3088                size = 1024 * 1024;
3089        } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) {
3090                start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
3091                                                (21 - bsbits)) << 21;
3092                size = 2 * 1024 * 1024;
3093        } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) {
3094                start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
3095                                                        (22 - bsbits)) << 22;
3096                size = 4 * 1024 * 1024;
3097        } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len,
3098                                        (8<<20)>>bsbits, max, 8 * 1024)) {
3099                start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
3100                                                        (23 - bsbits)) << 23;
3101                size = 8 * 1024 * 1024;
3102        } else {
3103                start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits;
3104                size      = (loff_t) EXT4_C2B(EXT4_SB(ac->ac_sb),
3105                                              ac->ac_o_ex.fe_len) << bsbits;
3106        }
3107        size = size >> bsbits;
3108        start = start_off >> bsbits;
3109
3110        /* don't cover already allocated blocks in selected range */
3111        if (ar->pleft && start <= ar->lleft) {
3112                size -= ar->lleft + 1 - start;
3113                start = ar->lleft + 1;
3114        }
3115        if (ar->pright && start + size - 1 >= ar->lright)
3116                size -= start + size - ar->lright;
3117
3118        /*
3119         * Trim allocation request for filesystems with artificially small
3120         * groups.
3121         */
3122        if (size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb))
3123                size = EXT4_BLOCKS_PER_GROUP(ac->ac_sb);
3124
3125        end = start + size;
3126
3127        /* check we don't cross already preallocated blocks */
3128        rcu_read_lock();
3129        list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
3130                ext4_lblk_t pa_end;
3131
3132                if (pa->pa_deleted)
3133                        continue;
3134                spin_lock(&pa->pa_lock);
3135                if (pa->pa_deleted) {
3136                        spin_unlock(&pa->pa_lock);
3137                        continue;
3138                }
3139
3140                pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
3141                                                  pa->pa_len);
3142
3143                /* PA must not overlap original request */
3144                BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
3145                        ac->ac_o_ex.fe_logical < pa->pa_lstart));
3146
3147                /* skip PAs this normalized request doesn't overlap with */
3148                if (pa->pa_lstart >= end || pa_end <= start) {
3149                        spin_unlock(&pa->pa_lock);
3150                        continue;
3151                }
3152                BUG_ON(pa->pa_lstart <= start && pa_end >= end);
3153
3154                /* adjust start or end to be adjacent to this pa */
3155                if (pa_end <= ac->ac_o_ex.fe_logical) {
3156                        BUG_ON(pa_end < start);
3157                        start = pa_end;
3158                } else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
3159                        BUG_ON(pa->pa_lstart > end);
3160                        end = pa->pa_lstart;
3161                }
3162                spin_unlock(&pa->pa_lock);
3163        }
3164        rcu_read_unlock();
3165        size = end - start;
3166
3167        /* XXX: extra loop to check we really don't overlap preallocations */
3168        rcu_read_lock();
3169        list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
3170                ext4_lblk_t pa_end;
3171
3172                spin_lock(&pa->pa_lock);
3173                if (pa->pa_deleted == 0) {
3174                        pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
3175                                                          pa->pa_len);
3176                        BUG_ON(!(start >= pa_end || end <= pa->pa_lstart));
3177                }
3178                spin_unlock(&pa->pa_lock);
3179        }
3180        rcu_read_unlock();
3181
3182        if (start + size <= ac->ac_o_ex.fe_logical &&
3183                        start > ac->ac_o_ex.fe_logical) {
3184                ext4_msg(ac->ac_sb, KERN_ERR,
3185                         "start %lu, size %lu, fe_logical %lu",
3186                         (unsigned long) start, (unsigned long) size,
3187                         (unsigned long) ac->ac_o_ex.fe_logical);
3188                BUG();
3189        }
3190        BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
3191
3192        /* now prepare goal request */
3193
3194        /* XXX: is it better to align blocks WRT to logical
3195         * placement or satisfy big request as is */
3196        ac->ac_g_ex.fe_logical = start;
3197        ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size);
3198
3199        /* define goal start in order to merge */
3200        if (ar->pright && (ar->lright == (start + size))) {
3201                /* merge to the right */
3202                ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size,
3203                                                &ac->ac_f_ex.fe_group,
3204                                                &ac->ac_f_ex.fe_start);
3205                ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
3206        }
3207        if (ar->pleft && (ar->lleft + 1 == start)) {
3208                /* merge to the left */
3209                ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1,
3210                                                &ac->ac_f_ex.fe_group,
3211                                                &ac->ac_f_ex.fe_start);
3212                ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
3213        }
3214
3215        mb_debug(1, "goal: %u(was %u) blocks at %u\n", (unsigned) size,
3216                (unsigned) orig_size, (unsigned) start);
3217}
3218
3219static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
3220{
3221        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
3222
3223        if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) {
3224                atomic_inc(&sbi->s_bal_reqs);
3225                atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
3226                if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
3227                        atomic_inc(&sbi->s_bal_success);
3228                atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
3229                if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
3230                                ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
3231                        atomic_inc(&sbi->s_bal_goals);
3232                if (ac->ac_found > sbi->s_mb_max_to_scan)
3233                        atomic_inc(&sbi->s_bal_breaks);
3234        }
3235
3236        if (ac->ac_op == EXT4_MB_HISTORY_ALLOC)
3237                trace_ext4_mballoc_alloc(ac);
3238        else
3239                trace_ext4_mballoc_prealloc(ac);
3240}
3241
3242/*
3243 * Called on failure; free up any blocks from the inode PA for this
3244 * context.  We don't need this for MB_GROUP_PA because we only change
3245 * pa_free in ext4_mb_release_context(), but on failure, we've already
3246 * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed.
3247 */
3248static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
3249{
3250        struct ext4_prealloc_space *pa = ac->ac_pa;
3251        struct ext4_buddy e4b;
3252        int err;
3253
3254        if (pa == NULL) {
3255                if (ac->ac_f_ex.fe_len == 0)
3256                        return;
3257                err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b);
3258                if (err) {
3259                        /*
3260                         * This should never happen since we pin the
3261                         * pages in the ext4_allocation_context so
3262                         * ext4_mb_load_buddy() should never fail.
3263                         */
3264                        WARN(1, "mb_load_buddy failed (%d)", err);
3265                        return;
3266                }
3267                ext4_lock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
3268                mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start,
3269                               ac->ac_f_ex.fe_len);
3270                ext4_unlock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
3271                ext4_mb_unload_buddy(&e4b);
3272                return;
3273        }
3274        if (pa->pa_type == MB_INODE_PA)
3275                pa->pa_free += ac->ac_b_ex.fe_len;
3276}
3277
3278/*
3279 * use blocks preallocated to inode
3280 */
3281static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
3282                                struct ext4_prealloc_space *pa)
3283{
3284        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
3285        ext4_fsblk_t start;
3286        ext4_fsblk_t end;
3287        int len;
3288
3289        /* found preallocated blocks, use them */
3290        start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart);
3291        end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len),
3292                  start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len));
3293        len = EXT4_NUM_B2C(sbi, end - start);
3294        ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group,
3295                                        &ac->ac_b_ex.fe_start);
3296        ac->ac_b_ex.fe_len = len;
3297        ac->ac_status = AC_STATUS_FOUND;
3298        ac->ac_pa = pa;
3299
3300        BUG_ON(start < pa->pa_pstart);
3301        BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len));
3302        BUG_ON(pa->pa_free < len);
3303        pa->pa_free -= len;
3304
3305        mb_debug(1, "use %llu/%u from inode pa %p\n", start, len, pa);
3306}
3307
3308/*
3309 * use blocks preallocated to locality group
3310 */
3311static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
3312                                struct ext4_prealloc_space *pa)
3313{
3314        unsigned int len = ac->ac_o_ex.fe_len;
3315
3316        ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart,
3317                                        &ac->ac_b_ex.fe_group,
3318                                        &ac->ac_b_ex.fe_start);
3319        ac->ac_b_ex.fe_len = len;
3320        ac->ac_status = AC_STATUS_FOUND;
3321        ac->ac_pa = pa;
3322
3323        /* we don't correct pa_pstart or pa_plen here to avoid
3324         * possible race when the group is being loaded concurrently
3325         * instead we correct pa later, after blocks are marked
3326         * in on-disk bitmap -- see ext4_mb_release_context()
3327         * Other CPUs are prevented from allocating from this pa by lg_mutex
3328         */
3329        mb_debug(1, "use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
3330}
3331
3332/*
3333 * Return the prealloc space that have minimal distance
3334 * from the goal block. @cpa is the prealloc
3335 * space that is having currently known minimal distance
3336 * from the goal block.
3337 */
3338static struct ext4_prealloc_space *
3339ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
3340                        struct ext4_prealloc_space *pa,
3341                        struct ext4_prealloc_space *cpa)
3342{
3343        ext4_fsblk_t cur_distance, new_distance;
3344
3345        if (cpa == NULL) {
3346                atomic_inc(&pa->pa_count);
3347                return pa;
3348        }
3349        cur_distance = abs(goal_block - cpa->pa_pstart);
3350        new_distance = abs(goal_block - pa->pa_pstart);
3351
3352        if (cur_distance <= new_distance)
3353                return cpa;
3354
3355        /* drop the previous reference */
3356        atomic_dec(&cpa->pa_count);
3357        atomic_inc(&pa->pa_count);
3358        return pa;
3359}
3360
3361/*
3362 * search goal blocks in preallocated space
3363 */
3364static noinline_for_stack int
3365ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
3366{
3367        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
3368        int order, i;
3369        struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
3370        struct ext4_locality_group *lg;
3371        struct ext4_prealloc_space *pa, *cpa = NULL;
3372        ext4_fsblk_t goal_block;
3373
3374        /* only data can be preallocated */
3375        if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
3376                return 0;
3377
3378        /* first, try per-file preallocation */
3379        rcu_read_lock();
3380        list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
3381
3382                /* all fields in this condition don't change,
3383                 * so we can skip locking for them */
3384                if (ac->ac_o_ex.fe_logical < pa->pa_lstart ||
3385                    ac->ac_o_ex.fe_logical >= (pa->pa_lstart +
3386                                               EXT4_C2B(sbi, pa->pa_len)))
3387                        continue;
3388
3389                /* non-extent files can't have physical blocks past 2^32 */
3390                if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
3391                    (pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len) >
3392                     EXT4_MAX_BLOCK_FILE_PHYS))
3393                        continue;
3394
3395                /* found preallocated blocks, use them */
3396                spin_lock(&pa->pa_lock);
3397                if (pa->pa_deleted == 0 && pa->pa_free) {
3398                        atomic_inc(&pa->pa_count);
3399                        ext4_mb_use_inode_pa(ac, pa);
3400                        spin_unlock(&pa->pa_lock);
3401                        ac->ac_criteria = 10;
3402                        rcu_read_unlock();
3403                        return 1;
3404                }
3405                spin_unlock(&pa->pa_lock);
3406        }
3407        rcu_read_unlock();
3408
3409        /* can we use group allocation? */
3410        if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC))
3411                return 0;
3412
3413        /* inode may have no locality group for some reason */
3414        lg = ac->ac_lg;
3415        if (lg == NULL)
3416                return 0;
3417        order  = fls(ac->ac_o_ex.fe_len) - 1;
3418        if (order > PREALLOC_TB_SIZE - 1)
3419                /* The max size of hash table is PREALLOC_TB_SIZE */
3420                order = PREALLOC_TB_SIZE - 1;
3421
3422        goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex);
3423        /*
3424         * search for the prealloc space that is having
3425         * minimal distance from the goal block.
3426         */
3427        for (i = order; i < PREALLOC_TB_SIZE; i++) {
3428                rcu_read_lock();
3429                list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i],
3430                                        pa_inode_list) {
3431                        spin_lock(&pa->pa_lock);
3432                        if (pa->pa_deleted == 0 &&
3433                                        pa->pa_free >= ac->ac_o_ex.fe_len) {
3434
3435                                cpa = ext4_mb_check_group_pa(goal_block,
3436                                                                pa, cpa);
3437                        }
3438                        spin_unlock(&pa->pa_lock);
3439                }
3440                rcu_read_unlock();
3441        }
3442        if (cpa) {
3443                ext4_mb_use_group_pa(ac, cpa);
3444                ac->ac_criteria = 20;
3445                return 1;
3446        }
3447        return 0;
3448}
3449
3450/*
3451 * the function goes through all block freed in the group
3452 * but not yet committed and marks them used in in-core bitmap.
3453 * buddy must be generated from this bitmap
3454 * Need to be called with the ext4 group lock held
3455 */
3456static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
3457                                                ext4_group_t group)
3458{
3459        struct rb_node *n;
3460        struct ext4_group_info *grp;
3461        struct ext4_free_data *entry;
3462
3463        grp = ext4_get_group_info(sb, group);
3464        n = rb_first(&(grp->bb_free_root));
3465
3466        while (n) {
3467                entry = rb_entry(n, struct ext4_free_data, efd_node);
3468                ext4_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count);
3469                n = rb_next(n);
3470        }
3471        return;
3472}
3473
3474/*
3475 * the function goes through all preallocation in this group and marks them
3476 * used in in-core bitmap. buddy must be generated from this bitmap
3477 * Need to be called with ext4 group lock held
3478 */
3479static noinline_for_stack
3480void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
3481                                        ext4_group_t group)
3482{
3483        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
3484        struct ext4_prealloc_space *pa;
3485        struct list_head *cur;
3486        ext4_group_t groupnr;
3487        ext4_grpblk_t start;
3488        int preallocated = 0;
3489        int len;
3490
3491        /* all form of preallocation discards first load group,
3492         * so the only competing code is preallocation use.
3493         * we don't need any locking here
3494         * notice we do NOT ignore preallocations with pa_deleted
3495         * otherwise we could leave used blocks available for
3496         * allocation in buddy when concurrent ext4_mb_put_pa()
3497         * is dropping preallocation
3498         */
3499        list_for_each(cur, &grp->bb_prealloc_list) {
3500                pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
3501                spin_lock(&pa->pa_lock);
3502                ext4_get_group_no_and_offset(sb, pa->pa_pstart,
3503                                             &groupnr, &start);
3504                len = pa->pa_len;
3505                spin_unlock(&pa->pa_lock);
3506                if (unlikely(len == 0))
3507                        continue;
3508                BUG_ON(groupnr != group);
3509                ext4_set_bits(bitmap, start, len);
3510                preallocated += len;
3511        }
3512        mb_debug(1, "prellocated %u for group %u\n", preallocated, group);
3513}
3514
3515static void ext4_mb_pa_callback(struct rcu_head *head)
3516{
3517        struct ext4_prealloc_space *pa;
3518        pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
3519
3520        BUG_ON(atomic_read(&pa->pa_count));
3521        BUG_ON(pa->pa_deleted == 0);
3522        kmem_cache_free(ext4_pspace_cachep, pa);
3523}
3524
3525/*
3526 * drops a reference to preallocated space descriptor
3527 * if this was the last reference and the space is consumed
3528 */
3529static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
3530                        struct super_block *sb, struct ext4_prealloc_space *pa)
3531{
3532        ext4_group_t grp;
3533        ext4_fsblk_t grp_blk;
3534
3535        /* in this short window concurrent discard can set pa_deleted */
3536        spin_lock(&pa->pa_lock);
3537        if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) {
3538                spin_unlock(&pa->pa_lock);
3539                return;
3540        }
3541
3542        if (pa->pa_deleted == 1) {
3543                spin_unlock(&pa->pa_lock);
3544                return;
3545        }
3546
3547        pa->pa_deleted = 1;
3548        spin_unlock(&pa->pa_lock);
3549
3550        grp_blk = pa->pa_pstart;
3551        /*
3552         * If doing group-based preallocation, pa_pstart may be in the
3553         * next group when pa is used up
3554         */
3555        if (pa->pa_type == MB_GROUP_PA)
3556                grp_blk--;
3557
3558        grp = ext4_get_group_number(sb, grp_blk);
3559
3560        /*
3561         * possible race:
3562         *
3563         *  P1 (buddy init)                     P2 (regular allocation)
3564         *                                      find block B in PA
3565         *  copy on-disk bitmap to buddy
3566         *                                      mark B in on-disk bitmap
3567         *                                      drop PA from group
3568         *  mark all PAs in buddy
3569         *
3570         * thus, P1 initializes buddy with B available. to prevent this
3571         * we make "copy" and "mark all PAs" atomic and serialize "drop PA"
3572         * against that pair
3573         */
3574        ext4_lock_group(sb, grp);
3575        list_del(&pa->pa_group_list);
3576        ext4_unlock_group(sb, grp);
3577
3578        spin_lock(pa->pa_obj_lock);
3579        list_del_rcu(&pa->pa_inode_list);
3580        spin_unlock(pa->pa_obj_lock);
3581
3582        call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
3583}
3584
3585/*
3586 * creates new preallocated space for given inode
3587 */
3588static noinline_for_stack int
3589ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
3590{
3591        struct super_block *sb = ac->ac_sb;
3592        struct ext4_sb_info *sbi = EXT4_SB(sb);
3593        struct ext4_prealloc_space *pa;
3594        struct ext4_group_info *grp;
3595        struct ext4_inode_info *ei;
3596
3597        /* preallocate only when found space is larger then requested */
3598        BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
3599        BUG_ON(ac->ac_status != AC_STATUS_FOUND);
3600        BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
3601
3602        pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
3603        if (pa == NULL)
3604                return -ENOMEM;
3605
3606        if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) {
3607                int winl;
3608                int wins;
3609                int win;
3610                int offs;
3611
3612                /* we can't allocate as much as normalizer wants.
3613                 * so, found space must get proper lstart
3614                 * to cover original request */
3615                BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical);
3616                BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len);
3617
3618                /* we're limited by original request in that
3619                 * logical block must be covered any way
3620                 * winl is window we can move our chunk within */
3621                winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical;
3622
3623                /* also, we should cover whole original request */
3624                wins = EXT4_C2B(sbi, ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len);
3625
3626                /* the smallest one defines real window */
3627                win = min(winl, wins);
3628
3629                offs = ac->ac_o_ex.fe_logical %
3630                        EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
3631                if (offs && offs < win)
3632                        win = offs;
3633
3634                ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical -
3635                        EXT4_NUM_B2C(sbi, win);
3636                BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
3637                BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
3638        }
3639
3640        /* preallocation can change ac_b_ex, thus we store actually
3641         * allocated blocks for history */
3642        ac->ac_f_ex = ac->ac_b_ex;
3643
3644        pa->pa_lstart = ac->ac_b_ex.fe_logical;
3645        pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
3646        pa->pa_len = ac->ac_b_ex.fe_len;
3647        pa->pa_free = pa->pa_len;
3648        atomic_set(&pa->pa_count, 1);
3649        spin_lock_init(&pa->pa_lock);
3650        INIT_LIST_HEAD(&pa->pa_inode_list);
3651        INIT_LIST_HEAD(&pa->pa_group_list);
3652        pa->pa_deleted = 0;
3653        pa->pa_type = MB_INODE_PA;
3654
3655        mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa,
3656                        pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3657        trace_ext4_mb_new_inode_pa(ac, pa);
3658
3659        ext4_mb_use_inode_pa(ac, pa);
3660        atomic_add(pa->pa_free, &sbi->s_mb_preallocated);
3661
3662        ei = EXT4_I(ac->ac_inode);
3663        grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
3664
3665        pa->pa_obj_lock = &ei->i_prealloc_lock;
3666        pa->pa_inode = ac->ac_inode;
3667
3668        ext4_lock_group(sb, ac->ac_b_ex.fe_group);
3669        list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
3670        ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
3671
3672        spin_lock(pa->pa_obj_lock);
3673        list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
3674        spin_unlock(pa->pa_obj_lock);
3675
3676        return 0;
3677}
3678
3679/*
3680 * creates new preallocated space for locality group inodes belongs to
3681 */
3682static noinline_for_stack int
3683ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
3684{
3685        struct super_block *sb = ac->ac_sb;
3686        struct ext4_locality_group *lg;
3687        struct ext4_prealloc_space *pa;
3688        struct ext4_group_info *grp;
3689
3690        /* preallocate only when found space is larger then requested */
3691        BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
3692        BUG_ON(ac->ac_status != AC_STATUS_FOUND);
3693        BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
3694
3695        BUG_ON(ext4_pspace_cachep == NULL);
3696        pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
3697        if (pa == NULL)
3698                return -ENOMEM;
3699
3700        /* preallocation can change ac_b_ex, thus we store actually
3701         * allocated blocks for history */
3702        ac->ac_f_ex = ac->ac_b_ex;
3703
3704        pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
3705        pa->pa_lstart = pa->pa_pstart;
3706        pa->pa_len = ac->ac_b_ex.fe_len;
3707        pa->pa_free = pa->pa_len;
3708        atomic_set(&pa->pa_count, 1);
3709        spin_lock_init(&pa->pa_lock);
3710        INIT_LIST_HEAD(&pa->pa_inode_list);
3711        INIT_LIST_HEAD(&pa->pa_group_list);
3712        pa->pa_deleted = 0;
3713        pa->pa_type = MB_GROUP_PA;
3714
3715        mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa,
3716                        pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3717        trace_ext4_mb_new_group_pa(ac, pa);
3718
3719        ext4_mb_use_group_pa(ac, pa);
3720        atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
3721
3722        grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
3723        lg = ac->ac_lg;
3724        BUG_ON(lg == NULL);
3725
3726        pa->pa_obj_lock = &lg->lg_prealloc_lock;
3727        pa->pa_inode = NULL;
3728
3729        ext4_lock_group(sb, ac->ac_b_ex.fe_group);
3730        list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
3731        ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
3732
3733        /*
3734         * We will later add the new pa to the right bucket
3735         * after updating the pa_free in ext4_mb_release_context
3736         */
3737        return 0;
3738}
3739
3740static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
3741{
3742        int err;
3743
3744        if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
3745                err = ext4_mb_new_group_pa(ac);
3746        else
3747                err = ext4_mb_new_inode_pa(ac);
3748        return err;
3749}
3750
3751/*
3752 * finds all unused blocks in on-disk bitmap, frees them in
3753 * in-core bitmap and buddy.
3754 * @pa must be unlinked from inode and group lists, so that
3755 * nobody else can find/use it.
3756 * the caller MUST hold group/inode locks.
3757 * TODO: optimize the case when there are no in-core structures yet
3758 */
3759static noinline_for_stack int
3760ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3761                        struct ext4_prealloc_space *pa)
3762{
3763        struct super_block *sb = e4b->bd_sb;
3764        struct ext4_sb_info *sbi = EXT4_SB(sb);
3765        unsigned int end;
3766        unsigned int next;
3767        ext4_group_t group;
3768        ext4_grpblk_t bit;
3769        unsigned long long grp_blk_start;
3770        int err = 0;
3771        int free = 0;
3772
3773        BUG_ON(pa->pa_deleted == 0);
3774        ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
3775        grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit);
3776        BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
3777        end = bit + pa->pa_len;
3778
3779        while (bit < end) {
3780                bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit);
3781                if (bit >= end)
3782                        break;
3783                next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
3784                mb_debug(1, "    free preallocated %u/%u in group %u\n",
3785                         (unsigned) ext4_group_first_block_no(sb, group) + bit,
3786                         (unsigned) next - bit, (unsigned) group);
3787                free += next - bit;
3788
3789                trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
3790                trace_ext4_mb_release_inode_pa(pa, (grp_blk_start +
3791                                                    EXT4_C2B(sbi, bit)),
3792                                               next - bit);
3793                mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
3794                bit = next + 1;
3795        }
3796        if (free != pa->pa_free) {
3797                ext4_msg(e4b->bd_sb, KERN_CRIT,
3798                         "pa %p: logic %lu, phys. %lu, len %lu",
3799                         pa, (unsigned long) pa->pa_lstart,
3800                         (unsigned long) pa->pa_pstart,
3801                         (unsigned long) pa->pa_len);
3802                ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u",
3803                                        free, pa->pa_free);
3804                /*
3805                 * pa is already deleted so we use the value obtained
3806                 * from the bitmap and continue.
3807                 */
3808        }
3809        atomic_add(free, &sbi->s_mb_discarded);
3810
3811        return err;
3812}
3813
3814static noinline_for_stack int
3815ext4_mb_release_group_pa(struct ext4_buddy *e4b,
3816                                struct ext4_prealloc_space *pa)
3817{
3818        struct super_block *sb = e4b->bd_sb;
3819        ext4_group_t group;
3820        ext4_grpblk_t bit;
3821
3822        trace_ext4_mb_release_group_pa(sb, pa);
3823        BUG_ON(pa->pa_deleted == 0);
3824        ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
3825        BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
3826        mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
3827        atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
3828        trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);
3829
3830        return 0;
3831}
3832
3833/*
3834 * releases all preallocations in given group
3835 *
3836 * first, we need to decide discard policy:
3837 * - when do we discard
3838 *   1) ENOSPC
3839 * - how many do we discard
3840 *   1) how many requested
3841 */
3842static noinline_for_stack int
3843ext4_mb_discard_group_preallocations(struct super_block *sb,
3844                                        ext4_group_t group, int needed)
3845{
3846        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
3847        struct buffer_head *bitmap_bh = NULL;
3848        struct ext4_prealloc_space *pa, *tmp;
3849        struct list_head list;
3850        struct ext4_buddy e4b;
3851        int err;
3852        int busy = 0;
3853        int free = 0;
3854
3855        mb_debug(1, "discard preallocation for group %u\n", group);
3856
3857        if (list_empty(&grp->bb_prealloc_list))
3858                return 0;
3859
3860        bitmap_bh = ext4_read_block_bitmap(sb, group);
3861        if (bitmap_bh == NULL) {
3862                ext4_error(sb, "Error reading block bitmap for %u", group);
3863                return 0;
3864        }
3865
3866        err = ext4_mb_load_buddy(sb, group, &e4b);
3867        if (err) {
3868                ext4_error(sb, "Error loading buddy information for %u", group);
3869                put_bh(bitmap_bh);
3870                return 0;
3871        }
3872
3873        if (needed == 0)
3874                needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
3875
3876        INIT_LIST_HEAD(&list);
3877repeat:
3878        ext4_lock_group(sb, group);
3879        list_for_each_entry_safe(pa, tmp,
3880                                &grp->bb_prealloc_list, pa_group_list) {
3881                spin_lock(&pa->pa_lock);
3882                if (atomic_read(&pa->pa_count)) {
3883                        spin_unlock(&pa->pa_lock);
3884                        busy = 1;
3885                        continue;
3886                }
3887                if (pa->pa_deleted) {
3888                        spin_unlock(&pa->pa_lock);
3889                        continue;
3890                }
3891
3892                /* seems this one can be freed ... */
3893                pa->pa_deleted = 1;
3894
3895                /* we can trust pa_free ... */
3896                free += pa->pa_free;
3897
3898                spin_unlock(&pa->pa_lock);
3899
3900                list_del(&pa->pa_group_list);
3901                list_add(&pa->u.pa_tmp_list, &list);
3902        }
3903
3904        /* if we still need more blocks and some PAs were used, try again */
3905        if (free < needed && busy) {
3906                busy = 0;
3907                ext4_unlock_group(sb, group);
3908                cond_resched();
3909                goto repeat;
3910        }
3911
3912        /* found anything to free? */
3913        if (list_empty(&list)) {
3914                BUG_ON(free != 0);
3915                goto out;
3916        }
3917
3918        /* now free all selected PAs */
3919        list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
3920
3921                /* remove from object (inode or locality group) */
3922                spin_lock(pa->pa_obj_lock);
3923                list_del_rcu(&pa->pa_inode_list);
3924                spin_unlock(pa->pa_obj_lock);
3925
3926                if (pa->pa_type == MB_GROUP_PA)
3927                        ext4_mb_release_group_pa(&e4b, pa);
3928                else
3929                        ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
3930
3931                list_del(&pa->u.pa_tmp_list);
3932                call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
3933        }
3934
3935out:
3936        ext4_unlock_group(sb, group);
3937        ext4_mb_unload_buddy(&e4b);
3938        put_bh(bitmap_bh);
3939        return free;
3940}
3941
3942/*
3943 * releases all non-used preallocated blocks for given inode
3944 *
3945 * It's important to discard preallocations under i_data_sem
3946 * We don't want another block to be served from the prealloc
3947 * space when we are discarding the inode prealloc space.
3948 *
3949 * FIXME!! Make sure it is valid at all the call sites
3950 */
3951void ext4_discard_preallocations(struct inode *inode)
3952{
3953        struct ext4_inode_info *ei = EXT4_I(inode);
3954        struct super_block *sb = inode->i_sb;
3955        struct buffer_head *bitmap_bh = NULL;
3956        struct ext4_prealloc_space *pa, *tmp;
3957        ext4_group_t group = 0;
3958        struct list_head list;
3959        struct ext4_buddy e4b;
3960        int err;
3961
3962        if (!S_ISREG(inode->i_mode)) {
3963                /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/
3964                return;
3965        }
3966
3967        mb_debug(1, "discard preallocation for inode %lu\n", inode->i_ino);
3968        trace_ext4_discard_preallocations(inode);
3969
3970        INIT_LIST_HEAD(&list);
3971
3972repeat:
3973        /* first, collect all pa's in the inode */
3974        spin_lock(&ei->i_prealloc_lock);
3975        while (!list_empty(&ei->i_prealloc_list)) {
3976                pa = list_entry(ei->i_prealloc_list.next,
3977                                struct ext4_prealloc_space, pa_inode_list);
3978                BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock);
3979                spin_lock(&pa->pa_lock);
3980                if (atomic_read(&pa->pa_count)) {
3981                        /* this shouldn't happen often - nobody should
3982                         * use preallocation while we're discarding it */
3983                        spin_unlock(&pa->pa_lock);
3984                        spin_unlock(&ei->i_prealloc_lock);
3985                        ext4_msg(sb, KERN_ERR,
3986                                 "uh-oh! used pa while discarding");
3987                        WARN_ON(1);
3988                        schedule_timeout_uninterruptible(HZ);
3989                        goto repeat;
3990
3991                }
3992                if (pa->pa_deleted == 0) {
3993                        pa->pa_deleted = 1;
3994                        spin_unlock(&pa->pa_lock);
3995                        list_del_rcu(&pa->pa_inode_list);
3996                        list_add(&pa->u.pa_tmp_list, &list);
3997                        continue;
3998                }
3999
4000                /* someone is deleting pa right now */
4001                spin_unlock(&pa->pa_lock);
4002                spin_unlock(&ei->i_prealloc_lock);
4003
4004                /* we have to wait here because pa_deleted
4005                 * doesn't mean pa is already unlinked from
4006                 * the list. as we might be called from
4007                 * ->clear_inode() the inode will get freed
4008                 * and concurrent thread which is unlinking
4009                 * pa from inode's list may access already
4010                 * freed memory, bad-bad-bad */
4011
4012                /* XXX: if this happens too often, we can
4013                 * add a flag to force wait only in case
4014                 * of ->clear_inode(), but not in case of
4015                 * regular truncate */
4016                schedule_timeout_uninterruptible(HZ);
4017                goto repeat;
4018        }
4019        spin_unlock(&ei->i_prealloc_lock);
4020
4021        list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
4022                BUG_ON(pa->pa_type != MB_INODE_PA);
4023                group = ext4_get_group_number(sb, pa->pa_pstart);
4024
4025                err = ext4_mb_load_buddy(sb, group, &e4b);
4026                if (err) {
4027                        ext4_error(sb, "Error loading buddy information for %u",
4028                                        group);
4029                        continue;
4030                }
4031
4032                bitmap_bh = ext4_read_block_bitmap(sb, group);
4033                if (bitmap_bh == NULL) {
4034                        ext4_error(sb, "Error reading block bitmap for %u",
4035                                        group);
4036                        ext4_mb_unload_buddy(&e4b);
4037                        continue;
4038                }
4039
4040                ext4_lock_group(sb, group);
4041                list_del(&pa->pa_group_list);
4042                ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
4043                ext4_unlock_group(sb, group);
4044
4045                ext4_mb_unload_buddy(&e4b);
4046                put_bh(bitmap_bh);
4047
4048                list_del(&pa->u.pa_tmp_list);
4049                call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
4050        }
4051}
4052
4053#ifdef CONFIG_EXT4_DEBUG
4054static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
4055{
4056        struct super_block *sb = ac->ac_sb;
4057        ext4_group_t ngroups, i;
4058
4059        if (!ext4_mballoc_debug ||
4060            (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED))
4061                return;
4062
4063        ext4_msg(ac->ac_sb, KERN_ERR, "Can't allocate:"
4064                        " Allocation context details:");
4065        ext4_msg(ac->ac_sb, KERN_ERR, "status %d flags %d",
4066                        ac->ac_status, ac->ac_flags);
4067        ext4_msg(ac->ac_sb, KERN_ERR, "orig %lu/%lu/%lu@%lu, "
4068                        "goal %lu/%lu/%lu@%lu, "
4069                        "best %lu/%lu/%lu@%lu cr %d",
4070                        (unsigned long)ac->ac_o_ex.fe_group,
4071                        (unsigned long)ac->ac_o_ex.fe_start,
4072                        (unsigned long)ac->ac_o_ex.fe_len,
4073                        (unsigned long)ac->ac_o_ex.fe_logical,
4074                        (unsigned long)ac->ac_g_ex.fe_group,
4075                        (unsigned long)ac->ac_g_ex.fe_start,
4076                        (unsigned long)ac->ac_g_ex.fe_len,
4077                        (unsigned long)ac->ac_g_ex.fe_logical,
4078                        (unsigned long)ac->ac_b_ex.fe_group,
4079                        (unsigned long)ac->ac_b_ex.fe_start,
4080                        (unsigned long)ac->ac_b_ex.fe_len,
4081                        (unsigned long)ac->ac_b_ex.fe_logical,
4082                        (int)ac->ac_criteria);
4083        ext4_msg(ac->ac_sb, KERN_ERR, "%d found", ac->ac_found);
4084        ext4_msg(ac->ac_sb, KERN_ERR, "groups: ");
4085        ngroups = ext4_get_groups_count(sb);
4086        for (i = 0; i < ngroups; i++) {
4087                struct ext4_group_info *grp = ext4_get_group_info(sb, i);
4088                struct ext4_prealloc_space *pa;
4089                ext4_grpblk_t start;
4090                struct list_head *cur;
4091                ext4_lock_group(sb, i);
4092                list_for_each(cur, &grp->bb_prealloc_list) {
4093                        pa = list_entry(cur, struct ext4_prealloc_space,
4094                                        pa_group_list);
4095                        spin_lock(&pa->pa_lock);
4096                        ext4_get_group_no_and_offset(sb, pa->pa_pstart,
4097                                                     NULL, &start);
4098                        spin_unlock(&pa->pa_lock);
4099                        printk(KERN_ERR "PA:%u:%d:%u \n", i,
4100                               start, pa->pa_len);
4101                }
4102                ext4_unlock_group(sb, i);
4103
4104                if (grp->bb_free == 0)
4105                        continue;
4106                printk(KERN_ERR "%u: %d/%d \n",
4107                       i, grp->bb_free, grp->bb_fragments);
4108        }
4109        printk(KERN_ERR "\n");
4110}
4111#else
4112static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
4113{
4114        return;
4115}
4116#endif
4117
4118/*
4119 * We use locality group preallocation for small size file. The size of the
4120 * file is determined by the current size or the resulting size after
4121 * allocation which ever is larger
4122 *
4123 * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req
4124 */
4125static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
4126{
4127        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
4128        int bsbits = ac->ac_sb->s_blocksize_bits;
4129        loff_t size, isize;
4130
4131        if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
4132                return;
4133
4134        if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
4135                return;
4136
4137        size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
4138        isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
4139                >> bsbits;
4140
4141        if ((size == isize) &&
4142            !ext4_fs_is_busy(sbi) &&
4143            (atomic_read(&ac->ac_inode->i_writecount) == 0)) {
4144                ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
4145                return;
4146        }
4147
4148        if (sbi->s_mb_group_prealloc <= 0) {
4149                ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
4150                return;
4151        }
4152
4153        /* don't use group allocation for large files */
4154        size = max(size, isize);
4155        if (size > sbi->s_mb_stream_request) {
4156                ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
4157                return;
4158        }
4159
4160        BUG_ON(ac->ac_lg != NULL);
4161        /*
4162         * locality group prealloc space are per cpu. The reason for having
4163         * per cpu locality group is to reduce the contention between block
4164         * request from multiple CPUs.
4165         */
4166        ac->ac_lg = __this_cpu_ptr(sbi->s_locality_groups);
4167
4168        /* we're going to use group allocation */
4169        ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;
4170
4171        /* serialize all allocations in the group */
4172        mutex_lock(&ac->ac_lg->lg_mutex);
4173}
4174
4175static noinline_for_stack int
4176ext4_mb_initialize_context(struct ext4_allocation_context *ac,
4177                                struct ext4_allocation_request *ar)
4178{
4179        struct super_block *sb = ar->inode->i_sb;
4180        struct ext4_sb_info *sbi = EXT4_SB(sb);
4181        struct ext4_super_block *es = sbi->s_es;
4182        ext4_group_t group;
4183        unsigned int len;
4184        ext4_fsblk_t goal;
4185        ext4_grpblk_t block;
4186
4187        /* we can't allocate > group size */
4188        len = ar->len;
4189
4190        /* just a dirty hack to filter too big requests  */
4191        if (len >= EXT4_CLUSTERS_PER_GROUP(sb))
4192                len = EXT4_CLUSTERS_PER_GROUP(sb);
4193
4194        /* start searching from the goal */
4195        goal = ar->goal;
4196        if (goal < le32_to_cpu(es->s_first_data_block) ||
4197                        goal >= ext4_blocks_count(es))
4198                goal = le32_to_cpu(es->s_first_data_block);
4199        ext4_get_group_no_and_offset(sb, goal, &group, &block);
4200
4201        /* set up allocation goals */
4202        ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical);
4203        ac->ac_status = AC_STATUS_CONTINUE;
4204        ac->ac_sb = sb;
4205        ac->ac_inode = ar->inode;
4206        ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical;
4207        ac->ac_o_ex.fe_group = group;
4208        ac->ac_o_ex.fe_start = block;
4209        ac->ac_o_ex.fe_len = len;
4210        ac->ac_g_ex = ac->ac_o_ex;
4211        ac->ac_flags = ar->flags;
4212
4213        /* we have to define context: we'll we work with a file or
4214         * locality group. this is a policy, actually */
4215        ext4_mb_group_or_file(ac);
4216
4217        mb_debug(1, "init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, "
4218                        "left: %u/%u, right %u/%u to %swritable\n",
4219                        (unsigned) ar->len, (unsigned) ar->logical,
4220                        (unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
4221                        (unsigned) ar->lleft, (unsigned) ar->pleft,
4222                        (unsigned) ar->lright, (unsigned) ar->pright,
4223                        atomic_read(&ar->inode->i_writecount) ? "" : "non-");
4224        return 0;
4225
4226}
4227
4228static noinline_for_stack void
4229ext4_mb_discard_lg_preallocations(struct super_block *sb,
4230                                        struct ext4_locality_group *lg,
4231                                        int order, int total_entries)
4232{
4233        ext4_group_t group = 0;
4234        struct ext4_buddy e4b;
4235        struct list_head discard_list;
4236        struct ext4_prealloc_space *pa, *tmp;
4237
4238        mb_debug(1, "discard locality group preallocation\n");
4239
4240        INIT_LIST_HEAD(&discard_list);
4241
4242        spin_lock(&lg->lg_prealloc_lock);
4243        list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
4244                                                pa_inode_list) {
4245                spin_lock(&pa->pa_lock);
4246                if (atomic_read(&pa->pa_count)) {
4247                        /*
4248                         * This is the pa that we just used
4249                         * for block allocation. So don't
4250                         * free that
4251                         */
4252                        spin_unlock(&pa->pa_lock);
4253                        continue;
4254                }
4255                if (pa->pa_deleted) {
4256                        spin_unlock(&pa->pa_lock);
4257                        continue;
4258                }
4259                /* only lg prealloc space */
4260                BUG_ON(pa->pa_type != MB_GROUP_PA);
4261
4262                /* seems this one can be freed ... */
4263                pa->pa_deleted = 1;
4264                spin_unlock(&pa->pa_lock);
4265
4266                list_del_rcu(&pa->pa_inode_list);
4267                list_add(&pa->u.pa_tmp_list, &discard_list);
4268
4269                total_entries--;
4270                if (total_entries <= 5) {
4271                        /*
4272                         * we want to keep only 5 entries
4273                         * allowing it to grow to 8. This
4274                         * mak sure we don't call discard
4275                         * soon for this list.
4276                         */
4277                        break;
4278                }
4279        }
4280        spin_unlock(&lg->lg_prealloc_lock);
4281
4282        list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {
4283
4284                group = ext4_get_group_number(sb, pa->pa_pstart);
4285                if (ext4_mb_load_buddy(sb, group, &e4b)) {
4286                        ext4_error(sb, "Error loading buddy information for %u",
4287                                        group);
4288                        continue;
4289                }
4290                ext4_lock_group(sb, group);
4291                list_del(&pa->pa_group_list);
4292                ext4_mb_release_group_pa(&e4b, pa);
4293                ext4_unlock_group(sb, group);
4294
4295                ext4_mb_unload_buddy(&e4b);
4296                list_del(&pa->u.pa_tmp_list);
4297                call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
4298        }
4299}
4300
4301/*
4302 * We have incremented pa_count. So it cannot be freed at this
4303 * point. Also we hold lg_mutex. So no parallel allocation is
4304 * possible from this lg. That means pa_free cannot be updated.
4305 *
4306 * A parallel ext4_mb_discard_group_preallocations is possible.
4307 * which can cause the lg_prealloc_list to be updated.
4308 */
4309
4310static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
4311{
4312        int order, added = 0, lg_prealloc_count = 1;
4313        struct super_block *sb = ac->ac_sb;
4314        struct ext4_locality_group *lg = ac->ac_lg;
4315        struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa;
4316
4317        order = fls(pa->pa_free) - 1;
4318        if (order > PREALLOC_TB_SIZE - 1)
4319                /* The max size of hash table is PREALLOC_TB_SIZE */
4320                order = PREALLOC_TB_SIZE - 1;
4321        /* Add the prealloc space to lg */
4322        spin_lock(&lg->lg_prealloc_lock);
4323        list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
4324                                                pa_inode_list) {
4325                spin_lock(&tmp_pa->pa_lock);
4326                if (tmp_pa->pa_deleted) {
4327                        spin_unlock(&tmp_pa->pa_lock);
4328                        continue;
4329                }
4330                if (!added && pa->pa_free < tmp_pa->pa_free) {
4331                        /* Add to the tail of the previous entry */
4332                        list_add_tail_rcu(&pa->pa_inode_list,
4333                                                &tmp_pa->pa_inode_list);
4334                        added = 1;
4335                        /*
4336                         * we want to count the total
4337                         * number of entries in the list
4338                         */
4339                }
4340                spin_unlock(&tmp_pa->pa_lock);
4341                lg_prealloc_count++;
4342        }
4343        if (!added)
4344                list_add_tail_rcu(&pa->pa_inode_list,
4345                                        &lg->lg_prealloc_list[order]);
4346        spin_unlock(&lg->lg_prealloc_lock);
4347
4348        /* Now trim the list to be not more than 8 elements */
4349        if (lg_prealloc_count > 8) {
4350                ext4_mb_discard_lg_preallocations(sb, lg,
4351                                                  order, lg_prealloc_count);
4352                return;
4353        }
4354        return ;
4355}
4356
4357/*
4358 * release all resource we used in allocation
4359 */
4360static int ext4_mb_release_context(struct ext4_allocation_context *ac)
4361{
4362        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
4363        struct ext4_prealloc_space *pa = ac->ac_pa;
4364        if (pa) {
4365                if (pa->pa_type == MB_GROUP_PA) {
4366                        /* see comment in ext4_mb_use_group_pa() */
4367                        spin_lock(&pa->pa_lock);
4368                        pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
4369                        pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
4370                        pa->pa_free -= ac->ac_b_ex.fe_len;
4371                        pa->pa_len -= ac->ac_b_ex.fe_len;
4372                        spin_unlock(&pa->pa_lock);
4373                }
4374        }
4375        if (pa) {
4376                /*
4377                 * We want to add the pa to the right bucket.
4378                 * Remove it from the list and while adding
4379                 * make sure the list to which we are adding
4380                 * doesn't grow big.
4381                 */
4382                if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
4383                        spin_lock(pa->pa_obj_lock);
4384                        list_del_rcu(&pa->pa_inode_list);
4385                        spin_unlock(pa->pa_obj_lock);
4386                        ext4_mb_add_n_trim(ac);
4387                }
4388                ext4_mb_put_pa(ac, ac->ac_sb, pa);
4389        }
4390        if (ac->ac_bitmap_page)
4391                page_cache_release(ac->ac_bitmap_page);
4392        if (ac->ac_buddy_page)
4393                page_cache_release(ac->ac_buddy_page);
4394        if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
4395                mutex_unlock(&ac->ac_lg->lg_mutex);
4396        ext4_mb_collect_stats(ac);
4397        return 0;
4398}
4399
4400static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
4401{
4402        ext4_group_t i, ngroups = ext4_get_groups_count(sb);
4403        int ret;
4404        int freed = 0;
4405
4406        trace_ext4_mb_discard_preallocations(sb, needed);
4407        for (i = 0; i < ngroups && needed > 0; i++) {
4408                ret = ext4_mb_discard_group_preallocations(sb, i, needed);
4409                freed += ret;
4410                needed -= ret;
4411        }
4412
4413        return freed;
4414}
4415
4416/*
4417 * Main entry point into mballoc to allocate blocks
4418 * it tries to use preallocation first, then falls back
4419 * to usual allocation
4420 */
4421ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4422                                struct ext4_allocation_request *ar, int *errp)
4423{
4424        int freed;
4425        struct ext4_allocation_context *ac = NULL;
4426        struct ext4_sb_info *sbi;
4427        struct super_block *sb;
4428        ext4_fsblk_t block = 0;
4429        unsigned int inquota = 0;
4430        unsigned int reserv_clstrs = 0;
4431
4432        might_sleep();
4433        sb = ar->inode->i_sb;
4434        sbi = EXT4_SB(sb);
4435
4436        trace_ext4_request_blocks(ar);
4437
4438        /* Allow to use superuser reservation for quota file */
4439        if (IS_NOQUOTA(ar->inode))
4440                ar->flags |= EXT4_MB_USE_ROOT_BLOCKS;
4441
4442        if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) {
4443                /* Without delayed allocation we need to verify
4444                 * there is enough free blocks to do block allocation
4445                 * and verify allocation doesn't exceed the quota limits.
4446                 */
4447                while (ar->len &&
4448                        ext4_claim_free_clusters(sbi, ar->len, ar->flags)) {
4449
4450                        /* let others to free the space */
4451                        cond_resched();
4452                        ar->len = ar->len >> 1;
4453                }
4454                if (!ar->len) {
4455                        *errp = -ENOSPC;
4456                        return 0;
4457                }
4458                reserv_clstrs = ar->len;
4459                if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) {
4460                        dquot_alloc_block_nofail(ar->inode,
4461                                                 EXT4_C2B(sbi, ar->len));
4462                } else {
4463                        while (ar->len &&
4464                                dquot_alloc_block(ar->inode,
4465                                                  EXT4_C2B(sbi, ar->len))) {
4466
4467                                ar->flags |= EXT4_MB_HINT_NOPREALLOC;
4468                                ar->len--;
4469                        }
4470                }
4471                inquota = ar->len;
4472                if (ar->len == 0) {
4473                        *errp = -EDQUOT;
4474                        goto out;
4475                }
4476        }
4477
4478        ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS);
4479        if (!ac) {
4480                ar->len = 0;
4481                *errp = -ENOMEM;
4482                goto out;
4483        }
4484
4485        *errp = ext4_mb_initialize_context(ac, ar);
4486        if (*errp) {
4487                ar->len = 0;
4488                goto out;
4489        }
4490
4491        ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
4492        if (!ext4_mb_use_preallocated(ac)) {
4493                ac->ac_op = EXT4_MB_HISTORY_ALLOC;
4494                ext4_mb_normalize_request(ac, ar);
4495repeat:
4496                /* allocate space in core */
4497                *errp = ext4_mb_regular_allocator(ac);
4498                if (*errp)
4499                        goto discard_and_exit;
4500
4501                /* as we've just preallocated more space than
4502                 * user requested originally, we store allocated
4503                 * space in a special descriptor */
4504                if (ac->ac_status == AC_STATUS_FOUND &&
4505                    ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
4506                        *errp = ext4_mb_new_preallocation(ac);
4507                if (*errp) {
4508                discard_and_exit:
4509                        ext4_discard_allocated_blocks(ac);
4510                        goto errout;
4511                }
4512        }
4513        if (likely(ac->ac_status == AC_STATUS_FOUND)) {
4514                *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
4515                if (*errp) {
4516                        ext4_discard_allocated_blocks(ac);
4517                        goto errout;
4518                } else {
4519                        block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
4520                        ar->len = ac->ac_b_ex.fe_len;
4521                }
4522        } else {
4523                freed  = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len);
4524                if (freed)
4525                        goto repeat;
4526                *errp = -ENOSPC;
4527        }
4528
4529errout:
4530        if (*errp) {
4531                ac->ac_b_ex.fe_len = 0;
4532                ar->len = 0;
4533                ext4_mb_show_ac(ac);
4534        }
4535        ext4_mb_release_context(ac);
4536out:
4537        if (ac)
4538                kmem_cache_free(ext4_ac_cachep, ac);
4539        if (inquota && ar->len < inquota)
4540                dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len));
4541        if (!ar->len) {
4542                if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0)
4543                        /* release all the reserved blocks if non delalloc */
4544                        percpu_counter_sub(&sbi->s_dirtyclusters_counter,
4545                                                reserv_clstrs);
4546        }
4547
4548        trace_ext4_allocate_blocks(ar, (unsigned long long)block);
4549
4550        return block;
4551}
4552
4553/*
4554 * We can merge two free data extents only if the physical blocks
4555 * are contiguous, AND the extents were freed by the same transaction,
4556 * AND the blocks are associated with the same group.
4557 */
4558static int can_merge(struct ext4_free_data *entry1,
4559                        struct ext4_free_data *entry2)
4560{
4561        if ((entry1->efd_tid == entry2->efd_tid) &&
4562            (entry1->efd_group == entry2->efd_group) &&
4563            ((entry1->efd_start_cluster + entry1->efd_count) == entry2->efd_start_cluster))
4564                return 1;
4565        return 0;
4566}
4567
4568static noinline_for_stack int
4569ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4570                      struct ext4_free_data *new_entry)
4571{
4572        ext4_group_t group = e4b->bd_group;
4573        ext4_grpblk_t cluster;
4574        struct ext4_free_data *entry;
4575        struct ext4_group_info *db = e4b->bd_info;
4576        struct super_block *sb = e4b->bd_sb;
4577        struct ext4_sb_info *sbi = EXT4_SB(sb);
4578        struct rb_node **n = &db->bb_free_root.rb_node, *node;
4579        struct rb_node *parent = NULL, *new_node;
4580
4581        BUG_ON(!ext4_handle_valid(handle));
4582        BUG_ON(e4b->bd_bitmap_page == NULL);
4583        BUG_ON(e4b->bd_buddy_page == NULL);
4584
4585        new_node = &new_entry->efd_node;
4586        cluster = new_entry->efd_start_cluster;
4587
4588        if (!*n) {
4589                /* first free block exent. We need to
4590                   protect buddy cache from being freed,
4591                 * otherwise we'll refresh it from
4592                 * on-disk bitmap and lose not-yet-available
4593                 * blocks */
4594                page_cache_get(e4b->bd_buddy_page);
4595                page_cache_get(e4b->bd_bitmap_page);
4596        }
4597        while (*n) {
4598                parent = *n;
4599                entry = rb_entry(parent, struct ext4_free_data, efd_node);
4600                if (cluster < entry->efd_start_cluster)
4601                        n = &(*n)->rb_left;
4602                else if (cluster >= (entry->efd_start_cluster + entry->efd_count))
4603                        n = &(*n)->rb_right;
4604                else {
4605                        ext4_grp_locked_error(sb, group, 0,
4606                                ext4_group_first_block_no(sb, group) +
4607                                EXT4_C2B(sbi, cluster),
4608                                "Block already on to-be-freed list");
4609                        return 0;
4610                }
4611        }
4612
4613        rb_link_node(new_node, parent, n);
4614        rb_insert_color(new_node, &db->bb_free_root);
4615
4616        /* Now try to see the extent can be merged to left and right */
4617        node = rb_prev(new_node);
4618        if (node) {
4619                entry = rb_entry(node, struct ext4_free_data, efd_node);
4620                if (can_merge(entry, new_entry) &&
4621                    ext4_journal_callback_try_del(handle, &entry->efd_jce)) {
4622                        new_entry->efd_start_cluster = entry->efd_start_cluster;
4623                        new_entry->efd_count += entry->efd_count;
4624                        rb_erase(node, &(db->bb_free_root));
4625                        kmem_cache_free(ext4_free_data_cachep, entry);
4626                }
4627        }
4628
4629        node = rb_next(new_node);
4630        if (node) {
4631                entry = rb_entry(node, struct ext4_free_data, efd_node);
4632                if (can_merge(new_entry, entry) &&
4633                    ext4_journal_callback_try_del(handle, &entry->efd_jce)) {
4634                        new_entry->efd_count += entry->efd_count;
4635                        rb_erase(node, &(db->bb_free_root));
4636                        kmem_cache_free(ext4_free_data_cachep, entry);
4637                }
4638        }
4639        /* Add the extent to transaction's private list */
4640        ext4_journal_callback_add(handle, ext4_free_data_callback,
4641                                  &new_entry->efd_jce);
4642        return 0;
4643}
4644
4645/**
4646 * ext4_free_blocks() -- Free given blocks and update quota
4647 * @handle:             handle for this transaction
4648 * @inode:              inode
4649 * @block:              start physical block to free
4650 * @count:              number of blocks to count
4651 * @flags:              flags used by ext4_free_blocks
4652 */
4653void ext4_free_blocks(handle_t *handle, struct inode *inode,
4654                      struct buffer_head *bh, ext4_fsblk_t block,
4655                      unsigned long count, int flags)
4656{
4657        struct buffer_head *bitmap_bh = NULL;
4658        struct super_block *sb = inode->i_sb;
4659        struct ext4_group_desc *gdp;
4660        unsigned int overflow;
4661        ext4_grpblk_t bit;
4662        struct buffer_head *gd_bh;
4663        ext4_group_t block_group;
4664        struct ext4_sb_info *sbi;
4665        struct ext4_buddy e4b;
4666        unsigned int count_clusters;
4667        int err = 0;
4668        int ret;
4669
4670        might_sleep();
4671        if (bh) {
4672                if (block)
4673                        BUG_ON(block != bh->b_blocknr);
4674                else
4675                        block = bh->b_blocknr;
4676        }
4677
4678        sbi = EXT4_SB(sb);
4679        if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
4680            !ext4_data_block_valid(sbi, block, count)) {
4681                ext4_error(sb, "Freeing blocks not in datazone - "
4682                           "block = %llu, count = %lu", block, count);
4683                goto error_return;
4684        }
4685
4686        ext4_debug("freeing block %llu\n", block);
4687        trace_ext4_free_blocks(inode, block, count, flags);
4688
4689        if (flags & EXT4_FREE_BLOCKS_FORGET) {
4690                struct buffer_head *tbh = bh;
4691                int i;
4692
4693                BUG_ON(bh && (count > 1));
4694
4695                for (i = 0; i < count; i++) {
4696                        cond_resched();
4697                        if (!bh)
4698                                tbh = sb_find_get_block(inode->i_sb,
4699                                                        block + i);
4700                        if (!tbh)
4701                                continue;
4702                        ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
4703                                    inode, tbh, block + i);
4704                }
4705        }
4706
4707        /*
4708         * We need to make sure we don't reuse the freed block until
4709         * after the transaction is committed, which we can do by
4710         * treating the block as metadata, below.  We make an
4711         * exception if the inode is to be written in writeback mode
4712         * since writeback mode has weak data consistency guarantees.
4713         */
4714        if (!ext4_should_writeback_data(inode))
4715                flags |= EXT4_FREE_BLOCKS_METADATA;
4716
4717        /*
4718         * If the extent to be freed does not begin on a cluster
4719         * boundary, we need to deal with partial clusters at the
4720         * beginning and end of the extent.  Normally we will free
4721         * blocks at the beginning or the end unless we are explicitly
4722         * requested to avoid doing so.
4723         */
4724        overflow = EXT4_PBLK_COFF(sbi, block);
4725        if (overflow) {
4726                if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
4727                        overflow = sbi->s_cluster_ratio - overflow;
4728                        block += overflow;
4729                        if (count > overflow)
4730                                count -= overflow;
4731                        else
4732                                return;
4733                } else {
4734                        block -= overflow;
4735                        count += overflow;
4736                }
4737        }
4738        overflow = EXT4_LBLK_COFF(sbi, count);
4739        if (overflow) {
4740                if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
4741                        if (count > overflow)
4742                                count -= overflow;
4743                        else
4744                                return;
4745                } else
4746                        count += sbi->s_cluster_ratio - overflow;
4747        }
4748
4749do_more:
4750        overflow = 0;
4751        ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
4752
4753        if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(
4754                        ext4_get_group_info(sb, block_group))))
4755                return;
4756
4757        /*
4758         * Check to see if we are freeing blocks across a group
4759         * boundary.
4760         */
4761        if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) {
4762                overflow = EXT4_C2B(sbi, bit) + count -
4763                        EXT4_BLOCKS_PER_GROUP(sb);
4764                count -= overflow;
4765        }
4766        count_clusters = EXT4_NUM_B2C(sbi, count);
4767        bitmap_bh = ext4_read_block_bitmap(sb, block_group);
4768        if (!bitmap_bh) {
4769                err = -EIO;
4770                goto error_return;
4771        }
4772        gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
4773        if (!gdp) {
4774                err = -EIO;
4775                goto error_return;
4776        }
4777
4778        if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
4779            in_range(ext4_inode_bitmap(sb, gdp), block, count) ||
4780            in_range(block, ext4_inode_table(sb, gdp),
4781                     EXT4_SB(sb)->s_itb_per_group) ||
4782            in_range(block + count - 1, ext4_inode_table(sb, gdp),
4783                     EXT4_SB(sb)->s_itb_per_group)) {
4784
4785                ext4_error(sb, "Freeing blocks in system zone - "
4786                           "Block = %llu, count = %lu", block, count);
4787                /* err = 0. ext4_std_error should be a no op */
4788                goto error_return;
4789        }
4790
4791        BUFFER_TRACE(bitmap_bh, "getting write access");
4792        err = ext4_journal_get_write_access(handle, bitmap_bh);
4793        if (err)
4794                goto error_return;
4795
4796        /*
4797         * We are about to modify some metadata.  Call the journal APIs
4798         * to unshare ->b_data if a currently-committing transaction is
4799         * using it
4800         */
4801        BUFFER_TRACE(gd_bh, "get_write_access");
4802        err = ext4_journal_get_write_access(handle, gd_bh);
4803        if (err)
4804                goto error_return;
4805#ifdef AGGRESSIVE_CHECK
4806        {
4807                int i;
4808                for (i = 0; i < count_clusters; i++)
4809                        BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
4810        }
4811#endif
4812        trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters);
4813
4814        err = ext4_mb_load_buddy(sb, block_group, &e4b);
4815        if (err)
4816                goto error_return;
4817
4818        if ((flags & EXT4_FREE_BLOCKS_METADATA) && ext4_handle_valid(handle)) {
4819                struct ext4_free_data *new_entry;
4820                /*
4821                 * blocks being freed are metadata. these blocks shouldn't
4822                 * be used until this transaction is committed
4823                 *
4824                 * We use __GFP_NOFAIL because ext4_free_blocks() is not allowed
4825                 * to fail.
4826                 */
4827                new_entry = kmem_cache_alloc(ext4_free_data_cachep,
4828                                GFP_NOFS|__GFP_NOFAIL);
4829                new_entry->efd_start_cluster = bit;
4830                new_entry->efd_group = block_group;
4831                new_entry->efd_count = count_clusters;
4832                new_entry->efd_tid = handle->h_transaction->t_tid;
4833
4834                ext4_lock_group(sb, block_group);
4835                mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
4836                ext4_mb_free_metadata(handle, &e4b, new_entry);
4837        } else {
4838                /* need to update group_info->bb_free and bitmap
4839                 * with group lock held. generate_buddy look at
4840                 * them with group lock_held
4841                 */
4842                if (test_opt(sb, DISCARD)) {
4843                        err = ext4_issue_discard(sb, block_group, bit, count);
4844                        if (err && err != -EOPNOTSUPP)
4845                                ext4_msg(sb, KERN_WARNING, "discard request in"
4846                                         " group:%d block:%d count:%lu failed"
4847                                         " with %d", block_group, bit, count,
4848                                         err);
4849                } else
4850                        EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info);
4851
4852                ext4_lock_group(sb, block_group);
4853                mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
4854                mb_free_blocks(inode, &e4b, bit, count_clusters);
4855        }
4856
4857        ret = ext4_free_group_clusters(sb, gdp) + count_clusters;
4858        ext4_free_group_clusters_set(sb, gdp, ret);
4859        ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh);
4860        ext4_group_desc_csum_set(sb, block_group, gdp);
4861        ext4_unlock_group(sb, block_group);
4862
4863        if (sbi->s_log_groups_per_flex) {
4864                ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
4865                atomic64_add(count_clusters,
4866                             &sbi->s_flex_groups[flex_group].free_clusters);
4867        }
4868
4869        if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
4870                dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
4871        percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters);
4872
4873        ext4_mb_unload_buddy(&e4b);
4874
4875        /* We dirtied the bitmap block */
4876        BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
4877        err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
4878
4879        /* And the group descriptor block */
4880        BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
4881        ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
4882        if (!err)
4883                err = ret;
4884
4885        if (overflow && !err) {
4886                block += count;
4887                count = overflow;
4888                put_bh(bitmap_bh);
4889                goto do_more;
4890        }
4891error_return:
4892        brelse(bitmap_bh);
4893        ext4_std_error(sb, err);
4894        return;
4895}
4896
4897/**
4898 * ext4_group_add_blocks() -- Add given blocks to an existing group
4899 * @handle:                     handle to this transaction
4900 * @sb:                         super block
4901 * @block:                      start physical block to add to the block group
4902 * @count:                      number of blocks to free
4903 *
4904 * This marks the blocks as free in the bitmap and buddy.
4905 */
4906int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
4907                         ext4_fsblk_t block, unsigned long count)
4908{
4909        struct buffer_head *bitmap_bh = NULL;
4910        struct buffer_head *gd_bh;
4911        ext4_group_t block_group;
4912        ext4_grpblk_t bit;
4913        unsigned int i;
4914        struct ext4_group_desc *desc;
4915        struct ext4_sb_info *sbi = EXT4_SB(sb);
4916        struct ext4_buddy e4b;
4917        int err = 0, ret, blk_free_count;
4918        ext4_grpblk_t blocks_freed;
4919
4920        ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
4921
4922        if (count == 0)
4923                return 0;
4924
4925        ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
4926        /*
4927         * Check to see if we are freeing blocks across a group
4928         * boundary.
4929         */
4930        if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
4931                ext4_warning(sb, "too much blocks added to group %u\n",
4932                             block_group);
4933                err = -EINVAL;
4934                goto error_return;
4935        }
4936
4937        bitmap_bh = ext4_read_block_bitmap(sb, block_group);
4938        if (!bitmap_bh) {
4939                err = -EIO;
4940                goto error_return;
4941        }
4942
4943        desc = ext4_get_group_desc(sb, block_group, &gd_bh);
4944        if (!desc) {
4945                err = -EIO;
4946                goto error_return;
4947        }
4948
4949        if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
4950            in_range(ext4_inode_bitmap(sb, desc), block, count) ||
4951            in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
4952            in_range(block + count - 1, ext4_inode_table(sb, desc),
4953                     sbi->s_itb_per_group)) {
4954                ext4_error(sb, "Adding blocks in system zones - "
4955                           "Block = %llu, count = %lu",
4956                           block, count);
4957                err = -EINVAL;
4958                goto error_return;
4959        }
4960
4961        BUFFER_TRACE(bitmap_bh, "getting write access");
4962        err = ext4_journal_get_write_access(handle, bitmap_bh);
4963        if (err)
4964                goto error_return;
4965
4966        /*
4967         * We are about to modify some metadata.  Call the journal APIs
4968         * to unshare ->b_data if a currently-committing transaction is
4969         * using it
4970         */
4971        BUFFER_TRACE(gd_bh, "get_write_access");
4972        err = ext4_journal_get_write_access(handle, gd_bh);
4973        if (err)
4974                goto error_return;
4975
4976        for (i = 0, blocks_freed = 0; i < count; i++) {
4977                BUFFER_TRACE(bitmap_bh, "clear bit");
4978                if (!mb_test_bit(bit + i, bitmap_bh->b_data)) {
4979                        ext4_error(sb, "bit already cleared for block %llu",
4980                                   (ext4_fsblk_t)(block + i));
4981                        BUFFER_TRACE(bitmap_bh, "bit already cleared");
4982                } else {
4983                        blocks_freed++;
4984                }
4985        }
4986
4987        err = ext4_mb_load_buddy(sb, block_group, &e4b);
4988        if (err)
4989                goto error_return;
4990
4991        /*
4992         * need to update group_info->bb_free and bitmap
4993         * with group lock held. generate_buddy look at
4994         * them with group lock_held
4995         */
4996        ext4_lock_group(sb, block_group);
4997        mb_clear_bits(bitmap_bh->b_data, bit, count);
4998        mb_free_blocks(NULL, &e4b, bit, count);
4999        blk_free_count = blocks_freed + ext4_free_group_clusters(sb, desc);
5000        ext4_free_group_clusters_set(sb, desc, blk_free_count);
5001        ext4_block_bitmap_csum_set(sb, block_group, desc, bitmap_bh);
5002        ext4_group_desc_csum_set(sb, block_group, desc);
5003        ext4_unlock_group(sb, block_group);
5004        percpu_counter_add(&sbi->s_freeclusters_counter,
5005                           EXT4_NUM_B2C(sbi, blocks_freed));
5006
5007        if (sbi->s_log_groups_per_flex) {
5008                ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
5009                atomic64_add(EXT4_NUM_B2C(sbi, blocks_freed),
5010                             &sbi->s_flex_groups[flex_group].free_clusters);
5011        }
5012
5013        ext4_mb_unload_buddy(&e4b);
5014
5015        /* We dirtied the bitmap block */
5016        BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
5017        err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
5018
5019        /* And the group descriptor block */
5020        BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
5021        ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
5022        if (!err)
5023                err = ret;
5024
5025error_return:
5026        brelse(bitmap_bh);
5027        ext4_std_error(sb, err);
5028        return err;
5029}
5030
5031/**
5032 * ext4_trim_extent -- function to TRIM one single free extent in the group
5033 * @sb:         super block for the file system
5034 * @start:      starting block of the free extent in the alloc. group
5035 * @count:      number of blocks to TRIM
5036 * @group:      alloc. group we are working with
5037 * @e4b:        ext4 buddy for the group
5038 *
5039 * Trim "count" blocks starting at "start" in the "group". To assure that no
5040 * one will allocate those blocks, mark it as used in buddy bitmap. This must
5041 * be called with under the group lock.
5042 */
5043static int ext4_trim_extent(struct super_block *sb, int start, int count,
5044                             ext4_group_t group, struct ext4_buddy *e4b)
5045__releases(bitlock)
5046__acquires(bitlock)
5047{
5048        struct ext4_free_extent ex;
5049        int ret = 0;
5050
5051        trace_ext4_trim_extent(sb, group, start, count);
5052
5053        assert_spin_locked(ext4_group_lock_ptr(sb, group));
5054
5055        ex.fe_start = start;
5056        ex.fe_group = group;
5057        ex.fe_len = count;
5058
5059        /*
5060         * Mark blocks used, so no one can reuse them while
5061         * being trimmed.
5062         */
5063        mb_mark_used(e4b, &ex);
5064        ext4_unlock_group(sb, group);
5065        ret = ext4_issue_discard(sb, group, start, count);
5066        ext4_lock_group(sb, group);
5067        mb_free_blocks(NULL, e4b, start, ex.fe_len);
5068        return ret;
5069}
5070
5071/**
5072 * ext4_trim_all_free -- function to trim all free space in alloc. group
5073 * @sb:                 super block for file system
5074 * @group:              group to be trimmed
5075 * @start:              first group block to examine
5076 * @max:                last group block to examine
5077 * @minblocks:          minimum extent block count
5078 *
5079 * ext4_trim_all_free walks through group's buddy bitmap searching for free
5080 * extents. When the free block is found, ext4_trim_extent is called to TRIM
5081 * the extent.
5082 *
5083 *
5084 * ext4_trim_all_free walks through group's block bitmap searching for free
5085 * extents. When the free extent is found, mark it as used in group buddy
5086 * bitmap. Then issue a TRIM command on this extent and free the extent in
5087 * the group buddy bitmap. This is done until whole group is scanned.
5088 */
5089static ext4_grpblk_t
5090ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
5091                   ext4_grpblk_t start, ext4_grpblk_t max,
5092                   ext4_grpblk_t minblocks)
5093{
5094        void *bitmap;
5095        ext4_grpblk_t next, count = 0, free_count = 0;
5096        struct ext4_buddy e4b;
5097        int ret = 0;
5098
5099        trace_ext4_trim_all_free(sb, group, start, max);
5100
5101        ret = ext4_mb_load_buddy(sb, group, &e4b);
5102        if (ret) {
5103                ext4_error(sb, "Error in loading buddy "
5104                                "information for %u", group);
5105                return ret;
5106        }
5107        bitmap = e4b.bd_bitmap;
5108
5109        ext4_lock_group(sb, group);
5110        if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) &&
5111            minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks))
5112                goto out;
5113
5114        start = (e4b.bd_info->bb_first_free > start) ?
5115                e4b.bd_info->bb_first_free : start;
5116
5117        while (start <= max) {
5118                start = mb_find_next_zero_bit(bitmap, max + 1, start);
5119                if (start > max)
5120                        break;
5121                next = mb_find_next_bit(bitmap, max + 1, start);
5122
5123                if ((next - start) >= minblocks) {
5124                        ret = ext4_trim_extent(sb, start,
5125                                               next - start, group, &e4b);
5126                        if (ret && ret != -EOPNOTSUPP)
5127                                break;
5128                        ret = 0;
5129                        count += next - start;
5130                }
5131                free_count += next - start;
5132                start = next + 1;
5133
5134                if (fatal_signal_pending(current)) {
5135                        count = -ERESTARTSYS;
5136                        break;
5137                }
5138
5139                if (need_resched()) {
5140                        ext4_unlock_group(sb, group);
5141                        cond_resched();
5142                        ext4_lock_group(sb, group);
5143                }
5144
5145                if ((e4b.bd_info->bb_free - free_count) < minblocks)
5146                        break;
5147        }
5148
5149        if (!ret) {
5150                ret = count;
5151                EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
5152        }
5153out:
5154        ext4_unlock_group(sb, group);
5155        ext4_mb_unload_buddy(&e4b);
5156
5157        ext4_debug("trimmed %d blocks in the group %d\n",
5158                count, group);
5159
5160        return ret;
5161}
5162
5163/**
5164 * ext4_trim_fs() -- trim ioctl handle function
5165 * @sb:                 superblock for filesystem
5166 * @range:              fstrim_range structure
5167 *
5168 * start:       First Byte to trim
5169 * len:         number of Bytes to trim from start
5170 * minlen:      minimum extent length in Bytes
5171 * ext4_trim_fs goes through all allocation groups containing Bytes from
5172 * start to start+len. For each such a group ext4_trim_all_free function
5173 * is invoked to trim all free space.
5174 */
5175int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
5176{
5177        struct ext4_group_info *grp;
5178        ext4_group_t group, first_group, last_group;
5179        ext4_grpblk_t cnt = 0, first_cluster, last_cluster;
5180        uint64_t start, end, minlen, trimmed = 0;
5181        ext4_fsblk_t first_data_blk =
5182                        le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
5183        ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es);
5184        int ret = 0;
5185
5186        start = range->start >> sb->s_blocksize_bits;
5187        end = start + (range->len >> sb->s_blocksize_bits) - 1;
5188        minlen = EXT4_NUM_B2C(EXT4_SB(sb),
5189                              range->minlen >> sb->s_blocksize_bits);
5190
5191        if (minlen > EXT4_CLUSTERS_PER_GROUP(sb) ||
5192            start >= max_blks ||
5193            range->len < sb->s_blocksize)
5194                return -EINVAL;
5195        if (end >= max_blks)
5196                end = max_blks - 1;
5197        if (end <= first_data_blk)
5198                goto out;
5199        if (start < first_data_blk)
5200                start = first_data_blk;
5201
5202        /* Determine first and last group to examine based on start and end */
5203        ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
5204                                     &first_group, &first_cluster);
5205        ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end,
5206                                     &last_group, &last_cluster);
5207
5208        /* end now represents the last cluster to discard in this group */
5209        end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
5210
5211        for (group = first_group; group <= last_group; group++) {
5212                grp = ext4_get_group_info(sb, group);
5213                /* We only do this if the grp has never been initialized */
5214                if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
5215                        ret = ext4_mb_init_group(sb, group);
5216                        if (ret)
5217                                break;
5218                }
5219
5220                /*
5221                 * For all the groups except the last one, last cluster will
5222                 * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to
5223                 * change it for the last group, note that last_cluster is
5224                 * already computed earlier by ext4_get_group_no_and_offset()
5225                 */
5226                if (group == last_group)
5227                        end = last_cluster;
5228
5229                if (grp->bb_free >= minlen) {
5230                        cnt = ext4_trim_all_free(sb, group, first_cluster,
5231                                                end, minlen);
5232                        if (cnt < 0) {
5233                                ret = cnt;
5234                                break;
5235                        }
5236                        trimmed += cnt;
5237                }
5238
5239                /*
5240                 * For every group except the first one, we are sure
5241                 * that the first cluster to discard will be cluster #0.
5242                 */
5243                first_cluster = 0;
5244        }
5245
5246        if (!ret)
5247                atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen);
5248
5249out:
5250        range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits;
5251        return ret;
5252}
5253