linux/fs/ext4/mballoc.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
   3 * Written by Alex Tomas <alex@clusterfs.com>
   4 *
   5 * This program is free software; you can redistribute it and/or modify
   6 * it under the terms of the GNU General Public License version 2 as
   7 * published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope that it will be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public Licens
  15 * along with this program; if not, write to the Free Software
  16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
  17 */
  18
  19
  20/*
  21 * mballoc.c contains the multiblocks allocation routines
  22 */
  23
  24#include "mballoc.h"
  25#include <linux/debugfs.h>
  26#include <linux/slab.h>
  27#include <trace/events/ext4.h>
  28
  29/*
  30 * MUSTDO:
  31 *   - test ext4_ext_search_left() and ext4_ext_search_right()
  32 *   - search for metadata in few groups
  33 *
  34 * TODO v4:
  35 *   - normalization should take into account whether file is still open
  36 *   - discard preallocations if no free space left (policy?)
  37 *   - don't normalize tails
  38 *   - quota
  39 *   - reservation for superuser
  40 *
  41 * TODO v3:
  42 *   - bitmap read-ahead (proposed by Oleg Drokin aka green)
  43 *   - track min/max extents in each group for better group selection
  44 *   - mb_mark_used() may allocate chunk right after splitting buddy
  45 *   - tree of groups sorted by number of free blocks
  46 *   - error handling
  47 */
  48
  49/*
  50 * The allocation request involve request for multiple number of blocks
  51 * near to the goal(block) value specified.
  52 *
  53 * During initialization phase of the allocator we decide to use the
  54 * group preallocation or inode preallocation depending on the size of
  55 * the file. The size of the file could be the resulting file size we
  56 * would have after allocation, or the current file size, which ever
  57 * is larger. If the size is less than sbi->s_mb_stream_request we
  58 * select to use the group preallocation. The default value of
  59 * s_mb_stream_request is 16 blocks. This can also be tuned via
  60 * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in
  61 * terms of number of blocks.
  62 *
  63 * The main motivation for having small file use group preallocation is to
  64 * ensure that we have small files closer together on the disk.
  65 *
  66 * First stage the allocator looks at the inode prealloc list,
  67 * ext4_inode_info->i_prealloc_list, which contains list of prealloc
  68 * spaces for this particular inode. The inode prealloc space is
  69 * represented as:
  70 *
  71 * pa_lstart -> the logical start block for this prealloc space
  72 * pa_pstart -> the physical start block for this prealloc space
  73 * pa_len    -> length for this prealloc space
  74 * pa_free   ->  free space available in this prealloc space
  75 *
  76 * The inode preallocation space is used looking at the _logical_ start
  77 * block. If only the logical file block falls within the range of prealloc
  78 * space we will consume the particular prealloc space. This make sure that
  79 * that the we have contiguous physical blocks representing the file blocks
  80 *
  81 * The important thing to be noted in case of inode prealloc space is that
  82 * we don't modify the values associated to inode prealloc space except
  83 * pa_free.
  84 *
  85 * If we are not able to find blocks in the inode prealloc space and if we
  86 * have the group allocation flag set then we look at the locality group
  87 * prealloc space. These are per CPU prealloc list repreasented as
  88 *
  89 * ext4_sb_info.s_locality_groups[smp_processor_id()]
  90 *
  91 * The reason for having a per cpu locality group is to reduce the contention
  92 * between CPUs. It is possible to get scheduled at this point.
  93 *
  94 * The locality group prealloc space is used looking at whether we have
  95 * enough free space (pa_free) within the prealloc space.
  96 *
  97 * If we can't allocate blocks via inode prealloc or/and locality group
  98 * prealloc then we look at the buddy cache. The buddy cache is represented
  99 * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets
 100 * mapped to the buddy and bitmap information regarding different
 101 * groups. The buddy information is attached to buddy cache inode so that
 102 * we can access them through the page cache. The information regarding
 103 * each group is loaded via ext4_mb_load_buddy.  The information involve
 104 * block bitmap and buddy information. The information are stored in the
 105 * inode as:
 106 *
 107 *  {                        page                        }
 108 *  [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
 109 *
 110 *
 111 * one block each for bitmap and buddy information.  So for each group we
 112 * take up 2 blocks. A page can contain blocks_per_page (PAGE_CACHE_SIZE /
 113 * blocksize) blocks.  So it can have information regarding groups_per_page
 114 * which is blocks_per_page/2
 115 *
 116 * The buddy cache inode is not stored on disk. The inode is thrown
 117 * away when the filesystem is unmounted.
 118 *
 119 * We look for count number of blocks in the buddy cache. If we were able
 120 * to locate that many free blocks we return with additional information
 121 * regarding rest of the contiguous physical block available
 122 *
 123 * Before allocating blocks via buddy cache we normalize the request
 124 * blocks. This ensure we ask for more blocks that we needed. The extra
 125 * blocks that we get after allocation is added to the respective prealloc
 126 * list. In case of inode preallocation we follow a list of heuristics
 127 * based on file size. This can be found in ext4_mb_normalize_request. If
 128 * we are doing a group prealloc we try to normalize the request to
 129 * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is
 130 * 512 blocks. This can be tuned via
 131 * /sys/fs/ext4/<partition/mb_group_prealloc. The value is represented in
 132 * terms of number of blocks. If we have mounted the file system with -O
 133 * stripe=<value> option the group prealloc request is normalized to the
 134 * stripe value (sbi->s_stripe)
 135 *
 136 * The regular allocator(using the buddy cache) supports few tunables.
 137 *
 138 * /sys/fs/ext4/<partition>/mb_min_to_scan
 139 * /sys/fs/ext4/<partition>/mb_max_to_scan
 140 * /sys/fs/ext4/<partition>/mb_order2_req
 141 *
 142 * The regular allocator uses buddy scan only if the request len is power of
 143 * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
 144 * value of s_mb_order2_reqs can be tuned via
 145 * /sys/fs/ext4/<partition>/mb_order2_req.  If the request len is equal to
 146 * stripe size (sbi->s_stripe), we try to search for contiguous block in
 147 * stripe size. This should result in better allocation on RAID setups. If
 148 * not, we search in the specific group using bitmap for best extents. The
 149 * tunable min_to_scan and max_to_scan control the behaviour here.
 150 * min_to_scan indicate how long the mballoc __must__ look for a best
 151 * extent and max_to_scan indicates how long the mballoc __can__ look for a
 152 * best extent in the found extents. Searching for the blocks starts with
 153 * the group specified as the goal value in allocation context via
 154 * ac_g_ex. Each group is first checked based on the criteria whether it
 155 * can used for allocation. ext4_mb_good_group explains how the groups are
 156 * checked.
 157 *
 158 * Both the prealloc space are getting populated as above. So for the first
 159 * request we will hit the buddy cache which will result in this prealloc
 160 * space getting filled. The prealloc space is then later used for the
 161 * subsequent request.
 162 */
 163
 164/*
 165 * mballoc operates on the following data:
 166 *  - on-disk bitmap
 167 *  - in-core buddy (actually includes buddy and bitmap)
 168 *  - preallocation descriptors (PAs)
 169 *
 170 * there are two types of preallocations:
 171 *  - inode
 172 *    assiged to specific inode and can be used for this inode only.
 173 *    it describes part of inode's space preallocated to specific
 174 *    physical blocks. any block from that preallocated can be used
 175 *    independent. the descriptor just tracks number of blocks left
 176 *    unused. so, before taking some block from descriptor, one must
 177 *    make sure corresponded logical block isn't allocated yet. this
 178 *    also means that freeing any block within descriptor's range
 179 *    must discard all preallocated blocks.
 180 *  - locality group
 181 *    assigned to specific locality group which does not translate to
 182 *    permanent set of inodes: inode can join and leave group. space
 183 *    from this type of preallocation can be used for any inode. thus
 184 *    it's consumed from the beginning to the end.
 185 *
 186 * relation between them can be expressed as:
 187 *    in-core buddy = on-disk bitmap + preallocation descriptors
 188 *
 189 * this mean blocks mballoc considers used are:
 190 *  - allocated blocks (persistent)
 191 *  - preallocated blocks (non-persistent)
 192 *
 193 * consistency in mballoc world means that at any time a block is either
 194 * free or used in ALL structures. notice: "any time" should not be read
 195 * literally -- time is discrete and delimited by locks.
 196 *
 197 *  to keep it simple, we don't use block numbers, instead we count number of
 198 *  blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA.
 199 *
 200 * all operations can be expressed as:
 201 *  - init buddy:                       buddy = on-disk + PAs
 202 *  - new PA:                           buddy += N; PA = N
 203 *  - use inode PA:                     on-disk += N; PA -= N
 204 *  - discard inode PA                  buddy -= on-disk - PA; PA = 0
 205 *  - use locality group PA             on-disk += N; PA -= N
 206 *  - discard locality group PA         buddy -= PA; PA = 0
 207 *  note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap
 208 *        is used in real operation because we can't know actual used
 209 *        bits from PA, only from on-disk bitmap
 210 *
 211 * if we follow this strict logic, then all operations above should be atomic.
 212 * given some of them can block, we'd have to use something like semaphores
 213 * killing performance on high-end SMP hardware. let's try to relax it using
 214 * the following knowledge:
 215 *  1) if buddy is referenced, it's already initialized
 216 *  2) while block is used in buddy and the buddy is referenced,
 217 *     nobody can re-allocate that block
 218 *  3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has
 219 *     bit set and PA claims same block, it's OK. IOW, one can set bit in
 220 *     on-disk bitmap if buddy has same bit set or/and PA covers corresponded
 221 *     block
 222 *
 223 * so, now we're building a concurrency table:
 224 *  - init buddy vs.
 225 *    - new PA
 226 *      blocks for PA are allocated in the buddy, buddy must be referenced
 227 *      until PA is linked to allocation group to avoid concurrent buddy init
 228 *    - use inode PA
 229 *      we need to make sure that either on-disk bitmap or PA has uptodate data
 230 *      given (3) we care that PA-=N operation doesn't interfere with init
 231 *    - discard inode PA
 232 *      the simplest way would be to have buddy initialized by the discard
 233 *    - use locality group PA
 234 *      again PA-=N must be serialized with init
 235 *    - discard locality group PA
 236 *      the simplest way would be to have buddy initialized by the discard
 237 *  - new PA vs.
 238 *    - use inode PA
 239 *      i_data_sem serializes them
 240 *    - discard inode PA
 241 *      discard process must wait until PA isn't used by another process
 242 *    - use locality group PA
 243 *      some mutex should serialize them
 244 *    - discard locality group PA
 245 *      discard process must wait until PA isn't used by another process
 246 *  - use inode PA
 247 *    - use inode PA
 248 *      i_data_sem or another mutex should serializes them
 249 *    - discard inode PA
 250 *      discard process must wait until PA isn't used by another process
 251 *    - use locality group PA
 252 *      nothing wrong here -- they're different PAs covering different blocks
 253 *    - discard locality group PA
 254 *      discard process must wait until PA isn't used by another process
 255 *
 256 * now we're ready to make few consequences:
 257 *  - PA is referenced and while it is no discard is possible
 258 *  - PA is referenced until block isn't marked in on-disk bitmap
 259 *  - PA changes only after on-disk bitmap
 260 *  - discard must not compete with init. either init is done before
 261 *    any discard or they're serialized somehow
 262 *  - buddy init as sum of on-disk bitmap and PAs is done atomically
 263 *
 264 * a special case when we've used PA to emptiness. no need to modify buddy
 265 * in this case, but we should care about concurrent init
 266 *
 267 */
 268
 269 /*
 270 * Logic in few words:
 271 *
 272 *  - allocation:
 273 *    load group
 274 *    find blocks
 275 *    mark bits in on-disk bitmap
 276 *    release group
 277 *
 278 *  - use preallocation:
 279 *    find proper PA (per-inode or group)
 280 *    load group
 281 *    mark bits in on-disk bitmap
 282 *    release group
 283 *    release PA
 284 *
 285 *  - free:
 286 *    load group
 287 *    mark bits in on-disk bitmap
 288 *    release group
 289 *
 290 *  - discard preallocations in group:
 291 *    mark PAs deleted
 292 *    move them onto local list
 293 *    load on-disk bitmap
 294 *    load group
 295 *    remove PA from object (inode or locality group)
 296 *    mark free blocks in-core
 297 *
 298 *  - discard inode's preallocations:
 299 */
 300
 301/*
 302 * Locking rules
 303 *
 304 * Locks:
 305 *  - bitlock on a group        (group)
 306 *  - object (inode/locality)   (object)
 307 *  - per-pa lock               (pa)
 308 *
 309 * Paths:
 310 *  - new pa
 311 *    object
 312 *    group
 313 *
 314 *  - find and use pa:
 315 *    pa
 316 *
 317 *  - release consumed pa:
 318 *    pa
 319 *    group
 320 *    object
 321 *
 322 *  - generate in-core bitmap:
 323 *    group
 324 *        pa
 325 *
 326 *  - discard all for given object (inode, locality group):
 327 *    object
 328 *        pa
 329 *    group
 330 *
 331 *  - discard all for given group:
 332 *    group
 333 *        pa
 334 *    group
 335 *        object
 336 *
 337 */
 338static struct kmem_cache *ext4_pspace_cachep;
 339static struct kmem_cache *ext4_ac_cachep;
 340static struct kmem_cache *ext4_free_ext_cachep;
 341
 342/* We create slab caches for groupinfo data structures based on the
 343 * superblock block size.  There will be one per mounted filesystem for
 344 * each unique s_blocksize_bits */
 345#define NR_GRPINFO_CACHES 8
 346static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
 347
 348static const char *ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
 349        "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k",
 350        "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k",
 351        "ext4_groupinfo_64k", "ext4_groupinfo_128k"
 352};
 353
 354static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
 355                                        ext4_group_t group);
 356static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
 357                                                ext4_group_t group);
 358static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
 359
 360static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
 361{
 362#if BITS_PER_LONG == 64
 363        *bit += ((unsigned long) addr & 7UL) << 3;
 364        addr = (void *) ((unsigned long) addr & ~7UL);
 365#elif BITS_PER_LONG == 32
 366        *bit += ((unsigned long) addr & 3UL) << 3;
 367        addr = (void *) ((unsigned long) addr & ~3UL);
 368#else
 369#error "how many bits you are?!"
 370#endif
 371        return addr;
 372}
 373
 374static inline int mb_test_bit(int bit, void *addr)
 375{
 376        /*
 377         * ext4_test_bit on architecture like powerpc
 378         * needs unsigned long aligned address
 379         */
 380        addr = mb_correct_addr_and_bit(&bit, addr);
 381        return ext4_test_bit(bit, addr);
 382}
 383
 384static inline void mb_set_bit(int bit, void *addr)
 385{
 386        addr = mb_correct_addr_and_bit(&bit, addr);
 387        ext4_set_bit(bit, addr);
 388}
 389
 390static inline void mb_clear_bit(int bit, void *addr)
 391{
 392        addr = mb_correct_addr_and_bit(&bit, addr);
 393        ext4_clear_bit(bit, addr);
 394}
 395
 396static inline int mb_find_next_zero_bit(void *addr, int max, int start)
 397{
 398        int fix = 0, ret, tmpmax;
 399        addr = mb_correct_addr_and_bit(&fix, addr);
 400        tmpmax = max + fix;
 401        start += fix;
 402
 403        ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix;
 404        if (ret > max)
 405                return max;
 406        return ret;
 407}
 408
 409static inline int mb_find_next_bit(void *addr, int max, int start)
 410{
 411        int fix = 0, ret, tmpmax;
 412        addr = mb_correct_addr_and_bit(&fix, addr);
 413        tmpmax = max + fix;
 414        start += fix;
 415
 416        ret = ext4_find_next_bit(addr, tmpmax, start) - fix;
 417        if (ret > max)
 418                return max;
 419        return ret;
 420}
 421
 422static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
 423{
 424        char *bb;
 425
 426        BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b));
 427        BUG_ON(max == NULL);
 428
 429        if (order > e4b->bd_blkbits + 1) {
 430                *max = 0;
 431                return NULL;
 432        }
 433
 434        /* at order 0 we see each particular block */
 435        if (order == 0) {
 436                *max = 1 << (e4b->bd_blkbits + 3);
 437                return EXT4_MB_BITMAP(e4b);
 438        }
 439
 440        bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
 441        *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
 442
 443        return bb;
 444}
 445
 446#ifdef DOUBLE_CHECK
 447static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
 448                           int first, int count)
 449{
 450        int i;
 451        struct super_block *sb = e4b->bd_sb;
 452
 453        if (unlikely(e4b->bd_info->bb_bitmap == NULL))
 454                return;
 455        assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
 456        for (i = 0; i < count; i++) {
 457                if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
 458                        ext4_fsblk_t blocknr;
 459
 460                        blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
 461                        blocknr += first + i;
 462                        ext4_grp_locked_error(sb, e4b->bd_group,
 463                                              inode ? inode->i_ino : 0,
 464                                              blocknr,
 465                                              "freeing block already freed "
 466                                              "(bit %u)",
 467                                              first + i);
 468                }
 469                mb_clear_bit(first + i, e4b->bd_info->bb_bitmap);
 470        }
 471}
 472
 473static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count)
 474{
 475        int i;
 476
 477        if (unlikely(e4b->bd_info->bb_bitmap == NULL))
 478                return;
 479        assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
 480        for (i = 0; i < count; i++) {
 481                BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap));
 482                mb_set_bit(first + i, e4b->bd_info->bb_bitmap);
 483        }
 484}
 485
 486static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
 487{
 488        if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) {
 489                unsigned char *b1, *b2;
 490                int i;
 491                b1 = (unsigned char *) e4b->bd_info->bb_bitmap;
 492                b2 = (unsigned char *) bitmap;
 493                for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
 494                        if (b1[i] != b2[i]) {
 495                                printk(KERN_ERR "corruption in group %u "
 496                                       "at byte %u(%u): %x in copy != %x "
 497                                       "on disk/prealloc\n",
 498                                       e4b->bd_group, i, i * 8, b1[i], b2[i]);
 499                                BUG();
 500                        }
 501                }
 502        }
 503}
 504
 505#else
 506static inline void mb_free_blocks_double(struct inode *inode,
 507                                struct ext4_buddy *e4b, int first, int count)
 508{
 509        return;
 510}
 511static inline void mb_mark_used_double(struct ext4_buddy *e4b,
 512                                                int first, int count)
 513{
 514        return;
 515}
 516static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
 517{
 518        return;
 519}
 520#endif
 521
 522#ifdef AGGRESSIVE_CHECK
 523
 524#define MB_CHECK_ASSERT(assert)                                         \
 525do {                                                                    \
 526        if (!(assert)) {                                                \
 527                printk(KERN_EMERG                                       \
 528                        "Assertion failure in %s() at %s:%d: \"%s\"\n", \
 529                        function, file, line, # assert);                \
 530                BUG();                                                  \
 531        }                                                               \
 532} while (0)
 533
 534static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
 535                                const char *function, int line)
 536{
 537        struct super_block *sb = e4b->bd_sb;
 538        int order = e4b->bd_blkbits + 1;
 539        int max;
 540        int max2;
 541        int i;
 542        int j;
 543        int k;
 544        int count;
 545        struct ext4_group_info *grp;
 546        int fragments = 0;
 547        int fstart;
 548        struct list_head *cur;
 549        void *buddy;
 550        void *buddy2;
 551
 552        {
 553                static int mb_check_counter;
 554                if (mb_check_counter++ % 100 != 0)
 555                        return 0;
 556        }
 557
 558        while (order > 1) {
 559                buddy = mb_find_buddy(e4b, order, &max);
 560                MB_CHECK_ASSERT(buddy);
 561                buddy2 = mb_find_buddy(e4b, order - 1, &max2);
 562                MB_CHECK_ASSERT(buddy2);
 563                MB_CHECK_ASSERT(buddy != buddy2);
 564                MB_CHECK_ASSERT(max * 2 == max2);
 565
 566                count = 0;
 567                for (i = 0; i < max; i++) {
 568
 569                        if (mb_test_bit(i, buddy)) {
 570                                /* only single bit in buddy2 may be 1 */
 571                                if (!mb_test_bit(i << 1, buddy2)) {
 572                                        MB_CHECK_ASSERT(
 573                                                mb_test_bit((i<<1)+1, buddy2));
 574                                } else if (!mb_test_bit((i << 1) + 1, buddy2)) {
 575                                        MB_CHECK_ASSERT(
 576                                                mb_test_bit(i << 1, buddy2));
 577                                }
 578                                continue;
 579                        }
 580
 581                        /* both bits in buddy2 must be 0 */
 582                        MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2));
 583                        MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2));
 584
 585                        for (j = 0; j < (1 << order); j++) {
 586                                k = (i * (1 << order)) + j;
 587                                MB_CHECK_ASSERT(
 588                                        !mb_test_bit(k, EXT4_MB_BITMAP(e4b)));
 589                        }
 590                        count++;
 591                }
 592                MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count);
 593                order--;
 594        }
 595
 596        fstart = -1;
 597        buddy = mb_find_buddy(e4b, 0, &max);
 598        for (i = 0; i < max; i++) {
 599                if (!mb_test_bit(i, buddy)) {
 600                        MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free);
 601                        if (fstart == -1) {
 602                                fragments++;
 603                                fstart = i;
 604                        }
 605                        continue;
 606                }
 607                fstart = -1;
 608                /* check used bits only */
 609                for (j = 0; j < e4b->bd_blkbits + 1; j++) {
 610                        buddy2 = mb_find_buddy(e4b, j, &max2);
 611                        k = i >> j;
 612                        MB_CHECK_ASSERT(k < max2);
 613                        MB_CHECK_ASSERT(mb_test_bit(k, buddy2));
 614                }
 615        }
 616        MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info));
 617        MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);
 618
 619        grp = ext4_get_group_info(sb, e4b->bd_group);
 620        list_for_each(cur, &grp->bb_prealloc_list) {
 621                ext4_group_t groupnr;
 622                struct ext4_prealloc_space *pa;
 623                pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
 624                ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k);
 625                MB_CHECK_ASSERT(groupnr == e4b->bd_group);
 626                for (i = 0; i < pa->pa_len; i++)
 627                        MB_CHECK_ASSERT(mb_test_bit(k + i, buddy));
 628        }
 629        return 0;
 630}
 631#undef MB_CHECK_ASSERT
 632#define mb_check_buddy(e4b) __mb_check_buddy(e4b,       \
 633                                        __FILE__, __func__, __LINE__)
 634#else
 635#define mb_check_buddy(e4b)
 636#endif
 637
 638/*
 639 * Divide blocks started from @first with length @len into
 640 * smaller chunks with power of 2 blocks.
 641 * Clear the bits in bitmap which the blocks of the chunk(s) covered,
 642 * then increase bb_counters[] for corresponded chunk size.
 643 */
 644static void ext4_mb_mark_free_simple(struct super_block *sb,
 645                                void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
 646                                        struct ext4_group_info *grp)
 647{
 648        struct ext4_sb_info *sbi = EXT4_SB(sb);
 649        ext4_grpblk_t min;
 650        ext4_grpblk_t max;
 651        ext4_grpblk_t chunk;
 652        unsigned short border;
 653
 654        BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb));
 655
 656        border = 2 << sb->s_blocksize_bits;
 657
 658        while (len > 0) {
 659                /* find how many blocks can be covered since this position */
 660                max = ffs(first | border) - 1;
 661
 662                /* find how many blocks of power 2 we need to mark */
 663                min = fls(len) - 1;
 664
 665                if (max < min)
 666                        min = max;
 667                chunk = 1 << min;
 668
 669                /* mark multiblock chunks only */
 670                grp->bb_counters[min]++;
 671                if (min > 0)
 672                        mb_clear_bit(first >> min,
 673                                     buddy + sbi->s_mb_offsets[min]);
 674
 675                len -= chunk;
 676                first += chunk;
 677        }
 678}
 679
 680/*
 681 * Cache the order of the largest free extent we have available in this block
 682 * group.
 683 */
 684static void
 685mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
 686{
 687        int i;
 688        int bits;
 689
 690        grp->bb_largest_free_order = -1; /* uninit */
 691
 692        bits = sb->s_blocksize_bits + 1;
 693        for (i = bits; i >= 0; i--) {
 694                if (grp->bb_counters[i] > 0) {
 695                        grp->bb_largest_free_order = i;
 696                        break;
 697                }
 698        }
 699}
 700
 701static noinline_for_stack
 702void ext4_mb_generate_buddy(struct super_block *sb,
 703                                void *buddy, void *bitmap, ext4_group_t group)
 704{
 705        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
 706        ext4_grpblk_t max = EXT4_BLOCKS_PER_GROUP(sb);
 707        ext4_grpblk_t i = 0;
 708        ext4_grpblk_t first;
 709        ext4_grpblk_t len;
 710        unsigned free = 0;
 711        unsigned fragments = 0;
 712        unsigned long long period = get_cycles();
 713
 714        /* initialize buddy from bitmap which is aggregation
 715         * of on-disk bitmap and preallocations */
 716        i = mb_find_next_zero_bit(bitmap, max, 0);
 717        grp->bb_first_free = i;
 718        while (i < max) {
 719                fragments++;
 720                first = i;
 721                i = mb_find_next_bit(bitmap, max, i);
 722                len = i - first;
 723                free += len;
 724                if (len > 1)
 725                        ext4_mb_mark_free_simple(sb, buddy, first, len, grp);
 726                else
 727                        grp->bb_counters[0]++;
 728                if (i < max)
 729                        i = mb_find_next_zero_bit(bitmap, max, i);
 730        }
 731        grp->bb_fragments = fragments;
 732
 733        if (free != grp->bb_free) {
 734                ext4_grp_locked_error(sb, group, 0, 0,
 735                                      "%u blocks in bitmap, %u in gd",
 736                                      free, grp->bb_free);
 737                /*
 738                 * If we intent to continue, we consider group descritor
 739                 * corrupt and update bb_free using bitmap value
 740                 */
 741                grp->bb_free = free;
 742        }
 743        mb_set_largest_free_order(sb, grp);
 744
 745        clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
 746
 747        period = get_cycles() - period;
 748        spin_lock(&EXT4_SB(sb)->s_bal_lock);
 749        EXT4_SB(sb)->s_mb_buddies_generated++;
 750        EXT4_SB(sb)->s_mb_generation_time += period;
 751        spin_unlock(&EXT4_SB(sb)->s_bal_lock);
 752}
 753
 754/* The buddy information is attached the buddy cache inode
 755 * for convenience. The information regarding each group
 756 * is loaded via ext4_mb_load_buddy. The information involve
 757 * block bitmap and buddy information. The information are
 758 * stored in the inode as
 759 *
 760 * {                        page                        }
 761 * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
 762 *
 763 *
 764 * one block each for bitmap and buddy information.
 765 * So for each group we take up 2 blocks. A page can
 766 * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize)  blocks.
 767 * So it can have information regarding groups_per_page which
 768 * is blocks_per_page/2
 769 *
 770 * Locking note:  This routine takes the block group lock of all groups
 771 * for this page; do not hold this lock when calling this routine!
 772 */
 773
 774static int ext4_mb_init_cache(struct page *page, char *incore)
 775{
 776        ext4_group_t ngroups;
 777        int blocksize;
 778        int blocks_per_page;
 779        int groups_per_page;
 780        int err = 0;
 781        int i;
 782        ext4_group_t first_group;
 783        int first_block;
 784        struct super_block *sb;
 785        struct buffer_head *bhs;
 786        struct buffer_head **bh;
 787        struct inode *inode;
 788        char *data;
 789        char *bitmap;
 790        struct ext4_group_info *grinfo;
 791
 792        mb_debug(1, "init page %lu\n", page->index);
 793
 794        inode = page->mapping->host;
 795        sb = inode->i_sb;
 796        ngroups = ext4_get_groups_count(sb);
 797        blocksize = 1 << inode->i_blkbits;
 798        blocks_per_page = PAGE_CACHE_SIZE / blocksize;
 799
 800        groups_per_page = blocks_per_page >> 1;
 801        if (groups_per_page == 0)
 802                groups_per_page = 1;
 803
 804        /* allocate buffer_heads to read bitmaps */
 805        if (groups_per_page > 1) {
 806                err = -ENOMEM;
 807                i = sizeof(struct buffer_head *) * groups_per_page;
 808                bh = kzalloc(i, GFP_NOFS);
 809                if (bh == NULL)
 810                        goto out;
 811        } else
 812                bh = &bhs;
 813
 814        first_group = page->index * blocks_per_page / 2;
 815
 816        /* read all groups the page covers into the cache */
 817        for (i = 0; i < groups_per_page; i++) {
 818                struct ext4_group_desc *desc;
 819
 820                if (first_group + i >= ngroups)
 821                        break;
 822
 823                grinfo = ext4_get_group_info(sb, first_group + i);
 824                /*
 825                 * If page is uptodate then we came here after online resize
 826                 * which added some new uninitialized group info structs, so
 827                 * we must skip all initialized uptodate buddies on the page,
 828                 * which may be currently in use by an allocating task.
 829                 */
 830                if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) {
 831                        bh[i] = NULL;
 832                        continue;
 833                }
 834
 835                err = -EIO;
 836                desc = ext4_get_group_desc(sb, first_group + i, NULL);
 837                if (desc == NULL)
 838                        goto out;
 839
 840                err = -ENOMEM;
 841                bh[i] = sb_getblk(sb, ext4_block_bitmap(sb, desc));
 842                if (bh[i] == NULL)
 843                        goto out;
 844
 845                if (bitmap_uptodate(bh[i]))
 846                        continue;
 847
 848                lock_buffer(bh[i]);
 849                if (bitmap_uptodate(bh[i])) {
 850                        unlock_buffer(bh[i]);
 851                        continue;
 852                }
 853                ext4_lock_group(sb, first_group + i);
 854                if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
 855                        ext4_init_block_bitmap(sb, bh[i],
 856                                                first_group + i, desc);
 857                        set_bitmap_uptodate(bh[i]);
 858                        set_buffer_uptodate(bh[i]);
 859                        ext4_unlock_group(sb, first_group + i);
 860                        unlock_buffer(bh[i]);
 861                        continue;
 862                }
 863                ext4_unlock_group(sb, first_group + i);
 864                if (buffer_uptodate(bh[i])) {
 865                        /*
 866                         * if not uninit if bh is uptodate,
 867                         * bitmap is also uptodate
 868                         */
 869                        set_bitmap_uptodate(bh[i]);
 870                        unlock_buffer(bh[i]);
 871                        continue;
 872                }
 873                get_bh(bh[i]);
 874                /*
 875                 * submit the buffer_head for read. We can
 876                 * safely mark the bitmap as uptodate now.
 877                 * We do it here so the bitmap uptodate bit
 878                 * get set with buffer lock held.
 879                 */
 880                set_bitmap_uptodate(bh[i]);
 881                bh[i]->b_end_io = end_buffer_read_sync;
 882                submit_bh(READ, bh[i]);
 883                mb_debug(1, "read bitmap for group %u\n", first_group + i);
 884        }
 885
 886        /* wait for I/O completion */
 887        for (i = 0; i < groups_per_page; i++)
 888                if (bh[i])
 889                        wait_on_buffer(bh[i]);
 890
 891        err = -EIO;
 892        for (i = 0; i < groups_per_page; i++)
 893                if (bh[i] && !buffer_uptodate(bh[i]))
 894                        goto out;
 895
 896        err = 0;
 897        first_block = page->index * blocks_per_page;
 898        for (i = 0; i < blocks_per_page; i++) {
 899                int group;
 900
 901                group = (first_block + i) >> 1;
 902                if (group >= ngroups)
 903                        break;
 904
 905                if (!bh[group - first_group])
 906                        /* skip initialized uptodate buddy */
 907                        continue;
 908
 909                /*
 910                 * data carry information regarding this
 911                 * particular group in the format specified
 912                 * above
 913                 *
 914                 */
 915                data = page_address(page) + (i * blocksize);
 916                bitmap = bh[group - first_group]->b_data;
 917
 918                /*
 919                 * We place the buddy block and bitmap block
 920                 * close together
 921                 */
 922                if ((first_block + i) & 1) {
 923                        /* this is block of buddy */
 924                        BUG_ON(incore == NULL);
 925                        mb_debug(1, "put buddy for group %u in page %lu/%x\n",
 926                                group, page->index, i * blocksize);
 927                        trace_ext4_mb_buddy_bitmap_load(sb, group);
 928                        grinfo = ext4_get_group_info(sb, group);
 929                        grinfo->bb_fragments = 0;
 930                        memset(grinfo->bb_counters, 0,
 931                               sizeof(*grinfo->bb_counters) *
 932                                (sb->s_blocksize_bits+2));
 933                        /*
 934                         * incore got set to the group block bitmap below
 935                         */
 936                        ext4_lock_group(sb, group);
 937                        /* init the buddy */
 938                        memset(data, 0xff, blocksize);
 939                        ext4_mb_generate_buddy(sb, data, incore, group);
 940                        ext4_unlock_group(sb, group);
 941                        incore = NULL;
 942                } else {
 943                        /* this is block of bitmap */
 944                        BUG_ON(incore != NULL);
 945                        mb_debug(1, "put bitmap for group %u in page %lu/%x\n",
 946                                group, page->index, i * blocksize);
 947                        trace_ext4_mb_bitmap_load(sb, group);
 948
 949                        /* see comments in ext4_mb_put_pa() */
 950                        ext4_lock_group(sb, group);
 951                        memcpy(data, bitmap, blocksize);
 952
 953                        /* mark all preallocated blks used in in-core bitmap */
 954                        ext4_mb_generate_from_pa(sb, data, group);
 955                        ext4_mb_generate_from_freelist(sb, data, group);
 956                        ext4_unlock_group(sb, group);
 957
 958                        /* set incore so that the buddy information can be
 959                         * generated using this
 960                         */
 961                        incore = data;
 962                }
 963        }
 964        SetPageUptodate(page);
 965
 966out:
 967        if (bh) {
 968                for (i = 0; i < groups_per_page; i++)
 969                        brelse(bh[i]);
 970                if (bh != &bhs)
 971                        kfree(bh);
 972        }
 973        return err;
 974}
 975
 976/*
 977 * Lock the buddy and bitmap pages. This make sure other parallel init_group
 978 * on the same buddy page doesn't happen whild holding the buddy page lock.
 979 * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap
 980 * are on the same page e4b->bd_buddy_page is NULL and return value is 0.
 981 */
 982static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
 983                ext4_group_t group, struct ext4_buddy *e4b)
 984{
 985        struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
 986        int block, pnum, poff;
 987        int blocks_per_page;
 988        struct page *page;
 989
 990        e4b->bd_buddy_page = NULL;
 991        e4b->bd_bitmap_page = NULL;
 992
 993        blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
 994        /*
 995         * the buddy cache inode stores the block bitmap
 996         * and buddy information in consecutive blocks.
 997         * So for each group we need two blocks.
 998         */
 999        block = group * 2;
1000        pnum = block / blocks_per_page;
1001        poff = block % blocks_per_page;
1002        page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1003        if (!page)
1004                return -EIO;
1005        BUG_ON(page->mapping != inode->i_mapping);
1006        e4b->bd_bitmap_page = page;
1007        e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
1008
1009        if (blocks_per_page >= 2) {
1010                /* buddy and bitmap are on the same page */
1011                return 0;
1012        }
1013
1014        block++;
1015        pnum = block / blocks_per_page;
1016        poff = block % blocks_per_page;
1017        page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1018        if (!page)
1019                return -EIO;
1020        BUG_ON(page->mapping != inode->i_mapping);
1021        e4b->bd_buddy_page = page;
1022        return 0;
1023}
1024
1025static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
1026{
1027        if (e4b->bd_bitmap_page) {
1028                unlock_page(e4b->bd_bitmap_page);
1029                page_cache_release(e4b->bd_bitmap_page);
1030        }
1031        if (e4b->bd_buddy_page) {
1032                unlock_page(e4b->bd_buddy_page);
1033                page_cache_release(e4b->bd_buddy_page);
1034        }
1035}
1036
1037/*
1038 * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
1039 * block group lock of all groups for this page; do not hold the BG lock when
1040 * calling this routine!
1041 */
1042static noinline_for_stack
1043int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
1044{
1045
1046        struct ext4_group_info *this_grp;
1047        struct ext4_buddy e4b;
1048        struct page *page;
1049        int ret = 0;
1050
1051        mb_debug(1, "init group %u\n", group);
1052        this_grp = ext4_get_group_info(sb, group);
1053        /*
1054         * This ensures that we don't reinit the buddy cache
1055         * page which map to the group from which we are already
1056         * allocating. If we are looking at the buddy cache we would
1057         * have taken a reference using ext4_mb_load_buddy and that
1058         * would have pinned buddy page to page cache.
1059         */
1060        ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b);
1061        if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
1062                /*
1063                 * somebody initialized the group
1064                 * return without doing anything
1065                 */
1066                goto err;
1067        }
1068
1069        page = e4b.bd_bitmap_page;
1070        ret = ext4_mb_init_cache(page, NULL);
1071        if (ret)
1072                goto err;
1073        if (!PageUptodate(page)) {
1074                ret = -EIO;
1075                goto err;
1076        }
1077        mark_page_accessed(page);
1078
1079        if (e4b.bd_buddy_page == NULL) {
1080                /*
1081                 * If both the bitmap and buddy are in
1082                 * the same page we don't need to force
1083                 * init the buddy
1084                 */
1085                ret = 0;
1086                goto err;
1087        }
1088        /* init buddy cache */
1089        page = e4b.bd_buddy_page;
1090        ret = ext4_mb_init_cache(page, e4b.bd_bitmap);
1091        if (ret)
1092                goto err;
1093        if (!PageUptodate(page)) {
1094                ret = -EIO;
1095                goto err;
1096        }
1097        mark_page_accessed(page);
1098err:
1099        ext4_mb_put_buddy_page_lock(&e4b);
1100        return ret;
1101}
1102
1103/*
1104 * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
1105 * block group lock of all groups for this page; do not hold the BG lock when
1106 * calling this routine!
1107 */
1108static noinline_for_stack int
1109ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1110                                        struct ext4_buddy *e4b)
1111{
1112        int blocks_per_page;
1113        int block;
1114        int pnum;
1115        int poff;
1116        struct page *page;
1117        int ret;
1118        struct ext4_group_info *grp;
1119        struct ext4_sb_info *sbi = EXT4_SB(sb);
1120        struct inode *inode = sbi->s_buddy_cache;
1121
1122        mb_debug(1, "load group %u\n", group);
1123
1124        blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1125        grp = ext4_get_group_info(sb, group);
1126
1127        e4b->bd_blkbits = sb->s_blocksize_bits;
1128        e4b->bd_info = ext4_get_group_info(sb, group);
1129        e4b->bd_sb = sb;
1130        e4b->bd_group = group;
1131        e4b->bd_buddy_page = NULL;
1132        e4b->bd_bitmap_page = NULL;
1133
1134        if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
1135                /*
1136                 * we need full data about the group
1137                 * to make a good selection
1138                 */
1139                ret = ext4_mb_init_group(sb, group);
1140                if (ret)
1141                        return ret;
1142        }
1143
1144        /*
1145         * the buddy cache inode stores the block bitmap
1146         * and buddy information in consecutive blocks.
1147         * So for each group we need two blocks.
1148         */
1149        block = group * 2;
1150        pnum = block / blocks_per_page;
1151        poff = block % blocks_per_page;
1152
1153        /* we could use find_or_create_page(), but it locks page
1154         * what we'd like to avoid in fast path ... */
1155        page = find_get_page(inode->i_mapping, pnum);
1156        if (page == NULL || !PageUptodate(page)) {
1157                if (page)
1158                        /*
1159                         * drop the page reference and try
1160                         * to get the page with lock. If we
1161                         * are not uptodate that implies
1162                         * somebody just created the page but
1163                         * is yet to initialize the same. So
1164                         * wait for it to initialize.
1165                         */
1166                        page_cache_release(page);
1167                page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1168                if (page) {
1169                        BUG_ON(page->mapping != inode->i_mapping);
1170                        if (!PageUptodate(page)) {
1171                                ret = ext4_mb_init_cache(page, NULL);
1172                                if (ret) {
1173                                        unlock_page(page);
1174                                        goto err;
1175                                }
1176                                mb_cmp_bitmaps(e4b, page_address(page) +
1177                                               (poff * sb->s_blocksize));
1178                        }
1179                        unlock_page(page);
1180                }
1181        }
1182        if (page == NULL || !PageUptodate(page)) {
1183                ret = -EIO;
1184                goto err;
1185        }
1186        e4b->bd_bitmap_page = page;
1187        e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
1188        mark_page_accessed(page);
1189
1190        block++;
1191        pnum = block / blocks_per_page;
1192        poff = block % blocks_per_page;
1193
1194        page = find_get_page(inode->i_mapping, pnum);
1195        if (page == NULL || !PageUptodate(page)) {
1196                if (page)
1197                        page_cache_release(page);
1198                page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1199                if (page) {
1200                        BUG_ON(page->mapping != inode->i_mapping);
1201                        if (!PageUptodate(page)) {
1202                                ret = ext4_mb_init_cache(page, e4b->bd_bitmap);
1203                                if (ret) {
1204                                        unlock_page(page);
1205                                        goto err;
1206                                }
1207                        }
1208                        unlock_page(page);
1209                }
1210        }
1211        if (page == NULL || !PageUptodate(page)) {
1212                ret = -EIO;
1213                goto err;
1214        }
1215        e4b->bd_buddy_page = page;
1216        e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
1217        mark_page_accessed(page);
1218
1219        BUG_ON(e4b->bd_bitmap_page == NULL);
1220        BUG_ON(e4b->bd_buddy_page == NULL);
1221
1222        return 0;
1223
1224err:
1225        if (page)
1226                page_cache_release(page);
1227        if (e4b->bd_bitmap_page)
1228                page_cache_release(e4b->bd_bitmap_page);
1229        if (e4b->bd_buddy_page)
1230                page_cache_release(e4b->bd_buddy_page);
1231        e4b->bd_buddy = NULL;
1232        e4b->bd_bitmap = NULL;
1233        return ret;
1234}
1235
1236static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
1237{
1238        if (e4b->bd_bitmap_page)
1239                page_cache_release(e4b->bd_bitmap_page);
1240        if (e4b->bd_buddy_page)
1241                page_cache_release(e4b->bd_buddy_page);
1242}
1243
1244
1245static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
1246{
1247        int order = 1;
1248        void *bb;
1249
1250        BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b));
1251        BUG_ON(block >= (1 << (e4b->bd_blkbits + 3)));
1252
1253        bb = EXT4_MB_BUDDY(e4b);
1254        while (order <= e4b->bd_blkbits + 1) {
1255                block = block >> 1;
1256                if (!mb_test_bit(block, bb)) {
1257                        /* this block is part of buddy of order 'order' */
1258                        return order;
1259                }
1260                bb += 1 << (e4b->bd_blkbits - order);
1261                order++;
1262        }
1263        return 0;
1264}
1265
1266static void mb_clear_bits(void *bm, int cur, int len)
1267{
1268        __u32 *addr;
1269
1270        len = cur + len;
1271        while (cur < len) {
1272                if ((cur & 31) == 0 && (len - cur) >= 32) {
1273                        /* fast path: clear whole word at once */
1274                        addr = bm + (cur >> 3);
1275                        *addr = 0;
1276                        cur += 32;
1277                        continue;
1278                }
1279                mb_clear_bit(cur, bm);
1280                cur++;
1281        }
1282}
1283
1284static void mb_set_bits(void *bm, int cur, int len)
1285{
1286        __u32 *addr;
1287
1288        len = cur + len;
1289        while (cur < len) {
1290                if ((cur & 31) == 0 && (len - cur) >= 32) {
1291                        /* fast path: set whole word at once */
1292                        addr = bm + (cur >> 3);
1293                        *addr = 0xffffffff;
1294                        cur += 32;
1295                        continue;
1296                }
1297                mb_set_bit(cur, bm);
1298                cur++;
1299        }
1300}
1301
1302static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1303                          int first, int count)
1304{
1305        int block = 0;
1306        int max = 0;
1307        int order;
1308        void *buddy;
1309        void *buddy2;
1310        struct super_block *sb = e4b->bd_sb;
1311
1312        BUG_ON(first + count > (sb->s_blocksize << 3));
1313        assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
1314        mb_check_buddy(e4b);
1315        mb_free_blocks_double(inode, e4b, first, count);
1316
1317        e4b->bd_info->bb_free += count;
1318        if (first < e4b->bd_info->bb_first_free)
1319                e4b->bd_info->bb_first_free = first;
1320
1321        /* let's maintain fragments counter */
1322        if (first != 0)
1323                block = !mb_test_bit(first - 1, EXT4_MB_BITMAP(e4b));
1324        if (first + count < EXT4_SB(sb)->s_mb_maxs[0])
1325                max = !mb_test_bit(first + count, EXT4_MB_BITMAP(e4b));
1326        if (block && max)
1327                e4b->bd_info->bb_fragments--;
1328        else if (!block && !max)
1329                e4b->bd_info->bb_fragments++;
1330
1331        /* let's maintain buddy itself */
1332        while (count-- > 0) {
1333                block = first++;
1334                order = 0;
1335
1336                if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) {
1337                        ext4_fsblk_t blocknr;
1338
1339                        blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
1340                        blocknr += block;
1341                        ext4_grp_locked_error(sb, e4b->bd_group,
1342                                              inode ? inode->i_ino : 0,
1343                                              blocknr,
1344                                              "freeing already freed block "
1345                                              "(bit %u)", block);
1346                }
1347                mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
1348                e4b->bd_info->bb_counters[order]++;
1349
1350                /* start of the buddy */
1351                buddy = mb_find_buddy(e4b, order, &max);
1352
1353                do {
1354                        block &= ~1UL;
1355                        if (mb_test_bit(block, buddy) ||
1356                                        mb_test_bit(block + 1, buddy))
1357                                break;
1358
1359                        /* both the buddies are free, try to coalesce them */
1360                        buddy2 = mb_find_buddy(e4b, order + 1, &max);
1361
1362                        if (!buddy2)
1363                                break;
1364
1365                        if (order > 0) {
1366                                /* for special purposes, we don't set
1367                                 * free bits in bitmap */
1368                                mb_set_bit(block, buddy);
1369                                mb_set_bit(block + 1, buddy);
1370                        }
1371                        e4b->bd_info->bb_counters[order]--;
1372                        e4b->bd_info->bb_counters[order]--;
1373
1374                        block = block >> 1;
1375                        order++;
1376                        e4b->bd_info->bb_counters[order]++;
1377
1378                        mb_clear_bit(block, buddy2);
1379                        buddy = buddy2;
1380                } while (1);
1381        }
1382        mb_set_largest_free_order(sb, e4b->bd_info);
1383        mb_check_buddy(e4b);
1384}
1385
1386static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
1387                                int needed, struct ext4_free_extent *ex)
1388{
1389        int next = block;
1390        int max;
1391        int ord;
1392        void *buddy;
1393
1394        assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
1395        BUG_ON(ex == NULL);
1396
1397        buddy = mb_find_buddy(e4b, order, &max);
1398        BUG_ON(buddy == NULL);
1399        BUG_ON(block >= max);
1400        if (mb_test_bit(block, buddy)) {
1401                ex->fe_len = 0;
1402                ex->fe_start = 0;
1403                ex->fe_group = 0;
1404                return 0;
1405        }
1406
1407        /* FIXME dorp order completely ? */
1408        if (likely(order == 0)) {
1409                /* find actual order */
1410                order = mb_find_order_for_block(e4b, block);
1411                block = block >> order;
1412        }
1413
1414        ex->fe_len = 1 << order;
1415        ex->fe_start = block << order;
1416        ex->fe_group = e4b->bd_group;
1417
1418        /* calc difference from given start */
1419        next = next - ex->fe_start;
1420        ex->fe_len -= next;
1421        ex->fe_start += next;
1422
1423        while (needed > ex->fe_len &&
1424               (buddy = mb_find_buddy(e4b, order, &max))) {
1425
1426                if (block + 1 >= max)
1427                        break;
1428
1429                next = (block + 1) * (1 << order);
1430                if (mb_test_bit(next, EXT4_MB_BITMAP(e4b)))
1431                        break;
1432
1433                ord = mb_find_order_for_block(e4b, next);
1434
1435                order = ord;
1436                block = next >> order;
1437                ex->fe_len += 1 << order;
1438        }
1439
1440        BUG_ON(ex->fe_start + ex->fe_len > (1 << (e4b->bd_blkbits + 3)));
1441        return ex->fe_len;
1442}
1443
1444static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
1445{
1446        int ord;
1447        int mlen = 0;
1448        int max = 0;
1449        int cur;
1450        int start = ex->fe_start;
1451        int len = ex->fe_len;
1452        unsigned ret = 0;
1453        int len0 = len;
1454        void *buddy;
1455
1456        BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
1457        BUG_ON(e4b->bd_group != ex->fe_group);
1458        assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
1459        mb_check_buddy(e4b);
1460        mb_mark_used_double(e4b, start, len);
1461
1462        e4b->bd_info->bb_free -= len;
1463        if (e4b->bd_info->bb_first_free == start)
1464                e4b->bd_info->bb_first_free += len;
1465
1466        /* let's maintain fragments counter */
1467        if (start != 0)
1468                mlen = !mb_test_bit(start - 1, EXT4_MB_BITMAP(e4b));
1469        if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0])
1470                max = !mb_test_bit(start + len, EXT4_MB_BITMAP(e4b));
1471        if (mlen && max)
1472                e4b->bd_info->bb_fragments++;
1473        else if (!mlen && !max)
1474                e4b->bd_info->bb_fragments--;
1475
1476        /* let's maintain buddy itself */
1477        while (len) {
1478                ord = mb_find_order_for_block(e4b, start);
1479
1480                if (((start >> ord) << ord) == start && len >= (1 << ord)) {
1481                        /* the whole chunk may be allocated at once! */
1482                        mlen = 1 << ord;
1483                        buddy = mb_find_buddy(e4b, ord, &max);
1484                        BUG_ON((start >> ord) >= max);
1485                        mb_set_bit(start >> ord, buddy);
1486                        e4b->bd_info->bb_counters[ord]--;
1487                        start += mlen;
1488                        len -= mlen;
1489                        BUG_ON(len < 0);
1490                        continue;
1491                }
1492
1493                /* store for history */
1494                if (ret == 0)
1495                        ret = len | (ord << 16);
1496
1497                /* we have to split large buddy */
1498                BUG_ON(ord <= 0);
1499                buddy = mb_find_buddy(e4b, ord, &max);
1500                mb_set_bit(start >> ord, buddy);
1501                e4b->bd_info->bb_counters[ord]--;
1502
1503                ord--;
1504                cur = (start >> ord) & ~1U;
1505                buddy = mb_find_buddy(e4b, ord, &max);
1506                mb_clear_bit(cur, buddy);
1507                mb_clear_bit(cur + 1, buddy);
1508                e4b->bd_info->bb_counters[ord]++;
1509                e4b->bd_info->bb_counters[ord]++;
1510        }
1511        mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
1512
1513        mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
1514        mb_check_buddy(e4b);
1515
1516        return ret;
1517}
1518
1519/*
1520 * Must be called under group lock!
1521 */
1522static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
1523                                        struct ext4_buddy *e4b)
1524{
1525        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
1526        int ret;
1527
1528        BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group);
1529        BUG_ON(ac->ac_status == AC_STATUS_FOUND);
1530
1531        ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len);
1532        ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical;
1533        ret = mb_mark_used(e4b, &ac->ac_b_ex);
1534
1535        /* preallocation can change ac_b_ex, thus we store actually
1536         * allocated blocks for history */
1537        ac->ac_f_ex = ac->ac_b_ex;
1538
1539        ac->ac_status = AC_STATUS_FOUND;
1540        ac->ac_tail = ret & 0xffff;
1541        ac->ac_buddy = ret >> 16;
1542
1543        /*
1544         * take the page reference. We want the page to be pinned
1545         * so that we don't get a ext4_mb_init_cache_call for this
1546         * group until we update the bitmap. That would mean we
1547         * double allocate blocks. The reference is dropped
1548         * in ext4_mb_release_context
1549         */
1550        ac->ac_bitmap_page = e4b->bd_bitmap_page;
1551        get_page(ac->ac_bitmap_page);
1552        ac->ac_buddy_page = e4b->bd_buddy_page;
1553        get_page(ac->ac_buddy_page);
1554        /* store last allocated for subsequent stream allocation */
1555        if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
1556                spin_lock(&sbi->s_md_lock);
1557                sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
1558                sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
1559                spin_unlock(&sbi->s_md_lock);
1560        }
1561}
1562
1563/*
1564 * regular allocator, for general purposes allocation
1565 */
1566
1567static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
1568                                        struct ext4_buddy *e4b,
1569                                        int finish_group)
1570{
1571        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
1572        struct ext4_free_extent *bex = &ac->ac_b_ex;
1573        struct ext4_free_extent *gex = &ac->ac_g_ex;
1574        struct ext4_free_extent ex;
1575        int max;
1576
1577        if (ac->ac_status == AC_STATUS_FOUND)
1578                return;
1579        /*
1580         * We don't want to scan for a whole year
1581         */
1582        if (ac->ac_found > sbi->s_mb_max_to_scan &&
1583                        !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
1584                ac->ac_status = AC_STATUS_BREAK;
1585                return;
1586        }
1587
1588        /*
1589         * Haven't found good chunk so far, let's continue
1590         */
1591        if (bex->fe_len < gex->fe_len)
1592                return;
1593
1594        if ((finish_group || ac->ac_found > sbi->s_mb_min_to_scan)
1595                        && bex->fe_group == e4b->bd_group) {
1596                /* recheck chunk's availability - we don't know
1597                 * when it was found (within this lock-unlock
1598                 * period or not) */
1599                max = mb_find_extent(e4b, 0, bex->fe_start, gex->fe_len, &ex);
1600                if (max >= gex->fe_len) {
1601                        ext4_mb_use_best_found(ac, e4b);
1602                        return;
1603                }
1604        }
1605}
1606
1607/*
1608 * The routine checks whether found extent is good enough. If it is,
1609 * then the extent gets marked used and flag is set to the context
1610 * to stop scanning. Otherwise, the extent is compared with the
1611 * previous found extent and if new one is better, then it's stored
1612 * in the context. Later, the best found extent will be used, if
1613 * mballoc can't find good enough extent.
1614 *
1615 * FIXME: real allocation policy is to be designed yet!
1616 */
1617static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
1618                                        struct ext4_free_extent *ex,
1619                                        struct ext4_buddy *e4b)
1620{
1621        struct ext4_free_extent *bex = &ac->ac_b_ex;
1622        struct ext4_free_extent *gex = &ac->ac_g_ex;
1623
1624        BUG_ON(ex->fe_len <= 0);
1625        BUG_ON(ex->fe_len > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
1626        BUG_ON(ex->fe_start >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
1627        BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
1628
1629        ac->ac_found++;
1630
1631        /*
1632         * The special case - take what you catch first
1633         */
1634        if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
1635                *bex = *ex;
1636                ext4_mb_use_best_found(ac, e4b);
1637                return;
1638        }
1639
1640        /*
1641         * Let's check whether the chuck is good enough
1642         */
1643        if (ex->fe_len == gex->fe_len) {
1644                *bex = *ex;
1645                ext4_mb_use_best_found(ac, e4b);
1646                return;
1647        }
1648
1649        /*
1650         * If this is first found extent, just store it in the context
1651         */
1652        if (bex->fe_len == 0) {
1653                *bex = *ex;
1654                return;
1655        }
1656
1657        /*
1658         * If new found extent is better, store it in the context
1659         */
1660        if (bex->fe_len < gex->fe_len) {
1661                /* if the request isn't satisfied, any found extent
1662                 * larger than previous best one is better */
1663                if (ex->fe_len > bex->fe_len)
1664                        *bex = *ex;
1665        } else if (ex->fe_len > gex->fe_len) {
1666                /* if the request is satisfied, then we try to find
1667                 * an extent that still satisfy the request, but is
1668                 * smaller than previous one */
1669                if (ex->fe_len < bex->fe_len)
1670                        *bex = *ex;
1671        }
1672
1673        ext4_mb_check_limits(ac, e4b, 0);
1674}
1675
1676static noinline_for_stack
1677int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
1678                                        struct ext4_buddy *e4b)
1679{
1680        struct ext4_free_extent ex = ac->ac_b_ex;
1681        ext4_group_t group = ex.fe_group;
1682        int max;
1683        int err;
1684
1685        BUG_ON(ex.fe_len <= 0);
1686        err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
1687        if (err)
1688                return err;
1689
1690        ext4_lock_group(ac->ac_sb, group);
1691        max = mb_find_extent(e4b, 0, ex.fe_start, ex.fe_len, &ex);
1692
1693        if (max > 0) {
1694                ac->ac_b_ex = ex;
1695                ext4_mb_use_best_found(ac, e4b);
1696        }
1697
1698        ext4_unlock_group(ac->ac_sb, group);
1699        ext4_mb_unload_buddy(e4b);
1700
1701        return 0;
1702}
1703
1704static noinline_for_stack
1705int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
1706                                struct ext4_buddy *e4b)
1707{
1708        ext4_group_t group = ac->ac_g_ex.fe_group;
1709        int max;
1710        int err;
1711        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
1712        struct ext4_free_extent ex;
1713
1714        if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
1715                return 0;
1716
1717        err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
1718        if (err)
1719                return err;
1720
1721        ext4_lock_group(ac->ac_sb, group);
1722        max = mb_find_extent(e4b, 0, ac->ac_g_ex.fe_start,
1723                             ac->ac_g_ex.fe_len, &ex);
1724
1725        if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
1726                ext4_fsblk_t start;
1727
1728                start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) +
1729                        ex.fe_start;
1730                /* use do_div to get remainder (would be 64-bit modulo) */
1731                if (do_div(start, sbi->s_stripe) == 0) {
1732                        ac->ac_found++;
1733                        ac->ac_b_ex = ex;
1734                        ext4_mb_use_best_found(ac, e4b);
1735                }
1736        } else if (max >= ac->ac_g_ex.fe_len) {
1737                BUG_ON(ex.fe_len <= 0);
1738                BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
1739                BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
1740                ac->ac_found++;
1741                ac->ac_b_ex = ex;
1742                ext4_mb_use_best_found(ac, e4b);
1743        } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) {
1744                /* Sometimes, caller may want to merge even small
1745                 * number of blocks to an existing extent */
1746                BUG_ON(ex.fe_len <= 0);
1747                BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
1748                BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
1749                ac->ac_found++;
1750                ac->ac_b_ex = ex;
1751                ext4_mb_use_best_found(ac, e4b);
1752        }
1753        ext4_unlock_group(ac->ac_sb, group);
1754        ext4_mb_unload_buddy(e4b);
1755
1756        return 0;
1757}
1758
1759/*
1760 * The routine scans buddy structures (not bitmap!) from given order
1761 * to max order and tries to find big enough chunk to satisfy the req
1762 */
1763static noinline_for_stack
1764void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
1765                                        struct ext4_buddy *e4b)
1766{
1767        struct super_block *sb = ac->ac_sb;
1768        struct ext4_group_info *grp = e4b->bd_info;
1769        void *buddy;
1770        int i;
1771        int k;
1772        int max;
1773
1774        BUG_ON(ac->ac_2order <= 0);
1775        for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) {
1776                if (grp->bb_counters[i] == 0)
1777                        continue;
1778
1779                buddy = mb_find_buddy(e4b, i, &max);
1780                BUG_ON(buddy == NULL);
1781
1782                k = mb_find_next_zero_bit(buddy, max, 0);
1783                BUG_ON(k >= max);
1784
1785                ac->ac_found++;
1786
1787                ac->ac_b_ex.fe_len = 1 << i;
1788                ac->ac_b_ex.fe_start = k << i;
1789                ac->ac_b_ex.fe_group = e4b->bd_group;
1790
1791                ext4_mb_use_best_found(ac, e4b);
1792
1793                BUG_ON(ac->ac_b_ex.fe_len != ac->ac_g_ex.fe_len);
1794
1795                if (EXT4_SB(sb)->s_mb_stats)
1796                        atomic_inc(&EXT4_SB(sb)->s_bal_2orders);
1797
1798                break;
1799        }
1800}
1801
1802/*
1803 * The routine scans the group and measures all found extents.
1804 * In order to optimize scanning, caller must pass number of
1805 * free blocks in the group, so the routine can know upper limit.
1806 */
1807static noinline_for_stack
1808void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
1809                                        struct ext4_buddy *e4b)
1810{
1811        struct super_block *sb = ac->ac_sb;
1812        void *bitmap = EXT4_MB_BITMAP(e4b);
1813        struct ext4_free_extent ex;
1814        int i;
1815        int free;
1816
1817        free = e4b->bd_info->bb_free;
1818        BUG_ON(free <= 0);
1819
1820        i = e4b->bd_info->bb_first_free;
1821
1822        while (free && ac->ac_status == AC_STATUS_CONTINUE) {
1823                i = mb_find_next_zero_bit(bitmap,
1824                                                EXT4_BLOCKS_PER_GROUP(sb), i);
1825                if (i >= EXT4_BLOCKS_PER_GROUP(sb)) {
1826                        /*
1827                         * IF we have corrupt bitmap, we won't find any
1828                         * free blocks even though group info says we
1829                         * we have free blocks
1830                         */
1831                        ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
1832                                        "%d free blocks as per "
1833                                        "group info. But bitmap says 0",
1834                                        free);
1835                        break;
1836                }
1837
1838                mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex);
1839                BUG_ON(ex.fe_len <= 0);
1840                if (free < ex.fe_len) {
1841                        ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
1842                                        "%d free blocks as per "
1843                                        "group info. But got %d blocks",
1844                                        free, ex.fe_len);
1845                        /*
1846                         * The number of free blocks differs. This mostly
1847                         * indicate that the bitmap is corrupt. So exit
1848                         * without claiming the space.
1849                         */
1850                        break;
1851                }
1852
1853                ext4_mb_measure_extent(ac, &ex, e4b);
1854
1855                i += ex.fe_len;
1856                free -= ex.fe_len;
1857        }
1858
1859        ext4_mb_check_limits(ac, e4b, 1);
1860}
1861
1862/*
1863 * This is a special case for storages like raid5
1864 * we try to find stripe-aligned chunks for stripe-size-multiple requests
1865 */
1866static noinline_for_stack
1867void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
1868                                 struct ext4_buddy *e4b)
1869{
1870        struct super_block *sb = ac->ac_sb;
1871        struct ext4_sb_info *sbi = EXT4_SB(sb);
1872        void *bitmap = EXT4_MB_BITMAP(e4b);
1873        struct ext4_free_extent ex;
1874        ext4_fsblk_t first_group_block;
1875        ext4_fsblk_t a;
1876        ext4_grpblk_t i;
1877        int max;
1878
1879        BUG_ON(sbi->s_stripe == 0);
1880
1881        /* find first stripe-aligned block in group */
1882        first_group_block = ext4_group_first_block_no(sb, e4b->bd_group);
1883
1884        a = first_group_block + sbi->s_stripe - 1;
1885        do_div(a, sbi->s_stripe);
1886        i = (a * sbi->s_stripe) - first_group_block;
1887
1888        while (i < EXT4_BLOCKS_PER_GROUP(sb)) {
1889                if (!mb_test_bit(i, bitmap)) {
1890                        max = mb_find_extent(e4b, 0, i, sbi->s_stripe, &ex);
1891                        if (max >= sbi->s_stripe) {
1892                                ac->ac_found++;
1893                                ac->ac_b_ex = ex;
1894                                ext4_mb_use_best_found(ac, e4b);
1895                                break;
1896                        }
1897                }
1898                i += sbi->s_stripe;
1899        }
1900}
1901
1902/* This is now called BEFORE we load the buddy bitmap. */
1903static int ext4_mb_good_group(struct ext4_allocation_context *ac,
1904                                ext4_group_t group, int cr)
1905{
1906        unsigned free, fragments;
1907        int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
1908        struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
1909
1910        BUG_ON(cr < 0 || cr >= 4);
1911
1912        /* We only do this if the grp has never been initialized */
1913        if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
1914                int ret = ext4_mb_init_group(ac->ac_sb, group);
1915                if (ret)
1916                        return 0;
1917        }
1918
1919        free = grp->bb_free;
1920        fragments = grp->bb_fragments;
1921        if (free == 0)
1922                return 0;
1923        if (fragments == 0)
1924                return 0;
1925
1926        switch (cr) {
1927        case 0:
1928                BUG_ON(ac->ac_2order == 0);
1929
1930                if (grp->bb_largest_free_order < ac->ac_2order)
1931                        return 0;
1932
1933                /* Avoid using the first bg of a flexgroup for data files */
1934                if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
1935                    (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
1936                    ((group % flex_size) == 0))
1937                        return 0;
1938
1939                return 1;
1940        case 1:
1941                if ((free / fragments) >= ac->ac_g_ex.fe_len)
1942                        return 1;
1943                break;
1944        case 2:
1945                if (free >= ac->ac_g_ex.fe_len)
1946                        return 1;
1947                break;
1948        case 3:
1949                return 1;
1950        default:
1951                BUG();
1952        }
1953
1954        return 0;
1955}
1956
1957static noinline_for_stack int
1958ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
1959{
1960        ext4_group_t ngroups, group, i;
1961        int cr;
1962        int err = 0;
1963        struct ext4_sb_info *sbi;
1964        struct super_block *sb;
1965        struct ext4_buddy e4b;
1966
1967        sb = ac->ac_sb;
1968        sbi = EXT4_SB(sb);
1969        ngroups = ext4_get_groups_count(sb);
1970        /* non-extent files are limited to low blocks/groups */
1971        if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
1972                ngroups = sbi->s_blockfile_groups;
1973
1974        BUG_ON(ac->ac_status == AC_STATUS_FOUND);
1975
1976        /* first, try the goal */
1977        err = ext4_mb_find_by_goal(ac, &e4b);
1978        if (err || ac->ac_status == AC_STATUS_FOUND)
1979                goto out;
1980
1981        if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
1982                goto out;
1983
1984        /*
1985         * ac->ac2_order is set only if the fe_len is a power of 2
1986         * if ac2_order is set we also set criteria to 0 so that we
1987         * try exact allocation using buddy.
1988         */
1989        i = fls(ac->ac_g_ex.fe_len);
1990        ac->ac_2order = 0;
1991        /*
1992         * We search using buddy data only if the order of the request
1993         * is greater than equal to the sbi_s_mb_order2_reqs
1994         * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
1995         */
1996        if (i >= sbi->s_mb_order2_reqs) {
1997                /*
1998                 * This should tell if fe_len is exactly power of 2
1999                 */
2000                if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0)
2001                        ac->ac_2order = i - 1;
2002        }
2003
2004        /* if stream allocation is enabled, use global goal */
2005        if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
2006                /* TBD: may be hot point */
2007                spin_lock(&sbi->s_md_lock);
2008                ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
2009                ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
2010                spin_unlock(&sbi->s_md_lock);
2011        }
2012
2013        /* Let's just scan groups to find more-less suitable blocks */
2014        cr = ac->ac_2order ? 0 : 1;
2015        /*
2016         * cr == 0 try to get exact allocation,
2017         * cr == 3  try to get anything
2018         */
2019repeat:
2020        for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
2021                ac->ac_criteria = cr;
2022                /*
2023                 * searching for the right group start
2024                 * from the goal value specified
2025                 */
2026                group = ac->ac_g_ex.fe_group;
2027
2028                for (i = 0; i < ngroups; group++, i++) {
2029                        if (group == ngroups)
2030                                group = 0;
2031
2032                        /* This now checks without needing the buddy page */
2033                        if (!ext4_mb_good_group(ac, group, cr))
2034                                continue;
2035
2036                        err = ext4_mb_load_buddy(sb, group, &e4b);
2037                        if (err)
2038                                goto out;
2039
2040                        ext4_lock_group(sb, group);
2041
2042                        /*
2043                         * We need to check again after locking the
2044                         * block group
2045                         */
2046                        if (!ext4_mb_good_group(ac, group, cr)) {
2047                                ext4_unlock_group(sb, group);
2048                                ext4_mb_unload_buddy(&e4b);
2049                                continue;
2050                        }
2051
2052                        ac->ac_groups_scanned++;
2053                        if (cr == 0)
2054                                ext4_mb_simple_scan_group(ac, &e4b);
2055                        else if (cr == 1 && sbi->s_stripe &&
2056                                        !(ac->ac_g_ex.fe_len % sbi->s_stripe))
2057                                ext4_mb_scan_aligned(ac, &e4b);
2058                        else
2059                                ext4_mb_complex_scan_group(ac, &e4b);
2060
2061                        ext4_unlock_group(sb, group);
2062                        ext4_mb_unload_buddy(&e4b);
2063
2064                        if (ac->ac_status != AC_STATUS_CONTINUE)
2065                                break;
2066                }
2067        }
2068
2069        if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
2070            !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
2071                /*
2072                 * We've been searching too long. Let's try to allocate
2073                 * the best chunk we've found so far
2074                 */
2075
2076                ext4_mb_try_best_found(ac, &e4b);
2077                if (ac->ac_status != AC_STATUS_FOUND) {
2078                        /*
2079                         * Someone more lucky has already allocated it.
2080                         * The only thing we can do is just take first
2081                         * found block(s)
2082                        printk(KERN_DEBUG "EXT4-fs: someone won our chunk\n");
2083                         */
2084                        ac->ac_b_ex.fe_group = 0;
2085                        ac->ac_b_ex.fe_start = 0;
2086                        ac->ac_b_ex.fe_len = 0;
2087                        ac->ac_status = AC_STATUS_CONTINUE;
2088                        ac->ac_flags |= EXT4_MB_HINT_FIRST;
2089                        cr = 3;
2090                        atomic_inc(&sbi->s_mb_lost_chunks);
2091                        goto repeat;
2092                }
2093        }
2094out:
2095        return err;
2096}
2097
2098static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
2099{
2100        struct super_block *sb = seq->private;
2101        ext4_group_t group;
2102
2103        if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
2104                return NULL;
2105        group = *pos + 1;
2106        return (void *) ((unsigned long) group);
2107}
2108
2109static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
2110{
2111        struct super_block *sb = seq->private;
2112        ext4_group_t group;
2113
2114        ++*pos;
2115        if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
2116                return NULL;
2117        group = *pos + 1;
2118        return (void *) ((unsigned long) group);
2119}
2120
2121static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
2122{
2123        struct super_block *sb = seq->private;
2124        ext4_group_t group = (ext4_group_t) ((unsigned long) v);
2125        int i;
2126        int err;
2127        struct ext4_buddy e4b;
2128        struct sg {
2129                struct ext4_group_info info;
2130                ext4_grpblk_t counters[16];
2131        } sg;
2132
2133        group--;
2134        if (group == 0)
2135                seq_printf(seq, "#%-5s: %-5s %-5s %-5s "
2136                                "[ %-5s %-5s %-5s %-5s %-5s %-5s %-5s "
2137                                  "%-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n",
2138                           "group", "free", "frags", "first",
2139                           "2^0", "2^1", "2^2", "2^3", "2^4", "2^5", "2^6",
2140                           "2^7", "2^8", "2^9", "2^10", "2^11", "2^12", "2^13");
2141
2142        i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
2143                sizeof(struct ext4_group_info);
2144        err = ext4_mb_load_buddy(sb, group, &e4b);
2145        if (err) {
2146                seq_printf(seq, "#%-5u: I/O error\n", group);
2147                return 0;
2148        }
2149        ext4_lock_group(sb, group);
2150        memcpy(&sg, ext4_get_group_info(sb, group), i);
2151        ext4_unlock_group(sb, group);
2152        ext4_mb_unload_buddy(&e4b);
2153
2154        seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
2155                        sg.info.bb_fragments, sg.info.bb_first_free);
2156        for (i = 0; i <= 13; i++)
2157                seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ?
2158                                sg.info.bb_counters[i] : 0);
2159        seq_printf(seq, " ]\n");
2160
2161        return 0;
2162}
2163
2164static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v)
2165{
2166}
2167
2168static const struct seq_operations ext4_mb_seq_groups_ops = {
2169        .start  = ext4_mb_seq_groups_start,
2170        .next   = ext4_mb_seq_groups_next,
2171        .stop   = ext4_mb_seq_groups_stop,
2172        .show   = ext4_mb_seq_groups_show,
2173};
2174
2175static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
2176{
2177        struct super_block *sb = PDE(inode)->data;
2178        int rc;
2179
2180        rc = seq_open(file, &ext4_mb_seq_groups_ops);
2181        if (rc == 0) {
2182                struct seq_file *m = file->private_data;
2183                m->private = sb;
2184        }
2185        return rc;
2186
2187}
2188
2189static const struct file_operations ext4_mb_seq_groups_fops = {
2190        .owner          = THIS_MODULE,
2191        .open           = ext4_mb_seq_groups_open,
2192        .read           = seq_read,
2193        .llseek         = seq_lseek,
2194        .release        = seq_release,
2195};
2196
2197static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
2198{
2199        int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
2200        struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index];
2201
2202        BUG_ON(!cachep);
2203        return cachep;
2204}
2205
2206/* Create and initialize ext4_group_info data for the given group. */
2207int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2208                          struct ext4_group_desc *desc)
2209{
2210        int i;
2211        int metalen = 0;
2212        struct ext4_sb_info *sbi = EXT4_SB(sb);
2213        struct ext4_group_info **meta_group_info;
2214        struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2215
2216        /*
2217         * First check if this group is the first of a reserved block.
2218         * If it's true, we have to allocate a new table of pointers
2219         * to ext4_group_info structures
2220         */
2221        if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
2222                metalen = sizeof(*meta_group_info) <<
2223                        EXT4_DESC_PER_BLOCK_BITS(sb);
2224                meta_group_info = kmalloc(metalen, GFP_KERNEL);
2225                if (meta_group_info == NULL) {
2226                        printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
2227                               "buddy group\n");
2228                        goto exit_meta_group_info;
2229                }
2230                sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] =
2231                        meta_group_info;
2232        }
2233
2234        meta_group_info =
2235                sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
2236        i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
2237
2238        meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL);
2239        if (meta_group_info[i] == NULL) {
2240                printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n");
2241                goto exit_group_info;
2242        }
2243        memset(meta_group_info[i], 0, kmem_cache_size(cachep));
2244        set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
2245                &(meta_group_info[i]->bb_state));
2246
2247        /*
2248         * initialize bb_free to be able to skip
2249         * empty groups without initialization
2250         */
2251        if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
2252                meta_group_info[i]->bb_free =
2253                        ext4_free_blocks_after_init(sb, group, desc);
2254        } else {
2255                meta_group_info[i]->bb_free =
2256                        ext4_free_blks_count(sb, desc);
2257        }
2258
2259        INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
2260        init_rwsem(&meta_group_info[i]->alloc_sem);
2261        meta_group_info[i]->bb_free_root = RB_ROOT;
2262        meta_group_info[i]->bb_largest_free_order = -1;  /* uninit */
2263
2264#ifdef DOUBLE_CHECK
2265        {
2266                struct buffer_head *bh;
2267                meta_group_info[i]->bb_bitmap =
2268                        kmalloc(sb->s_blocksize, GFP_KERNEL);
2269                BUG_ON(meta_group_info[i]->bb_bitmap == NULL);
2270                bh = ext4_read_block_bitmap(sb, group);
2271                BUG_ON(bh == NULL);
2272                memcpy(meta_group_info[i]->bb_bitmap, bh->b_data,
2273                        sb->s_blocksize);
2274                put_bh(bh);
2275        }
2276#endif
2277
2278        return 0;
2279
2280exit_group_info:
2281        /* If a meta_group_info table has been allocated, release it now */
2282        if (group % EXT4_DESC_PER_BLOCK(sb) == 0)
2283                kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]);
2284exit_meta_group_info:
2285        return -ENOMEM;
2286} /* ext4_mb_add_groupinfo */
2287
2288static int ext4_mb_init_backend(struct super_block *sb)
2289{
2290        ext4_group_t ngroups = ext4_get_groups_count(sb);
2291        ext4_group_t i;
2292        struct ext4_sb_info *sbi = EXT4_SB(sb);
2293        struct ext4_super_block *es = sbi->s_es;
2294        int num_meta_group_infos;
2295        int num_meta_group_infos_max;
2296        int array_size;
2297        struct ext4_group_desc *desc;
2298        struct kmem_cache *cachep;
2299
2300        /* This is the number of blocks used by GDT */
2301        num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) -
2302                                1) >> EXT4_DESC_PER_BLOCK_BITS(sb);
2303
2304        /*
2305         * This is the total number of blocks used by GDT including
2306         * the number of reserved blocks for GDT.
2307         * The s_group_info array is allocated with this value
2308         * to allow a clean online resize without a complex
2309         * manipulation of pointer.
2310         * The drawback is the unused memory when no resize
2311         * occurs but it's very low in terms of pages
2312         * (see comments below)
2313         * Need to handle this properly when META_BG resizing is allowed
2314         */
2315        num_meta_group_infos_max = num_meta_group_infos +
2316                                le16_to_cpu(es->s_reserved_gdt_blocks);
2317
2318        /*
2319         * array_size is the size of s_group_info array. We round it
2320         * to the next power of two because this approximation is done
2321         * internally by kmalloc so we can have some more memory
2322         * for free here (e.g. may be used for META_BG resize).
2323         */
2324        array_size = 1;
2325        while (array_size < sizeof(*sbi->s_group_info) *
2326               num_meta_group_infos_max)
2327                array_size = array_size << 1;
2328        /* An 8TB filesystem with 64-bit pointers requires a 4096 byte
2329         * kmalloc. A 128kb malloc should suffice for a 256TB filesystem.
2330         * So a two level scheme suffices for now. */
2331        sbi->s_group_info = kzalloc(array_size, GFP_KERNEL);
2332        if (sbi->s_group_info == NULL) {
2333                printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n");
2334                return -ENOMEM;
2335        }
2336        sbi->s_buddy_cache = new_inode(sb);
2337        if (sbi->s_buddy_cache == NULL) {
2338                printk(KERN_ERR "EXT4-fs: can't get new inode\n");
2339                goto err_freesgi;
2340        }
2341        sbi->s_buddy_cache->i_ino = get_next_ino();
2342        EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
2343        for (i = 0; i < ngroups; i++) {
2344                desc = ext4_get_group_desc(sb, i, NULL);
2345                if (desc == NULL) {
2346                        printk(KERN_ERR
2347                                "EXT4-fs: can't read descriptor %u\n", i);
2348                        goto err_freebuddy;
2349                }
2350                if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
2351                        goto err_freebuddy;
2352        }
2353
2354        return 0;
2355
2356err_freebuddy:
2357        cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2358        while (i-- > 0)
2359                kmem_cache_free(cachep, ext4_get_group_info(sb, i));
2360        i = num_meta_group_infos;
2361        while (i-- > 0)
2362                kfree(sbi->s_group_info[i]);
2363        iput(sbi->s_buddy_cache);
2364err_freesgi:
2365        kfree(sbi->s_group_info);
2366        return -ENOMEM;
2367}
2368
2369static void ext4_groupinfo_destroy_slabs(void)
2370{
2371        int i;
2372
2373        for (i = 0; i < NR_GRPINFO_CACHES; i++) {
2374                if (ext4_groupinfo_caches[i])
2375                        kmem_cache_destroy(ext4_groupinfo_caches[i]);
2376                ext4_groupinfo_caches[i] = NULL;
2377        }
2378}
2379
2380static int ext4_groupinfo_create_slab(size_t size)
2381{
2382        static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex);
2383        int slab_size;
2384        int blocksize_bits = order_base_2(size);
2385        int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
2386        struct kmem_cache *cachep;
2387
2388        if (cache_index >= NR_GRPINFO_CACHES)
2389                return -EINVAL;
2390
2391        if (unlikely(cache_index < 0))
2392                cache_index = 0;
2393
2394        mutex_lock(&ext4_grpinfo_slab_create_mutex);
2395        if (ext4_groupinfo_caches[cache_index]) {
2396                mutex_unlock(&ext4_grpinfo_slab_create_mutex);
2397                return 0;       /* Already created */
2398        }
2399
2400        slab_size = offsetof(struct ext4_group_info,
2401                                bb_counters[blocksize_bits + 2]);
2402
2403        cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index],
2404                                        slab_size, 0, SLAB_RECLAIM_ACCOUNT,
2405                                        NULL);
2406
2407        mutex_unlock(&ext4_grpinfo_slab_create_mutex);
2408        if (!cachep) {
2409                printk(KERN_EMERG "EXT4: no memory for groupinfo slab cache\n");
2410                return -ENOMEM;
2411        }
2412
2413        ext4_groupinfo_caches[cache_index] = cachep;
2414
2415        return 0;
2416}
2417
2418int ext4_mb_init(struct super_block *sb, int needs_recovery)
2419{
2420        struct ext4_sb_info *sbi = EXT4_SB(sb);
2421        unsigned i, j;
2422        unsigned offset;
2423        unsigned max;
2424        int ret;
2425
2426        i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
2427
2428        sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
2429        if (sbi->s_mb_offsets == NULL) {
2430                ret = -ENOMEM;
2431                goto out;
2432        }
2433
2434        i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
2435        sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
2436        if (sbi->s_mb_maxs == NULL) {
2437                ret = -ENOMEM;
2438                goto out;
2439        }
2440
2441        ret = ext4_groupinfo_create_slab(sb->s_blocksize);
2442        if (ret < 0)
2443                goto out;
2444
2445        /* order 0 is regular bitmap */
2446        sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
2447        sbi->s_mb_offsets[0] = 0;
2448
2449        i = 1;
2450        offset = 0;
2451        max = sb->s_blocksize << 2;
2452        do {
2453                sbi->s_mb_offsets[i] = offset;
2454                sbi->s_mb_maxs[i] = max;
2455                offset += 1 << (sb->s_blocksize_bits - i);
2456                max = max >> 1;
2457                i++;
2458        } while (i <= sb->s_blocksize_bits + 1);
2459
2460        /* init file for buddy data */
2461        ret = ext4_mb_init_backend(sb);
2462        if (ret != 0) {
2463                goto out;
2464        }
2465
2466        spin_lock_init(&sbi->s_md_lock);
2467        spin_lock_init(&sbi->s_bal_lock);
2468
2469        sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
2470        sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
2471        sbi->s_mb_stats = MB_DEFAULT_STATS;
2472        sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
2473        sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
2474        sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC;
2475
2476        sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
2477        if (sbi->s_locality_groups == NULL) {
2478                ret = -ENOMEM;
2479                goto out;
2480        }
2481        for_each_possible_cpu(i) {
2482                struct ext4_locality_group *lg;
2483                lg = per_cpu_ptr(sbi->s_locality_groups, i);
2484                mutex_init(&lg->lg_mutex);
2485                for (j = 0; j < PREALLOC_TB_SIZE; j++)
2486                        INIT_LIST_HEAD(&lg->lg_prealloc_list[j]);
2487                spin_lock_init(&lg->lg_prealloc_lock);
2488        }
2489
2490        if (sbi->s_proc)
2491                proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
2492                                 &ext4_mb_seq_groups_fops, sb);
2493
2494        if (sbi->s_journal)
2495                sbi->s_journal->j_commit_callback = release_blocks_on_commit;
2496out:
2497        if (ret) {
2498                kfree(sbi->s_mb_offsets);
2499                kfree(sbi->s_mb_maxs);
2500        }
2501        return ret;
2502}
2503
2504/* need to called with the ext4 group lock held */
2505static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
2506{
2507        struct ext4_prealloc_space *pa;
2508        struct list_head *cur, *tmp;
2509        int count = 0;
2510
2511        list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) {
2512                pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
2513                list_del(&pa->pa_group_list);
2514                count++;
2515                kmem_cache_free(ext4_pspace_cachep, pa);
2516        }
2517        if (count)
2518                mb_debug(1, "mballoc: %u PAs left\n", count);
2519
2520}
2521
2522int ext4_mb_release(struct super_block *sb)
2523{
2524        ext4_group_t ngroups = ext4_get_groups_count(sb);
2525        ext4_group_t i;
2526        int num_meta_group_infos;
2527        struct ext4_group_info *grinfo;
2528        struct ext4_sb_info *sbi = EXT4_SB(sb);
2529        struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2530
2531        if (sbi->s_group_info) {
2532                for (i = 0; i < ngroups; i++) {
2533                        grinfo = ext4_get_group_info(sb, i);
2534#ifdef DOUBLE_CHECK
2535                        kfree(grinfo->bb_bitmap);
2536#endif
2537                        ext4_lock_group(sb, i);
2538                        ext4_mb_cleanup_pa(grinfo);
2539                        ext4_unlock_group(sb, i);
2540                        kmem_cache_free(cachep, grinfo);
2541                }
2542                num_meta_group_infos = (ngroups +
2543                                EXT4_DESC_PER_BLOCK(sb) - 1) >>
2544                        EXT4_DESC_PER_BLOCK_BITS(sb);
2545                for (i = 0; i < num_meta_group_infos; i++)
2546                        kfree(sbi->s_group_info[i]);
2547                kfree(sbi->s_group_info);
2548        }
2549        kfree(sbi->s_mb_offsets);
2550        kfree(sbi->s_mb_maxs);
2551        if (sbi->s_buddy_cache)
2552                iput(sbi->s_buddy_cache);
2553        if (sbi->s_mb_stats) {
2554                printk(KERN_INFO
2555                       "EXT4-fs: mballoc: %u blocks %u reqs (%u success)\n",
2556                                atomic_read(&sbi->s_bal_allocated),
2557                                atomic_read(&sbi->s_bal_reqs),
2558                                atomic_read(&sbi->s_bal_success));
2559                printk(KERN_INFO
2560                      "EXT4-fs: mballoc: %u extents scanned, %u goal hits, "
2561                                "%u 2^N hits, %u breaks, %u lost\n",
2562                                atomic_read(&sbi->s_bal_ex_scanned),
2563                                atomic_read(&sbi->s_bal_goals),
2564                                atomic_read(&sbi->s_bal_2orders),
2565                                atomic_read(&sbi->s_bal_breaks),
2566                                atomic_read(&sbi->s_mb_lost_chunks));
2567                printk(KERN_INFO
2568                       "EXT4-fs: mballoc: %lu generated and it took %Lu\n",
2569                                sbi->s_mb_buddies_generated++,
2570                                sbi->s_mb_generation_time);
2571                printk(KERN_INFO
2572                       "EXT4-fs: mballoc: %u preallocated, %u discarded\n",
2573                                atomic_read(&sbi->s_mb_preallocated),
2574                                atomic_read(&sbi->s_mb_discarded));
2575        }
2576
2577        free_percpu(sbi->s_locality_groups);
2578        if (sbi->s_proc)
2579                remove_proc_entry("mb_groups", sbi->s_proc);
2580
2581        return 0;
2582}
2583
2584static inline int ext4_issue_discard(struct super_block *sb,
2585                ext4_group_t block_group, ext4_grpblk_t block, int count)
2586{
2587        ext4_fsblk_t discard_block;
2588
2589        discard_block = block + ext4_group_first_block_no(sb, block_group);
2590        trace_ext4_discard_blocks(sb,
2591                        (unsigned long long) discard_block, count);
2592        return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
2593}
2594
2595/*
2596 * This function is called by the jbd2 layer once the commit has finished,
2597 * so we know we can free the blocks that were released with that commit.
2598 */
2599static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2600{
2601        struct super_block *sb = journal->j_private;
2602        struct ext4_buddy e4b;
2603        struct ext4_group_info *db;
2604        int err, count = 0, count2 = 0;
2605        struct ext4_free_data *entry;
2606        struct list_head *l, *ltmp;
2607
2608        list_for_each_safe(l, ltmp, &txn->t_private_list) {
2609                entry = list_entry(l, struct ext4_free_data, list);
2610
2611                mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
2612                         entry->count, entry->group, entry);
2613
2614                if (test_opt(sb, DISCARD))
2615                        ext4_issue_discard(sb, entry->group,
2616                                           entry->start_blk, entry->count);
2617
2618                err = ext4_mb_load_buddy(sb, entry->group, &e4b);
2619                /* we expect to find existing buddy because it's pinned */
2620                BUG_ON(err != 0);
2621
2622                db = e4b.bd_info;
2623                /* there are blocks to put in buddy to make them really free */
2624                count += entry->count;
2625                count2++;
2626                ext4_lock_group(sb, entry->group);
2627                /* Take it out of per group rb tree */
2628                rb_erase(&entry->node, &(db->bb_free_root));
2629                mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count);
2630
2631                if (!db->bb_free_root.rb_node) {
2632                        /* No more items in the per group rb tree
2633                         * balance refcounts from ext4_mb_free_metadata()
2634                         */
2635                        page_cache_release(e4b.bd_buddy_page);
2636                        page_cache_release(e4b.bd_bitmap_page);
2637                }
2638                ext4_unlock_group(sb, entry->group);
2639                kmem_cache_free(ext4_free_ext_cachep, entry);
2640                ext4_mb_unload_buddy(&e4b);
2641        }
2642
2643        mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
2644}
2645
2646#ifdef CONFIG_EXT4_DEBUG
2647u8 mb_enable_debug __read_mostly;
2648
2649static struct dentry *debugfs_dir;
2650static struct dentry *debugfs_debug;
2651
2652static void __init ext4_create_debugfs_entry(void)
2653{
2654        debugfs_dir = debugfs_create_dir("ext4", NULL);
2655        if (debugfs_dir)
2656                debugfs_debug = debugfs_create_u8("mballoc-debug",
2657                                                  S_IRUGO | S_IWUSR,
2658                                                  debugfs_dir,
2659                                                  &mb_enable_debug);
2660}
2661
2662static void ext4_remove_debugfs_entry(void)
2663{
2664        debugfs_remove(debugfs_debug);
2665        debugfs_remove(debugfs_dir);
2666}
2667
2668#else
2669
2670static void __init ext4_create_debugfs_entry(void)
2671{
2672}
2673
2674static void ext4_remove_debugfs_entry(void)
2675{
2676}
2677
2678#endif
2679
2680int __init ext4_init_mballoc(void)
2681{
2682        ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
2683                                        SLAB_RECLAIM_ACCOUNT);
2684        if (ext4_pspace_cachep == NULL)
2685                return -ENOMEM;
2686
2687        ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context,
2688                                    SLAB_RECLAIM_ACCOUNT);
2689        if (ext4_ac_cachep == NULL) {
2690                kmem_cache_destroy(ext4_pspace_cachep);
2691                return -ENOMEM;
2692        }
2693
2694        ext4_free_ext_cachep = KMEM_CACHE(ext4_free_data,
2695                                          SLAB_RECLAIM_ACCOUNT);
2696        if (ext4_free_ext_cachep == NULL) {
2697                kmem_cache_destroy(ext4_pspace_cachep);
2698                kmem_cache_destroy(ext4_ac_cachep);
2699                return -ENOMEM;
2700        }
2701        ext4_create_debugfs_entry();
2702        return 0;
2703}
2704
2705void ext4_exit_mballoc(void)
2706{
2707        /*
2708         * Wait for completion of call_rcu()'s on ext4_pspace_cachep
2709         * before destroying the slab cache.
2710         */
2711        rcu_barrier();
2712        kmem_cache_destroy(ext4_pspace_cachep);
2713        kmem_cache_destroy(ext4_ac_cachep);
2714        kmem_cache_destroy(ext4_free_ext_cachep);
2715        ext4_groupinfo_destroy_slabs();
2716        ext4_remove_debugfs_entry();
2717}
2718
2719
2720/*
2721 * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps
2722 * Returns 0 if success or error code
2723 */
2724static noinline_for_stack int
2725ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2726                                handle_t *handle, unsigned int reserv_blks)
2727{
2728        struct buffer_head *bitmap_bh = NULL;
2729        struct ext4_group_desc *gdp;
2730        struct buffer_head *gdp_bh;
2731        struct ext4_sb_info *sbi;
2732        struct super_block *sb;
2733        ext4_fsblk_t block;
2734        int err, len;
2735
2736        BUG_ON(ac->ac_status != AC_STATUS_FOUND);
2737        BUG_ON(ac->ac_b_ex.fe_len <= 0);
2738
2739        sb = ac->ac_sb;
2740        sbi = EXT4_SB(sb);
2741
2742        err = -EIO;
2743        bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
2744        if (!bitmap_bh)
2745                goto out_err;
2746
2747        err = ext4_journal_get_write_access(handle, bitmap_bh);
2748        if (err)
2749                goto out_err;
2750
2751        err = -EIO;
2752        gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh);
2753        if (!gdp)
2754                goto out_err;
2755
2756        ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
2757                        ext4_free_blks_count(sb, gdp));
2758
2759        err = ext4_journal_get_write_access(handle, gdp_bh);
2760        if (err)
2761                goto out_err;
2762
2763        block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
2764
2765        len = ac->ac_b_ex.fe_len;
2766        if (!ext4_data_block_valid(sbi, block, len)) {
2767                ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
2768                           "fs metadata\n", block, block+len);
2769                /* File system mounted not to panic on error
2770                 * Fix the bitmap and repeat the block allocation
2771                 * We leak some of the blocks here.
2772                 */
2773                ext4_lock_group(sb, ac->ac_b_ex.fe_group);
2774                mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
2775                            ac->ac_b_ex.fe_len);
2776                ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
2777                err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
2778                if (!err)
2779                        err = -EAGAIN;
2780                goto out_err;
2781        }
2782
2783        ext4_lock_group(sb, ac->ac_b_ex.fe_group);
2784#ifdef AGGRESSIVE_CHECK
2785        {
2786                int i;
2787                for (i = 0; i < ac->ac_b_ex.fe_len; i++) {
2788                        BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i,
2789                                                bitmap_bh->b_data));
2790                }
2791        }
2792#endif
2793        mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,ac->ac_b_ex.fe_len);
2794        if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
2795                gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
2796                ext4_free_blks_set(sb, gdp,
2797                                        ext4_free_blocks_after_init(sb,
2798                                        ac->ac_b_ex.fe_group, gdp));
2799        }
2800        len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len;
2801        ext4_free_blks_set(sb, gdp, len);
2802        gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
2803
2804        ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
2805        percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
2806        /*
2807         * Now reduce the dirty block count also. Should not go negative
2808         */
2809        if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
2810                /* release all the reserved blocks if non delalloc */
2811                percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks);
2812
2813        if (sbi->s_log_groups_per_flex) {
2814                ext4_group_t flex_group = ext4_flex_group(sbi,
2815                                                          ac->ac_b_ex.fe_group);
2816                atomic_sub(ac->ac_b_ex.fe_len,
2817                           &sbi->s_flex_groups[flex_group].free_blocks);
2818        }
2819
2820        err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
2821        if (err)
2822                goto out_err;
2823        err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
2824
2825out_err:
2826        ext4_mark_super_dirty(sb);
2827        brelse(bitmap_bh);
2828        return err;
2829}
2830
2831/*
2832 * here we normalize request for locality group
2833 * Group request are normalized to s_strip size if we set the same via mount
2834 * option. If not we set it to s_mb_group_prealloc which can be configured via
2835 * /sys/fs/ext4/<partition>/mb_group_prealloc
2836 *
2837 * XXX: should we try to preallocate more than the group has now?
2838 */
2839static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
2840{
2841        struct super_block *sb = ac->ac_sb;
2842        struct ext4_locality_group *lg = ac->ac_lg;
2843
2844        BUG_ON(lg == NULL);
2845        if (EXT4_SB(sb)->s_stripe)
2846                ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe;
2847        else
2848                ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
2849        mb_debug(1, "#%u: goal %u blocks for locality group\n",
2850                current->pid, ac->ac_g_ex.fe_len);
2851}
2852
2853/*
2854 * Normalization means making request better in terms of
2855 * size and alignment
2856 */
2857static noinline_for_stack void
2858ext4_mb_normalize_request(struct ext4_allocation_context *ac,
2859                                struct ext4_allocation_request *ar)
2860{
2861        int bsbits, max;
2862        ext4_lblk_t end;
2863        loff_t size, orig_size, start_off;
2864        ext4_lblk_t start;
2865        struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
2866        struct ext4_prealloc_space *pa;
2867
2868        /* do normalize only data requests, metadata requests
2869           do not need preallocation */
2870        if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
2871                return;
2872
2873        /* sometime caller may want exact blocks */
2874        if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
2875                return;
2876
2877        /* caller may indicate that preallocation isn't
2878         * required (it's a tail, for example) */
2879        if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC)
2880                return;
2881
2882        if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) {
2883                ext4_mb_normalize_group_request(ac);
2884                return ;
2885        }
2886
2887        bsbits = ac->ac_sb->s_blocksize_bits;
2888
2889        /* first, let's learn actual file size
2890         * given current request is allocated */
2891        size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
2892        size = size << bsbits;
2893        if (size < i_size_read(ac->ac_inode))
2894                size = i_size_read(ac->ac_inode);
2895        orig_size = size;
2896
2897        /* max size of free chunks */
2898        max = 2 << bsbits;
2899
2900#define NRL_CHECK_SIZE(req, size, max, chunk_size)      \
2901                (req <= (size) || max <= (chunk_size))
2902
2903        /* first, try to predict filesize */
2904        /* XXX: should this table be tunable? */
2905        start_off = 0;
2906        if (size <= 16 * 1024) {
2907                size = 16 * 1024;
2908        } else if (size <= 32 * 1024) {
2909                size = 32 * 1024;
2910        } else if (size <= 64 * 1024) {
2911                size = 64 * 1024;
2912        } else if (size <= 128 * 1024) {
2913                size = 128 * 1024;
2914        } else if (size <= 256 * 1024) {
2915                size = 256 * 1024;
2916        } else if (size <= 512 * 1024) {
2917                size = 512 * 1024;
2918        } else if (size <= 1024 * 1024) {
2919                size = 1024 * 1024;
2920        } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) {
2921                start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
2922                                                (21 - bsbits)) << 21;
2923                size = 2 * 1024 * 1024;
2924        } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) {
2925                start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
2926                                                        (22 - bsbits)) << 22;
2927                size = 4 * 1024 * 1024;
2928        } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len,
2929                                        (8<<20)>>bsbits, max, 8 * 1024)) {
2930                start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
2931                                                        (23 - bsbits)) << 23;
2932                size = 8 * 1024 * 1024;
2933        } else {
2934                start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits;
2935                size      = ac->ac_o_ex.fe_len << bsbits;
2936        }
2937        size = size >> bsbits;
2938        start = start_off >> bsbits;
2939
2940        /* don't cover already allocated blocks in selected range */
2941        if (ar->pleft && start <= ar->lleft) {
2942                size -= ar->lleft + 1 - start;
2943                start = ar->lleft + 1;
2944        }
2945        if (ar->pright && start + size - 1 >= ar->lright)
2946                size -= start + size - ar->lright;
2947
2948        end = start + size;
2949
2950        /* check we don't cross already preallocated blocks */
2951        rcu_read_lock();
2952        list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
2953                ext4_lblk_t pa_end;
2954
2955                if (pa->pa_deleted)
2956                        continue;
2957                spin_lock(&pa->pa_lock);
2958                if (pa->pa_deleted) {
2959                        spin_unlock(&pa->pa_lock);
2960                        continue;
2961                }
2962
2963                pa_end = pa->pa_lstart + pa->pa_len;
2964
2965                /* PA must not overlap original request */
2966                BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
2967                        ac->ac_o_ex.fe_logical < pa->pa_lstart));
2968
2969                /* skip PAs this normalized request doesn't overlap with */
2970                if (pa->pa_lstart >= end || pa_end <= start) {
2971                        spin_unlock(&pa->pa_lock);
2972                        continue;
2973                }
2974                BUG_ON(pa->pa_lstart <= start && pa_end >= end);
2975
2976                /* adjust start or end to be adjacent to this pa */
2977                if (pa_end <= ac->ac_o_ex.fe_logical) {
2978                        BUG_ON(pa_end < start);
2979                        start = pa_end;
2980                } else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
2981                        BUG_ON(pa->pa_lstart > end);
2982                        end = pa->pa_lstart;
2983                }
2984                spin_unlock(&pa->pa_lock);
2985        }
2986        rcu_read_unlock();
2987        size = end - start;
2988
2989        /* XXX: extra loop to check we really don't overlap preallocations */
2990        rcu_read_lock();
2991        list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
2992                ext4_lblk_t pa_end;
2993                spin_lock(&pa->pa_lock);
2994                if (pa->pa_deleted == 0) {
2995                        pa_end = pa->pa_lstart + pa->pa_len;
2996                        BUG_ON(!(start >= pa_end || end <= pa->pa_lstart));
2997                }
2998                spin_unlock(&pa->pa_lock);
2999        }
3000        rcu_read_unlock();
3001
3002        if (start + size <= ac->ac_o_ex.fe_logical &&
3003                        start > ac->ac_o_ex.fe_logical) {
3004                printk(KERN_ERR "start %lu, size %lu, fe_logical %lu\n",
3005                        (unsigned long) start, (unsigned long) size,
3006                        (unsigned long) ac->ac_o_ex.fe_logical);
3007        }
3008        BUG_ON(start + size <= ac->ac_o_ex.fe_logical &&
3009                        start > ac->ac_o_ex.fe_logical);
3010        BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
3011
3012        /* now prepare goal request */
3013
3014        /* XXX: is it better to align blocks WRT to logical
3015         * placement or satisfy big request as is */
3016        ac->ac_g_ex.fe_logical = start;
3017        ac->ac_g_ex.fe_len = size;
3018
3019        /* define goal start in order to merge */
3020        if (ar->pright && (ar->lright == (start + size))) {
3021                /* merge to the right */
3022                ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size,
3023                                                &ac->ac_f_ex.fe_group,
3024                                                &ac->ac_f_ex.fe_start);
3025                ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
3026        }
3027        if (ar->pleft && (ar->lleft + 1 == start)) {
3028                /* merge to the left */
3029                ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1,
3030                                                &ac->ac_f_ex.fe_group,
3031                                                &ac->ac_f_ex.fe_start);
3032                ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
3033        }
3034
3035        mb_debug(1, "goal: %u(was %u) blocks at %u\n", (unsigned) size,
3036                (unsigned) orig_size, (unsigned) start);
3037}
3038
3039static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
3040{
3041        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
3042
3043        if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) {
3044                atomic_inc(&sbi->s_bal_reqs);
3045                atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
3046                if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
3047                        atomic_inc(&sbi->s_bal_success);
3048                atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
3049                if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
3050                                ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
3051                        atomic_inc(&sbi->s_bal_goals);
3052                if (ac->ac_found > sbi->s_mb_max_to_scan)
3053                        atomic_inc(&sbi->s_bal_breaks);
3054        }
3055
3056        if (ac->ac_op == EXT4_MB_HISTORY_ALLOC)
3057                trace_ext4_mballoc_alloc(ac);
3058        else
3059                trace_ext4_mballoc_prealloc(ac);
3060}
3061
3062/*
3063 * Called on failure; free up any blocks from the inode PA for this
3064 * context.  We don't need this for MB_GROUP_PA because we only change
3065 * pa_free in ext4_mb_release_context(), but on failure, we've already
3066 * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed.
3067 */
3068static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
3069{
3070        struct ext4_prealloc_space *pa = ac->ac_pa;
3071        int len;
3072
3073        if (pa && pa->pa_type == MB_INODE_PA) {
3074                len = ac->ac_b_ex.fe_len;
3075                pa->pa_free += len;
3076        }
3077
3078}
3079
3080/*
3081 * use blocks preallocated to inode
3082 */
3083static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
3084                                struct ext4_prealloc_space *pa)
3085{
3086        ext4_fsblk_t start;
3087        ext4_fsblk_t end;
3088        int len;
3089
3090        /* found preallocated blocks, use them */
3091        start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart);
3092        end = min(pa->pa_pstart + pa->pa_len, start + ac->ac_o_ex.fe_len);
3093        len = end - start;
3094        ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group,
3095                                        &ac->ac_b_ex.fe_start);
3096        ac->ac_b_ex.fe_len = len;
3097        ac->ac_status = AC_STATUS_FOUND;
3098        ac->ac_pa = pa;
3099
3100        BUG_ON(start < pa->pa_pstart);
3101        BUG_ON(start + len > pa->pa_pstart + pa->pa_len);
3102        BUG_ON(pa->pa_free < len);
3103        pa->pa_free -= len;
3104
3105        mb_debug(1, "use %llu/%u from inode pa %p\n", start, len, pa);
3106}
3107
3108/*
3109 * use blocks preallocated to locality group
3110 */
3111static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
3112                                struct ext4_prealloc_space *pa)
3113{
3114        unsigned int len = ac->ac_o_ex.fe_len;
3115
3116        ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart,
3117                                        &ac->ac_b_ex.fe_group,
3118                                        &ac->ac_b_ex.fe_start);
3119        ac->ac_b_ex.fe_len = len;
3120        ac->ac_status = AC_STATUS_FOUND;
3121        ac->ac_pa = pa;
3122
3123        /* we don't correct pa_pstart or pa_plen here to avoid
3124         * possible race when the group is being loaded concurrently
3125         * instead we correct pa later, after blocks are marked
3126         * in on-disk bitmap -- see ext4_mb_release_context()
3127         * Other CPUs are prevented from allocating from this pa by lg_mutex
3128         */
3129        mb_debug(1, "use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
3130}
3131
3132/*
3133 * Return the prealloc space that have minimal distance
3134 * from the goal block. @cpa is the prealloc
3135 * space that is having currently known minimal distance
3136 * from the goal block.
3137 */
3138static struct ext4_prealloc_space *
3139ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
3140                        struct ext4_prealloc_space *pa,
3141                        struct ext4_prealloc_space *cpa)
3142{
3143        ext4_fsblk_t cur_distance, new_distance;
3144
3145        if (cpa == NULL) {
3146                atomic_inc(&pa->pa_count);
3147                return pa;
3148        }
3149        cur_distance = abs(goal_block - cpa->pa_pstart);
3150        new_distance = abs(goal_block - pa->pa_pstart);
3151
3152        if (cur_distance <= new_distance)
3153                return cpa;
3154
3155        /* drop the previous reference */
3156        atomic_dec(&cpa->pa_count);
3157        atomic_inc(&pa->pa_count);
3158        return pa;
3159}
3160
3161/*
3162 * search goal blocks in preallocated space
3163 */
3164static noinline_for_stack int
3165ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
3166{
3167        int order, i;
3168        struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
3169        struct ext4_locality_group *lg;
3170        struct ext4_prealloc_space *pa, *cpa = NULL;
3171        ext4_fsblk_t goal_block;
3172
3173        /* only data can be preallocated */
3174        if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
3175                return 0;
3176
3177        /* first, try per-file preallocation */
3178        rcu_read_lock();
3179        list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
3180
3181                /* all fields in this condition don't change,
3182                 * so we can skip locking for them */
3183                if (ac->ac_o_ex.fe_logical < pa->pa_lstart ||
3184                        ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len)
3185                        continue;
3186
3187                /* non-extent files can't have physical blocks past 2^32 */
3188                if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
3189                        pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS)
3190                        continue;
3191
3192                /* found preallocated blocks, use them */
3193                spin_lock(&pa->pa_lock);
3194                if (pa->pa_deleted == 0 && pa->pa_free) {
3195                        atomic_inc(&pa->pa_count);
3196                        ext4_mb_use_inode_pa(ac, pa);
3197                        spin_unlock(&pa->pa_lock);
3198                        ac->ac_criteria = 10;
3199                        rcu_read_unlock();
3200                        return 1;
3201                }
3202                spin_unlock(&pa->pa_lock);
3203        }
3204        rcu_read_unlock();
3205
3206        /* can we use group allocation? */
3207        if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC))
3208                return 0;
3209
3210        /* inode may have no locality group for some reason */
3211        lg = ac->ac_lg;
3212        if (lg == NULL)
3213                return 0;
3214        order  = fls(ac->ac_o_ex.fe_len) - 1;
3215        if (order > PREALLOC_TB_SIZE - 1)
3216                /* The max size of hash table is PREALLOC_TB_SIZE */
3217                order = PREALLOC_TB_SIZE - 1;
3218
3219        goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex);
3220        /*
3221         * search for the prealloc space that is having
3222         * minimal distance from the goal block.
3223         */
3224        for (i = order; i < PREALLOC_TB_SIZE; i++) {
3225                rcu_read_lock();
3226                list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i],
3227                                        pa_inode_list) {
3228                        spin_lock(&pa->pa_lock);
3229                        if (pa->pa_deleted == 0 &&
3230                                        pa->pa_free >= ac->ac_o_ex.fe_len) {
3231
3232                                cpa = ext4_mb_check_group_pa(goal_block,
3233                                                                pa, cpa);
3234                        }
3235                        spin_unlock(&pa->pa_lock);
3236                }
3237                rcu_read_unlock();
3238        }
3239        if (cpa) {
3240                ext4_mb_use_group_pa(ac, cpa);
3241                ac->ac_criteria = 20;
3242                return 1;
3243        }
3244        return 0;
3245}
3246
3247/*
3248 * the function goes through all block freed in the group
3249 * but not yet committed and marks them used in in-core bitmap.
3250 * buddy must be generated from this bitmap
3251 * Need to be called with the ext4 group lock held
3252 */
3253static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
3254                                                ext4_group_t group)
3255{
3256        struct rb_node *n;
3257        struct ext4_group_info *grp;
3258        struct ext4_free_data *entry;
3259
3260        grp = ext4_get_group_info(sb, group);
3261        n = rb_first(&(grp->bb_free_root));
3262
3263        while (n) {
3264                entry = rb_entry(n, struct ext4_free_data, node);
3265                mb_set_bits(bitmap, entry->start_blk, entry->count);
3266                n = rb_next(n);
3267        }
3268        return;
3269}
3270
3271/*
3272 * the function goes through all preallocation in this group and marks them
3273 * used in in-core bitmap. buddy must be generated from this bitmap
3274 * Need to be called with ext4 group lock held
3275 */
3276static noinline_for_stack
3277void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
3278                                        ext4_group_t group)
3279{
3280        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
3281        struct ext4_prealloc_space *pa;
3282        struct list_head *cur;
3283        ext4_group_t groupnr;
3284        ext4_grpblk_t start;
3285        int preallocated = 0;
3286        int count = 0;
3287        int len;
3288
3289        /* all form of preallocation discards first load group,
3290         * so the only competing code is preallocation use.
3291         * we don't need any locking here
3292         * notice we do NOT ignore preallocations with pa_deleted
3293         * otherwise we could leave used blocks available for
3294         * allocation in buddy when concurrent ext4_mb_put_pa()
3295         * is dropping preallocation
3296         */
3297        list_for_each(cur, &grp->bb_prealloc_list) {
3298                pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
3299                spin_lock(&pa->pa_lock);
3300                ext4_get_group_no_and_offset(sb, pa->pa_pstart,
3301                                             &groupnr, &start);
3302                len = pa->pa_len;
3303                spin_unlock(&pa->pa_lock);
3304                if (unlikely(len == 0))
3305                        continue;
3306                BUG_ON(groupnr != group);
3307                mb_set_bits(bitmap, start, len);
3308                preallocated += len;
3309                count++;
3310        }
3311        mb_debug(1, "prellocated %u for group %u\n", preallocated, group);
3312}
3313
3314static void ext4_mb_pa_callback(struct rcu_head *head)
3315{
3316        struct ext4_prealloc_space *pa;
3317        pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
3318        kmem_cache_free(ext4_pspace_cachep, pa);
3319}
3320
3321/*
3322 * drops a reference to preallocated space descriptor
3323 * if this was the last reference and the space is consumed
3324 */
3325static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
3326                        struct super_block *sb, struct ext4_prealloc_space *pa)
3327{
3328        ext4_group_t grp;
3329        ext4_fsblk_t grp_blk;
3330
3331        if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0)
3332                return;
3333
3334        /* in this short window concurrent discard can set pa_deleted */
3335        spin_lock(&pa->pa_lock);
3336        if (pa->pa_deleted == 1) {
3337                spin_unlock(&pa->pa_lock);
3338                return;
3339        }
3340
3341        pa->pa_deleted = 1;
3342        spin_unlock(&pa->pa_lock);
3343
3344        grp_blk = pa->pa_pstart;
3345        /*
3346         * If doing group-based preallocation, pa_pstart may be in the
3347         * next group when pa is used up
3348         */
3349        if (pa->pa_type == MB_GROUP_PA)
3350                grp_blk--;
3351
3352        ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL);
3353
3354        /*
3355         * possible race:
3356         *
3357         *  P1 (buddy init)                     P2 (regular allocation)
3358         *                                      find block B in PA
3359         *  copy on-disk bitmap to buddy
3360         *                                      mark B in on-disk bitmap
3361         *                                      drop PA from group
3362         *  mark all PAs in buddy
3363         *
3364         * thus, P1 initializes buddy with B available. to prevent this
3365         * we make "copy" and "mark all PAs" atomic and serialize "drop PA"
3366         * against that pair
3367         */
3368        ext4_lock_group(sb, grp);
3369        list_del(&pa->pa_group_list);
3370        ext4_unlock_group(sb, grp);
3371
3372        spin_lock(pa->pa_obj_lock);
3373        list_del_rcu(&pa->pa_inode_list);
3374        spin_unlock(pa->pa_obj_lock);
3375
3376        call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
3377}
3378
3379/*
3380 * creates new preallocated space for given inode
3381 */
3382static noinline_for_stack int
3383ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
3384{
3385        struct super_block *sb = ac->ac_sb;
3386        struct ext4_prealloc_space *pa;
3387        struct ext4_group_info *grp;
3388        struct ext4_inode_info *ei;
3389
3390        /* preallocate only when found space is larger then requested */
3391        BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
3392        BUG_ON(ac->ac_status != AC_STATUS_FOUND);
3393        BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
3394
3395        pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
3396        if (pa == NULL)
3397                return -ENOMEM;
3398
3399        if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) {
3400                int winl;
3401                int wins;
3402                int win;
3403                int offs;
3404
3405                /* we can't allocate as much as normalizer wants.
3406                 * so, found space must get proper lstart
3407                 * to cover original request */
3408                BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical);
3409                BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len);
3410
3411                /* we're limited by original request in that
3412                 * logical block must be covered any way
3413                 * winl is window we can move our chunk within */
3414                winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical;
3415
3416                /* also, we should cover whole original request */
3417                wins = ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len;
3418
3419                /* the smallest one defines real window */
3420                win = min(winl, wins);
3421
3422                offs = ac->ac_o_ex.fe_logical % ac->ac_b_ex.fe_len;
3423                if (offs && offs < win)
3424                        win = offs;
3425
3426                ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical - win;
3427                BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
3428                BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
3429        }
3430
3431        /* preallocation can change ac_b_ex, thus we store actually
3432         * allocated blocks for history */
3433        ac->ac_f_ex = ac->ac_b_ex;
3434
3435        pa->pa_lstart = ac->ac_b_ex.fe_logical;
3436        pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
3437        pa->pa_len = ac->ac_b_ex.fe_len;
3438        pa->pa_free = pa->pa_len;
3439        atomic_set(&pa->pa_count, 1);
3440        spin_lock_init(&pa->pa_lock);
3441        INIT_LIST_HEAD(&pa->pa_inode_list);
3442        INIT_LIST_HEAD(&pa->pa_group_list);
3443        pa->pa_deleted = 0;
3444        pa->pa_type = MB_INODE_PA;
3445
3446        mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa,
3447                        pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3448        trace_ext4_mb_new_inode_pa(ac, pa);
3449
3450        ext4_mb_use_inode_pa(ac, pa);
3451        atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
3452
3453        ei = EXT4_I(ac->ac_inode);
3454        grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
3455
3456        pa->pa_obj_lock = &ei->i_prealloc_lock;
3457        pa->pa_inode = ac->ac_inode;
3458
3459        ext4_lock_group(sb, ac->ac_b_ex.fe_group);
3460        list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
3461        ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
3462
3463        spin_lock(pa->pa_obj_lock);
3464        list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
3465        spin_unlock(pa->pa_obj_lock);
3466
3467        return 0;
3468}
3469
3470/*
3471 * creates new preallocated space for locality group inodes belongs to
3472 */
3473static noinline_for_stack int
3474ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
3475{
3476        struct super_block *sb = ac->ac_sb;
3477        struct ext4_locality_group *lg;
3478        struct ext4_prealloc_space *pa;
3479        struct ext4_group_info *grp;
3480
3481        /* preallocate only when found space is larger then requested */
3482        BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
3483        BUG_ON(ac->ac_status != AC_STATUS_FOUND);
3484        BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
3485
3486        BUG_ON(ext4_pspace_cachep == NULL);
3487        pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
3488        if (pa == NULL)
3489                return -ENOMEM;
3490
3491        /* preallocation can change ac_b_ex, thus we store actually
3492         * allocated blocks for history */
3493        ac->ac_f_ex = ac->ac_b_ex;
3494
3495        pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
3496        pa->pa_lstart = pa->pa_pstart;
3497        pa->pa_len = ac->ac_b_ex.fe_len;
3498        pa->pa_free = pa->pa_len;
3499        atomic_set(&pa->pa_count, 1);
3500        spin_lock_init(&pa->pa_lock);
3501        INIT_LIST_HEAD(&pa->pa_inode_list);
3502        INIT_LIST_HEAD(&pa->pa_group_list);
3503        pa->pa_deleted = 0;
3504        pa->pa_type = MB_GROUP_PA;
3505
3506        mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa,
3507                        pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3508        trace_ext4_mb_new_group_pa(ac, pa);
3509
3510        ext4_mb_use_group_pa(ac, pa);
3511        atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
3512
3513        grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
3514        lg = ac->ac_lg;
3515        BUG_ON(lg == NULL);
3516
3517        pa->pa_obj_lock = &lg->lg_prealloc_lock;
3518        pa->pa_inode = NULL;
3519
3520        ext4_lock_group(sb, ac->ac_b_ex.fe_group);
3521        list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
3522        ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
3523
3524        /*
3525         * We will later add the new pa to the right bucket
3526         * after updating the pa_free in ext4_mb_release_context
3527         */
3528        return 0;
3529}
3530
3531static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
3532{
3533        int err;
3534
3535        if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
3536                err = ext4_mb_new_group_pa(ac);
3537        else
3538                err = ext4_mb_new_inode_pa(ac);
3539        return err;
3540}
3541
3542/*
3543 * finds all unused blocks in on-disk bitmap, frees them in
3544 * in-core bitmap and buddy.
3545 * @pa must be unlinked from inode and group lists, so that
3546 * nobody else can find/use it.
3547 * the caller MUST hold group/inode locks.
3548 * TODO: optimize the case when there are no in-core structures yet
3549 */
3550static noinline_for_stack int
3551ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3552                        struct ext4_prealloc_space *pa)
3553{
3554        struct super_block *sb = e4b->bd_sb;
3555        struct ext4_sb_info *sbi = EXT4_SB(sb);
3556        unsigned int end;
3557        unsigned int next;
3558        ext4_group_t group;
3559        ext4_grpblk_t bit;
3560        unsigned long long grp_blk_start;
3561        int err = 0;
3562        int free = 0;
3563
3564        BUG_ON(pa->pa_deleted == 0);
3565        ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
3566        grp_blk_start = pa->pa_pstart - bit;
3567        BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
3568        end = bit + pa->pa_len;
3569
3570        while (bit < end) {
3571                bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit);
3572                if (bit >= end)
3573                        break;
3574                next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
3575                mb_debug(1, "    free preallocated %u/%u in group %u\n",
3576                         (unsigned) ext4_group_first_block_no(sb, group) + bit,
3577                         (unsigned) next - bit, (unsigned) group);
3578                free += next - bit;
3579
3580                trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
3581                trace_ext4_mb_release_inode_pa(pa, grp_blk_start + bit,
3582                                               next - bit);
3583                mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
3584                bit = next + 1;
3585        }
3586        if (free != pa->pa_free) {
3587                printk(KERN_CRIT "pa %p: logic %lu, phys. %lu, len %lu\n",
3588                        pa, (unsigned long) pa->pa_lstart,
3589                        (unsigned long) pa->pa_pstart,
3590                        (unsigned long) pa->pa_len);
3591                ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u",
3592                                        free, pa->pa_free);
3593                /*
3594                 * pa is already deleted so we use the value obtained
3595                 * from the bitmap and continue.
3596                 */
3597        }
3598        atomic_add(free, &sbi->s_mb_discarded);
3599
3600        return err;
3601}
3602
3603static noinline_for_stack int
3604ext4_mb_release_group_pa(struct ext4_buddy *e4b,
3605                                struct ext4_prealloc_space *pa)
3606{
3607        struct super_block *sb = e4b->bd_sb;
3608        ext4_group_t group;
3609        ext4_grpblk_t bit;
3610
3611        trace_ext4_mb_release_group_pa(pa);
3612        BUG_ON(pa->pa_deleted == 0);
3613        ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
3614        BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
3615        mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
3616        atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
3617        trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);
3618
3619        return 0;
3620}
3621
3622/*
3623 * releases all preallocations in given group
3624 *
3625 * first, we need to decide discard policy:
3626 * - when do we discard
3627 *   1) ENOSPC
3628 * - how many do we discard
3629 *   1) how many requested
3630 */
3631static noinline_for_stack int
3632ext4_mb_discard_group_preallocations(struct super_block *sb,
3633                                        ext4_group_t group, int needed)
3634{
3635        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
3636        struct buffer_head *bitmap_bh = NULL;
3637        struct ext4_prealloc_space *pa, *tmp;
3638        struct list_head list;
3639        struct ext4_buddy e4b;
3640        int err;
3641        int busy = 0;
3642        int free = 0;
3643
3644        mb_debug(1, "discard preallocation for group %u\n", group);
3645
3646        if (list_empty(&grp->bb_prealloc_list))
3647                return 0;
3648
3649        bitmap_bh = ext4_read_block_bitmap(sb, group);
3650        if (bitmap_bh == NULL) {
3651                ext4_error(sb, "Error reading block bitmap for %u", group);
3652                return 0;
3653        }
3654
3655        err = ext4_mb_load_buddy(sb, group, &e4b);
3656        if (err) {
3657                ext4_error(sb, "Error loading buddy information for %u", group);
3658                put_bh(bitmap_bh);
3659                return 0;
3660        }
3661
3662        if (needed == 0)
3663                needed = EXT4_BLOCKS_PER_GROUP(sb) + 1;
3664
3665        INIT_LIST_HEAD(&list);
3666repeat:
3667        ext4_lock_group(sb, group);
3668        list_for_each_entry_safe(pa, tmp,
3669                                &grp->bb_prealloc_list, pa_group_list) {
3670                spin_lock(&pa->pa_lock);
3671                if (atomic_read(&pa->pa_count)) {
3672                        spin_unlock(&pa->pa_lock);
3673                        busy = 1;
3674                        continue;
3675                }
3676                if (pa->pa_deleted) {
3677                        spin_unlock(&pa->pa_lock);
3678                        continue;
3679                }
3680
3681                /* seems this one can be freed ... */
3682                pa->pa_deleted = 1;
3683
3684                /* we can trust pa_free ... */
3685                free += pa->pa_free;
3686
3687                spin_unlock(&pa->pa_lock);
3688
3689                list_del(&pa->pa_group_list);
3690                list_add(&pa->u.pa_tmp_list, &list);
3691        }
3692
3693        /* if we still need more blocks and some PAs were used, try again */
3694        if (free < needed && busy) {
3695                busy = 0;
3696                ext4_unlock_group(sb, group);
3697                /*
3698                 * Yield the CPU here so that we don't get soft lockup
3699                 * in non preempt case.
3700                 */
3701                yield();
3702                goto repeat;
3703        }
3704
3705        /* found anything to free? */
3706        if (list_empty(&list)) {
3707                BUG_ON(free != 0);
3708                goto out;
3709        }
3710
3711        /* now free all selected PAs */
3712        list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
3713
3714                /* remove from object (inode or locality group) */
3715                spin_lock(pa->pa_obj_lock);
3716                list_del_rcu(&pa->pa_inode_list);
3717                spin_unlock(pa->pa_obj_lock);
3718
3719                if (pa->pa_type == MB_GROUP_PA)
3720                        ext4_mb_release_group_pa(&e4b, pa);
3721                else
3722                        ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
3723
3724                list_del(&pa->u.pa_tmp_list);
3725                call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
3726        }
3727
3728out:
3729        ext4_unlock_group(sb, group);
3730        ext4_mb_unload_buddy(&e4b);
3731        put_bh(bitmap_bh);
3732        return free;
3733}
3734
3735/*
3736 * releases all non-used preallocated blocks for given inode
3737 *
3738 * It's important to discard preallocations under i_data_sem
3739 * We don't want another block to be served from the prealloc
3740 * space when we are discarding the inode prealloc space.
3741 *
3742 * FIXME!! Make sure it is valid at all the call sites
3743 */
3744void ext4_discard_preallocations(struct inode *inode)
3745{
3746        struct ext4_inode_info *ei = EXT4_I(inode);
3747        struct super_block *sb = inode->i_sb;
3748        struct buffer_head *bitmap_bh = NULL;
3749        struct ext4_prealloc_space *pa, *tmp;
3750        ext4_group_t group = 0;
3751        struct list_head list;
3752        struct ext4_buddy e4b;
3753        int err;
3754
3755        if (!S_ISREG(inode->i_mode)) {
3756                /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/
3757                return;
3758        }
3759
3760        mb_debug(1, "discard preallocation for inode %lu\n", inode->i_ino);
3761        trace_ext4_discard_preallocations(inode);
3762
3763        INIT_LIST_HEAD(&list);
3764
3765repeat:
3766        /* first, collect all pa's in the inode */
3767        spin_lock(&ei->i_prealloc_lock);
3768        while (!list_empty(&ei->i_prealloc_list)) {
3769                pa = list_entry(ei->i_prealloc_list.next,
3770                                struct ext4_prealloc_space, pa_inode_list);
3771                BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock);
3772                spin_lock(&pa->pa_lock);
3773                if (atomic_read(&pa->pa_count)) {
3774                        /* this shouldn't happen often - nobody should
3775                         * use preallocation while we're discarding it */
3776                        spin_unlock(&pa->pa_lock);
3777                        spin_unlock(&ei->i_prealloc_lock);
3778                        printk(KERN_ERR "uh-oh! used pa while discarding\n");
3779                        WARN_ON(1);
3780                        schedule_timeout_uninterruptible(HZ);
3781                        goto repeat;
3782
3783                }
3784                if (pa->pa_deleted == 0) {
3785                        pa->pa_deleted = 1;
3786                        spin_unlock(&pa->pa_lock);
3787                        list_del_rcu(&pa->pa_inode_list);
3788                        list_add(&pa->u.pa_tmp_list, &list);
3789                        continue;
3790                }
3791
3792                /* someone is deleting pa right now */
3793                spin_unlock(&pa->pa_lock);
3794                spin_unlock(&ei->i_prealloc_lock);
3795
3796                /* we have to wait here because pa_deleted
3797                 * doesn't mean pa is already unlinked from
3798                 * the list. as we might be called from
3799                 * ->clear_inode() the inode will get freed
3800                 * and concurrent thread which is unlinking
3801                 * pa from inode's list may access already
3802                 * freed memory, bad-bad-bad */
3803
3804                /* XXX: if this happens too often, we can
3805                 * add a flag to force wait only in case
3806                 * of ->clear_inode(), but not in case of
3807                 * regular truncate */
3808                schedule_timeout_uninterruptible(HZ);
3809                goto repeat;
3810        }
3811        spin_unlock(&ei->i_prealloc_lock);
3812
3813        list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
3814                BUG_ON(pa->pa_type != MB_INODE_PA);
3815                ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
3816
3817                err = ext4_mb_load_buddy(sb, group, &e4b);
3818                if (err) {
3819                        ext4_error(sb, "Error loading buddy information for %u",
3820                                        group);
3821                        continue;
3822                }
3823
3824                bitmap_bh = ext4_read_block_bitmap(sb, group);
3825                if (bitmap_bh == NULL) {
3826                        ext4_error(sb, "Error reading block bitmap for %u",
3827                                        group);
3828                        ext4_mb_unload_buddy(&e4b);
3829                        continue;
3830                }
3831
3832                ext4_lock_group(sb, group);
3833                list_del(&pa->pa_group_list);
3834                ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
3835                ext4_unlock_group(sb, group);
3836
3837                ext4_mb_unload_buddy(&e4b);
3838                put_bh(bitmap_bh);
3839
3840                list_del(&pa->u.pa_tmp_list);
3841                call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
3842        }
3843}
3844
3845#ifdef CONFIG_EXT4_DEBUG
3846static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
3847{
3848        struct super_block *sb = ac->ac_sb;
3849        ext4_group_t ngroups, i;
3850
3851        if (!mb_enable_debug ||
3852            (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED))
3853                return;
3854
3855        printk(KERN_ERR "EXT4-fs: Can't allocate:"
3856                        " Allocation context details:\n");
3857        printk(KERN_ERR "EXT4-fs: status %d flags %d\n",
3858                        ac->ac_status, ac->ac_flags);
3859        printk(KERN_ERR "EXT4-fs: orig %lu/%lu/%lu@%lu, goal %lu/%lu/%lu@%lu, "
3860                        "best %lu/%lu/%lu@%lu cr %d\n",
3861                        (unsigned long)ac->ac_o_ex.fe_group,
3862                        (unsigned long)ac->ac_o_ex.fe_start,
3863                        (unsigned long)ac->ac_o_ex.fe_len,
3864                        (unsigned long)ac->ac_o_ex.fe_logical,
3865                        (unsigned long)ac->ac_g_ex.fe_group,
3866                        (unsigned long)ac->ac_g_ex.fe_start,
3867                        (unsigned long)ac->ac_g_ex.fe_len,
3868                        (unsigned long)ac->ac_g_ex.fe_logical,
3869                        (unsigned long)ac->ac_b_ex.fe_group,
3870                        (unsigned long)ac->ac_b_ex.fe_start,
3871                        (unsigned long)ac->ac_b_ex.fe_len,
3872                        (unsigned long)ac->ac_b_ex.fe_logical,
3873                        (int)ac->ac_criteria);
3874        printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned,
3875                ac->ac_found);
3876        printk(KERN_ERR "EXT4-fs: groups: \n");
3877        ngroups = ext4_get_groups_count(sb);
3878        for (i = 0; i < ngroups; i++) {
3879                struct ext4_group_info *grp = ext4_get_group_info(sb, i);
3880                struct ext4_prealloc_space *pa;
3881                ext4_grpblk_t start;
3882                struct list_head *cur;
3883                ext4_lock_group(sb, i);
3884                list_for_each(cur, &grp->bb_prealloc_list) {
3885                        pa = list_entry(cur, struct ext4_prealloc_space,
3886                                        pa_group_list);
3887                        spin_lock(&pa->pa_lock);
3888                        ext4_get_group_no_and_offset(sb, pa->pa_pstart,
3889                                                     NULL, &start);
3890                        spin_unlock(&pa->pa_lock);
3891                        printk(KERN_ERR "PA:%u:%d:%u \n", i,
3892                               start, pa->pa_len);
3893                }
3894                ext4_unlock_group(sb, i);
3895
3896                if (grp->bb_free == 0)
3897                        continue;
3898                printk(KERN_ERR "%u: %d/%d \n",
3899                       i, grp->bb_free, grp->bb_fragments);
3900        }
3901        printk(KERN_ERR "\n");
3902}
3903#else
3904static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
3905{
3906        return;
3907}
3908#endif
3909
3910/*
3911 * We use locality group preallocation for small size file. The size of the
3912 * file is determined by the current size or the resulting size after
3913 * allocation which ever is larger
3914 *
3915 * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req
3916 */
3917static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
3918{
3919        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
3920        int bsbits = ac->ac_sb->s_blocksize_bits;
3921        loff_t size, isize;
3922
3923        if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
3924                return;
3925
3926        if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
3927                return;
3928
3929        size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
3930        isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
3931                >> bsbits;
3932
3933        if ((size == isize) &&
3934            !ext4_fs_is_busy(sbi) &&
3935            (atomic_read(&ac->ac_inode->i_writecount) == 0)) {
3936                ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
3937                return;
3938        }
3939
3940        /* don't use group allocation for large files */
3941        size = max(size, isize);
3942        if (size > sbi->s_mb_stream_request) {
3943                ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
3944                return;
3945        }
3946
3947        BUG_ON(ac->ac_lg != NULL);
3948        /*
3949         * locality group prealloc space are per cpu. The reason for having
3950         * per cpu locality group is to reduce the contention between block
3951         * request from multiple CPUs.
3952         */
3953        ac->ac_lg = __this_cpu_ptr(sbi->s_locality_groups);
3954
3955        /* we're going to use group allocation */
3956        ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;
3957
3958        /* serialize all allocations in the group */
3959        mutex_lock(&ac->ac_lg->lg_mutex);
3960}
3961
3962static noinline_for_stack int
3963ext4_mb_initialize_context(struct ext4_allocation_context *ac,
3964                                struct ext4_allocation_request *ar)
3965{
3966        struct super_block *sb = ar->inode->i_sb;
3967        struct ext4_sb_info *sbi = EXT4_SB(sb);
3968        struct ext4_super_block *es = sbi->s_es;
3969        ext4_group_t group;
3970        unsigned int len;
3971        ext4_fsblk_t goal;
3972        ext4_grpblk_t block;
3973
3974        /* we can't allocate > group size */
3975        len = ar->len;
3976
3977        /* just a dirty hack to filter too big requests  */
3978        if (len >= EXT4_BLOCKS_PER_GROUP(sb) - 10)
3979                len = EXT4_BLOCKS_PER_GROUP(sb) - 10;
3980
3981        /* start searching from the goal */
3982        goal = ar->goal;
3983        if (goal < le32_to_cpu(es->s_first_data_block) ||
3984                        goal >= ext4_blocks_count(es))
3985                goal = le32_to_cpu(es->s_first_data_block);
3986        ext4_get_group_no_and_offset(sb, goal, &group, &block);
3987
3988        /* set up allocation goals */
3989        memset(ac, 0, sizeof(struct ext4_allocation_context));
3990        ac->ac_b_ex.fe_logical = ar->logical;
3991        ac->ac_status = AC_STATUS_CONTINUE;
3992        ac->ac_sb = sb;
3993        ac->ac_inode = ar->inode;
3994        ac->ac_o_ex.fe_logical = ar->logical;
3995        ac->ac_o_ex.fe_group = group;
3996        ac->ac_o_ex.fe_start = block;
3997        ac->ac_o_ex.fe_len = len;
3998        ac->ac_g_ex.fe_logical = ar->logical;
3999        ac->ac_g_ex.fe_group = group;
4000        ac->ac_g_ex.fe_start = block;
4001        ac->ac_g_ex.fe_len = len;
4002        ac->ac_flags = ar->flags;
4003
4004        /* we have to define context: we'll we work with a file or
4005         * locality group. this is a policy, actually */
4006        ext4_mb_group_or_file(ac);
4007
4008        mb_debug(1, "init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, "
4009                        "left: %u/%u, right %u/%u to %swritable\n",
4010                        (unsigned) ar->len, (unsigned) ar->logical,
4011                        (unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
4012                        (unsigned) ar->lleft, (unsigned) ar->pleft,
4013                        (unsigned) ar->lright, (unsigned) ar->pright,
4014                        atomic_read(&ar->inode->i_writecount) ? "" : "non-");
4015        return 0;
4016
4017}
4018
4019static noinline_for_stack void
4020ext4_mb_discard_lg_preallocations(struct super_block *sb,
4021                                        struct ext4_locality_group *lg,
4022                                        int order, int total_entries)
4023{
4024        ext4_group_t group = 0;
4025        struct ext4_buddy e4b;
4026        struct list_head discard_list;
4027        struct ext4_prealloc_space *pa, *tmp;
4028
4029        mb_debug(1, "discard locality group preallocation\n");
4030
4031        INIT_LIST_HEAD(&discard_list);
4032
4033        spin_lock(&lg->lg_prealloc_lock);
4034        list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
4035                                                pa_inode_list) {
4036                spin_lock(&pa->pa_lock);
4037                if (atomic_read(&pa->pa_count)) {
4038                        /*
4039                         * This is the pa that we just used
4040                         * for block allocation. So don't
4041                         * free that
4042                         */
4043                        spin_unlock(&pa->pa_lock);
4044                        continue;
4045                }
4046                if (pa->pa_deleted) {
4047                        spin_unlock(&pa->pa_lock);
4048                        continue;
4049                }
4050                /* only lg prealloc space */
4051                BUG_ON(pa->pa_type != MB_GROUP_PA);
4052
4053                /* seems this one can be freed ... */
4054                pa->pa_deleted = 1;
4055                spin_unlock(&pa->pa_lock);
4056
4057                list_del_rcu(&pa->pa_inode_list);
4058                list_add(&pa->u.pa_tmp_list, &discard_list);
4059
4060                total_entries--;
4061                if (total_entries <= 5) {
4062                        /*
4063                         * we want to keep only 5 entries
4064                         * allowing it to grow to 8. This
4065                         * mak sure we don't call discard
4066                         * soon for this list.
4067                         */
4068                        break;
4069                }
4070        }
4071        spin_unlock(&lg->lg_prealloc_lock);
4072
4073        list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {
4074
4075                ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
4076                if (ext4_mb_load_buddy(sb, group, &e4b)) {
4077                        ext4_error(sb, "Error loading buddy information for %u",
4078                                        group);
4079                        continue;
4080                }
4081                ext4_lock_group(sb, group);
4082                list_del(&pa->pa_group_list);
4083                ext4_mb_release_group_pa(&e4b, pa);
4084                ext4_unlock_group(sb, group);
4085
4086                ext4_mb_unload_buddy(&e4b);
4087                list_del(&pa->u.pa_tmp_list);
4088                call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
4089        }
4090}
4091
4092/*
4093 * We have incremented pa_count. So it cannot be freed at this
4094 * point. Also we hold lg_mutex. So no parallel allocation is
4095 * possible from this lg. That means pa_free cannot be updated.
4096 *
4097 * A parallel ext4_mb_discard_group_preallocations is possible.
4098 * which can cause the lg_prealloc_list to be updated.
4099 */
4100
4101static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
4102{
4103        int order, added = 0, lg_prealloc_count = 1;
4104        struct super_block *sb = ac->ac_sb;
4105        struct ext4_locality_group *lg = ac->ac_lg;
4106        struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa;
4107
4108        order = fls(pa->pa_free) - 1;
4109        if (order > PREALLOC_TB_SIZE - 1)
4110                /* The max size of hash table is PREALLOC_TB_SIZE */
4111                order = PREALLOC_TB_SIZE - 1;
4112        /* Add the prealloc space to lg */
4113        rcu_read_lock();
4114        list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
4115                                                pa_inode_list) {
4116                spin_lock(&tmp_pa->pa_lock);
4117                if (tmp_pa->pa_deleted) {
4118                        spin_unlock(&tmp_pa->pa_lock);
4119                        continue;
4120                }
4121                if (!added && pa->pa_free < tmp_pa->pa_free) {
4122                        /* Add to the tail of the previous entry */
4123                        list_add_tail_rcu(&pa->pa_inode_list,
4124                                                &tmp_pa->pa_inode_list);
4125                        added = 1;
4126                        /*
4127                         * we want to count the total
4128                         * number of entries in the list
4129                         */
4130                }
4131                spin_unlock(&tmp_pa->pa_lock);
4132                lg_prealloc_count++;
4133        }
4134        if (!added)
4135                list_add_tail_rcu(&pa->pa_inode_list,
4136                                        &lg->lg_prealloc_list[order]);
4137        rcu_read_unlock();
4138
4139        /* Now trim the list to be not more than 8 elements */
4140        if (lg_prealloc_count > 8) {
4141                ext4_mb_discard_lg_preallocations(sb, lg,
4142                                                order, lg_prealloc_count);
4143                return;
4144        }
4145        return ;
4146}
4147
4148/*
4149 * release all resource we used in allocation
4150 */
4151static int ext4_mb_release_context(struct ext4_allocation_context *ac)
4152{
4153        struct ext4_prealloc_space *pa = ac->ac_pa;
4154        if (pa) {
4155                if (pa->pa_type == MB_GROUP_PA) {
4156                        /* see comment in ext4_mb_use_group_pa() */
4157                        spin_lock(&pa->pa_lock);
4158                        pa->pa_pstart += ac->ac_b_ex.fe_len;
4159                        pa->pa_lstart += ac->ac_b_ex.fe_len;
4160                        pa->pa_free -= ac->ac_b_ex.fe_len;
4161                        pa->pa_len -= ac->ac_b_ex.fe_len;
4162                        spin_unlock(&pa->pa_lock);
4163                }
4164        }
4165        if (pa) {
4166                /*
4167                 * We want to add the pa to the right bucket.
4168                 * Remove it from the list and while adding
4169                 * make sure the list to which we are adding
4170                 * doesn't grow big.
4171                 */
4172                if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
4173                        spin_lock(pa->pa_obj_lock);
4174                        list_del_rcu(&pa->pa_inode_list);
4175                        spin_unlock(pa->pa_obj_lock);
4176                        ext4_mb_add_n_trim(ac);
4177                }
4178                ext4_mb_put_pa(ac, ac->ac_sb, pa);
4179        }
4180        if (ac->ac_bitmap_page)
4181                page_cache_release(ac->ac_bitmap_page);
4182        if (ac->ac_buddy_page)
4183                page_cache_release(ac->ac_buddy_page);
4184        if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
4185                mutex_unlock(&ac->ac_lg->lg_mutex);
4186        ext4_mb_collect_stats(ac);
4187        return 0;
4188}
4189
4190static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
4191{
4192        ext4_group_t i, ngroups = ext4_get_groups_count(sb);
4193        int ret;
4194        int freed = 0;
4195
4196        trace_ext4_mb_discard_preallocations(sb, needed);
4197        for (i = 0; i < ngroups && needed > 0; i++) {
4198                ret = ext4_mb_discard_group_preallocations(sb, i, needed);
4199                freed += ret;
4200                needed -= ret;
4201        }
4202
4203        return freed;
4204}
4205
4206/*
4207 * Main entry point into mballoc to allocate blocks
4208 * it tries to use preallocation first, then falls back
4209 * to usual allocation
4210 */
4211ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4212                                struct ext4_allocation_request *ar, int *errp)
4213{
4214        int freed;
4215        struct ext4_allocation_context *ac = NULL;
4216        struct ext4_sb_info *sbi;
4217        struct super_block *sb;
4218        ext4_fsblk_t block = 0;
4219        unsigned int inquota = 0;
4220        unsigned int reserv_blks = 0;
4221
4222        sb = ar->inode->i_sb;
4223        sbi = EXT4_SB(sb);
4224
4225        trace_ext4_request_blocks(ar);
4226
4227        /*
4228         * For delayed allocation, we could skip the ENOSPC and
4229         * EDQUOT check, as blocks and quotas have been already
4230         * reserved when data being copied into pagecache.
4231         */
4232        if (ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED))
4233                ar->flags |= EXT4_MB_DELALLOC_RESERVED;
4234        else {
4235                /* Without delayed allocation we need to verify
4236                 * there is enough free blocks to do block allocation
4237                 * and verify allocation doesn't exceed the quota limits.
4238                 */
4239                while (ar->len &&
4240                        ext4_claim_free_blocks(sbi, ar->len, ar->flags)) {
4241
4242                        /* let others to free the space */
4243                        yield();
4244                        ar->len = ar->len >> 1;
4245                }
4246                if (!ar->len) {
4247                        *errp = -ENOSPC;
4248                        return 0;
4249                }
4250                reserv_blks = ar->len;
4251                if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) {
4252                        dquot_alloc_block_nofail(ar->inode, ar->len);
4253                } else {
4254                        while (ar->len &&
4255                                dquot_alloc_block(ar->inode, ar->len)) {
4256
4257                                ar->flags |= EXT4_MB_HINT_NOPREALLOC;
4258                                ar->len--;
4259                        }
4260                }
4261                inquota = ar->len;
4262                if (ar->len == 0) {
4263                        *errp = -EDQUOT;
4264                        goto out;
4265                }
4266        }
4267
4268        ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
4269        if (!ac) {
4270                ar->len = 0;
4271                *errp = -ENOMEM;
4272                goto out;
4273        }
4274
4275        *errp = ext4_mb_initialize_context(ac, ar);
4276        if (*errp) {
4277                ar->len = 0;
4278                goto out;
4279        }
4280
4281        ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
4282        if (!ext4_mb_use_preallocated(ac)) {
4283                ac->ac_op = EXT4_MB_HISTORY_ALLOC;
4284                ext4_mb_normalize_request(ac, ar);
4285repeat:
4286                /* allocate space in core */
4287                *errp = ext4_mb_regular_allocator(ac);
4288                if (*errp)
4289                        goto errout;
4290
4291                /* as we've just preallocated more space than
4292                 * user requested orinally, we store allocated
4293                 * space in a special descriptor */
4294                if (ac->ac_status == AC_STATUS_FOUND &&
4295                                ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
4296                        ext4_mb_new_preallocation(ac);
4297        }
4298        if (likely(ac->ac_status == AC_STATUS_FOUND)) {
4299                *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks);
4300                if (*errp == -EAGAIN) {
4301                        /*
4302                         * drop the reference that we took
4303                         * in ext4_mb_use_best_found
4304                         */
4305                        ext4_mb_release_context(ac);
4306                        ac->ac_b_ex.fe_group = 0;
4307                        ac->ac_b_ex.fe_start = 0;
4308                        ac->ac_b_ex.fe_len = 0;
4309                        ac->ac_status = AC_STATUS_CONTINUE;
4310                        goto repeat;
4311                } else if (*errp)
4312                errout:
4313                        ext4_discard_allocated_blocks(ac);
4314                else {
4315                        block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
4316                        ar->len = ac->ac_b_ex.fe_len;
4317                }
4318        } else {
4319                freed  = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len);
4320                if (freed)
4321                        goto repeat;
4322                *errp = -ENOSPC;
4323        }
4324
4325        if (*errp) {
4326                ac->ac_b_ex.fe_len = 0;
4327                ar->len = 0;
4328                ext4_mb_show_ac(ac);
4329        }
4330        ext4_mb_release_context(ac);
4331out:
4332        if (ac)
4333                kmem_cache_free(ext4_ac_cachep, ac);
4334        if (inquota && ar->len < inquota)
4335                dquot_free_block(ar->inode, inquota - ar->len);
4336        if (!ar->len) {
4337                if (!ext4_test_inode_state(ar->inode,
4338                                           EXT4_STATE_DELALLOC_RESERVED))
4339                        /* release all the reserved blocks if non delalloc */
4340                        percpu_counter_sub(&sbi->s_dirtyblocks_counter,
4341                                                reserv_blks);
4342        }
4343
4344        trace_ext4_allocate_blocks(ar, (unsigned long long)block);
4345
4346        return block;
4347}
4348
4349/*
4350 * We can merge two free data extents only if the physical blocks
4351 * are contiguous, AND the extents were freed by the same transaction,
4352 * AND the blocks are associated with the same group.
4353 */
4354static int can_merge(struct ext4_free_data *entry1,
4355                        struct ext4_free_data *entry2)
4356{
4357        if ((entry1->t_tid == entry2->t_tid) &&
4358            (entry1->group == entry2->group) &&
4359            ((entry1->start_blk + entry1->count) == entry2->start_blk))
4360                return 1;
4361        return 0;
4362}
4363
4364static noinline_for_stack int
4365ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4366                      struct ext4_free_data *new_entry)
4367{
4368        ext4_group_t group = e4b->bd_group;
4369        ext4_grpblk_t block;
4370        struct ext4_free_data *entry;
4371        struct ext4_group_info *db = e4b->bd_info;
4372        struct super_block *sb = e4b->bd_sb;
4373        struct ext4_sb_info *sbi = EXT4_SB(sb);
4374        struct rb_node **n = &db->bb_free_root.rb_node, *node;
4375        struct rb_node *parent = NULL, *new_node;
4376
4377        BUG_ON(!ext4_handle_valid(handle));
4378        BUG_ON(e4b->bd_bitmap_page == NULL);
4379        BUG_ON(e4b->bd_buddy_page == NULL);
4380
4381        new_node = &new_entry->node;
4382        block = new_entry->start_blk;
4383
4384        if (!*n) {
4385                /* first free block exent. We need to
4386                   protect buddy cache from being freed,
4387                 * otherwise we'll refresh it from
4388                 * on-disk bitmap and lose not-yet-available
4389                 * blocks */
4390                page_cache_get(e4b->bd_buddy_page);
4391                page_cache_get(e4b->bd_bitmap_page);
4392        }
4393        while (*n) {
4394                parent = *n;
4395                entry = rb_entry(parent, struct ext4_free_data, node);
4396                if (block < entry->start_blk)
4397                        n = &(*n)->rb_left;
4398                else if (block >= (entry->start_blk + entry->count))
4399                        n = &(*n)->rb_right;
4400                else {
4401                        ext4_grp_locked_error(sb, group, 0,
4402                                ext4_group_first_block_no(sb, group) + block,
4403                                "Block already on to-be-freed list");
4404                        return 0;
4405                }
4406        }
4407
4408        rb_link_node(new_node, parent, n);
4409        rb_insert_color(new_node, &db->bb_free_root);
4410
4411        /* Now try to see the extent can be merged to left and right */
4412        node = rb_prev(new_node);
4413        if (node) {
4414                entry = rb_entry(node, struct ext4_free_data, node);
4415                if (can_merge(entry, new_entry)) {
4416                        new_entry->start_blk = entry->start_blk;
4417                        new_entry->count += entry->count;
4418                        rb_erase(node, &(db->bb_free_root));
4419                        spin_lock(&sbi->s_md_lock);
4420                        list_del(&entry->list);
4421                        spin_unlock(&sbi->s_md_lock);
4422                        kmem_cache_free(ext4_free_ext_cachep, entry);
4423                }
4424        }
4425
4426        node = rb_next(new_node);
4427        if (node) {
4428                entry = rb_entry(node, struct ext4_free_data, node);
4429                if (can_merge(new_entry, entry)) {
4430                        new_entry->count += entry->count;
4431                        rb_erase(node, &(db->bb_free_root));
4432                        spin_lock(&sbi->s_md_lock);
4433                        list_del(&entry->list);
4434                        spin_unlock(&sbi->s_md_lock);
4435                        kmem_cache_free(ext4_free_ext_cachep, entry);
4436                }
4437        }
4438        /* Add the extent to transaction's private list */
4439        spin_lock(&sbi->s_md_lock);
4440        list_add(&new_entry->list, &handle->h_transaction->t_private_list);
4441        spin_unlock(&sbi->s_md_lock);
4442        return 0;
4443}
4444
4445/**
4446 * ext4_free_blocks() -- Free given blocks and update quota
4447 * @handle:             handle for this transaction
4448 * @inode:              inode
4449 * @block:              start physical block to free
4450 * @count:              number of blocks to count
4451 * @flags:              flags used by ext4_free_blocks
4452 */
4453void ext4_free_blocks(handle_t *handle, struct inode *inode,
4454                      struct buffer_head *bh, ext4_fsblk_t block,
4455                      unsigned long count, int flags)
4456{
4457        struct buffer_head *bitmap_bh = NULL;
4458        struct super_block *sb = inode->i_sb;
4459        struct ext4_group_desc *gdp;
4460        unsigned long freed = 0;
4461        unsigned int overflow;
4462        ext4_grpblk_t bit;
4463        struct buffer_head *gd_bh;
4464        ext4_group_t block_group;
4465        struct ext4_sb_info *sbi;
4466        struct ext4_buddy e4b;
4467        int err = 0;
4468        int ret;
4469
4470        if (bh) {
4471                if (block)
4472                        BUG_ON(block != bh->b_blocknr);
4473                else
4474                        block = bh->b_blocknr;
4475        }
4476
4477        sbi = EXT4_SB(sb);
4478        if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
4479            !ext4_data_block_valid(sbi, block, count)) {
4480                ext4_error(sb, "Freeing blocks not in datazone - "
4481                           "block = %llu, count = %lu", block, count);
4482                goto error_return;
4483        }
4484
4485        ext4_debug("freeing block %llu\n", block);
4486        trace_ext4_free_blocks(inode, block, count, flags);
4487
4488        if (flags & EXT4_FREE_BLOCKS_FORGET) {
4489                struct buffer_head *tbh = bh;
4490                int i;
4491
4492                BUG_ON(bh && (count > 1));
4493
4494                for (i = 0; i < count; i++) {
4495                        if (!bh)
4496                                tbh = sb_find_get_block(inode->i_sb,
4497                                                        block + i);
4498                        if (unlikely(!tbh))
4499                                continue;
4500                        ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
4501                                    inode, tbh, block + i);
4502                }
4503        }
4504
4505        /*
4506         * We need to make sure we don't reuse the freed block until
4507         * after the transaction is committed, which we can do by
4508         * treating the block as metadata, below.  We make an
4509         * exception if the inode is to be written in writeback mode
4510         * since writeback mode has weak data consistency guarantees.
4511         */
4512        if (!ext4_should_writeback_data(inode))
4513                flags |= EXT4_FREE_BLOCKS_METADATA;
4514
4515do_more:
4516        overflow = 0;
4517        ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
4518
4519        /*
4520         * Check to see if we are freeing blocks across a group
4521         * boundary.
4522         */
4523        if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
4524                overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb);
4525                count -= overflow;
4526        }
4527        bitmap_bh = ext4_read_block_bitmap(sb, block_group);
4528        if (!bitmap_bh) {
4529                err = -EIO;
4530                goto error_return;
4531        }
4532        gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
4533        if (!gdp) {
4534                err = -EIO;
4535                goto error_return;
4536        }
4537
4538        if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
4539            in_range(ext4_inode_bitmap(sb, gdp), block, count) ||
4540            in_range(block, ext4_inode_table(sb, gdp),
4541                      EXT4_SB(sb)->s_itb_per_group) ||
4542            in_range(block + count - 1, ext4_inode_table(sb, gdp),
4543                      EXT4_SB(sb)->s_itb_per_group)) {
4544
4545                ext4_error(sb, "Freeing blocks in system zone - "
4546                           "Block = %llu, count = %lu", block, count);
4547                /* err = 0. ext4_std_error should be a no op */
4548                goto error_return;
4549        }
4550
4551        BUFFER_TRACE(bitmap_bh, "getting write access");
4552        err = ext4_journal_get_write_access(handle, bitmap_bh);
4553        if (err)
4554                goto error_return;
4555
4556        /*
4557         * We are about to modify some metadata.  Call the journal APIs
4558         * to unshare ->b_data if a currently-committing transaction is
4559         * using it
4560         */
4561        BUFFER_TRACE(gd_bh, "get_write_access");
4562        err = ext4_journal_get_write_access(handle, gd_bh);
4563        if (err)
4564                goto error_return;
4565#ifdef AGGRESSIVE_CHECK
4566        {
4567                int i;
4568                for (i = 0; i < count; i++)
4569                        BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
4570        }
4571#endif
4572        trace_ext4_mballoc_free(sb, inode, block_group, bit, count);
4573
4574        err = ext4_mb_load_buddy(sb, block_group, &e4b);
4575        if (err)
4576                goto error_return;
4577
4578        if ((flags & EXT4_FREE_BLOCKS_METADATA) && ext4_handle_valid(handle)) {
4579                struct ext4_free_data *new_entry;
4580                /*
4581                 * blocks being freed are metadata. these blocks shouldn't
4582                 * be used until this transaction is committed
4583                 */
4584                new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
4585                if (!new_entry) {
4586                        err = -ENOMEM;
4587                        goto error_return;
4588                }
4589                new_entry->start_blk = bit;
4590                new_entry->group  = block_group;
4591                new_entry->count = count;
4592                new_entry->t_tid = handle->h_transaction->t_tid;
4593
4594                ext4_lock_group(sb, block_group);
4595                mb_clear_bits(bitmap_bh->b_data, bit, count);
4596                ext4_mb_free_metadata(handle, &e4b, new_entry);
4597        } else {
4598                /* need to update group_info->bb_free and bitmap
4599                 * with group lock held. generate_buddy look at
4600                 * them with group lock_held
4601                 */
4602                ext4_lock_group(sb, block_group);
4603                mb_clear_bits(bitmap_bh->b_data, bit, count);
4604                mb_free_blocks(inode, &e4b, bit, count);
4605        }
4606
4607        ret = ext4_free_blks_count(sb, gdp) + count;
4608        ext4_free_blks_set(sb, gdp, ret);
4609        gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
4610        ext4_unlock_group(sb, block_group);
4611        percpu_counter_add(&sbi->s_freeblocks_counter, count);
4612
4613        if (sbi->s_log_groups_per_flex) {
4614                ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
4615                atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks);
4616        }
4617
4618        ext4_mb_unload_buddy(&e4b);
4619
4620        freed += count;
4621
4622        /* We dirtied the bitmap block */
4623        BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
4624        err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
4625
4626        /* And the group descriptor block */
4627        BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
4628        ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
4629        if (!err)
4630                err = ret;
4631
4632        if (overflow && !err) {
4633                block += count;
4634                count = overflow;
4635                put_bh(bitmap_bh);
4636                goto do_more;
4637        }
4638        ext4_mark_super_dirty(sb);
4639error_return:
4640        if (freed)
4641                dquot_free_block(inode, freed);
4642        brelse(bitmap_bh);
4643        ext4_std_error(sb, err);
4644        return;
4645}
4646
4647/**
4648 * ext4_add_groupblocks() -- Add given blocks to an existing group
4649 * @handle:                     handle to this transaction
4650 * @sb:                         super block
4651 * @block:                      start physcial block to add to the block group
4652 * @count:                      number of blocks to free
4653 *
4654 * This marks the blocks as free in the bitmap and buddy.
4655 */
4656void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
4657                         ext4_fsblk_t block, unsigned long count)
4658{
4659        struct buffer_head *bitmap_bh = NULL;
4660        struct buffer_head *gd_bh;
4661        ext4_group_t block_group;
4662        ext4_grpblk_t bit;
4663        unsigned int i;
4664        struct ext4_group_desc *desc;
4665        struct ext4_sb_info *sbi = EXT4_SB(sb);
4666        struct ext4_buddy e4b;
4667        int err = 0, ret, blk_free_count;
4668        ext4_grpblk_t blocks_freed;
4669        struct ext4_group_info *grp;
4670
4671        ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
4672
4673        ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
4674        grp = ext4_get_group_info(sb, block_group);
4675        /*
4676         * Check to see if we are freeing blocks across a group
4677         * boundary.
4678         */
4679        if (bit + count > EXT4_BLOCKS_PER_GROUP(sb))
4680                goto error_return;
4681
4682        bitmap_bh = ext4_read_block_bitmap(sb, block_group);
4683        if (!bitmap_bh)
4684                goto error_return;
4685        desc = ext4_get_group_desc(sb, block_group, &gd_bh);
4686        if (!desc)
4687                goto error_return;
4688
4689        if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
4690            in_range(ext4_inode_bitmap(sb, desc), block, count) ||
4691            in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
4692            in_range(block + count - 1, ext4_inode_table(sb, desc),
4693                     sbi->s_itb_per_group)) {
4694                ext4_error(sb, "Adding blocks in system zones - "
4695                           "Block = %llu, count = %lu",
4696                           block, count);
4697                goto error_return;
4698        }
4699
4700        BUFFER_TRACE(bitmap_bh, "getting write access");
4701        err = ext4_journal_get_write_access(handle, bitmap_bh);
4702        if (err)
4703                goto error_return;
4704
4705        /*
4706         * We are about to modify some metadata.  Call the journal APIs
4707         * to unshare ->b_data if a currently-committing transaction is
4708         * using it
4709         */
4710        BUFFER_TRACE(gd_bh, "get_write_access");
4711        err = ext4_journal_get_write_access(handle, gd_bh);
4712        if (err)
4713                goto error_return;
4714
4715        for (i = 0, blocks_freed = 0; i < count; i++) {
4716                BUFFER_TRACE(bitmap_bh, "clear bit");
4717                if (!mb_test_bit(bit + i, bitmap_bh->b_data)) {
4718                        ext4_error(sb, "bit already cleared for block %llu",
4719                                   (ext4_fsblk_t)(block + i));
4720                        BUFFER_TRACE(bitmap_bh, "bit already cleared");
4721                } else {
4722                        blocks_freed++;
4723                }
4724        }
4725
4726        err = ext4_mb_load_buddy(sb, block_group, &e4b);
4727        if (err)
4728                goto error_return;
4729
4730        /*
4731         * need to update group_info->bb_free and bitmap
4732         * with group lock held. generate_buddy look at
4733         * them with group lock_held
4734         */
4735        ext4_lock_group(sb, block_group);
4736        mb_clear_bits(bitmap_bh->b_data, bit, count);
4737        mb_free_blocks(NULL, &e4b, bit, count);
4738        blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
4739        ext4_free_blks_set(sb, desc, blk_free_count);
4740        desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
4741        ext4_unlock_group(sb, block_group);
4742        percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
4743
4744        if (sbi->s_log_groups_per_flex) {
4745                ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
4746                atomic_add(blocks_freed,
4747                           &sbi->s_flex_groups[flex_group].free_blocks);
4748        }
4749
4750        ext4_mb_unload_buddy(&e4b);
4751
4752        /* We dirtied the bitmap block */
4753        BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
4754        err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
4755
4756        /* And the group descriptor block */
4757        BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
4758        ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
4759        if (!err)
4760                err = ret;
4761
4762error_return:
4763        brelse(bitmap_bh);
4764        ext4_std_error(sb, err);
4765        return;
4766}
4767
4768/**
4769 * ext4_trim_extent -- function to TRIM one single free extent in the group
4770 * @sb:         super block for the file system
4771 * @start:      starting block of the free extent in the alloc. group
4772 * @count:      number of blocks to TRIM
4773 * @group:      alloc. group we are working with
4774 * @e4b:        ext4 buddy for the group
4775 *
4776 * Trim "count" blocks starting at "start" in the "group". To assure that no
4777 * one will allocate those blocks, mark it as used in buddy bitmap. This must
4778 * be called with under the group lock.
4779 */
4780static void ext4_trim_extent(struct super_block *sb, int start, int count,
4781                             ext4_group_t group, struct ext4_buddy *e4b)
4782{
4783        struct ext4_free_extent ex;
4784
4785        assert_spin_locked(ext4_group_lock_ptr(sb, group));
4786
4787        ex.fe_start = start;
4788        ex.fe_group = group;
4789        ex.fe_len = count;
4790
4791        /*
4792         * Mark blocks used, so no one can reuse them while
4793         * being trimmed.
4794         */
4795        mb_mark_used(e4b, &ex);
4796        ext4_unlock_group(sb, group);
4797        ext4_issue_discard(sb, group, start, count);
4798        ext4_lock_group(sb, group);
4799        mb_free_blocks(NULL, e4b, start, ex.fe_len);
4800}
4801
4802/**
4803 * ext4_trim_all_free -- function to trim all free space in alloc. group
4804 * @sb:                 super block for file system
4805 * @e4b:                ext4 buddy
4806 * @start:              first group block to examine
4807 * @max:                last group block to examine
4808 * @minblocks:          minimum extent block count
4809 *
4810 * ext4_trim_all_free walks through group's buddy bitmap searching for free
4811 * extents. When the free block is found, ext4_trim_extent is called to TRIM
4812 * the extent.
4813 *
4814 *
4815 * ext4_trim_all_free walks through group's block bitmap searching for free
4816 * extents. When the free extent is found, mark it as used in group buddy
4817 * bitmap. Then issue a TRIM command on this extent and free the extent in
4818 * the group buddy bitmap. This is done until whole group is scanned.
4819 */
4820static ext4_grpblk_t
4821ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
4822                   ext4_grpblk_t start, ext4_grpblk_t max,
4823                   ext4_grpblk_t minblocks)
4824{
4825        void *bitmap;
4826        ext4_grpblk_t next, count = 0;
4827        struct ext4_buddy e4b;
4828        int ret;
4829
4830        ret = ext4_mb_load_buddy(sb, group, &e4b);
4831        if (ret) {
4832                ext4_error(sb, "Error in loading buddy "
4833                                "information for %u", group);
4834                return ret;
4835        }
4836        bitmap = e4b.bd_bitmap;
4837
4838        ext4_lock_group(sb, group);
4839        start = (e4b.bd_info->bb_first_free > start) ?
4840                e4b.bd_info->bb_first_free : start;
4841
4842        while (start < max) {
4843                start = mb_find_next_zero_bit(bitmap, max, start);
4844                if (start >= max)
4845                        break;
4846                next = mb_find_next_bit(bitmap, max, start);
4847
4848                if ((next - start) >= minblocks) {
4849                        ext4_trim_extent(sb, start,
4850                                         next - start, group, &e4b);
4851                        count += next - start;
4852                }
4853                start = next + 1;
4854
4855                if (fatal_signal_pending(current)) {
4856                        count = -ERESTARTSYS;
4857                        break;
4858                }
4859
4860                if (need_resched()) {
4861                        ext4_unlock_group(sb, group);
4862                        cond_resched();
4863                        ext4_lock_group(sb, group);
4864                }
4865
4866                if ((e4b.bd_info->bb_free - count) < minblocks)
4867                        break;
4868        }
4869        ext4_unlock_group(sb, group);
4870        ext4_mb_unload_buddy(&e4b);
4871
4872        ext4_debug("trimmed %d blocks in the group %d\n",
4873                count, group);
4874
4875        return count;
4876}
4877
4878/**
4879 * ext4_trim_fs() -- trim ioctl handle function
4880 * @sb:                 superblock for filesystem
4881 * @range:              fstrim_range structure
4882 *
4883 * start:       First Byte to trim
4884 * len:         number of Bytes to trim from start
4885 * minlen:      minimum extent length in Bytes
4886 * ext4_trim_fs goes through all allocation groups containing Bytes from
4887 * start to start+len. For each such a group ext4_trim_all_free function
4888 * is invoked to trim all free space.
4889 */
4890int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
4891{
4892        struct ext4_group_info *grp;
4893        ext4_group_t first_group, last_group;
4894        ext4_group_t group, ngroups = ext4_get_groups_count(sb);
4895        ext4_grpblk_t cnt = 0, first_block, last_block;
4896        uint64_t start, len, minlen, trimmed = 0;
4897        ext4_fsblk_t first_data_blk =
4898                        le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
4899        int ret = 0;
4900
4901        start = range->start >> sb->s_blocksize_bits;
4902        len = range->len >> sb->s_blocksize_bits;
4903        minlen = range->minlen >> sb->s_blocksize_bits;
4904
4905        if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb)))
4906                return -EINVAL;
4907        if (start < first_data_blk) {
4908                len -= first_data_blk - start;
4909                start = first_data_blk;
4910        }
4911
4912        /* Determine first and last group to examine based on start and len */
4913        ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
4914                                     &first_group, &first_block);
4915        ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) (start + len),
4916                                     &last_group, &last_block);
4917        last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
4918        last_block = EXT4_BLOCKS_PER_GROUP(sb);
4919
4920        if (first_group > last_group)
4921                return -EINVAL;
4922
4923        for (group = first_group; group <= last_group; group++) {
4924                grp = ext4_get_group_info(sb, group);
4925                /* We only do this if the grp has never been initialized */
4926                if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
4927                        ret = ext4_mb_init_group(sb, group);
4928                        if (ret)
4929                                break;
4930                }
4931
4932                /*
4933                 * For all the groups except the last one, last block will
4934                 * always be EXT4_BLOCKS_PER_GROUP(sb), so we only need to
4935                 * change it for the last group in which case start +
4936                 * len < EXT4_BLOCKS_PER_GROUP(sb).
4937                 */
4938                if (first_block + len < EXT4_BLOCKS_PER_GROUP(sb))
4939                        last_block = first_block + len;
4940                len -= last_block - first_block;
4941
4942                if (grp->bb_free >= minlen) {
4943                        cnt = ext4_trim_all_free(sb, group, first_block,
4944                                                last_block, minlen);
4945                        if (cnt < 0) {
4946                                ret = cnt;
4947                                break;
4948                        }
4949                }
4950                trimmed += cnt;
4951                first_block = 0;
4952        }
4953        range->len = trimmed * sb->s_blocksize;
4954
4955        return ret;
4956}
4957