linux/fs/ext4/mballoc.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
   3 * Written by Alex Tomas <alex@clusterfs.com>
   4 *
   5 * This program is free software; you can redistribute it and/or modify
   6 * it under the terms of the GNU General Public License version 2 as
   7 * published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope that it will be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public Licens
  15 * along with this program; if not, write to the Free Software
  16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
  17 */
  18
  19
  20/*
  21 * mballoc.c contains the multiblocks allocation routines
  22 */
  23
  24#include "ext4_jbd2.h"
  25#include "mballoc.h"
  26#include <linux/log2.h>
  27#include <linux/module.h>
  28#include <linux/slab.h>
  29#include <trace/events/ext4.h>
  30
  31#ifdef CONFIG_EXT4_DEBUG
  32ushort ext4_mballoc_debug __read_mostly;
  33
  34module_param_named(mballoc_debug, ext4_mballoc_debug, ushort, 0644);
  35MODULE_PARM_DESC(mballoc_debug, "Debugging level for ext4's mballoc");
  36#endif
  37
  38/*
  39 * MUSTDO:
  40 *   - test ext4_ext_search_left() and ext4_ext_search_right()
  41 *   - search for metadata in few groups
  42 *
  43 * TODO v4:
  44 *   - normalization should take into account whether file is still open
  45 *   - discard preallocations if no free space left (policy?)
  46 *   - don't normalize tails
  47 *   - quota
  48 *   - reservation for superuser
  49 *
  50 * TODO v3:
  51 *   - bitmap read-ahead (proposed by Oleg Drokin aka green)
  52 *   - track min/max extents in each group for better group selection
  53 *   - mb_mark_used() may allocate chunk right after splitting buddy
  54 *   - tree of groups sorted by number of free blocks
  55 *   - error handling
  56 */
  57
  58/*
  59 * The allocation request involve request for multiple number of blocks
  60 * near to the goal(block) value specified.
  61 *
  62 * During initialization phase of the allocator we decide to use the
  63 * group preallocation or inode preallocation depending on the size of
  64 * the file. The size of the file could be the resulting file size we
  65 * would have after allocation, or the current file size, which ever
  66 * is larger. If the size is less than sbi->s_mb_stream_request we
  67 * select to use the group preallocation. The default value of
  68 * s_mb_stream_request is 16 blocks. This can also be tuned via
  69 * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in
  70 * terms of number of blocks.
  71 *
  72 * The main motivation for having small file use group preallocation is to
  73 * ensure that we have small files closer together on the disk.
  74 *
  75 * First stage the allocator looks at the inode prealloc list,
  76 * ext4_inode_info->i_prealloc_list, which contains list of prealloc
  77 * spaces for this particular inode. The inode prealloc space is
  78 * represented as:
  79 *
  80 * pa_lstart -> the logical start block for this prealloc space
  81 * pa_pstart -> the physical start block for this prealloc space
  82 * pa_len    -> length for this prealloc space (in clusters)
  83 * pa_free   ->  free space available in this prealloc space (in clusters)
  84 *
  85 * The inode preallocation space is used looking at the _logical_ start
  86 * block. If only the logical file block falls within the range of prealloc
  87 * space we will consume the particular prealloc space. This makes sure that
  88 * we have contiguous physical blocks representing the file blocks
  89 *
  90 * The important thing to be noted in case of inode prealloc space is that
  91 * we don't modify the values associated to inode prealloc space except
  92 * pa_free.
  93 *
  94 * If we are not able to find blocks in the inode prealloc space and if we
  95 * have the group allocation flag set then we look at the locality group
  96 * prealloc space. These are per CPU prealloc list represented as
  97 *
  98 * ext4_sb_info.s_locality_groups[smp_processor_id()]
  99 *
 100 * The reason for having a per cpu locality group is to reduce the contention
 101 * between CPUs. It is possible to get scheduled at this point.
 102 *
 103 * The locality group prealloc space is used looking at whether we have
 104 * enough free space (pa_free) within the prealloc space.
 105 *
 106 * If we can't allocate blocks via inode prealloc or/and locality group
 107 * prealloc then we look at the buddy cache. The buddy cache is represented
 108 * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets
 109 * mapped to the buddy and bitmap information regarding different
 110 * groups. The buddy information is attached to buddy cache inode so that
 111 * we can access them through the page cache. The information regarding
 112 * each group is loaded via ext4_mb_load_buddy.  The information involve
 113 * block bitmap and buddy information. The information are stored in the
 114 * inode as:
 115 *
 116 *  {                        page                        }
 117 *  [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
 118 *
 119 *
 120 * one block each for bitmap and buddy information.  So for each group we
 121 * take up 2 blocks. A page can contain blocks_per_page (PAGE_CACHE_SIZE /
 122 * blocksize) blocks.  So it can have information regarding groups_per_page
 123 * which is blocks_per_page/2
 124 *
 125 * The buddy cache inode is not stored on disk. The inode is thrown
 126 * away when the filesystem is unmounted.
 127 *
 128 * We look for count number of blocks in the buddy cache. If we were able
 129 * to locate that many free blocks we return with additional information
 130 * regarding rest of the contiguous physical block available
 131 *
 132 * Before allocating blocks via buddy cache we normalize the request
 133 * blocks. This ensure we ask for more blocks that we needed. The extra
 134 * blocks that we get after allocation is added to the respective prealloc
 135 * list. In case of inode preallocation we follow a list of heuristics
 136 * based on file size. This can be found in ext4_mb_normalize_request. If
 137 * we are doing a group prealloc we try to normalize the request to
 138 * sbi->s_mb_group_prealloc.  The default value of s_mb_group_prealloc is
 139 * dependent on the cluster size; for non-bigalloc file systems, it is
 140 * 512 blocks. This can be tuned via
 141 * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in
 142 * terms of number of blocks. If we have mounted the file system with -O
 143 * stripe=<value> option the group prealloc request is normalized to the
 144 * the smallest multiple of the stripe value (sbi->s_stripe) which is
 145 * greater than the default mb_group_prealloc.
 146 *
 147 * The regular allocator (using the buddy cache) supports a few tunables.
 148 *
 149 * /sys/fs/ext4/<partition>/mb_min_to_scan
 150 * /sys/fs/ext4/<partition>/mb_max_to_scan
 151 * /sys/fs/ext4/<partition>/mb_order2_req
 152 *
 153 * The regular allocator uses buddy scan only if the request len is power of
 154 * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
 155 * value of s_mb_order2_reqs can be tuned via
 156 * /sys/fs/ext4/<partition>/mb_order2_req.  If the request len is equal to
 157 * stripe size (sbi->s_stripe), we try to search for contiguous block in
 158 * stripe size. This should result in better allocation on RAID setups. If
 159 * not, we search in the specific group using bitmap for best extents. The
 160 * tunable min_to_scan and max_to_scan control the behaviour here.
 161 * min_to_scan indicate how long the mballoc __must__ look for a best
 162 * extent and max_to_scan indicates how long the mballoc __can__ look for a
 163 * best extent in the found extents. Searching for the blocks starts with
 164 * the group specified as the goal value in allocation context via
 165 * ac_g_ex. Each group is first checked based on the criteria whether it
 166 * can be used for allocation. ext4_mb_good_group explains how the groups are
 167 * checked.
 168 *
 169 * Both the prealloc space are getting populated as above. So for the first
 170 * request we will hit the buddy cache which will result in this prealloc
 171 * space getting filled. The prealloc space is then later used for the
 172 * subsequent request.
 173 */
 174
 175/*
 176 * mballoc operates on the following data:
 177 *  - on-disk bitmap
 178 *  - in-core buddy (actually includes buddy and bitmap)
 179 *  - preallocation descriptors (PAs)
 180 *
 181 * there are two types of preallocations:
 182 *  - inode
 183 *    assiged to specific inode and can be used for this inode only.
 184 *    it describes part of inode's space preallocated to specific
 185 *    physical blocks. any block from that preallocated can be used
 186 *    independent. the descriptor just tracks number of blocks left
 187 *    unused. so, before taking some block from descriptor, one must
 188 *    make sure corresponded logical block isn't allocated yet. this
 189 *    also means that freeing any block within descriptor's range
 190 *    must discard all preallocated blocks.
 191 *  - locality group
 192 *    assigned to specific locality group which does not translate to
 193 *    permanent set of inodes: inode can join and leave group. space
 194 *    from this type of preallocation can be used for any inode. thus
 195 *    it's consumed from the beginning to the end.
 196 *
 197 * relation between them can be expressed as:
 198 *    in-core buddy = on-disk bitmap + preallocation descriptors
 199 *
 200 * this mean blocks mballoc considers used are:
 201 *  - allocated blocks (persistent)
 202 *  - preallocated blocks (non-persistent)
 203 *
 204 * consistency in mballoc world means that at any time a block is either
 205 * free or used in ALL structures. notice: "any time" should not be read
 206 * literally -- time is discrete and delimited by locks.
 207 *
 208 *  to keep it simple, we don't use block numbers, instead we count number of
 209 *  blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA.
 210 *
 211 * all operations can be expressed as:
 212 *  - init buddy:                       buddy = on-disk + PAs
 213 *  - new PA:                           buddy += N; PA = N
 214 *  - use inode PA:                     on-disk += N; PA -= N
 215 *  - discard inode PA                  buddy -= on-disk - PA; PA = 0
 216 *  - use locality group PA             on-disk += N; PA -= N
 217 *  - discard locality group PA         buddy -= PA; PA = 0
 218 *  note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap
 219 *        is used in real operation because we can't know actual used
 220 *        bits from PA, only from on-disk bitmap
 221 *
 222 * if we follow this strict logic, then all operations above should be atomic.
 223 * given some of them can block, we'd have to use something like semaphores
 224 * killing performance on high-end SMP hardware. let's try to relax it using
 225 * the following knowledge:
 226 *  1) if buddy is referenced, it's already initialized
 227 *  2) while block is used in buddy and the buddy is referenced,
 228 *     nobody can re-allocate that block
 229 *  3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has
 230 *     bit set and PA claims same block, it's OK. IOW, one can set bit in
 231 *     on-disk bitmap if buddy has same bit set or/and PA covers corresponded
 232 *     block
 233 *
 234 * so, now we're building a concurrency table:
 235 *  - init buddy vs.
 236 *    - new PA
 237 *      blocks for PA are allocated in the buddy, buddy must be referenced
 238 *      until PA is linked to allocation group to avoid concurrent buddy init
 239 *    - use inode PA
 240 *      we need to make sure that either on-disk bitmap or PA has uptodate data
 241 *      given (3) we care that PA-=N operation doesn't interfere with init
 242 *    - discard inode PA
 243 *      the simplest way would be to have buddy initialized by the discard
 244 *    - use locality group PA
 245 *      again PA-=N must be serialized with init
 246 *    - discard locality group PA
 247 *      the simplest way would be to have buddy initialized by the discard
 248 *  - new PA vs.
 249 *    - use inode PA
 250 *      i_data_sem serializes them
 251 *    - discard inode PA
 252 *      discard process must wait until PA isn't used by another process
 253 *    - use locality group PA
 254 *      some mutex should serialize them
 255 *    - discard locality group PA
 256 *      discard process must wait until PA isn't used by another process
 257 *  - use inode PA
 258 *    - use inode PA
 259 *      i_data_sem or another mutex should serializes them
 260 *    - discard inode PA
 261 *      discard process must wait until PA isn't used by another process
 262 *    - use locality group PA
 263 *      nothing wrong here -- they're different PAs covering different blocks
 264 *    - discard locality group PA
 265 *      discard process must wait until PA isn't used by another process
 266 *
 267 * now we're ready to make few consequences:
 268 *  - PA is referenced and while it is no discard is possible
 269 *  - PA is referenced until block isn't marked in on-disk bitmap
 270 *  - PA changes only after on-disk bitmap
 271 *  - discard must not compete with init. either init is done before
 272 *    any discard or they're serialized somehow
 273 *  - buddy init as sum of on-disk bitmap and PAs is done atomically
 274 *
 275 * a special case when we've used PA to emptiness. no need to modify buddy
 276 * in this case, but we should care about concurrent init
 277 *
 278 */
 279
 280 /*
 281 * Logic in few words:
 282 *
 283 *  - allocation:
 284 *    load group
 285 *    find blocks
 286 *    mark bits in on-disk bitmap
 287 *    release group
 288 *
 289 *  - use preallocation:
 290 *    find proper PA (per-inode or group)
 291 *    load group
 292 *    mark bits in on-disk bitmap
 293 *    release group
 294 *    release PA
 295 *
 296 *  - free:
 297 *    load group
 298 *    mark bits in on-disk bitmap
 299 *    release group
 300 *
 301 *  - discard preallocations in group:
 302 *    mark PAs deleted
 303 *    move them onto local list
 304 *    load on-disk bitmap
 305 *    load group
 306 *    remove PA from object (inode or locality group)
 307 *    mark free blocks in-core
 308 *
 309 *  - discard inode's preallocations:
 310 */
 311
 312/*
 313 * Locking rules
 314 *
 315 * Locks:
 316 *  - bitlock on a group        (group)
 317 *  - object (inode/locality)   (object)
 318 *  - per-pa lock               (pa)
 319 *
 320 * Paths:
 321 *  - new pa
 322 *    object
 323 *    group
 324 *
 325 *  - find and use pa:
 326 *    pa
 327 *
 328 *  - release consumed pa:
 329 *    pa
 330 *    group
 331 *    object
 332 *
 333 *  - generate in-core bitmap:
 334 *    group
 335 *        pa
 336 *
 337 *  - discard all for given object (inode, locality group):
 338 *    object
 339 *        pa
 340 *    group
 341 *
 342 *  - discard all for given group:
 343 *    group
 344 *        pa
 345 *    group
 346 *        object
 347 *
 348 */
 349static struct kmem_cache *ext4_pspace_cachep;
 350static struct kmem_cache *ext4_ac_cachep;
 351static struct kmem_cache *ext4_free_data_cachep;
 352
 353/* We create slab caches for groupinfo data structures based on the
 354 * superblock block size.  There will be one per mounted filesystem for
 355 * each unique s_blocksize_bits */
 356#define NR_GRPINFO_CACHES 8
 357static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
 358
 359static const char *ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
 360        "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k",
 361        "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k",
 362        "ext4_groupinfo_64k", "ext4_groupinfo_128k"
 363};
 364
 365static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
 366                                        ext4_group_t group);
 367static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
 368                                                ext4_group_t group);
 369static void ext4_free_data_callback(struct super_block *sb,
 370                                struct ext4_journal_cb_entry *jce, int rc);
 371
 372static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
 373{
 374#if BITS_PER_LONG == 64
 375        *bit += ((unsigned long) addr & 7UL) << 3;
 376        addr = (void *) ((unsigned long) addr & ~7UL);
 377#elif BITS_PER_LONG == 32
 378        *bit += ((unsigned long) addr & 3UL) << 3;
 379        addr = (void *) ((unsigned long) addr & ~3UL);
 380#else
 381#error "how many bits you are?!"
 382#endif
 383        return addr;
 384}
 385
 386static inline int mb_test_bit(int bit, void *addr)
 387{
 388        /*
 389         * ext4_test_bit on architecture like powerpc
 390         * needs unsigned long aligned address
 391         */
 392        addr = mb_correct_addr_and_bit(&bit, addr);
 393        return ext4_test_bit(bit, addr);
 394}
 395
 396static inline void mb_set_bit(int bit, void *addr)
 397{
 398        addr = mb_correct_addr_and_bit(&bit, addr);
 399        ext4_set_bit(bit, addr);
 400}
 401
 402static inline void mb_clear_bit(int bit, void *addr)
 403{
 404        addr = mb_correct_addr_and_bit(&bit, addr);
 405        ext4_clear_bit(bit, addr);
 406}
 407
 408static inline int mb_test_and_clear_bit(int bit, void *addr)
 409{
 410        addr = mb_correct_addr_and_bit(&bit, addr);
 411        return ext4_test_and_clear_bit(bit, addr);
 412}
 413
 414static inline int mb_find_next_zero_bit(void *addr, int max, int start)
 415{
 416        int fix = 0, ret, tmpmax;
 417        addr = mb_correct_addr_and_bit(&fix, addr);
 418        tmpmax = max + fix;
 419        start += fix;
 420
 421        ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix;
 422        if (ret > max)
 423                return max;
 424        return ret;
 425}
 426
 427static inline int mb_find_next_bit(void *addr, int max, int start)
 428{
 429        int fix = 0, ret, tmpmax;
 430        addr = mb_correct_addr_and_bit(&fix, addr);
 431        tmpmax = max + fix;
 432        start += fix;
 433
 434        ret = ext4_find_next_bit(addr, tmpmax, start) - fix;
 435        if (ret > max)
 436                return max;
 437        return ret;
 438}
 439
 440static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
 441{
 442        char *bb;
 443
 444        BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
 445        BUG_ON(max == NULL);
 446
 447        if (order > e4b->bd_blkbits + 1) {
 448                *max = 0;
 449                return NULL;
 450        }
 451
 452        /* at order 0 we see each particular block */
 453        if (order == 0) {
 454                *max = 1 << (e4b->bd_blkbits + 3);
 455                return e4b->bd_bitmap;
 456        }
 457
 458        bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
 459        *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
 460
 461        return bb;
 462}
 463
 464#ifdef DOUBLE_CHECK
 465static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
 466                           int first, int count)
 467{
 468        int i;
 469        struct super_block *sb = e4b->bd_sb;
 470
 471        if (unlikely(e4b->bd_info->bb_bitmap == NULL))
 472                return;
 473        assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
 474        for (i = 0; i < count; i++) {
 475                if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
 476                        ext4_fsblk_t blocknr;
 477
 478                        blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
 479                        blocknr += EXT4_C2B(EXT4_SB(sb), first + i);
 480                        ext4_grp_locked_error(sb, e4b->bd_group,
 481                                              inode ? inode->i_ino : 0,
 482                                              blocknr,
 483                                              "freeing block already freed "
 484                                              "(bit %u)",
 485                                              first + i);
 486                }
 487                mb_clear_bit(first + i, e4b->bd_info->bb_bitmap);
 488        }
 489}
 490
 491static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count)
 492{
 493        int i;
 494
 495        if (unlikely(e4b->bd_info->bb_bitmap == NULL))
 496                return;
 497        assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
 498        for (i = 0; i < count; i++) {
 499                BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap));
 500                mb_set_bit(first + i, e4b->bd_info->bb_bitmap);
 501        }
 502}
 503
 504static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
 505{
 506        if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) {
 507                unsigned char *b1, *b2;
 508                int i;
 509                b1 = (unsigned char *) e4b->bd_info->bb_bitmap;
 510                b2 = (unsigned char *) bitmap;
 511                for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
 512                        if (b1[i] != b2[i]) {
 513                                ext4_msg(e4b->bd_sb, KERN_ERR,
 514                                         "corruption in group %u "
 515                                         "at byte %u(%u): %x in copy != %x "
 516                                         "on disk/prealloc",
 517                                         e4b->bd_group, i, i * 8, b1[i], b2[i]);
 518                                BUG();
 519                        }
 520                }
 521        }
 522}
 523
 524#else
 525static inline void mb_free_blocks_double(struct inode *inode,
 526                                struct ext4_buddy *e4b, int first, int count)
 527{
 528        return;
 529}
 530static inline void mb_mark_used_double(struct ext4_buddy *e4b,
 531                                                int first, int count)
 532{
 533        return;
 534}
 535static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
 536{
 537        return;
 538}
 539#endif
 540
 541#ifdef AGGRESSIVE_CHECK
 542
 543#define MB_CHECK_ASSERT(assert)                                         \
 544do {                                                                    \
 545        if (!(assert)) {                                                \
 546                printk(KERN_EMERG                                       \
 547                        "Assertion failure in %s() at %s:%d: \"%s\"\n", \
 548                        function, file, line, # assert);                \
 549                BUG();                                                  \
 550        }                                                               \
 551} while (0)
 552
 553static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
 554                                const char *function, int line)
 555{
 556        struct super_block *sb = e4b->bd_sb;
 557        int order = e4b->bd_blkbits + 1;
 558        int max;
 559        int max2;
 560        int i;
 561        int j;
 562        int k;
 563        int count;
 564        struct ext4_group_info *grp;
 565        int fragments = 0;
 566        int fstart;
 567        struct list_head *cur;
 568        void *buddy;
 569        void *buddy2;
 570
 571        {
 572                static int mb_check_counter;
 573                if (mb_check_counter++ % 100 != 0)
 574                        return 0;
 575        }
 576
 577        while (order > 1) {
 578                buddy = mb_find_buddy(e4b, order, &max);
 579                MB_CHECK_ASSERT(buddy);
 580                buddy2 = mb_find_buddy(e4b, order - 1, &max2);
 581                MB_CHECK_ASSERT(buddy2);
 582                MB_CHECK_ASSERT(buddy != buddy2);
 583                MB_CHECK_ASSERT(max * 2 == max2);
 584
 585                count = 0;
 586                for (i = 0; i < max; i++) {
 587
 588                        if (mb_test_bit(i, buddy)) {
 589                                /* only single bit in buddy2 may be 1 */
 590                                if (!mb_test_bit(i << 1, buddy2)) {
 591                                        MB_CHECK_ASSERT(
 592                                                mb_test_bit((i<<1)+1, buddy2));
 593                                } else if (!mb_test_bit((i << 1) + 1, buddy2)) {
 594                                        MB_CHECK_ASSERT(
 595                                                mb_test_bit(i << 1, buddy2));
 596                                }
 597                                continue;
 598                        }
 599
 600                        /* both bits in buddy2 must be 1 */
 601                        MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2));
 602                        MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2));
 603
 604                        for (j = 0; j < (1 << order); j++) {
 605                                k = (i * (1 << order)) + j;
 606                                MB_CHECK_ASSERT(
 607                                        !mb_test_bit(k, e4b->bd_bitmap));
 608                        }
 609                        count++;
 610                }
 611                MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count);
 612                order--;
 613        }
 614
 615        fstart = -1;
 616        buddy = mb_find_buddy(e4b, 0, &max);
 617        for (i = 0; i < max; i++) {
 618                if (!mb_test_bit(i, buddy)) {
 619                        MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free);
 620                        if (fstart == -1) {
 621                                fragments++;
 622                                fstart = i;
 623                        }
 624                        continue;
 625                }
 626                fstart = -1;
 627                /* check used bits only */
 628                for (j = 0; j < e4b->bd_blkbits + 1; j++) {
 629                        buddy2 = mb_find_buddy(e4b, j, &max2);
 630                        k = i >> j;
 631                        MB_CHECK_ASSERT(k < max2);
 632                        MB_CHECK_ASSERT(mb_test_bit(k, buddy2));
 633                }
 634        }
 635        MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info));
 636        MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);
 637
 638        grp = ext4_get_group_info(sb, e4b->bd_group);
 639        list_for_each(cur, &grp->bb_prealloc_list) {
 640                ext4_group_t groupnr;
 641                struct ext4_prealloc_space *pa;
 642                pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
 643                ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k);
 644                MB_CHECK_ASSERT(groupnr == e4b->bd_group);
 645                for (i = 0; i < pa->pa_len; i++)
 646                        MB_CHECK_ASSERT(mb_test_bit(k + i, buddy));
 647        }
 648        return 0;
 649}
 650#undef MB_CHECK_ASSERT
 651#define mb_check_buddy(e4b) __mb_check_buddy(e4b,       \
 652                                        __FILE__, __func__, __LINE__)
 653#else
 654#define mb_check_buddy(e4b)
 655#endif
 656
 657/*
 658 * Divide blocks started from @first with length @len into
 659 * smaller chunks with power of 2 blocks.
 660 * Clear the bits in bitmap which the blocks of the chunk(s) covered,
 661 * then increase bb_counters[] for corresponded chunk size.
 662 */
 663static void ext4_mb_mark_free_simple(struct super_block *sb,
 664                                void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
 665                                        struct ext4_group_info *grp)
 666{
 667        struct ext4_sb_info *sbi = EXT4_SB(sb);
 668        ext4_grpblk_t min;
 669        ext4_grpblk_t max;
 670        ext4_grpblk_t chunk;
 671        unsigned short border;
 672
 673        BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb));
 674
 675        border = 2 << sb->s_blocksize_bits;
 676
 677        while (len > 0) {
 678                /* find how many blocks can be covered since this position */
 679                max = ffs(first | border) - 1;
 680
 681                /* find how many blocks of power 2 we need to mark */
 682                min = fls(len) - 1;
 683
 684                if (max < min)
 685                        min = max;
 686                chunk = 1 << min;
 687
 688                /* mark multiblock chunks only */
 689                grp->bb_counters[min]++;
 690                if (min > 0)
 691                        mb_clear_bit(first >> min,
 692                                     buddy + sbi->s_mb_offsets[min]);
 693
 694                len -= chunk;
 695                first += chunk;
 696        }
 697}
 698
 699/*
 700 * Cache the order of the largest free extent we have available in this block
 701 * group.
 702 */
 703static void
 704mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
 705{
 706        int i;
 707        int bits;
 708
 709        grp->bb_largest_free_order = -1; /* uninit */
 710
 711        bits = sb->s_blocksize_bits + 1;
 712        for (i = bits; i >= 0; i--) {
 713                if (grp->bb_counters[i] > 0) {
 714                        grp->bb_largest_free_order = i;
 715                        break;
 716                }
 717        }
 718}
 719
 720static noinline_for_stack
 721void ext4_mb_generate_buddy(struct super_block *sb,
 722                                void *buddy, void *bitmap, ext4_group_t group)
 723{
 724        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
 725        ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
 726        ext4_grpblk_t i = 0;
 727        ext4_grpblk_t first;
 728        ext4_grpblk_t len;
 729        unsigned free = 0;
 730        unsigned fragments = 0;
 731        unsigned long long period = get_cycles();
 732
 733        /* initialize buddy from bitmap which is aggregation
 734         * of on-disk bitmap and preallocations */
 735        i = mb_find_next_zero_bit(bitmap, max, 0);
 736        grp->bb_first_free = i;
 737        while (i < max) {
 738                fragments++;
 739                first = i;
 740                i = mb_find_next_bit(bitmap, max, i);
 741                len = i - first;
 742                free += len;
 743                if (len > 1)
 744                        ext4_mb_mark_free_simple(sb, buddy, first, len, grp);
 745                else
 746                        grp->bb_counters[0]++;
 747                if (i < max)
 748                        i = mb_find_next_zero_bit(bitmap, max, i);
 749        }
 750        grp->bb_fragments = fragments;
 751
 752        if (free != grp->bb_free) {
 753                ext4_grp_locked_error(sb, group, 0, 0,
 754                                      "%u clusters in bitmap, %u in gd",
 755                                      free, grp->bb_free);
 756                /*
 757                 * If we intent to continue, we consider group descritor
 758                 * corrupt and update bb_free using bitmap value
 759                 */
 760                grp->bb_free = free;
 761        }
 762        mb_set_largest_free_order(sb, grp);
 763
 764        clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
 765
 766        period = get_cycles() - period;
 767        spin_lock(&EXT4_SB(sb)->s_bal_lock);
 768        EXT4_SB(sb)->s_mb_buddies_generated++;
 769        EXT4_SB(sb)->s_mb_generation_time += period;
 770        spin_unlock(&EXT4_SB(sb)->s_bal_lock);
 771}
 772
 773static void mb_regenerate_buddy(struct ext4_buddy *e4b)
 774{
 775        int count;
 776        int order = 1;
 777        void *buddy;
 778
 779        while ((buddy = mb_find_buddy(e4b, order++, &count))) {
 780                ext4_set_bits(buddy, 0, count);
 781        }
 782        e4b->bd_info->bb_fragments = 0;
 783        memset(e4b->bd_info->bb_counters, 0,
 784                sizeof(*e4b->bd_info->bb_counters) *
 785                (e4b->bd_sb->s_blocksize_bits + 2));
 786
 787        ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy,
 788                e4b->bd_bitmap, e4b->bd_group);
 789}
 790
 791/* The buddy information is attached the buddy cache inode
 792 * for convenience. The information regarding each group
 793 * is loaded via ext4_mb_load_buddy. The information involve
 794 * block bitmap and buddy information. The information are
 795 * stored in the inode as
 796 *
 797 * {                        page                        }
 798 * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
 799 *
 800 *
 801 * one block each for bitmap and buddy information.
 802 * So for each group we take up 2 blocks. A page can
 803 * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize)  blocks.
 804 * So it can have information regarding groups_per_page which
 805 * is blocks_per_page/2
 806 *
 807 * Locking note:  This routine takes the block group lock of all groups
 808 * for this page; do not hold this lock when calling this routine!
 809 */
 810
 811static int ext4_mb_init_cache(struct page *page, char *incore)
 812{
 813        ext4_group_t ngroups;
 814        int blocksize;
 815        int blocks_per_page;
 816        int groups_per_page;
 817        int err = 0;
 818        int i;
 819        ext4_group_t first_group, group;
 820        int first_block;
 821        struct super_block *sb;
 822        struct buffer_head *bhs;
 823        struct buffer_head **bh = NULL;
 824        struct inode *inode;
 825        char *data;
 826        char *bitmap;
 827        struct ext4_group_info *grinfo;
 828
 829        mb_debug(1, "init page %lu\n", page->index);
 830
 831        inode = page->mapping->host;
 832        sb = inode->i_sb;
 833        ngroups = ext4_get_groups_count(sb);
 834        blocksize = 1 << inode->i_blkbits;
 835        blocks_per_page = PAGE_CACHE_SIZE / blocksize;
 836
 837        groups_per_page = blocks_per_page >> 1;
 838        if (groups_per_page == 0)
 839                groups_per_page = 1;
 840
 841        /* allocate buffer_heads to read bitmaps */
 842        if (groups_per_page > 1) {
 843                i = sizeof(struct buffer_head *) * groups_per_page;
 844                bh = kzalloc(i, GFP_NOFS);
 845                if (bh == NULL) {
 846                        err = -ENOMEM;
 847                        goto out;
 848                }
 849        } else
 850                bh = &bhs;
 851
 852        first_group = page->index * blocks_per_page / 2;
 853
 854        /* read all groups the page covers into the cache */
 855        for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
 856                if (group >= ngroups)
 857                        break;
 858
 859                grinfo = ext4_get_group_info(sb, group);
 860                /*
 861                 * If page is uptodate then we came here after online resize
 862                 * which added some new uninitialized group info structs, so
 863                 * we must skip all initialized uptodate buddies on the page,
 864                 * which may be currently in use by an allocating task.
 865                 */
 866                if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) {
 867                        bh[i] = NULL;
 868                        continue;
 869                }
 870                if (!(bh[i] = ext4_read_block_bitmap_nowait(sb, group))) {
 871                        err = -ENOMEM;
 872                        goto out;
 873                }
 874                mb_debug(1, "read bitmap for group %u\n", group);
 875        }
 876
 877        /* wait for I/O completion */
 878        for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
 879                if (bh[i] && ext4_wait_block_bitmap(sb, group, bh[i])) {
 880                        err = -EIO;
 881                        goto out;
 882                }
 883        }
 884
 885        first_block = page->index * blocks_per_page;
 886        for (i = 0; i < blocks_per_page; i++) {
 887                group = (first_block + i) >> 1;
 888                if (group >= ngroups)
 889                        break;
 890
 891                if (!bh[group - first_group])
 892                        /* skip initialized uptodate buddy */
 893                        continue;
 894
 895                /*
 896                 * data carry information regarding this
 897                 * particular group in the format specified
 898                 * above
 899                 *
 900                 */
 901                data = page_address(page) + (i * blocksize);
 902                bitmap = bh[group - first_group]->b_data;
 903
 904                /*
 905                 * We place the buddy block and bitmap block
 906                 * close together
 907                 */
 908                if ((first_block + i) & 1) {
 909                        /* this is block of buddy */
 910                        BUG_ON(incore == NULL);
 911                        mb_debug(1, "put buddy for group %u in page %lu/%x\n",
 912                                group, page->index, i * blocksize);
 913                        trace_ext4_mb_buddy_bitmap_load(sb, group);
 914                        grinfo = ext4_get_group_info(sb, group);
 915                        grinfo->bb_fragments = 0;
 916                        memset(grinfo->bb_counters, 0,
 917                               sizeof(*grinfo->bb_counters) *
 918                                (sb->s_blocksize_bits+2));
 919                        /*
 920                         * incore got set to the group block bitmap below
 921                         */
 922                        ext4_lock_group(sb, group);
 923                        /* init the buddy */
 924                        memset(data, 0xff, blocksize);
 925                        ext4_mb_generate_buddy(sb, data, incore, group);
 926                        ext4_unlock_group(sb, group);
 927                        incore = NULL;
 928                } else {
 929                        /* this is block of bitmap */
 930                        BUG_ON(incore != NULL);
 931                        mb_debug(1, "put bitmap for group %u in page %lu/%x\n",
 932                                group, page->index, i * blocksize);
 933                        trace_ext4_mb_bitmap_load(sb, group);
 934
 935                        /* see comments in ext4_mb_put_pa() */
 936                        ext4_lock_group(sb, group);
 937                        memcpy(data, bitmap, blocksize);
 938
 939                        /* mark all preallocated blks used in in-core bitmap */
 940                        ext4_mb_generate_from_pa(sb, data, group);
 941                        ext4_mb_generate_from_freelist(sb, data, group);
 942                        ext4_unlock_group(sb, group);
 943
 944                        /* set incore so that the buddy information can be
 945                         * generated using this
 946                         */
 947                        incore = data;
 948                }
 949        }
 950        SetPageUptodate(page);
 951
 952out:
 953        if (bh) {
 954                for (i = 0; i < groups_per_page; i++)
 955                        brelse(bh[i]);
 956                if (bh != &bhs)
 957                        kfree(bh);
 958        }
 959        return err;
 960}
 961
 962/*
 963 * Lock the buddy and bitmap pages. This make sure other parallel init_group
 964 * on the same buddy page doesn't happen whild holding the buddy page lock.
 965 * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap
 966 * are on the same page e4b->bd_buddy_page is NULL and return value is 0.
 967 */
 968static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
 969                ext4_group_t group, struct ext4_buddy *e4b)
 970{
 971        struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
 972        int block, pnum, poff;
 973        int blocks_per_page;
 974        struct page *page;
 975
 976        e4b->bd_buddy_page = NULL;
 977        e4b->bd_bitmap_page = NULL;
 978
 979        blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
 980        /*
 981         * the buddy cache inode stores the block bitmap
 982         * and buddy information in consecutive blocks.
 983         * So for each group we need two blocks.
 984         */
 985        block = group * 2;
 986        pnum = block / blocks_per_page;
 987        poff = block % blocks_per_page;
 988        page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
 989        if (!page)
 990                return -EIO;
 991        BUG_ON(page->mapping != inode->i_mapping);
 992        e4b->bd_bitmap_page = page;
 993        e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
 994
 995        if (blocks_per_page >= 2) {
 996                /* buddy and bitmap are on the same page */
 997                return 0;
 998        }
 999
1000        block++;
1001        pnum = block / blocks_per_page;
1002        page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1003        if (!page)
1004                return -EIO;
1005        BUG_ON(page->mapping != inode->i_mapping);
1006        e4b->bd_buddy_page = page;
1007        return 0;
1008}
1009
1010static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
1011{
1012        if (e4b->bd_bitmap_page) {
1013                unlock_page(e4b->bd_bitmap_page);
1014                page_cache_release(e4b->bd_bitmap_page);
1015        }
1016        if (e4b->bd_buddy_page) {
1017                unlock_page(e4b->bd_buddy_page);
1018                page_cache_release(e4b->bd_buddy_page);
1019        }
1020}
1021
1022/*
1023 * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
1024 * block group lock of all groups for this page; do not hold the BG lock when
1025 * calling this routine!
1026 */
1027static noinline_for_stack
1028int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
1029{
1030
1031        struct ext4_group_info *this_grp;
1032        struct ext4_buddy e4b;
1033        struct page *page;
1034        int ret = 0;
1035
1036        might_sleep();
1037        mb_debug(1, "init group %u\n", group);
1038        this_grp = ext4_get_group_info(sb, group);
1039        /*
1040         * This ensures that we don't reinit the buddy cache
1041         * page which map to the group from which we are already
1042         * allocating. If we are looking at the buddy cache we would
1043         * have taken a reference using ext4_mb_load_buddy and that
1044         * would have pinned buddy page to page cache.
1045         */
1046        ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b);
1047        if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
1048                /*
1049                 * somebody initialized the group
1050                 * return without doing anything
1051                 */
1052                goto err;
1053        }
1054
1055        page = e4b.bd_bitmap_page;
1056        ret = ext4_mb_init_cache(page, NULL);
1057        if (ret)
1058                goto err;
1059        if (!PageUptodate(page)) {
1060                ret = -EIO;
1061                goto err;
1062        }
1063        mark_page_accessed(page);
1064
1065        if (e4b.bd_buddy_page == NULL) {
1066                /*
1067                 * If both the bitmap and buddy are in
1068                 * the same page we don't need to force
1069                 * init the buddy
1070                 */
1071                ret = 0;
1072                goto err;
1073        }
1074        /* init buddy cache */
1075        page = e4b.bd_buddy_page;
1076        ret = ext4_mb_init_cache(page, e4b.bd_bitmap);
1077        if (ret)
1078                goto err;
1079        if (!PageUptodate(page)) {
1080                ret = -EIO;
1081                goto err;
1082        }
1083        mark_page_accessed(page);
1084err:
1085        ext4_mb_put_buddy_page_lock(&e4b);
1086        return ret;
1087}
1088
1089/*
1090 * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
1091 * block group lock of all groups for this page; do not hold the BG lock when
1092 * calling this routine!
1093 */
1094static noinline_for_stack int
1095ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1096                                        struct ext4_buddy *e4b)
1097{
1098        int blocks_per_page;
1099        int block;
1100        int pnum;
1101        int poff;
1102        struct page *page;
1103        int ret;
1104        struct ext4_group_info *grp;
1105        struct ext4_sb_info *sbi = EXT4_SB(sb);
1106        struct inode *inode = sbi->s_buddy_cache;
1107
1108        might_sleep();
1109        mb_debug(1, "load group %u\n", group);
1110
1111        blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
1112        grp = ext4_get_group_info(sb, group);
1113
1114        e4b->bd_blkbits = sb->s_blocksize_bits;
1115        e4b->bd_info = grp;
1116        e4b->bd_sb = sb;
1117        e4b->bd_group = group;
1118        e4b->bd_buddy_page = NULL;
1119        e4b->bd_bitmap_page = NULL;
1120
1121        if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
1122                /*
1123                 * we need full data about the group
1124                 * to make a good selection
1125                 */
1126                ret = ext4_mb_init_group(sb, group);
1127                if (ret)
1128                        return ret;
1129        }
1130
1131        /*
1132         * the buddy cache inode stores the block bitmap
1133         * and buddy information in consecutive blocks.
1134         * So for each group we need two blocks.
1135         */
1136        block = group * 2;
1137        pnum = block / blocks_per_page;
1138        poff = block % blocks_per_page;
1139
1140        /* we could use find_or_create_page(), but it locks page
1141         * what we'd like to avoid in fast path ... */
1142        page = find_get_page(inode->i_mapping, pnum);
1143        if (page == NULL || !PageUptodate(page)) {
1144                if (page)
1145                        /*
1146                         * drop the page reference and try
1147                         * to get the page with lock. If we
1148                         * are not uptodate that implies
1149                         * somebody just created the page but
1150                         * is yet to initialize the same. So
1151                         * wait for it to initialize.
1152                         */
1153                        page_cache_release(page);
1154                page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1155                if (page) {
1156                        BUG_ON(page->mapping != inode->i_mapping);
1157                        if (!PageUptodate(page)) {
1158                                ret = ext4_mb_init_cache(page, NULL);
1159                                if (ret) {
1160                                        unlock_page(page);
1161                                        goto err;
1162                                }
1163                                mb_cmp_bitmaps(e4b, page_address(page) +
1164                                               (poff * sb->s_blocksize));
1165                        }
1166                        unlock_page(page);
1167                }
1168        }
1169        if (page == NULL || !PageUptodate(page)) {
1170                ret = -EIO;
1171                goto err;
1172        }
1173        e4b->bd_bitmap_page = page;
1174        e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
1175        mark_page_accessed(page);
1176
1177        block++;
1178        pnum = block / blocks_per_page;
1179        poff = block % blocks_per_page;
1180
1181        page = find_get_page(inode->i_mapping, pnum);
1182        if (page == NULL || !PageUptodate(page)) {
1183                if (page)
1184                        page_cache_release(page);
1185                page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
1186                if (page) {
1187                        BUG_ON(page->mapping != inode->i_mapping);
1188                        if (!PageUptodate(page)) {
1189                                ret = ext4_mb_init_cache(page, e4b->bd_bitmap);
1190                                if (ret) {
1191                                        unlock_page(page);
1192                                        goto err;
1193                                }
1194                        }
1195                        unlock_page(page);
1196                }
1197        }
1198        if (page == NULL || !PageUptodate(page)) {
1199                ret = -EIO;
1200                goto err;
1201        }
1202        e4b->bd_buddy_page = page;
1203        e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
1204        mark_page_accessed(page);
1205
1206        BUG_ON(e4b->bd_bitmap_page == NULL);
1207        BUG_ON(e4b->bd_buddy_page == NULL);
1208
1209        return 0;
1210
1211err:
1212        if (page)
1213                page_cache_release(page);
1214        if (e4b->bd_bitmap_page)
1215                page_cache_release(e4b->bd_bitmap_page);
1216        if (e4b->bd_buddy_page)
1217                page_cache_release(e4b->bd_buddy_page);
1218        e4b->bd_buddy = NULL;
1219        e4b->bd_bitmap = NULL;
1220        return ret;
1221}
1222
1223static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
1224{
1225        if (e4b->bd_bitmap_page)
1226                page_cache_release(e4b->bd_bitmap_page);
1227        if (e4b->bd_buddy_page)
1228                page_cache_release(e4b->bd_buddy_page);
1229}
1230
1231
1232static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
1233{
1234        int order = 1;
1235        void *bb;
1236
1237        BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
1238        BUG_ON(block >= (1 << (e4b->bd_blkbits + 3)));
1239
1240        bb = e4b->bd_buddy;
1241        while (order <= e4b->bd_blkbits + 1) {
1242                block = block >> 1;
1243                if (!mb_test_bit(block, bb)) {
1244                        /* this block is part of buddy of order 'order' */
1245                        return order;
1246                }
1247                bb += 1 << (e4b->bd_blkbits - order);
1248                order++;
1249        }
1250        return 0;
1251}
1252
1253static void mb_clear_bits(void *bm, int cur, int len)
1254{
1255        __u32 *addr;
1256
1257        len = cur + len;
1258        while (cur < len) {
1259                if ((cur & 31) == 0 && (len - cur) >= 32) {
1260                        /* fast path: clear whole word at once */
1261                        addr = bm + (cur >> 3);
1262                        *addr = 0;
1263                        cur += 32;
1264                        continue;
1265                }
1266                mb_clear_bit(cur, bm);
1267                cur++;
1268        }
1269}
1270
1271/* clear bits in given range
1272 * will return first found zero bit if any, -1 otherwise
1273 */
1274static int mb_test_and_clear_bits(void *bm, int cur, int len)
1275{
1276        __u32 *addr;
1277        int zero_bit = -1;
1278
1279        len = cur + len;
1280        while (cur < len) {
1281                if ((cur & 31) == 0 && (len - cur) >= 32) {
1282                        /* fast path: clear whole word at once */
1283                        addr = bm + (cur >> 3);
1284                        if (*addr != (__u32)(-1) && zero_bit == -1)
1285                                zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0);
1286                        *addr = 0;
1287                        cur += 32;
1288                        continue;
1289                }
1290                if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1)
1291                        zero_bit = cur;
1292                cur++;
1293        }
1294
1295        return zero_bit;
1296}
1297
1298void ext4_set_bits(void *bm, int cur, int len)
1299{
1300        __u32 *addr;
1301
1302        len = cur + len;
1303        while (cur < len) {
1304                if ((cur & 31) == 0 && (len - cur) >= 32) {
1305                        /* fast path: set whole word at once */
1306                        addr = bm + (cur >> 3);
1307                        *addr = 0xffffffff;
1308                        cur += 32;
1309                        continue;
1310                }
1311                mb_set_bit(cur, bm);
1312                cur++;
1313        }
1314}
1315
1316/*
1317 * _________________________________________________________________ */
1318
1319static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side)
1320{
1321        if (mb_test_bit(*bit + side, bitmap)) {
1322                mb_clear_bit(*bit, bitmap);
1323                (*bit) -= side;
1324                return 1;
1325        }
1326        else {
1327                (*bit) += side;
1328                mb_set_bit(*bit, bitmap);
1329                return -1;
1330        }
1331}
1332
1333static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last)
1334{
1335        int max;
1336        int order = 1;
1337        void *buddy = mb_find_buddy(e4b, order, &max);
1338
1339        while (buddy) {
1340                void *buddy2;
1341
1342                /* Bits in range [first; last] are known to be set since
1343                 * corresponding blocks were allocated. Bits in range
1344                 * (first; last) will stay set because they form buddies on
1345                 * upper layer. We just deal with borders if they don't
1346                 * align with upper layer and then go up.
1347                 * Releasing entire group is all about clearing
1348                 * single bit of highest order buddy.
1349                 */
1350
1351                /* Example:
1352                 * ---------------------------------
1353                 * |   1   |   1   |   1   |   1   |
1354                 * ---------------------------------
1355                 * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
1356                 * ---------------------------------
1357                 *   0   1   2   3   4   5   6   7
1358                 *      \_____________________/
1359                 *
1360                 * Neither [1] nor [6] is aligned to above layer.
1361                 * Left neighbour [0] is free, so mark it busy,
1362                 * decrease bb_counters and extend range to
1363                 * [0; 6]
1364                 * Right neighbour [7] is busy. It can't be coaleasced with [6], so
1365                 * mark [6] free, increase bb_counters and shrink range to
1366                 * [0; 5].
1367                 * Then shift range to [0; 2], go up and do the same.
1368                 */
1369
1370
1371                if (first & 1)
1372                        e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1);
1373                if (!(last & 1))
1374                        e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1);
1375                if (first > last)
1376                        break;
1377                order++;
1378
1379                if (first == last || !(buddy2 = mb_find_buddy(e4b, order, &max))) {
1380                        mb_clear_bits(buddy, first, last - first + 1);
1381                        e4b->bd_info->bb_counters[order - 1] += last - first + 1;
1382                        break;
1383                }
1384                first >>= 1;
1385                last >>= 1;
1386                buddy = buddy2;
1387        }
1388}
1389
1390static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1391                           int first, int count)
1392{
1393        int left_is_free = 0;
1394        int right_is_free = 0;
1395        int block;
1396        int last = first + count - 1;
1397        struct super_block *sb = e4b->bd_sb;
1398
1399        BUG_ON(last >= (sb->s_blocksize << 3));
1400        assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
1401        mb_check_buddy(e4b);
1402        mb_free_blocks_double(inode, e4b, first, count);
1403
1404        e4b->bd_info->bb_free += count;
1405        if (first < e4b->bd_info->bb_first_free)
1406                e4b->bd_info->bb_first_free = first;
1407
1408        /* access memory sequentially: check left neighbour,
1409         * clear range and then check right neighbour
1410         */
1411        if (first != 0)
1412                left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap);
1413        block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count);
1414        if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0])
1415                right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap);
1416
1417        if (unlikely(block != -1)) {
1418                ext4_fsblk_t blocknr;
1419
1420                blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
1421                blocknr += EXT4_C2B(EXT4_SB(sb), block);
1422                ext4_grp_locked_error(sb, e4b->bd_group,
1423                                      inode ? inode->i_ino : 0,
1424                                      blocknr,
1425                                      "freeing already freed block "
1426                                      "(bit %u)", block);
1427                mb_regenerate_buddy(e4b);
1428                goto done;
1429        }
1430
1431        /* let's maintain fragments counter */
1432        if (left_is_free && right_is_free)
1433                e4b->bd_info->bb_fragments--;
1434        else if (!left_is_free && !right_is_free)
1435                e4b->bd_info->bb_fragments++;
1436
1437        /* buddy[0] == bd_bitmap is a special case, so handle
1438         * it right away and let mb_buddy_mark_free stay free of
1439         * zero order checks.
1440         * Check if neighbours are to be coaleasced,
1441         * adjust bitmap bb_counters and borders appropriately.
1442         */
1443        if (first & 1) {
1444                first += !left_is_free;
1445                e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1;
1446        }
1447        if (!(last & 1)) {
1448                last -= !right_is_free;
1449                e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1;
1450        }
1451
1452        if (first <= last)
1453                mb_buddy_mark_free(e4b, first >> 1, last >> 1);
1454
1455done:
1456        mb_set_largest_free_order(sb, e4b->bd_info);
1457        mb_check_buddy(e4b);
1458}
1459
1460static int mb_find_extent(struct ext4_buddy *e4b, int block,
1461                                int needed, struct ext4_free_extent *ex)
1462{
1463        int next = block;
1464        int max, order;
1465        void *buddy;
1466
1467        assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
1468        BUG_ON(ex == NULL);
1469
1470        buddy = mb_find_buddy(e4b, 0, &max);
1471        BUG_ON(buddy == NULL);
1472        BUG_ON(block >= max);
1473        if (mb_test_bit(block, buddy)) {
1474                ex->fe_len = 0;
1475                ex->fe_start = 0;
1476                ex->fe_group = 0;
1477                return 0;
1478        }
1479
1480        /* find actual order */
1481        order = mb_find_order_for_block(e4b, block);
1482        block = block >> order;
1483
1484        ex->fe_len = 1 << order;
1485        ex->fe_start = block << order;
1486        ex->fe_group = e4b->bd_group;
1487
1488        /* calc difference from given start */
1489        next = next - ex->fe_start;
1490        ex->fe_len -= next;
1491        ex->fe_start += next;
1492
1493        while (needed > ex->fe_len &&
1494               mb_find_buddy(e4b, order, &max)) {
1495
1496                if (block + 1 >= max)
1497                        break;
1498
1499                next = (block + 1) * (1 << order);
1500                if (mb_test_bit(next, e4b->bd_bitmap))
1501                        break;
1502
1503                order = mb_find_order_for_block(e4b, next);
1504
1505                block = next >> order;
1506                ex->fe_len += 1 << order;
1507        }
1508
1509        BUG_ON(ex->fe_start + ex->fe_len > (1 << (e4b->bd_blkbits + 3)));
1510        return ex->fe_len;
1511}
1512
1513static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
1514{
1515        int ord;
1516        int mlen = 0;
1517        int max = 0;
1518        int cur;
1519        int start = ex->fe_start;
1520        int len = ex->fe_len;
1521        unsigned ret = 0;
1522        int len0 = len;
1523        void *buddy;
1524
1525        BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
1526        BUG_ON(e4b->bd_group != ex->fe_group);
1527        assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
1528        mb_check_buddy(e4b);
1529        mb_mark_used_double(e4b, start, len);
1530
1531        e4b->bd_info->bb_free -= len;
1532        if (e4b->bd_info->bb_first_free == start)
1533                e4b->bd_info->bb_first_free += len;
1534
1535        /* let's maintain fragments counter */
1536        if (start != 0)
1537                mlen = !mb_test_bit(start - 1, e4b->bd_bitmap);
1538        if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0])
1539                max = !mb_test_bit(start + len, e4b->bd_bitmap);
1540        if (mlen && max)
1541                e4b->bd_info->bb_fragments++;
1542        else if (!mlen && !max)
1543                e4b->bd_info->bb_fragments--;
1544
1545        /* let's maintain buddy itself */
1546        while (len) {
1547                ord = mb_find_order_for_block(e4b, start);
1548
1549                if (((start >> ord) << ord) == start && len >= (1 << ord)) {
1550                        /* the whole chunk may be allocated at once! */
1551                        mlen = 1 << ord;
1552                        buddy = mb_find_buddy(e4b, ord, &max);
1553                        BUG_ON((start >> ord) >= max);
1554                        mb_set_bit(start >> ord, buddy);
1555                        e4b->bd_info->bb_counters[ord]--;
1556                        start += mlen;
1557                        len -= mlen;
1558                        BUG_ON(len < 0);
1559                        continue;
1560                }
1561
1562                /* store for history */
1563                if (ret == 0)
1564                        ret = len | (ord << 16);
1565
1566                /* we have to split large buddy */
1567                BUG_ON(ord <= 0);
1568                buddy = mb_find_buddy(e4b, ord, &max);
1569                mb_set_bit(start >> ord, buddy);
1570                e4b->bd_info->bb_counters[ord]--;
1571
1572                ord--;
1573                cur = (start >> ord) & ~1U;
1574                buddy = mb_find_buddy(e4b, ord, &max);
1575                mb_clear_bit(cur, buddy);
1576                mb_clear_bit(cur + 1, buddy);
1577                e4b->bd_info->bb_counters[ord]++;
1578                e4b->bd_info->bb_counters[ord]++;
1579        }
1580        mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
1581
1582        ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0);
1583        mb_check_buddy(e4b);
1584
1585        return ret;
1586}
1587
1588/*
1589 * Must be called under group lock!
1590 */
1591static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
1592                                        struct ext4_buddy *e4b)
1593{
1594        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
1595        int ret;
1596
1597        BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group);
1598        BUG_ON(ac->ac_status == AC_STATUS_FOUND);
1599
1600        ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len);
1601        ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical;
1602        ret = mb_mark_used(e4b, &ac->ac_b_ex);
1603
1604        /* preallocation can change ac_b_ex, thus we store actually
1605         * allocated blocks for history */
1606        ac->ac_f_ex = ac->ac_b_ex;
1607
1608        ac->ac_status = AC_STATUS_FOUND;
1609        ac->ac_tail = ret & 0xffff;
1610        ac->ac_buddy = ret >> 16;
1611
1612        /*
1613         * take the page reference. We want the page to be pinned
1614         * so that we don't get a ext4_mb_init_cache_call for this
1615         * group until we update the bitmap. That would mean we
1616         * double allocate blocks. The reference is dropped
1617         * in ext4_mb_release_context
1618         */
1619        ac->ac_bitmap_page = e4b->bd_bitmap_page;
1620        get_page(ac->ac_bitmap_page);
1621        ac->ac_buddy_page = e4b->bd_buddy_page;
1622        get_page(ac->ac_buddy_page);
1623        /* store last allocated for subsequent stream allocation */
1624        if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
1625                spin_lock(&sbi->s_md_lock);
1626                sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
1627                sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
1628                spin_unlock(&sbi->s_md_lock);
1629        }
1630}
1631
1632/*
1633 * regular allocator, for general purposes allocation
1634 */
1635
1636static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
1637                                        struct ext4_buddy *e4b,
1638                                        int finish_group)
1639{
1640        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
1641        struct ext4_free_extent *bex = &ac->ac_b_ex;
1642        struct ext4_free_extent *gex = &ac->ac_g_ex;
1643        struct ext4_free_extent ex;
1644        int max;
1645
1646        if (ac->ac_status == AC_STATUS_FOUND)
1647                return;
1648        /*
1649         * We don't want to scan for a whole year
1650         */
1651        if (ac->ac_found > sbi->s_mb_max_to_scan &&
1652                        !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
1653                ac->ac_status = AC_STATUS_BREAK;
1654                return;
1655        }
1656
1657        /*
1658         * Haven't found good chunk so far, let's continue
1659         */
1660        if (bex->fe_len < gex->fe_len)
1661                return;
1662
1663        if ((finish_group || ac->ac_found > sbi->s_mb_min_to_scan)
1664                        && bex->fe_group == e4b->bd_group) {
1665                /* recheck chunk's availability - we don't know
1666                 * when it was found (within this lock-unlock
1667                 * period or not) */
1668                max = mb_find_extent(e4b, bex->fe_start, gex->fe_len, &ex);
1669                if (max >= gex->fe_len) {
1670                        ext4_mb_use_best_found(ac, e4b);
1671                        return;
1672                }
1673        }
1674}
1675
1676/*
1677 * The routine checks whether found extent is good enough. If it is,
1678 * then the extent gets marked used and flag is set to the context
1679 * to stop scanning. Otherwise, the extent is compared with the
1680 * previous found extent and if new one is better, then it's stored
1681 * in the context. Later, the best found extent will be used, if
1682 * mballoc can't find good enough extent.
1683 *
1684 * FIXME: real allocation policy is to be designed yet!
1685 */
1686static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
1687                                        struct ext4_free_extent *ex,
1688                                        struct ext4_buddy *e4b)
1689{
1690        struct ext4_free_extent *bex = &ac->ac_b_ex;
1691        struct ext4_free_extent *gex = &ac->ac_g_ex;
1692
1693        BUG_ON(ex->fe_len <= 0);
1694        BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
1695        BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
1696        BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
1697
1698        ac->ac_found++;
1699
1700        /*
1701         * The special case - take what you catch first
1702         */
1703        if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
1704                *bex = *ex;
1705                ext4_mb_use_best_found(ac, e4b);
1706                return;
1707        }
1708
1709        /*
1710         * Let's check whether the chuck is good enough
1711         */
1712        if (ex->fe_len == gex->fe_len) {
1713                *bex = *ex;
1714                ext4_mb_use_best_found(ac, e4b);
1715                return;
1716        }
1717
1718        /*
1719         * If this is first found extent, just store it in the context
1720         */
1721        if (bex->fe_len == 0) {
1722                *bex = *ex;
1723                return;
1724        }
1725
1726        /*
1727         * If new found extent is better, store it in the context
1728         */
1729        if (bex->fe_len < gex->fe_len) {
1730                /* if the request isn't satisfied, any found extent
1731                 * larger than previous best one is better */
1732                if (ex->fe_len > bex->fe_len)
1733                        *bex = *ex;
1734        } else if (ex->fe_len > gex->fe_len) {
1735                /* if the request is satisfied, then we try to find
1736                 * an extent that still satisfy the request, but is
1737                 * smaller than previous one */
1738                if (ex->fe_len < bex->fe_len)
1739                        *bex = *ex;
1740        }
1741
1742        ext4_mb_check_limits(ac, e4b, 0);
1743}
1744
1745static noinline_for_stack
1746int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
1747                                        struct ext4_buddy *e4b)
1748{
1749        struct ext4_free_extent ex = ac->ac_b_ex;
1750        ext4_group_t group = ex.fe_group;
1751        int max;
1752        int err;
1753
1754        BUG_ON(ex.fe_len <= 0);
1755        err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
1756        if (err)
1757                return err;
1758
1759        ext4_lock_group(ac->ac_sb, group);
1760        max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex);
1761
1762        if (max > 0) {
1763                ac->ac_b_ex = ex;
1764                ext4_mb_use_best_found(ac, e4b);
1765        }
1766
1767        ext4_unlock_group(ac->ac_sb, group);
1768        ext4_mb_unload_buddy(e4b);
1769
1770        return 0;
1771}
1772
1773static noinline_for_stack
1774int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
1775                                struct ext4_buddy *e4b)
1776{
1777        ext4_group_t group = ac->ac_g_ex.fe_group;
1778        int max;
1779        int err;
1780        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
1781        struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
1782        struct ext4_free_extent ex;
1783
1784        if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
1785                return 0;
1786        if (grp->bb_free == 0)
1787                return 0;
1788
1789        err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
1790        if (err)
1791                return err;
1792
1793        ext4_lock_group(ac->ac_sb, group);
1794        max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
1795                             ac->ac_g_ex.fe_len, &ex);
1796
1797        if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
1798                ext4_fsblk_t start;
1799
1800                start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) +
1801                        ex.fe_start;
1802                /* use do_div to get remainder (would be 64-bit modulo) */
1803                if (do_div(start, sbi->s_stripe) == 0) {
1804                        ac->ac_found++;
1805                        ac->ac_b_ex = ex;
1806                        ext4_mb_use_best_found(ac, e4b);
1807                }
1808        } else if (max >= ac->ac_g_ex.fe_len) {
1809                BUG_ON(ex.fe_len <= 0);
1810                BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
1811                BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
1812                ac->ac_found++;
1813                ac->ac_b_ex = ex;
1814                ext4_mb_use_best_found(ac, e4b);
1815        } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) {
1816                /* Sometimes, caller may want to merge even small
1817                 * number of blocks to an existing extent */
1818                BUG_ON(ex.fe_len <= 0);
1819                BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
1820                BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
1821                ac->ac_found++;
1822                ac->ac_b_ex = ex;
1823                ext4_mb_use_best_found(ac, e4b);
1824        }
1825        ext4_unlock_group(ac->ac_sb, group);
1826        ext4_mb_unload_buddy(e4b);
1827
1828        return 0;
1829}
1830
1831/*
1832 * The routine scans buddy structures (not bitmap!) from given order
1833 * to max order and tries to find big enough chunk to satisfy the req
1834 */
1835static noinline_for_stack
1836void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
1837                                        struct ext4_buddy *e4b)
1838{
1839        struct super_block *sb = ac->ac_sb;
1840        struct ext4_group_info *grp = e4b->bd_info;
1841        void *buddy;
1842        int i;
1843        int k;
1844        int max;
1845
1846        BUG_ON(ac->ac_2order <= 0);
1847        for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) {
1848                if (grp->bb_counters[i] == 0)
1849                        continue;
1850
1851                buddy = mb_find_buddy(e4b, i, &max);
1852                BUG_ON(buddy == NULL);
1853
1854                k = mb_find_next_zero_bit(buddy, max, 0);
1855                BUG_ON(k >= max);
1856
1857                ac->ac_found++;
1858
1859                ac->ac_b_ex.fe_len = 1 << i;
1860                ac->ac_b_ex.fe_start = k << i;
1861                ac->ac_b_ex.fe_group = e4b->bd_group;
1862
1863                ext4_mb_use_best_found(ac, e4b);
1864
1865                BUG_ON(ac->ac_b_ex.fe_len != ac->ac_g_ex.fe_len);
1866
1867                if (EXT4_SB(sb)->s_mb_stats)
1868                        atomic_inc(&EXT4_SB(sb)->s_bal_2orders);
1869
1870                break;
1871        }
1872}
1873
1874/*
1875 * The routine scans the group and measures all found extents.
1876 * In order to optimize scanning, caller must pass number of
1877 * free blocks in the group, so the routine can know upper limit.
1878 */
1879static noinline_for_stack
1880void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
1881                                        struct ext4_buddy *e4b)
1882{
1883        struct super_block *sb = ac->ac_sb;
1884        void *bitmap = e4b->bd_bitmap;
1885        struct ext4_free_extent ex;
1886        int i;
1887        int free;
1888
1889        free = e4b->bd_info->bb_free;
1890        BUG_ON(free <= 0);
1891
1892        i = e4b->bd_info->bb_first_free;
1893
1894        while (free && ac->ac_status == AC_STATUS_CONTINUE) {
1895                i = mb_find_next_zero_bit(bitmap,
1896                                                EXT4_CLUSTERS_PER_GROUP(sb), i);
1897                if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) {
1898                        /*
1899                         * IF we have corrupt bitmap, we won't find any
1900                         * free blocks even though group info says we
1901                         * we have free blocks
1902                         */
1903                        ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
1904                                        "%d free clusters as per "
1905                                        "group info. But bitmap says 0",
1906                                        free);
1907                        break;
1908                }
1909
1910                mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex);
1911                BUG_ON(ex.fe_len <= 0);
1912                if (free < ex.fe_len) {
1913                        ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
1914                                        "%d free clusters as per "
1915                                        "group info. But got %d blocks",
1916                                        free, ex.fe_len);
1917                        /*
1918                         * The number of free blocks differs. This mostly
1919                         * indicate that the bitmap is corrupt. So exit
1920                         * without claiming the space.
1921                         */
1922                        break;
1923                }
1924
1925                ext4_mb_measure_extent(ac, &ex, e4b);
1926
1927                i += ex.fe_len;
1928                free -= ex.fe_len;
1929        }
1930
1931        ext4_mb_check_limits(ac, e4b, 1);
1932}
1933
1934/*
1935 * This is a special case for storages like raid5
1936 * we try to find stripe-aligned chunks for stripe-size-multiple requests
1937 */
1938static noinline_for_stack
1939void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
1940                                 struct ext4_buddy *e4b)
1941{
1942        struct super_block *sb = ac->ac_sb;
1943        struct ext4_sb_info *sbi = EXT4_SB(sb);
1944        void *bitmap = e4b->bd_bitmap;
1945        struct ext4_free_extent ex;
1946        ext4_fsblk_t first_group_block;
1947        ext4_fsblk_t a;
1948        ext4_grpblk_t i;
1949        int max;
1950
1951        BUG_ON(sbi->s_stripe == 0);
1952
1953        /* find first stripe-aligned block in group */
1954        first_group_block = ext4_group_first_block_no(sb, e4b->bd_group);
1955
1956        a = first_group_block + sbi->s_stripe - 1;
1957        do_div(a, sbi->s_stripe);
1958        i = (a * sbi->s_stripe) - first_group_block;
1959
1960        while (i < EXT4_CLUSTERS_PER_GROUP(sb)) {
1961                if (!mb_test_bit(i, bitmap)) {
1962                        max = mb_find_extent(e4b, i, sbi->s_stripe, &ex);
1963                        if (max >= sbi->s_stripe) {
1964                                ac->ac_found++;
1965                                ac->ac_b_ex = ex;
1966                                ext4_mb_use_best_found(ac, e4b);
1967                                break;
1968                        }
1969                }
1970                i += sbi->s_stripe;
1971        }
1972}
1973
1974/* This is now called BEFORE we load the buddy bitmap. */
1975static int ext4_mb_good_group(struct ext4_allocation_context *ac,
1976                                ext4_group_t group, int cr)
1977{
1978        unsigned free, fragments;
1979        int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
1980        struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
1981
1982        BUG_ON(cr < 0 || cr >= 4);
1983
1984        free = grp->bb_free;
1985        if (free == 0)
1986                return 0;
1987        if (cr <= 2 && free < ac->ac_g_ex.fe_len)
1988                return 0;
1989
1990        /* We only do this if the grp has never been initialized */
1991        if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
1992                int ret = ext4_mb_init_group(ac->ac_sb, group);
1993                if (ret)
1994                        return 0;
1995        }
1996
1997        fragments = grp->bb_fragments;
1998        if (fragments == 0)
1999                return 0;
2000
2001        switch (cr) {
2002        case 0:
2003                BUG_ON(ac->ac_2order == 0);
2004
2005                /* Avoid using the first bg of a flexgroup for data files */
2006                if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
2007                    (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
2008                    ((group % flex_size) == 0))
2009                        return 0;
2010
2011                if ((ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) ||
2012                    (free / fragments) >= ac->ac_g_ex.fe_len)
2013                        return 1;
2014
2015                if (grp->bb_largest_free_order < ac->ac_2order)
2016                        return 0;
2017
2018                return 1;
2019        case 1:
2020                if ((free / fragments) >= ac->ac_g_ex.fe_len)
2021                        return 1;
2022                break;
2023        case 2:
2024                if (free >= ac->ac_g_ex.fe_len)
2025                        return 1;
2026                break;
2027        case 3:
2028                return 1;
2029        default:
2030                BUG();
2031        }
2032
2033        return 0;
2034}
2035
2036static noinline_for_stack int
2037ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
2038{
2039        ext4_group_t ngroups, group, i;
2040        int cr;
2041        int err = 0;
2042        struct ext4_sb_info *sbi;
2043        struct super_block *sb;
2044        struct ext4_buddy e4b;
2045
2046        sb = ac->ac_sb;
2047        sbi = EXT4_SB(sb);
2048        ngroups = ext4_get_groups_count(sb);
2049        /* non-extent files are limited to low blocks/groups */
2050        if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
2051                ngroups = sbi->s_blockfile_groups;
2052
2053        BUG_ON(ac->ac_status == AC_STATUS_FOUND);
2054
2055        /* first, try the goal */
2056        err = ext4_mb_find_by_goal(ac, &e4b);
2057        if (err || ac->ac_status == AC_STATUS_FOUND)
2058                goto out;
2059
2060        if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
2061                goto out;
2062
2063        /*
2064         * ac->ac2_order is set only if the fe_len is a power of 2
2065         * if ac2_order is set we also set criteria to 0 so that we
2066         * try exact allocation using buddy.
2067         */
2068        i = fls(ac->ac_g_ex.fe_len);
2069        ac->ac_2order = 0;
2070        /*
2071         * We search using buddy data only if the order of the request
2072         * is greater than equal to the sbi_s_mb_order2_reqs
2073         * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
2074         */
2075        if (i >= sbi->s_mb_order2_reqs) {
2076                /*
2077                 * This should tell if fe_len is exactly power of 2
2078                 */
2079                if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0)
2080                        ac->ac_2order = i - 1;
2081        }
2082
2083        /* if stream allocation is enabled, use global goal */
2084        if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
2085                /* TBD: may be hot point */
2086                spin_lock(&sbi->s_md_lock);
2087                ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
2088                ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
2089                spin_unlock(&sbi->s_md_lock);
2090        }
2091
2092        /* Let's just scan groups to find more-less suitable blocks */
2093        cr = ac->ac_2order ? 0 : 1;
2094        /*
2095         * cr == 0 try to get exact allocation,
2096         * cr == 3  try to get anything
2097         */
2098repeat:
2099        for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
2100                ac->ac_criteria = cr;
2101                /*
2102                 * searching for the right group start
2103                 * from the goal value specified
2104                 */
2105                group = ac->ac_g_ex.fe_group;
2106
2107                for (i = 0; i < ngroups; group++, i++) {
2108                        cond_resched();
2109                        /*
2110                         * Artificially restricted ngroups for non-extent
2111                         * files makes group > ngroups possible on first loop.
2112                         */
2113                        if (group >= ngroups)
2114                                group = 0;
2115
2116                        /* This now checks without needing the buddy page */
2117                        if (!ext4_mb_good_group(ac, group, cr))
2118                                continue;
2119
2120                        err = ext4_mb_load_buddy(sb, group, &e4b);
2121                        if (err)
2122                                goto out;
2123
2124                        ext4_lock_group(sb, group);
2125
2126                        /*
2127                         * We need to check again after locking the
2128                         * block group
2129                         */
2130                        if (!ext4_mb_good_group(ac, group, cr)) {
2131                                ext4_unlock_group(sb, group);
2132                                ext4_mb_unload_buddy(&e4b);
2133                                continue;
2134                        }
2135
2136                        ac->ac_groups_scanned++;
2137                        if (cr == 0 && ac->ac_2order < sb->s_blocksize_bits+2)
2138                                ext4_mb_simple_scan_group(ac, &e4b);
2139                        else if (cr == 1 && sbi->s_stripe &&
2140                                        !(ac->ac_g_ex.fe_len % sbi->s_stripe))
2141                                ext4_mb_scan_aligned(ac, &e4b);
2142                        else
2143                                ext4_mb_complex_scan_group(ac, &e4b);
2144
2145                        ext4_unlock_group(sb, group);
2146                        ext4_mb_unload_buddy(&e4b);
2147
2148                        if (ac->ac_status != AC_STATUS_CONTINUE)
2149                                break;
2150                }
2151        }
2152
2153        if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
2154            !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
2155                /*
2156                 * We've been searching too long. Let's try to allocate
2157                 * the best chunk we've found so far
2158                 */
2159
2160                ext4_mb_try_best_found(ac, &e4b);
2161                if (ac->ac_status != AC_STATUS_FOUND) {
2162                        /*
2163                         * Someone more lucky has already allocated it.
2164                         * The only thing we can do is just take first
2165                         * found block(s)
2166                        printk(KERN_DEBUG "EXT4-fs: someone won our chunk\n");
2167                         */
2168                        ac->ac_b_ex.fe_group = 0;
2169                        ac->ac_b_ex.fe_start = 0;
2170                        ac->ac_b_ex.fe_len = 0;
2171                        ac->ac_status = AC_STATUS_CONTINUE;
2172                        ac->ac_flags |= EXT4_MB_HINT_FIRST;
2173                        cr = 3;
2174                        atomic_inc(&sbi->s_mb_lost_chunks);
2175                        goto repeat;
2176                }
2177        }
2178out:
2179        return err;
2180}
2181
2182static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
2183{
2184        struct super_block *sb = seq->private;
2185        ext4_group_t group;
2186
2187        if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
2188                return NULL;
2189        group = *pos + 1;
2190        return (void *) ((unsigned long) group);
2191}
2192
2193static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
2194{
2195        struct super_block *sb = seq->private;
2196        ext4_group_t group;
2197
2198        ++*pos;
2199        if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
2200                return NULL;
2201        group = *pos + 1;
2202        return (void *) ((unsigned long) group);
2203}
2204
2205static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
2206{
2207        struct super_block *sb = seq->private;
2208        ext4_group_t group = (ext4_group_t) ((unsigned long) v);
2209        int i;
2210        int err, buddy_loaded = 0;
2211        struct ext4_buddy e4b;
2212        struct ext4_group_info *grinfo;
2213        struct sg {
2214                struct ext4_group_info info;
2215                ext4_grpblk_t counters[16];
2216        } sg;
2217
2218        group--;
2219        if (group == 0)
2220                seq_printf(seq, "#%-5s: %-5s %-5s %-5s "
2221                                "[ %-5s %-5s %-5s %-5s %-5s %-5s %-5s "
2222                                  "%-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n",
2223                           "group", "free", "frags", "first",
2224                           "2^0", "2^1", "2^2", "2^3", "2^4", "2^5", "2^6",
2225                           "2^7", "2^8", "2^9", "2^10", "2^11", "2^12", "2^13");
2226
2227        i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
2228                sizeof(struct ext4_group_info);
2229        grinfo = ext4_get_group_info(sb, group);
2230        /* Load the group info in memory only if not already loaded. */
2231        if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) {
2232                err = ext4_mb_load_buddy(sb, group, &e4b);
2233                if (err) {
2234                        seq_printf(seq, "#%-5u: I/O error\n", group);
2235                        return 0;
2236                }
2237                buddy_loaded = 1;
2238        }
2239
2240        memcpy(&sg, ext4_get_group_info(sb, group), i);
2241
2242        if (buddy_loaded)
2243                ext4_mb_unload_buddy(&e4b);
2244
2245        seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
2246                        sg.info.bb_fragments, sg.info.bb_first_free);
2247        for (i = 0; i <= 13; i++)
2248                seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ?
2249                                sg.info.bb_counters[i] : 0);
2250        seq_printf(seq, " ]\n");
2251
2252        return 0;
2253}
2254
2255static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v)
2256{
2257}
2258
2259static const struct seq_operations ext4_mb_seq_groups_ops = {
2260        .start  = ext4_mb_seq_groups_start,
2261        .next   = ext4_mb_seq_groups_next,
2262        .stop   = ext4_mb_seq_groups_stop,
2263        .show   = ext4_mb_seq_groups_show,
2264};
2265
2266static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
2267{
2268        struct super_block *sb = PDE_DATA(inode);
2269        int rc;
2270
2271        rc = seq_open(file, &ext4_mb_seq_groups_ops);
2272        if (rc == 0) {
2273                struct seq_file *m = file->private_data;
2274                m->private = sb;
2275        }
2276        return rc;
2277
2278}
2279
2280static const struct file_operations ext4_mb_seq_groups_fops = {
2281        .owner          = THIS_MODULE,
2282        .open           = ext4_mb_seq_groups_open,
2283        .read           = seq_read,
2284        .llseek         = seq_lseek,
2285        .release        = seq_release,
2286};
2287
2288static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
2289{
2290        int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
2291        struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index];
2292
2293        BUG_ON(!cachep);
2294        return cachep;
2295}
2296
2297/*
2298 * Allocate the top-level s_group_info array for the specified number
2299 * of groups
2300 */
2301int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
2302{
2303        struct ext4_sb_info *sbi = EXT4_SB(sb);
2304        unsigned size;
2305        struct ext4_group_info ***new_groupinfo;
2306
2307        size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >>
2308                EXT4_DESC_PER_BLOCK_BITS(sb);
2309        if (size <= sbi->s_group_info_size)
2310                return 0;
2311
2312        size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size);
2313        new_groupinfo = ext4_kvzalloc(size, GFP_KERNEL);
2314        if (!new_groupinfo) {
2315                ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
2316                return -ENOMEM;
2317        }
2318        if (sbi->s_group_info) {
2319                memcpy(new_groupinfo, sbi->s_group_info,
2320                       sbi->s_group_info_size * sizeof(*sbi->s_group_info));
2321                ext4_kvfree(sbi->s_group_info);
2322        }
2323        sbi->s_group_info = new_groupinfo;
2324        sbi->s_group_info_size = size / sizeof(*sbi->s_group_info);
2325        ext4_debug("allocated s_groupinfo array for %d meta_bg's\n", 
2326                   sbi->s_group_info_size);
2327        return 0;
2328}
2329
2330/* Create and initialize ext4_group_info data for the given group. */
2331int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2332                          struct ext4_group_desc *desc)
2333{
2334        int i;
2335        int metalen = 0;
2336        struct ext4_sb_info *sbi = EXT4_SB(sb);
2337        struct ext4_group_info **meta_group_info;
2338        struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2339
2340        /*
2341         * First check if this group is the first of a reserved block.
2342         * If it's true, we have to allocate a new table of pointers
2343         * to ext4_group_info structures
2344         */
2345        if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
2346                metalen = sizeof(*meta_group_info) <<
2347                        EXT4_DESC_PER_BLOCK_BITS(sb);
2348                meta_group_info = kmalloc(metalen, GFP_KERNEL);
2349                if (meta_group_info == NULL) {
2350                        ext4_msg(sb, KERN_ERR, "can't allocate mem "
2351                                 "for a buddy group");
2352                        goto exit_meta_group_info;
2353                }
2354                sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] =
2355                        meta_group_info;
2356        }
2357
2358        meta_group_info =
2359                sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
2360        i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
2361
2362        meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_KERNEL);
2363        if (meta_group_info[i] == NULL) {
2364                ext4_msg(sb, KERN_ERR, "can't allocate buddy mem");
2365                goto exit_group_info;
2366        }
2367        set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
2368                &(meta_group_info[i]->bb_state));
2369
2370        /*
2371         * initialize bb_free to be able to skip
2372         * empty groups without initialization
2373         */
2374        if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
2375                meta_group_info[i]->bb_free =
2376                        ext4_free_clusters_after_init(sb, group, desc);
2377        } else {
2378                meta_group_info[i]->bb_free =
2379                        ext4_free_group_clusters(sb, desc);
2380        }
2381
2382        INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
2383        init_rwsem(&meta_group_info[i]->alloc_sem);
2384        meta_group_info[i]->bb_free_root = RB_ROOT;
2385        meta_group_info[i]->bb_largest_free_order = -1;  /* uninit */
2386
2387#ifdef DOUBLE_CHECK
2388        {
2389                struct buffer_head *bh;
2390                meta_group_info[i]->bb_bitmap =
2391                        kmalloc(sb->s_blocksize, GFP_KERNEL);
2392                BUG_ON(meta_group_info[i]->bb_bitmap == NULL);
2393                bh = ext4_read_block_bitmap(sb, group);
2394                BUG_ON(bh == NULL);
2395                memcpy(meta_group_info[i]->bb_bitmap, bh->b_data,
2396                        sb->s_blocksize);
2397                put_bh(bh);
2398        }
2399#endif
2400
2401        return 0;
2402
2403exit_group_info:
2404        /* If a meta_group_info table has been allocated, release it now */
2405        if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
2406                kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]);
2407                sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = NULL;
2408        }
2409exit_meta_group_info:
2410        return -ENOMEM;
2411} /* ext4_mb_add_groupinfo */
2412
2413static int ext4_mb_init_backend(struct super_block *sb)
2414{
2415        ext4_group_t ngroups = ext4_get_groups_count(sb);
2416        ext4_group_t i;
2417        struct ext4_sb_info *sbi = EXT4_SB(sb);
2418        int err;
2419        struct ext4_group_desc *desc;
2420        struct kmem_cache *cachep;
2421
2422        err = ext4_mb_alloc_groupinfo(sb, ngroups);
2423        if (err)
2424                return err;
2425
2426        sbi->s_buddy_cache = new_inode(sb);
2427        if (sbi->s_buddy_cache == NULL) {
2428                ext4_msg(sb, KERN_ERR, "can't get new inode");
2429                goto err_freesgi;
2430        }
2431        /* To avoid potentially colliding with an valid on-disk inode number,
2432         * use EXT4_BAD_INO for the buddy cache inode number.  This inode is
2433         * not in the inode hash, so it should never be found by iget(), but
2434         * this will avoid confusion if it ever shows up during debugging. */
2435        sbi->s_buddy_cache->i_ino = EXT4_BAD_INO;
2436        EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
2437        for (i = 0; i < ngroups; i++) {
2438                desc = ext4_get_group_desc(sb, i, NULL);
2439                if (desc == NULL) {
2440                        ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i);
2441                        goto err_freebuddy;
2442                }
2443                if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
2444                        goto err_freebuddy;
2445        }
2446
2447        return 0;
2448
2449err_freebuddy:
2450        cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2451        while (i-- > 0)
2452                kmem_cache_free(cachep, ext4_get_group_info(sb, i));
2453        i = sbi->s_group_info_size;
2454        while (i-- > 0)
2455                kfree(sbi->s_group_info[i]);
2456        iput(sbi->s_buddy_cache);
2457err_freesgi:
2458        ext4_kvfree(sbi->s_group_info);
2459        return -ENOMEM;
2460}
2461
2462static void ext4_groupinfo_destroy_slabs(void)
2463{
2464        int i;
2465
2466        for (i = 0; i < NR_GRPINFO_CACHES; i++) {
2467                if (ext4_groupinfo_caches[i])
2468                        kmem_cache_destroy(ext4_groupinfo_caches[i]);
2469                ext4_groupinfo_caches[i] = NULL;
2470        }
2471}
2472
2473static int ext4_groupinfo_create_slab(size_t size)
2474{
2475        static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex);
2476        int slab_size;
2477        int blocksize_bits = order_base_2(size);
2478        int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
2479        struct kmem_cache *cachep;
2480
2481        if (cache_index >= NR_GRPINFO_CACHES)
2482                return -EINVAL;
2483
2484        if (unlikely(cache_index < 0))
2485                cache_index = 0;
2486
2487        mutex_lock(&ext4_grpinfo_slab_create_mutex);
2488        if (ext4_groupinfo_caches[cache_index]) {
2489                mutex_unlock(&ext4_grpinfo_slab_create_mutex);
2490                return 0;       /* Already created */
2491        }
2492
2493        slab_size = offsetof(struct ext4_group_info,
2494                                bb_counters[blocksize_bits + 2]);
2495
2496        cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index],
2497                                        slab_size, 0, SLAB_RECLAIM_ACCOUNT,
2498                                        NULL);
2499
2500        ext4_groupinfo_caches[cache_index] = cachep;
2501
2502        mutex_unlock(&ext4_grpinfo_slab_create_mutex);
2503        if (!cachep) {
2504                printk(KERN_EMERG
2505                       "EXT4-fs: no memory for groupinfo slab cache\n");
2506                return -ENOMEM;
2507        }
2508
2509        return 0;
2510}
2511
2512int ext4_mb_init(struct super_block *sb)
2513{
2514        struct ext4_sb_info *sbi = EXT4_SB(sb);
2515        unsigned i, j;
2516        unsigned offset;
2517        unsigned max;
2518        int ret;
2519
2520        i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
2521
2522        sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
2523        if (sbi->s_mb_offsets == NULL) {
2524                ret = -ENOMEM;
2525                goto out;
2526        }
2527
2528        i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
2529        sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
2530        if (sbi->s_mb_maxs == NULL) {
2531                ret = -ENOMEM;
2532                goto out;
2533        }
2534
2535        ret = ext4_groupinfo_create_slab(sb->s_blocksize);
2536        if (ret < 0)
2537                goto out;
2538
2539        /* order 0 is regular bitmap */
2540        sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
2541        sbi->s_mb_offsets[0] = 0;
2542
2543        i = 1;
2544        offset = 0;
2545        max = sb->s_blocksize << 2;
2546        do {
2547                sbi->s_mb_offsets[i] = offset;
2548                sbi->s_mb_maxs[i] = max;
2549                offset += 1 << (sb->s_blocksize_bits - i);
2550                max = max >> 1;
2551                i++;
2552        } while (i <= sb->s_blocksize_bits + 1);
2553
2554        spin_lock_init(&sbi->s_md_lock);
2555        spin_lock_init(&sbi->s_bal_lock);
2556
2557        sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
2558        sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
2559        sbi->s_mb_stats = MB_DEFAULT_STATS;
2560        sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
2561        sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
2562        /*
2563         * The default group preallocation is 512, which for 4k block
2564         * sizes translates to 2 megabytes.  However for bigalloc file
2565         * systems, this is probably too big (i.e, if the cluster size
2566         * is 1 megabyte, then group preallocation size becomes half a
2567         * gigabyte!).  As a default, we will keep a two megabyte
2568         * group pralloc size for cluster sizes up to 64k, and after
2569         * that, we will force a minimum group preallocation size of
2570         * 32 clusters.  This translates to 8 megs when the cluster
2571         * size is 256k, and 32 megs when the cluster size is 1 meg,
2572         * which seems reasonable as a default.
2573         */
2574        sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >>
2575                                       sbi->s_cluster_bits, 32);
2576        /*
2577         * If there is a s_stripe > 1, then we set the s_mb_group_prealloc
2578         * to the lowest multiple of s_stripe which is bigger than
2579         * the s_mb_group_prealloc as determined above. We want
2580         * the preallocation size to be an exact multiple of the
2581         * RAID stripe size so that preallocations don't fragment
2582         * the stripes.
2583         */
2584        if (sbi->s_stripe > 1) {
2585                sbi->s_mb_group_prealloc = roundup(
2586                        sbi->s_mb_group_prealloc, sbi->s_stripe);
2587        }
2588
2589        sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
2590        if (sbi->s_locality_groups == NULL) {
2591                ret = -ENOMEM;
2592                goto out_free_groupinfo_slab;
2593        }
2594        for_each_possible_cpu(i) {
2595                struct ext4_locality_group *lg;
2596                lg = per_cpu_ptr(sbi->s_locality_groups, i);
2597                mutex_init(&lg->lg_mutex);
2598                for (j = 0; j < PREALLOC_TB_SIZE; j++)
2599                        INIT_LIST_HEAD(&lg->lg_prealloc_list[j]);
2600                spin_lock_init(&lg->lg_prealloc_lock);
2601        }
2602
2603        /* init file for buddy data */
2604        ret = ext4_mb_init_backend(sb);
2605        if (ret != 0)
2606                goto out_free_locality_groups;
2607
2608        if (sbi->s_proc)
2609                proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
2610                                 &ext4_mb_seq_groups_fops, sb);
2611
2612        return 0;
2613
2614out_free_locality_groups:
2615        free_percpu(sbi->s_locality_groups);
2616        sbi->s_locality_groups = NULL;
2617out_free_groupinfo_slab:
2618        ext4_groupinfo_destroy_slabs();
2619out:
2620        kfree(sbi->s_mb_offsets);
2621        sbi->s_mb_offsets = NULL;
2622        kfree(sbi->s_mb_maxs);
2623        sbi->s_mb_maxs = NULL;
2624        return ret;
2625}
2626
2627/* need to called with the ext4 group lock held */
2628static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
2629{
2630        struct ext4_prealloc_space *pa;
2631        struct list_head *cur, *tmp;
2632        int count = 0;
2633
2634        list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) {
2635                pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
2636                list_del(&pa->pa_group_list);
2637                count++;
2638                kmem_cache_free(ext4_pspace_cachep, pa);
2639        }
2640        if (count)
2641                mb_debug(1, "mballoc: %u PAs left\n", count);
2642
2643}
2644
2645int ext4_mb_release(struct super_block *sb)
2646{
2647        ext4_group_t ngroups = ext4_get_groups_count(sb);
2648        ext4_group_t i;
2649        int num_meta_group_infos;
2650        struct ext4_group_info *grinfo;
2651        struct ext4_sb_info *sbi = EXT4_SB(sb);
2652        struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2653
2654        if (sbi->s_proc)
2655                remove_proc_entry("mb_groups", sbi->s_proc);
2656
2657        if (sbi->s_group_info) {
2658                for (i = 0; i < ngroups; i++) {
2659                        grinfo = ext4_get_group_info(sb, i);
2660#ifdef DOUBLE_CHECK
2661                        kfree(grinfo->bb_bitmap);
2662#endif
2663                        ext4_lock_group(sb, i);
2664                        ext4_mb_cleanup_pa(grinfo);
2665                        ext4_unlock_group(sb, i);
2666                        kmem_cache_free(cachep, grinfo);
2667                }
2668                num_meta_group_infos = (ngroups +
2669                                EXT4_DESC_PER_BLOCK(sb) - 1) >>
2670                        EXT4_DESC_PER_BLOCK_BITS(sb);
2671                for (i = 0; i < num_meta_group_infos; i++)
2672                        kfree(sbi->s_group_info[i]);
2673                ext4_kvfree(sbi->s_group_info);
2674        }
2675        kfree(sbi->s_mb_offsets);
2676        kfree(sbi->s_mb_maxs);
2677        if (sbi->s_buddy_cache)
2678                iput(sbi->s_buddy_cache);
2679        if (sbi->s_mb_stats) {
2680                ext4_msg(sb, KERN_INFO,
2681                       "mballoc: %u blocks %u reqs (%u success)",
2682                                atomic_read(&sbi->s_bal_allocated),
2683                                atomic_read(&sbi->s_bal_reqs),
2684                                atomic_read(&sbi->s_bal_success));
2685                ext4_msg(sb, KERN_INFO,
2686                      "mballoc: %u extents scanned, %u goal hits, "
2687                                "%u 2^N hits, %u breaks, %u lost",
2688                                atomic_read(&sbi->s_bal_ex_scanned),
2689                                atomic_read(&sbi->s_bal_goals),
2690                                atomic_read(&sbi->s_bal_2orders),
2691                                atomic_read(&sbi->s_bal_breaks),
2692                                atomic_read(&sbi->s_mb_lost_chunks));
2693                ext4_msg(sb, KERN_INFO,
2694                       "mballoc: %lu generated and it took %Lu",
2695                                sbi->s_mb_buddies_generated,
2696                                sbi->s_mb_generation_time);
2697                ext4_msg(sb, KERN_INFO,
2698                       "mballoc: %u preallocated, %u discarded",
2699                                atomic_read(&sbi->s_mb_preallocated),
2700                                atomic_read(&sbi->s_mb_discarded));
2701        }
2702
2703        free_percpu(sbi->s_locality_groups);
2704
2705        return 0;
2706}
2707
2708static inline int ext4_issue_discard(struct super_block *sb,
2709                ext4_group_t block_group, ext4_grpblk_t cluster, int count)
2710{
2711        ext4_fsblk_t discard_block;
2712
2713        discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) +
2714                         ext4_group_first_block_no(sb, block_group));
2715        count = EXT4_C2B(EXT4_SB(sb), count);
2716        trace_ext4_discard_blocks(sb,
2717                        (unsigned long long) discard_block, count);
2718        return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
2719}
2720
2721/*
2722 * This function is called by the jbd2 layer once the commit has finished,
2723 * so we know we can free the blocks that were released with that commit.
2724 */
2725static void ext4_free_data_callback(struct super_block *sb,
2726                                    struct ext4_journal_cb_entry *jce,
2727                                    int rc)
2728{
2729        struct ext4_free_data *entry = (struct ext4_free_data *)jce;
2730        struct ext4_buddy e4b;
2731        struct ext4_group_info *db;
2732        int err, count = 0, count2 = 0;
2733
2734        mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
2735                 entry->efd_count, entry->efd_group, entry);
2736
2737        if (test_opt(sb, DISCARD)) {
2738                err = ext4_issue_discard(sb, entry->efd_group,
2739                                         entry->efd_start_cluster,
2740                                         entry->efd_count);
2741                if (err && err != -EOPNOTSUPP)
2742                        ext4_msg(sb, KERN_WARNING, "discard request in"
2743                                 " group:%d block:%d count:%d failed"
2744                                 " with %d", entry->efd_group,
2745                                 entry->efd_start_cluster,
2746                                 entry->efd_count, err);
2747        }
2748
2749        err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
2750        /* we expect to find existing buddy because it's pinned */
2751        BUG_ON(err != 0);
2752
2753
2754        db = e4b.bd_info;
2755        /* there are blocks to put in buddy to make them really free */
2756        count += entry->efd_count;
2757        count2++;
2758        ext4_lock_group(sb, entry->efd_group);
2759        /* Take it out of per group rb tree */
2760        rb_erase(&entry->efd_node, &(db->bb_free_root));
2761        mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count);
2762
2763        /*
2764         * Clear the trimmed flag for the group so that the next
2765         * ext4_trim_fs can trim it.
2766         * If the volume is mounted with -o discard, online discard
2767         * is supported and the free blocks will be trimmed online.
2768         */
2769        if (!test_opt(sb, DISCARD))
2770                EXT4_MB_GRP_CLEAR_TRIMMED(db);
2771
2772        if (!db->bb_free_root.rb_node) {
2773                /* No more items in the per group rb tree
2774                 * balance refcounts from ext4_mb_free_metadata()
2775                 */
2776                page_cache_release(e4b.bd_buddy_page);
2777                page_cache_release(e4b.bd_bitmap_page);
2778        }
2779        ext4_unlock_group(sb, entry->efd_group);
2780        kmem_cache_free(ext4_free_data_cachep, entry);
2781        ext4_mb_unload_buddy(&e4b);
2782
2783        mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
2784}
2785
2786int __init ext4_init_mballoc(void)
2787{
2788        ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
2789                                        SLAB_RECLAIM_ACCOUNT);
2790        if (ext4_pspace_cachep == NULL)
2791                return -ENOMEM;
2792
2793        ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context,
2794                                    SLAB_RECLAIM_ACCOUNT);
2795        if (ext4_ac_cachep == NULL) {
2796                kmem_cache_destroy(ext4_pspace_cachep);
2797                return -ENOMEM;
2798        }
2799
2800        ext4_free_data_cachep = KMEM_CACHE(ext4_free_data,
2801                                           SLAB_RECLAIM_ACCOUNT);
2802        if (ext4_free_data_cachep == NULL) {
2803                kmem_cache_destroy(ext4_pspace_cachep);
2804                kmem_cache_destroy(ext4_ac_cachep);
2805                return -ENOMEM;
2806        }
2807        return 0;
2808}
2809
2810void ext4_exit_mballoc(void)
2811{
2812        /*
2813         * Wait for completion of call_rcu()'s on ext4_pspace_cachep
2814         * before destroying the slab cache.
2815         */
2816        rcu_barrier();
2817        kmem_cache_destroy(ext4_pspace_cachep);
2818        kmem_cache_destroy(ext4_ac_cachep);
2819        kmem_cache_destroy(ext4_free_data_cachep);
2820        ext4_groupinfo_destroy_slabs();
2821}
2822
2823
2824/*
2825 * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps
2826 * Returns 0 if success or error code
2827 */
2828static noinline_for_stack int
2829ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2830                                handle_t *handle, unsigned int reserv_clstrs)
2831{
2832        struct buffer_head *bitmap_bh = NULL;
2833        struct ext4_group_desc *gdp;
2834        struct buffer_head *gdp_bh;
2835        struct ext4_sb_info *sbi;
2836        struct super_block *sb;
2837        ext4_fsblk_t block;
2838        int err, len;
2839
2840        BUG_ON(ac->ac_status != AC_STATUS_FOUND);
2841        BUG_ON(ac->ac_b_ex.fe_len <= 0);
2842
2843        sb = ac->ac_sb;
2844        sbi = EXT4_SB(sb);
2845
2846        err = -EIO;
2847        bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
2848        if (!bitmap_bh)
2849                goto out_err;
2850
2851        err = ext4_journal_get_write_access(handle, bitmap_bh);
2852        if (err)
2853                goto out_err;
2854
2855        err = -EIO;
2856        gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh);
2857        if (!gdp)
2858                goto out_err;
2859
2860        ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
2861                        ext4_free_group_clusters(sb, gdp));
2862
2863        err = ext4_journal_get_write_access(handle, gdp_bh);
2864        if (err)
2865                goto out_err;
2866
2867        block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
2868
2869        len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
2870        if (!ext4_data_block_valid(sbi, block, len)) {
2871                ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
2872                           "fs metadata", block, block+len);
2873                /* File system mounted not to panic on error
2874                 * Fix the bitmap and repeat the block allocation
2875                 * We leak some of the blocks here.
2876                 */
2877                ext4_lock_group(sb, ac->ac_b_ex.fe_group);
2878                ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
2879                              ac->ac_b_ex.fe_len);
2880                ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
2881                err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
2882                if (!err)
2883                        err = -EAGAIN;
2884                goto out_err;
2885        }
2886
2887        ext4_lock_group(sb, ac->ac_b_ex.fe_group);
2888#ifdef AGGRESSIVE_CHECK
2889        {
2890                int i;
2891                for (i = 0; i < ac->ac_b_ex.fe_len; i++) {
2892                        BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i,
2893                                                bitmap_bh->b_data));
2894                }
2895        }
2896#endif
2897        ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
2898                      ac->ac_b_ex.fe_len);
2899        if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
2900                gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
2901                ext4_free_group_clusters_set(sb, gdp,
2902                                             ext4_free_clusters_after_init(sb,
2903                                                ac->ac_b_ex.fe_group, gdp));
2904        }
2905        len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len;
2906        ext4_free_group_clusters_set(sb, gdp, len);
2907        ext4_block_bitmap_csum_set(sb, ac->ac_b_ex.fe_group, gdp, bitmap_bh);
2908        ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp);
2909
2910        ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
2911        percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len);
2912        /*
2913         * Now reduce the dirty block count also. Should not go negative
2914         */
2915        if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
2916                /* release all the reserved blocks if non delalloc */
2917                percpu_counter_sub(&sbi->s_dirtyclusters_counter,
2918                                   reserv_clstrs);
2919
2920        if (sbi->s_log_groups_per_flex) {
2921                ext4_group_t flex_group = ext4_flex_group(sbi,
2922                                                          ac->ac_b_ex.fe_group);
2923                atomic64_sub(ac->ac_b_ex.fe_len,
2924                             &sbi->s_flex_groups[flex_group].free_clusters);
2925        }
2926
2927        err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
2928        if (err)
2929                goto out_err;
2930        err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
2931
2932out_err:
2933        brelse(bitmap_bh);
2934        return err;
2935}
2936
2937/*
2938 * here we normalize request for locality group
2939 * Group request are normalized to s_mb_group_prealloc, which goes to
2940 * s_strip if we set the same via mount option.
2941 * s_mb_group_prealloc can be configured via
2942 * /sys/fs/ext4/<partition>/mb_group_prealloc
2943 *
2944 * XXX: should we try to preallocate more than the group has now?
2945 */
2946static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
2947{
2948        struct super_block *sb = ac->ac_sb;
2949        struct ext4_locality_group *lg = ac->ac_lg;
2950
2951        BUG_ON(lg == NULL);
2952        ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
2953        mb_debug(1, "#%u: goal %u blocks for locality group\n",
2954                current->pid, ac->ac_g_ex.fe_len);
2955}
2956
2957/*
2958 * Normalization means making request better in terms of
2959 * size and alignment
2960 */
2961static noinline_for_stack void
2962ext4_mb_normalize_request(struct ext4_allocation_context *ac,
2963                                struct ext4_allocation_request *ar)
2964{
2965        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
2966        int bsbits, max;
2967        ext4_lblk_t end;
2968        loff_t size, start_off;
2969        loff_t orig_size __maybe_unused;
2970        ext4_lblk_t start;
2971        struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
2972        struct ext4_prealloc_space *pa;
2973
2974        /* do normalize only data requests, metadata requests
2975           do not need preallocation */
2976        if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
2977                return;
2978
2979        /* sometime caller may want exact blocks */
2980        if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
2981                return;
2982
2983        /* caller may indicate that preallocation isn't
2984         * required (it's a tail, for example) */
2985        if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC)
2986                return;
2987
2988        if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) {
2989                ext4_mb_normalize_group_request(ac);
2990                return ;
2991        }
2992
2993        bsbits = ac->ac_sb->s_blocksize_bits;
2994
2995        /* first, let's learn actual file size
2996         * given current request is allocated */
2997        size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
2998        size = size << bsbits;
2999        if (size < i_size_read(ac->ac_inode))
3000                size = i_size_read(ac->ac_inode);
3001        orig_size = size;
3002
3003        /* max size of free chunks */
3004        max = 2 << bsbits;
3005
3006#define NRL_CHECK_SIZE(req, size, max, chunk_size)      \
3007                (req <= (size) || max <= (chunk_size))
3008
3009        /* first, try to predict filesize */
3010        /* XXX: should this table be tunable? */
3011        start_off = 0;
3012        if (size <= 16 * 1024) {
3013                size = 16 * 1024;
3014        } else if (size <= 32 * 1024) {
3015                size = 32 * 1024;
3016        } else if (size <= 64 * 1024) {
3017                size = 64 * 1024;
3018        } else if (size <= 128 * 1024) {
3019                size = 128 * 1024;
3020        } else if (size <= 256 * 1024) {
3021                size = 256 * 1024;
3022        } else if (size <= 512 * 1024) {
3023                size = 512 * 1024;
3024        } else if (size <= 1024 * 1024) {
3025                size = 1024 * 1024;
3026        } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) {
3027                start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
3028                                                (21 - bsbits)) << 21;
3029                size = 2 * 1024 * 1024;
3030        } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) {
3031                start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
3032                                                        (22 - bsbits)) << 22;
3033                size = 4 * 1024 * 1024;
3034        } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len,
3035                                        (8<<20)>>bsbits, max, 8 * 1024)) {
3036                start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
3037                                                        (23 - bsbits)) << 23;
3038                size = 8 * 1024 * 1024;
3039        } else {
3040                start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits;
3041                size      = ac->ac_o_ex.fe_len << bsbits;
3042        }
3043        size = size >> bsbits;
3044        start = start_off >> bsbits;
3045
3046        /* don't cover already allocated blocks in selected range */
3047        if (ar->pleft && start <= ar->lleft) {
3048                size -= ar->lleft + 1 - start;
3049                start = ar->lleft + 1;
3050        }
3051        if (ar->pright && start + size - 1 >= ar->lright)
3052                size -= start + size - ar->lright;
3053
3054        end = start + size;
3055
3056        /* check we don't cross already preallocated blocks */
3057        rcu_read_lock();
3058        list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
3059                ext4_lblk_t pa_end;
3060
3061                if (pa->pa_deleted)
3062                        continue;
3063                spin_lock(&pa->pa_lock);
3064                if (pa->pa_deleted) {
3065                        spin_unlock(&pa->pa_lock);
3066                        continue;
3067                }
3068
3069                pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
3070                                                  pa->pa_len);
3071
3072                /* PA must not overlap original request */
3073                BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
3074                        ac->ac_o_ex.fe_logical < pa->pa_lstart));
3075
3076                /* skip PAs this normalized request doesn't overlap with */
3077                if (pa->pa_lstart >= end || pa_end <= start) {
3078                        spin_unlock(&pa->pa_lock);
3079                        continue;
3080                }
3081                BUG_ON(pa->pa_lstart <= start && pa_end >= end);
3082
3083                /* adjust start or end to be adjacent to this pa */
3084                if (pa_end <= ac->ac_o_ex.fe_logical) {
3085                        BUG_ON(pa_end < start);
3086                        start = pa_end;
3087                } else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
3088                        BUG_ON(pa->pa_lstart > end);
3089                        end = pa->pa_lstart;
3090                }
3091                spin_unlock(&pa->pa_lock);
3092        }
3093        rcu_read_unlock();
3094        size = end - start;
3095
3096        /* XXX: extra loop to check we really don't overlap preallocations */
3097        rcu_read_lock();
3098        list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
3099                ext4_lblk_t pa_end;
3100
3101                spin_lock(&pa->pa_lock);
3102                if (pa->pa_deleted == 0) {
3103                        pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
3104                                                          pa->pa_len);
3105                        BUG_ON(!(start >= pa_end || end <= pa->pa_lstart));
3106                }
3107                spin_unlock(&pa->pa_lock);
3108        }
3109        rcu_read_unlock();
3110
3111        if (start + size <= ac->ac_o_ex.fe_logical &&
3112                        start > ac->ac_o_ex.fe_logical) {
3113                ext4_msg(ac->ac_sb, KERN_ERR,
3114                         "start %lu, size %lu, fe_logical %lu",
3115                         (unsigned long) start, (unsigned long) size,
3116                         (unsigned long) ac->ac_o_ex.fe_logical);
3117        }
3118        BUG_ON(start + size <= ac->ac_o_ex.fe_logical &&
3119                        start > ac->ac_o_ex.fe_logical);
3120        BUG_ON(size <= 0 || size > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
3121
3122        /* now prepare goal request */
3123
3124        /* XXX: is it better to align blocks WRT to logical
3125         * placement or satisfy big request as is */
3126        ac->ac_g_ex.fe_logical = start;
3127        ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size);
3128
3129        /* define goal start in order to merge */
3130        if (ar->pright && (ar->lright == (start + size))) {
3131                /* merge to the right */
3132                ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size,
3133                                                &ac->ac_f_ex.fe_group,
3134                                                &ac->ac_f_ex.fe_start);
3135                ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
3136        }
3137        if (ar->pleft && (ar->lleft + 1 == start)) {
3138                /* merge to the left */
3139                ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1,
3140                                                &ac->ac_f_ex.fe_group,
3141                                                &ac->ac_f_ex.fe_start);
3142                ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
3143        }
3144
3145        mb_debug(1, "goal: %u(was %u) blocks at %u\n", (unsigned) size,
3146                (unsigned) orig_size, (unsigned) start);
3147}
3148
3149static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
3150{
3151        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
3152
3153        if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) {
3154                atomic_inc(&sbi->s_bal_reqs);
3155                atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
3156                if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
3157                        atomic_inc(&sbi->s_bal_success);
3158                atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
3159                if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
3160                                ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
3161                        atomic_inc(&sbi->s_bal_goals);
3162                if (ac->ac_found > sbi->s_mb_max_to_scan)
3163                        atomic_inc(&sbi->s_bal_breaks);
3164        }
3165
3166        if (ac->ac_op == EXT4_MB_HISTORY_ALLOC)
3167                trace_ext4_mballoc_alloc(ac);
3168        else
3169                trace_ext4_mballoc_prealloc(ac);
3170}
3171
3172/*
3173 * Called on failure; free up any blocks from the inode PA for this
3174 * context.  We don't need this for MB_GROUP_PA because we only change
3175 * pa_free in ext4_mb_release_context(), but on failure, we've already
3176 * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed.
3177 */
3178static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
3179{
3180        struct ext4_prealloc_space *pa = ac->ac_pa;
3181
3182        if (pa && pa->pa_type == MB_INODE_PA)
3183                pa->pa_free += ac->ac_b_ex.fe_len;
3184}
3185
3186/*
3187 * use blocks preallocated to inode
3188 */
3189static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
3190                                struct ext4_prealloc_space *pa)
3191{
3192        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
3193        ext4_fsblk_t start;
3194        ext4_fsblk_t end;
3195        int len;
3196
3197        /* found preallocated blocks, use them */
3198        start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart);
3199        end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len),
3200                  start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len));
3201        len = EXT4_NUM_B2C(sbi, end - start);
3202        ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group,
3203                                        &ac->ac_b_ex.fe_start);
3204        ac->ac_b_ex.fe_len = len;
3205        ac->ac_status = AC_STATUS_FOUND;
3206        ac->ac_pa = pa;
3207
3208        BUG_ON(start < pa->pa_pstart);
3209        BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len));
3210        BUG_ON(pa->pa_free < len);
3211        pa->pa_free -= len;
3212
3213        mb_debug(1, "use %llu/%u from inode pa %p\n", start, len, pa);
3214}
3215
3216/*
3217 * use blocks preallocated to locality group
3218 */
3219static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
3220                                struct ext4_prealloc_space *pa)
3221{
3222        unsigned int len = ac->ac_o_ex.fe_len;
3223
3224        ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart,
3225                                        &ac->ac_b_ex.fe_group,
3226                                        &ac->ac_b_ex.fe_start);
3227        ac->ac_b_ex.fe_len = len;
3228        ac->ac_status = AC_STATUS_FOUND;
3229        ac->ac_pa = pa;
3230
3231        /* we don't correct pa_pstart or pa_plen here to avoid
3232         * possible race when the group is being loaded concurrently
3233         * instead we correct pa later, after blocks are marked
3234         * in on-disk bitmap -- see ext4_mb_release_context()
3235         * Other CPUs are prevented from allocating from this pa by lg_mutex
3236         */
3237        mb_debug(1, "use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
3238}
3239
3240/*
3241 * Return the prealloc space that have minimal distance
3242 * from the goal block. @cpa is the prealloc
3243 * space that is having currently known minimal distance
3244 * from the goal block.
3245 */
3246static struct ext4_prealloc_space *
3247ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
3248                        struct ext4_prealloc_space *pa,
3249                        struct ext4_prealloc_space *cpa)
3250{
3251        ext4_fsblk_t cur_distance, new_distance;
3252
3253        if (cpa == NULL) {
3254                atomic_inc(&pa->pa_count);
3255                return pa;
3256        }
3257        cur_distance = abs(goal_block - cpa->pa_pstart);
3258        new_distance = abs(goal_block - pa->pa_pstart);
3259
3260        if (cur_distance <= new_distance)
3261                return cpa;
3262
3263        /* drop the previous reference */
3264        atomic_dec(&cpa->pa_count);
3265        atomic_inc(&pa->pa_count);
3266        return pa;
3267}
3268
3269/*
3270 * search goal blocks in preallocated space
3271 */
3272static noinline_for_stack int
3273ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
3274{
3275        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
3276        int order, i;
3277        struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
3278        struct ext4_locality_group *lg;
3279        struct ext4_prealloc_space *pa, *cpa = NULL;
3280        ext4_fsblk_t goal_block;
3281
3282        /* only data can be preallocated */
3283        if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
3284                return 0;
3285
3286        /* first, try per-file preallocation */
3287        rcu_read_lock();
3288        list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
3289
3290                /* all fields in this condition don't change,
3291                 * so we can skip locking for them */
3292                if (ac->ac_o_ex.fe_logical < pa->pa_lstart ||
3293                    ac->ac_o_ex.fe_logical >= (pa->pa_lstart +
3294                                               EXT4_C2B(sbi, pa->pa_len)))
3295                        continue;
3296
3297                /* non-extent files can't have physical blocks past 2^32 */
3298                if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
3299                    (pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len) >
3300                     EXT4_MAX_BLOCK_FILE_PHYS))
3301                        continue;
3302
3303                /* found preallocated blocks, use them */
3304                spin_lock(&pa->pa_lock);
3305                if (pa->pa_deleted == 0 && pa->pa_free) {
3306                        atomic_inc(&pa->pa_count);
3307                        ext4_mb_use_inode_pa(ac, pa);
3308                        spin_unlock(&pa->pa_lock);
3309                        ac->ac_criteria = 10;
3310                        rcu_read_unlock();
3311                        return 1;
3312                }
3313                spin_unlock(&pa->pa_lock);
3314        }
3315        rcu_read_unlock();
3316
3317        /* can we use group allocation? */
3318        if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC))
3319                return 0;
3320
3321        /* inode may have no locality group for some reason */
3322        lg = ac->ac_lg;
3323        if (lg == NULL)
3324                return 0;
3325        order  = fls(ac->ac_o_ex.fe_len) - 1;
3326        if (order > PREALLOC_TB_SIZE - 1)
3327                /* The max size of hash table is PREALLOC_TB_SIZE */
3328                order = PREALLOC_TB_SIZE - 1;
3329
3330        goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex);
3331        /*
3332         * search for the prealloc space that is having
3333         * minimal distance from the goal block.
3334         */
3335        for (i = order; i < PREALLOC_TB_SIZE; i++) {
3336                rcu_read_lock();
3337                list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i],
3338                                        pa_inode_list) {
3339                        spin_lock(&pa->pa_lock);
3340                        if (pa->pa_deleted == 0 &&
3341                                        pa->pa_free >= ac->ac_o_ex.fe_len) {
3342
3343                                cpa = ext4_mb_check_group_pa(goal_block,
3344                                                                pa, cpa);
3345                        }
3346                        spin_unlock(&pa->pa_lock);
3347                }
3348                rcu_read_unlock();
3349        }
3350        if (cpa) {
3351                ext4_mb_use_group_pa(ac, cpa);
3352                ac->ac_criteria = 20;
3353                return 1;
3354        }
3355        return 0;
3356}
3357
3358/*
3359 * the function goes through all block freed in the group
3360 * but not yet committed and marks them used in in-core bitmap.
3361 * buddy must be generated from this bitmap
3362 * Need to be called with the ext4 group lock held
3363 */
3364static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
3365                                                ext4_group_t group)
3366{
3367        struct rb_node *n;
3368        struct ext4_group_info *grp;
3369        struct ext4_free_data *entry;
3370
3371        grp = ext4_get_group_info(sb, group);
3372        n = rb_first(&(grp->bb_free_root));
3373
3374        while (n) {
3375                entry = rb_entry(n, struct ext4_free_data, efd_node);
3376                ext4_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count);
3377                n = rb_next(n);
3378        }
3379        return;
3380}
3381
3382/*
3383 * the function goes through all preallocation in this group and marks them
3384 * used in in-core bitmap. buddy must be generated from this bitmap
3385 * Need to be called with ext4 group lock held
3386 */
3387static noinline_for_stack
3388void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
3389                                        ext4_group_t group)
3390{
3391        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
3392        struct ext4_prealloc_space *pa;
3393        struct list_head *cur;
3394        ext4_group_t groupnr;
3395        ext4_grpblk_t start;
3396        int preallocated = 0;
3397        int len;
3398
3399        /* all form of preallocation discards first load group,
3400         * so the only competing code is preallocation use.
3401         * we don't need any locking here
3402         * notice we do NOT ignore preallocations with pa_deleted
3403         * otherwise we could leave used blocks available for
3404         * allocation in buddy when concurrent ext4_mb_put_pa()
3405         * is dropping preallocation
3406         */
3407        list_for_each(cur, &grp->bb_prealloc_list) {
3408                pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
3409                spin_lock(&pa->pa_lock);
3410                ext4_get_group_no_and_offset(sb, pa->pa_pstart,
3411                                             &groupnr, &start);
3412                len = pa->pa_len;
3413                spin_unlock(&pa->pa_lock);
3414                if (unlikely(len == 0))
3415                        continue;
3416                BUG_ON(groupnr != group);
3417                ext4_set_bits(bitmap, start, len);
3418                preallocated += len;
3419        }
3420        mb_debug(1, "prellocated %u for group %u\n", preallocated, group);
3421}
3422
3423static void ext4_mb_pa_callback(struct rcu_head *head)
3424{
3425        struct ext4_prealloc_space *pa;
3426        pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
3427        kmem_cache_free(ext4_pspace_cachep, pa);
3428}
3429
3430/*
3431 * drops a reference to preallocated space descriptor
3432 * if this was the last reference and the space is consumed
3433 */
3434static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
3435                        struct super_block *sb, struct ext4_prealloc_space *pa)
3436{
3437        ext4_group_t grp;
3438        ext4_fsblk_t grp_blk;
3439
3440        if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0)
3441                return;
3442
3443        /* in this short window concurrent discard can set pa_deleted */
3444        spin_lock(&pa->pa_lock);
3445        if (pa->pa_deleted == 1) {
3446                spin_unlock(&pa->pa_lock);
3447                return;
3448        }
3449
3450        pa->pa_deleted = 1;
3451        spin_unlock(&pa->pa_lock);
3452
3453        grp_blk = pa->pa_pstart;
3454        /*
3455         * If doing group-based preallocation, pa_pstart may be in the
3456         * next group when pa is used up
3457         */
3458        if (pa->pa_type == MB_GROUP_PA)
3459                grp_blk--;
3460
3461        grp = ext4_get_group_number(sb, grp_blk);
3462
3463        /*
3464         * possible race:
3465         *
3466         *  P1 (buddy init)                     P2 (regular allocation)
3467         *                                      find block B in PA
3468         *  copy on-disk bitmap to buddy
3469         *                                      mark B in on-disk bitmap
3470         *                                      drop PA from group
3471         *  mark all PAs in buddy
3472         *
3473         * thus, P1 initializes buddy with B available. to prevent this
3474         * we make "copy" and "mark all PAs" atomic and serialize "drop PA"
3475         * against that pair
3476         */
3477        ext4_lock_group(sb, grp);
3478        list_del(&pa->pa_group_list);
3479        ext4_unlock_group(sb, grp);
3480
3481        spin_lock(pa->pa_obj_lock);
3482        list_del_rcu(&pa->pa_inode_list);
3483        spin_unlock(pa->pa_obj_lock);
3484
3485        call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
3486}
3487
3488/*
3489 * creates new preallocated space for given inode
3490 */
3491static noinline_for_stack int
3492ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
3493{
3494        struct super_block *sb = ac->ac_sb;
3495        struct ext4_sb_info *sbi = EXT4_SB(sb);
3496        struct ext4_prealloc_space *pa;
3497        struct ext4_group_info *grp;
3498        struct ext4_inode_info *ei;
3499
3500        /* preallocate only when found space is larger then requested */
3501        BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
3502        BUG_ON(ac->ac_status != AC_STATUS_FOUND);
3503        BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
3504
3505        pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
3506        if (pa == NULL)
3507                return -ENOMEM;
3508
3509        if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) {
3510                int winl;
3511                int wins;
3512                int win;
3513                int offs;
3514
3515                /* we can't allocate as much as normalizer wants.
3516                 * so, found space must get proper lstart
3517                 * to cover original request */
3518                BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical);
3519                BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len);
3520
3521                /* we're limited by original request in that
3522                 * logical block must be covered any way
3523                 * winl is window we can move our chunk within */
3524                winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical;
3525
3526                /* also, we should cover whole original request */
3527                wins = EXT4_C2B(sbi, ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len);
3528
3529                /* the smallest one defines real window */
3530                win = min(winl, wins);
3531
3532                offs = ac->ac_o_ex.fe_logical %
3533                        EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
3534                if (offs && offs < win)
3535                        win = offs;
3536
3537                ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical -
3538                        EXT4_NUM_B2C(sbi, win);
3539                BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
3540                BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
3541        }
3542
3543        /* preallocation can change ac_b_ex, thus we store actually
3544         * allocated blocks for history */
3545        ac->ac_f_ex = ac->ac_b_ex;
3546
3547        pa->pa_lstart = ac->ac_b_ex.fe_logical;
3548        pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
3549        pa->pa_len = ac->ac_b_ex.fe_len;
3550        pa->pa_free = pa->pa_len;
3551        atomic_set(&pa->pa_count, 1);
3552        spin_lock_init(&pa->pa_lock);
3553        INIT_LIST_HEAD(&pa->pa_inode_list);
3554        INIT_LIST_HEAD(&pa->pa_group_list);
3555        pa->pa_deleted = 0;
3556        pa->pa_type = MB_INODE_PA;
3557
3558        mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa,
3559                        pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3560        trace_ext4_mb_new_inode_pa(ac, pa);
3561
3562        ext4_mb_use_inode_pa(ac, pa);
3563        atomic_add(pa->pa_free, &sbi->s_mb_preallocated);
3564
3565        ei = EXT4_I(ac->ac_inode);
3566        grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
3567
3568        pa->pa_obj_lock = &ei->i_prealloc_lock;
3569        pa->pa_inode = ac->ac_inode;
3570
3571        ext4_lock_group(sb, ac->ac_b_ex.fe_group);
3572        list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
3573        ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
3574
3575        spin_lock(pa->pa_obj_lock);
3576        list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
3577        spin_unlock(pa->pa_obj_lock);
3578
3579        return 0;
3580}
3581
3582/*
3583 * creates new preallocated space for locality group inodes belongs to
3584 */
3585static noinline_for_stack int
3586ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
3587{
3588        struct super_block *sb = ac->ac_sb;
3589        struct ext4_locality_group *lg;
3590        struct ext4_prealloc_space *pa;
3591        struct ext4_group_info *grp;
3592
3593        /* preallocate only when found space is larger then requested */
3594        BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
3595        BUG_ON(ac->ac_status != AC_STATUS_FOUND);
3596        BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
3597
3598        BUG_ON(ext4_pspace_cachep == NULL);
3599        pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
3600        if (pa == NULL)
3601                return -ENOMEM;
3602
3603        /* preallocation can change ac_b_ex, thus we store actually
3604         * allocated blocks for history */
3605        ac->ac_f_ex = ac->ac_b_ex;
3606
3607        pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
3608        pa->pa_lstart = pa->pa_pstart;
3609        pa->pa_len = ac->ac_b_ex.fe_len;
3610        pa->pa_free = pa->pa_len;
3611        atomic_set(&pa->pa_count, 1);
3612        spin_lock_init(&pa->pa_lock);
3613        INIT_LIST_HEAD(&pa->pa_inode_list);
3614        INIT_LIST_HEAD(&pa->pa_group_list);
3615        pa->pa_deleted = 0;
3616        pa->pa_type = MB_GROUP_PA;
3617
3618        mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa,
3619                        pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3620        trace_ext4_mb_new_group_pa(ac, pa);
3621
3622        ext4_mb_use_group_pa(ac, pa);
3623        atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
3624
3625        grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
3626        lg = ac->ac_lg;
3627        BUG_ON(lg == NULL);
3628
3629        pa->pa_obj_lock = &lg->lg_prealloc_lock;
3630        pa->pa_inode = NULL;
3631
3632        ext4_lock_group(sb, ac->ac_b_ex.fe_group);
3633        list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
3634        ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
3635
3636        /*
3637         * We will later add the new pa to the right bucket
3638         * after updating the pa_free in ext4_mb_release_context
3639         */
3640        return 0;
3641}
3642
3643static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
3644{
3645        int err;
3646
3647        if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
3648                err = ext4_mb_new_group_pa(ac);
3649        else
3650                err = ext4_mb_new_inode_pa(ac);
3651        return err;
3652}
3653
3654/*
3655 * finds all unused blocks in on-disk bitmap, frees them in
3656 * in-core bitmap and buddy.
3657 * @pa must be unlinked from inode and group lists, so that
3658 * nobody else can find/use it.
3659 * the caller MUST hold group/inode locks.
3660 * TODO: optimize the case when there are no in-core structures yet
3661 */
3662static noinline_for_stack int
3663ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3664                        struct ext4_prealloc_space *pa)
3665{
3666        struct super_block *sb = e4b->bd_sb;
3667        struct ext4_sb_info *sbi = EXT4_SB(sb);
3668        unsigned int end;
3669        unsigned int next;
3670        ext4_group_t group;
3671        ext4_grpblk_t bit;
3672        unsigned long long grp_blk_start;
3673        int err = 0;
3674        int free = 0;
3675
3676        BUG_ON(pa->pa_deleted == 0);
3677        ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
3678        grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit);
3679        BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
3680        end = bit + pa->pa_len;
3681
3682        while (bit < end) {
3683                bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit);
3684                if (bit >= end)
3685                        break;
3686                next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
3687                mb_debug(1, "    free preallocated %u/%u in group %u\n",
3688                         (unsigned) ext4_group_first_block_no(sb, group) + bit,
3689                         (unsigned) next - bit, (unsigned) group);
3690                free += next - bit;
3691
3692                trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
3693                trace_ext4_mb_release_inode_pa(pa, (grp_blk_start +
3694                                                    EXT4_C2B(sbi, bit)),
3695                                               next - bit);
3696                mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
3697                bit = next + 1;
3698        }
3699        if (free != pa->pa_free) {
3700                ext4_msg(e4b->bd_sb, KERN_CRIT,
3701                         "pa %p: logic %lu, phys. %lu, len %lu",
3702                         pa, (unsigned long) pa->pa_lstart,
3703                         (unsigned long) pa->pa_pstart,
3704                         (unsigned long) pa->pa_len);
3705                ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u",
3706                                        free, pa->pa_free);
3707                /*
3708                 * pa is already deleted so we use the value obtained
3709                 * from the bitmap and continue.
3710                 */
3711        }
3712        atomic_add(free, &sbi->s_mb_discarded);
3713
3714        return err;
3715}
3716
3717static noinline_for_stack int
3718ext4_mb_release_group_pa(struct ext4_buddy *e4b,
3719                                struct ext4_prealloc_space *pa)
3720{
3721        struct super_block *sb = e4b->bd_sb;
3722        ext4_group_t group;
3723        ext4_grpblk_t bit;
3724
3725        trace_ext4_mb_release_group_pa(sb, pa);
3726        BUG_ON(pa->pa_deleted == 0);
3727        ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
3728        BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
3729        mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
3730        atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
3731        trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);
3732
3733        return 0;
3734}
3735
3736/*
3737 * releases all preallocations in given group
3738 *
3739 * first, we need to decide discard policy:
3740 * - when do we discard
3741 *   1) ENOSPC
3742 * - how many do we discard
3743 *   1) how many requested
3744 */
3745static noinline_for_stack int
3746ext4_mb_discard_group_preallocations(struct super_block *sb,
3747                                        ext4_group_t group, int needed)
3748{
3749        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
3750        struct buffer_head *bitmap_bh = NULL;
3751        struct ext4_prealloc_space *pa, *tmp;
3752        struct list_head list;
3753        struct ext4_buddy e4b;
3754        int err;
3755        int busy = 0;
3756        int free = 0;
3757
3758        mb_debug(1, "discard preallocation for group %u\n", group);
3759
3760        if (list_empty(&grp->bb_prealloc_list))
3761                return 0;
3762
3763        bitmap_bh = ext4_read_block_bitmap(sb, group);
3764        if (bitmap_bh == NULL) {
3765                ext4_error(sb, "Error reading block bitmap for %u", group);
3766                return 0;
3767        }
3768
3769        err = ext4_mb_load_buddy(sb, group, &e4b);
3770        if (err) {
3771                ext4_error(sb, "Error loading buddy information for %u", group);
3772                put_bh(bitmap_bh);
3773                return 0;
3774        }
3775
3776        if (needed == 0)
3777                needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
3778
3779        INIT_LIST_HEAD(&list);
3780repeat:
3781        ext4_lock_group(sb, group);
3782        list_for_each_entry_safe(pa, tmp,
3783                                &grp->bb_prealloc_list, pa_group_list) {
3784                spin_lock(&pa->pa_lock);
3785                if (atomic_read(&pa->pa_count)) {
3786                        spin_unlock(&pa->pa_lock);
3787                        busy = 1;
3788                        continue;
3789                }
3790                if (pa->pa_deleted) {
3791                        spin_unlock(&pa->pa_lock);
3792                        continue;
3793                }
3794
3795                /* seems this one can be freed ... */
3796                pa->pa_deleted = 1;
3797
3798                /* we can trust pa_free ... */
3799                free += pa->pa_free;
3800
3801                spin_unlock(&pa->pa_lock);
3802
3803                list_del(&pa->pa_group_list);
3804                list_add(&pa->u.pa_tmp_list, &list);
3805        }
3806
3807        /* if we still need more blocks and some PAs were used, try again */
3808        if (free < needed && busy) {
3809                busy = 0;
3810                ext4_unlock_group(sb, group);
3811                cond_resched();
3812                goto repeat;
3813        }
3814
3815        /* found anything to free? */
3816        if (list_empty(&list)) {
3817                BUG_ON(free != 0);
3818                goto out;
3819        }
3820
3821        /* now free all selected PAs */
3822        list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
3823
3824                /* remove from object (inode or locality group) */
3825                spin_lock(pa->pa_obj_lock);
3826                list_del_rcu(&pa->pa_inode_list);
3827                spin_unlock(pa->pa_obj_lock);
3828
3829                if (pa->pa_type == MB_GROUP_PA)
3830                        ext4_mb_release_group_pa(&e4b, pa);
3831                else
3832                        ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
3833
3834                list_del(&pa->u.pa_tmp_list);
3835                call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
3836        }
3837
3838out:
3839        ext4_unlock_group(sb, group);
3840        ext4_mb_unload_buddy(&e4b);
3841        put_bh(bitmap_bh);
3842        return free;
3843}
3844
3845/*
3846 * releases all non-used preallocated blocks for given inode
3847 *
3848 * It's important to discard preallocations under i_data_sem
3849 * We don't want another block to be served from the prealloc
3850 * space when we are discarding the inode prealloc space.
3851 *
3852 * FIXME!! Make sure it is valid at all the call sites
3853 */
3854void ext4_discard_preallocations(struct inode *inode)
3855{
3856        struct ext4_inode_info *ei = EXT4_I(inode);
3857        struct super_block *sb = inode->i_sb;
3858        struct buffer_head *bitmap_bh = NULL;
3859        struct ext4_prealloc_space *pa, *tmp;
3860        ext4_group_t group = 0;
3861        struct list_head list;
3862        struct ext4_buddy e4b;
3863        int err;
3864
3865        if (!S_ISREG(inode->i_mode)) {
3866                /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/
3867                return;
3868        }
3869
3870        mb_debug(1, "discard preallocation for inode %lu\n", inode->i_ino);
3871        trace_ext4_discard_preallocations(inode);
3872
3873        INIT_LIST_HEAD(&list);
3874
3875repeat:
3876        /* first, collect all pa's in the inode */
3877        spin_lock(&ei->i_prealloc_lock);
3878        while (!list_empty(&ei->i_prealloc_list)) {
3879                pa = list_entry(ei->i_prealloc_list.next,
3880                                struct ext4_prealloc_space, pa_inode_list);
3881                BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock);
3882                spin_lock(&pa->pa_lock);
3883                if (atomic_read(&pa->pa_count)) {
3884                        /* this shouldn't happen often - nobody should
3885                         * use preallocation while we're discarding it */
3886                        spin_unlock(&pa->pa_lock);
3887                        spin_unlock(&ei->i_prealloc_lock);
3888                        ext4_msg(sb, KERN_ERR,
3889                                 "uh-oh! used pa while discarding");
3890                        WARN_ON(1);
3891                        schedule_timeout_uninterruptible(HZ);
3892                        goto repeat;
3893
3894                }
3895                if (pa->pa_deleted == 0) {
3896                        pa->pa_deleted = 1;
3897                        spin_unlock(&pa->pa_lock);
3898                        list_del_rcu(&pa->pa_inode_list);
3899                        list_add(&pa->u.pa_tmp_list, &list);
3900                        continue;
3901                }
3902
3903                /* someone is deleting pa right now */
3904                spin_unlock(&pa->pa_lock);
3905                spin_unlock(&ei->i_prealloc_lock);
3906
3907                /* we have to wait here because pa_deleted
3908                 * doesn't mean pa is already unlinked from
3909                 * the list. as we might be called from
3910                 * ->clear_inode() the inode will get freed
3911                 * and concurrent thread which is unlinking
3912                 * pa from inode's list may access already
3913                 * freed memory, bad-bad-bad */
3914
3915                /* XXX: if this happens too often, we can
3916                 * add a flag to force wait only in case
3917                 * of ->clear_inode(), but not in case of
3918                 * regular truncate */
3919                schedule_timeout_uninterruptible(HZ);
3920                goto repeat;
3921        }
3922        spin_unlock(&ei->i_prealloc_lock);
3923
3924        list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
3925                BUG_ON(pa->pa_type != MB_INODE_PA);
3926                group = ext4_get_group_number(sb, pa->pa_pstart);
3927
3928                err = ext4_mb_load_buddy(sb, group, &e4b);
3929                if (err) {
3930                        ext4_error(sb, "Error loading buddy information for %u",
3931                                        group);
3932                        continue;
3933                }
3934
3935                bitmap_bh = ext4_read_block_bitmap(sb, group);
3936                if (bitmap_bh == NULL) {
3937                        ext4_error(sb, "Error reading block bitmap for %u",
3938                                        group);
3939                        ext4_mb_unload_buddy(&e4b);
3940                        continue;
3941                }
3942
3943                ext4_lock_group(sb, group);
3944                list_del(&pa->pa_group_list);
3945                ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
3946                ext4_unlock_group(sb, group);
3947
3948                ext4_mb_unload_buddy(&e4b);
3949                put_bh(bitmap_bh);
3950
3951                list_del(&pa->u.pa_tmp_list);
3952                call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
3953        }
3954}
3955
3956#ifdef CONFIG_EXT4_DEBUG
3957static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
3958{
3959        struct super_block *sb = ac->ac_sb;
3960        ext4_group_t ngroups, i;
3961
3962        if (!ext4_mballoc_debug ||
3963            (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED))
3964                return;
3965
3966        ext4_msg(ac->ac_sb, KERN_ERR, "Can't allocate:"
3967                        " Allocation context details:");
3968        ext4_msg(ac->ac_sb, KERN_ERR, "status %d flags %d",
3969                        ac->ac_status, ac->ac_flags);
3970        ext4_msg(ac->ac_sb, KERN_ERR, "orig %lu/%lu/%lu@%lu, "
3971                        "goal %lu/%lu/%lu@%lu, "
3972                        "best %lu/%lu/%lu@%lu cr %d",
3973                        (unsigned long)ac->ac_o_ex.fe_group,
3974                        (unsigned long)ac->ac_o_ex.fe_start,
3975                        (unsigned long)ac->ac_o_ex.fe_len,
3976                        (unsigned long)ac->ac_o_ex.fe_logical,
3977                        (unsigned long)ac->ac_g_ex.fe_group,
3978                        (unsigned long)ac->ac_g_ex.fe_start,
3979                        (unsigned long)ac->ac_g_ex.fe_len,
3980                        (unsigned long)ac->ac_g_ex.fe_logical,
3981                        (unsigned long)ac->ac_b_ex.fe_group,
3982                        (unsigned long)ac->ac_b_ex.fe_start,
3983                        (unsigned long)ac->ac_b_ex.fe_len,
3984                        (unsigned long)ac->ac_b_ex.fe_logical,
3985                        (int)ac->ac_criteria);
3986        ext4_msg(ac->ac_sb, KERN_ERR, "%lu scanned, %d found",
3987                 ac->ac_ex_scanned, ac->ac_found);
3988        ext4_msg(ac->ac_sb, KERN_ERR, "groups: ");
3989        ngroups = ext4_get_groups_count(sb);
3990        for (i = 0; i < ngroups; i++) {
3991                struct ext4_group_info *grp = ext4_get_group_info(sb, i);
3992                struct ext4_prealloc_space *pa;
3993                ext4_grpblk_t start;
3994                struct list_head *cur;
3995                ext4_lock_group(sb, i);
3996                list_for_each(cur, &grp->bb_prealloc_list) {
3997                        pa = list_entry(cur, struct ext4_prealloc_space,
3998                                        pa_group_list);
3999                        spin_lock(&pa->pa_lock);
4000                        ext4_get_group_no_and_offset(sb, pa->pa_pstart,
4001                                                     NULL, &start);
4002                        spin_unlock(&pa->pa_lock);
4003                        printk(KERN_ERR "PA:%u:%d:%u \n", i,
4004                               start, pa->pa_len);
4005                }
4006                ext4_unlock_group(sb, i);
4007
4008                if (grp->bb_free == 0)
4009                        continue;
4010                printk(KERN_ERR "%u: %d/%d \n",
4011                       i, grp->bb_free, grp->bb_fragments);
4012        }
4013        printk(KERN_ERR "\n");
4014}
4015#else
4016static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
4017{
4018        return;
4019}
4020#endif
4021
4022/*
4023 * We use locality group preallocation for small size file. The size of the
4024 * file is determined by the current size or the resulting size after
4025 * allocation which ever is larger
4026 *
4027 * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req
4028 */
4029static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
4030{
4031        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
4032        int bsbits = ac->ac_sb->s_blocksize_bits;
4033        loff_t size, isize;
4034
4035        if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
4036                return;
4037
4038        if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
4039                return;
4040
4041        size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
4042        isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
4043                >> bsbits;
4044
4045        if ((size == isize) &&
4046            !ext4_fs_is_busy(sbi) &&
4047            (atomic_read(&ac->ac_inode->i_writecount) == 0)) {
4048                ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
4049                return;
4050        }
4051
4052        if (sbi->s_mb_group_prealloc <= 0) {
4053                ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
4054                return;
4055        }
4056
4057        /* don't use group allocation for large files */
4058        size = max(size, isize);
4059        if (size > sbi->s_mb_stream_request) {
4060                ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
4061                return;
4062        }
4063
4064        BUG_ON(ac->ac_lg != NULL);
4065        /*
4066         * locality group prealloc space are per cpu. The reason for having
4067         * per cpu locality group is to reduce the contention between block
4068         * request from multiple CPUs.
4069         */
4070        ac->ac_lg = __this_cpu_ptr(sbi->s_locality_groups);
4071
4072        /* we're going to use group allocation */
4073        ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;
4074
4075        /* serialize all allocations in the group */
4076        mutex_lock(&ac->ac_lg->lg_mutex);
4077}
4078
4079static noinline_for_stack int
4080ext4_mb_initialize_context(struct ext4_allocation_context *ac,
4081                                struct ext4_allocation_request *ar)
4082{
4083        struct super_block *sb = ar->inode->i_sb;
4084        struct ext4_sb_info *sbi = EXT4_SB(sb);
4085        struct ext4_super_block *es = sbi->s_es;
4086        ext4_group_t group;
4087        unsigned int len;
4088        ext4_fsblk_t goal;
4089        ext4_grpblk_t block;
4090
4091        /* we can't allocate > group size */
4092        len = ar->len;
4093
4094        /* just a dirty hack to filter too big requests  */
4095        if (len >= EXT4_CLUSTERS_PER_GROUP(sb))
4096                len = EXT4_CLUSTERS_PER_GROUP(sb);
4097
4098        /* start searching from the goal */
4099        goal = ar->goal;
4100        if (goal < le32_to_cpu(es->s_first_data_block) ||
4101                        goal >= ext4_blocks_count(es))
4102                goal = le32_to_cpu(es->s_first_data_block);
4103        ext4_get_group_no_and_offset(sb, goal, &group, &block);
4104
4105        /* set up allocation goals */
4106        ac->ac_b_ex.fe_logical = ar->logical & ~(sbi->s_cluster_ratio - 1);
4107        ac->ac_status = AC_STATUS_CONTINUE;
4108        ac->ac_sb = sb;
4109        ac->ac_inode = ar->inode;
4110        ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical;
4111        ac->ac_o_ex.fe_group = group;
4112        ac->ac_o_ex.fe_start = block;
4113        ac->ac_o_ex.fe_len = len;
4114        ac->ac_g_ex = ac->ac_o_ex;
4115        ac->ac_flags = ar->flags;
4116
4117        /* we have to define context: we'll we work with a file or
4118         * locality group. this is a policy, actually */
4119        ext4_mb_group_or_file(ac);
4120
4121        mb_debug(1, "init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, "
4122                        "left: %u/%u, right %u/%u to %swritable\n",
4123                        (unsigned) ar->len, (unsigned) ar->logical,
4124                        (unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
4125                        (unsigned) ar->lleft, (unsigned) ar->pleft,
4126                        (unsigned) ar->lright, (unsigned) ar->pright,
4127                        atomic_read(&ar->inode->i_writecount) ? "" : "non-");
4128        return 0;
4129
4130}
4131
4132static noinline_for_stack void
4133ext4_mb_discard_lg_preallocations(struct super_block *sb,
4134                                        struct ext4_locality_group *lg,
4135                                        int order, int total_entries)
4136{
4137        ext4_group_t group = 0;
4138        struct ext4_buddy e4b;
4139        struct list_head discard_list;
4140        struct ext4_prealloc_space *pa, *tmp;
4141
4142        mb_debug(1, "discard locality group preallocation\n");
4143
4144        INIT_LIST_HEAD(&discard_list);
4145
4146        spin_lock(&lg->lg_prealloc_lock);
4147        list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
4148                                                pa_inode_list) {
4149                spin_lock(&pa->pa_lock);
4150                if (atomic_read(&pa->pa_count)) {
4151                        /*
4152                         * This is the pa that we just used
4153                         * for block allocation. So don't
4154                         * free that
4155                         */
4156                        spin_unlock(&pa->pa_lock);
4157                        continue;
4158                }
4159                if (pa->pa_deleted) {
4160                        spin_unlock(&pa->pa_lock);
4161                        continue;
4162                }
4163                /* only lg prealloc space */
4164                BUG_ON(pa->pa_type != MB_GROUP_PA);
4165
4166                /* seems this one can be freed ... */
4167                pa->pa_deleted = 1;
4168                spin_unlock(&pa->pa_lock);
4169
4170                list_del_rcu(&pa->pa_inode_list);
4171                list_add(&pa->u.pa_tmp_list, &discard_list);
4172
4173                total_entries--;
4174                if (total_entries <= 5) {
4175                        /*
4176                         * we want to keep only 5 entries
4177                         * allowing it to grow to 8. This
4178                         * mak sure we don't call discard
4179                         * soon for this list.
4180                         */
4181                        break;
4182                }
4183        }
4184        spin_unlock(&lg->lg_prealloc_lock);
4185
4186        list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {
4187
4188                group = ext4_get_group_number(sb, pa->pa_pstart);
4189                if (ext4_mb_load_buddy(sb, group, &e4b)) {
4190                        ext4_error(sb, "Error loading buddy information for %u",
4191                                        group);
4192                        continue;
4193                }
4194                ext4_lock_group(sb, group);
4195                list_del(&pa->pa_group_list);
4196                ext4_mb_release_group_pa(&e4b, pa);
4197                ext4_unlock_group(sb, group);
4198
4199                ext4_mb_unload_buddy(&e4b);
4200                list_del(&pa->u.pa_tmp_list);
4201                call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
4202        }
4203}
4204
4205/*
4206 * We have incremented pa_count. So it cannot be freed at this
4207 * point. Also we hold lg_mutex. So no parallel allocation is
4208 * possible from this lg. That means pa_free cannot be updated.
4209 *
4210 * A parallel ext4_mb_discard_group_preallocations is possible.
4211 * which can cause the lg_prealloc_list to be updated.
4212 */
4213
4214static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
4215{
4216        int order, added = 0, lg_prealloc_count = 1;
4217        struct super_block *sb = ac->ac_sb;
4218        struct ext4_locality_group *lg = ac->ac_lg;
4219        struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa;
4220
4221        order = fls(pa->pa_free) - 1;
4222        if (order > PREALLOC_TB_SIZE - 1)
4223                /* The max size of hash table is PREALLOC_TB_SIZE */
4224                order = PREALLOC_TB_SIZE - 1;
4225        /* Add the prealloc space to lg */
4226        spin_lock(&lg->lg_prealloc_lock);
4227        list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
4228                                                pa_inode_list) {
4229                spin_lock(&tmp_pa->pa_lock);
4230                if (tmp_pa->pa_deleted) {
4231                        spin_unlock(&tmp_pa->pa_lock);
4232                        continue;
4233                }
4234                if (!added && pa->pa_free < tmp_pa->pa_free) {
4235                        /* Add to the tail of the previous entry */
4236                        list_add_tail_rcu(&pa->pa_inode_list,
4237                                                &tmp_pa->pa_inode_list);
4238                        added = 1;
4239                        /*
4240                         * we want to count the total
4241                         * number of entries in the list
4242                         */
4243                }
4244                spin_unlock(&tmp_pa->pa_lock);
4245                lg_prealloc_count++;
4246        }
4247        if (!added)
4248                list_add_tail_rcu(&pa->pa_inode_list,
4249                                        &lg->lg_prealloc_list[order]);
4250        spin_unlock(&lg->lg_prealloc_lock);
4251
4252        /* Now trim the list to be not more than 8 elements */
4253        if (lg_prealloc_count > 8) {
4254                ext4_mb_discard_lg_preallocations(sb, lg,
4255                                                  order, lg_prealloc_count);
4256                return;
4257        }
4258        return ;
4259}
4260
4261/*
4262 * release all resource we used in allocation
4263 */
4264static int ext4_mb_release_context(struct ext4_allocation_context *ac)
4265{
4266        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
4267        struct ext4_prealloc_space *pa = ac->ac_pa;
4268        if (pa) {
4269                if (pa->pa_type == MB_GROUP_PA) {
4270                        /* see comment in ext4_mb_use_group_pa() */
4271                        spin_lock(&pa->pa_lock);
4272                        pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
4273                        pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
4274                        pa->pa_free -= ac->ac_b_ex.fe_len;
4275                        pa->pa_len -= ac->ac_b_ex.fe_len;
4276                        spin_unlock(&pa->pa_lock);
4277                }
4278        }
4279        if (pa) {
4280                /*
4281                 * We want to add the pa to the right bucket.
4282                 * Remove it from the list and while adding
4283                 * make sure the list to which we are adding
4284                 * doesn't grow big.
4285                 */
4286                if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
4287                        spin_lock(pa->pa_obj_lock);
4288                        list_del_rcu(&pa->pa_inode_list);
4289                        spin_unlock(pa->pa_obj_lock);
4290                        ext4_mb_add_n_trim(ac);
4291                }
4292                ext4_mb_put_pa(ac, ac->ac_sb, pa);
4293        }
4294        if (ac->ac_bitmap_page)
4295                page_cache_release(ac->ac_bitmap_page);
4296        if (ac->ac_buddy_page)
4297                page_cache_release(ac->ac_buddy_page);
4298        if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
4299                mutex_unlock(&ac->ac_lg->lg_mutex);
4300        ext4_mb_collect_stats(ac);
4301        return 0;
4302}
4303
4304static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
4305{
4306        ext4_group_t i, ngroups = ext4_get_groups_count(sb);
4307        int ret;
4308        int freed = 0;
4309
4310        trace_ext4_mb_discard_preallocations(sb, needed);
4311        for (i = 0; i < ngroups && needed > 0; i++) {
4312                ret = ext4_mb_discard_group_preallocations(sb, i, needed);
4313                freed += ret;
4314                needed -= ret;
4315        }
4316
4317        return freed;
4318}
4319
4320/*
4321 * Main entry point into mballoc to allocate blocks
4322 * it tries to use preallocation first, then falls back
4323 * to usual allocation
4324 */
4325ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4326                                struct ext4_allocation_request *ar, int *errp)
4327{
4328        int freed;
4329        struct ext4_allocation_context *ac = NULL;
4330        struct ext4_sb_info *sbi;
4331        struct super_block *sb;
4332        ext4_fsblk_t block = 0;
4333        unsigned int inquota = 0;
4334        unsigned int reserv_clstrs = 0;
4335
4336        might_sleep();
4337        sb = ar->inode->i_sb;
4338        sbi = EXT4_SB(sb);
4339
4340        trace_ext4_request_blocks(ar);
4341
4342        /* Allow to use superuser reservation for quota file */
4343        if (IS_NOQUOTA(ar->inode))
4344                ar->flags |= EXT4_MB_USE_ROOT_BLOCKS;
4345
4346        /*
4347         * For delayed allocation, we could skip the ENOSPC and
4348         * EDQUOT check, as blocks and quotas have been already
4349         * reserved when data being copied into pagecache.
4350         */
4351        if (ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED))
4352                ar->flags |= EXT4_MB_DELALLOC_RESERVED;
4353        else {
4354                /* Without delayed allocation we need to verify
4355                 * there is enough free blocks to do block allocation
4356                 * and verify allocation doesn't exceed the quota limits.
4357                 */
4358                while (ar->len &&
4359                        ext4_claim_free_clusters(sbi, ar->len, ar->flags)) {
4360
4361                        /* let others to free the space */
4362                        cond_resched();
4363                        ar->len = ar->len >> 1;
4364                }
4365                if (!ar->len) {
4366                        *errp = -ENOSPC;
4367                        return 0;
4368                }
4369                reserv_clstrs = ar->len;
4370                if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) {
4371                        dquot_alloc_block_nofail(ar->inode,
4372                                                 EXT4_C2B(sbi, ar->len));
4373                } else {
4374                        while (ar->len &&
4375                                dquot_alloc_block(ar->inode,
4376                                                  EXT4_C2B(sbi, ar->len))) {
4377
4378                                ar->flags |= EXT4_MB_HINT_NOPREALLOC;
4379                                ar->len--;
4380                        }
4381                }
4382                inquota = ar->len;
4383                if (ar->len == 0) {
4384                        *errp = -EDQUOT;
4385                        goto out;
4386                }
4387        }
4388
4389        ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS);
4390        if (!ac) {
4391                ar->len = 0;
4392                *errp = -ENOMEM;
4393                goto out;
4394        }
4395
4396        *errp = ext4_mb_initialize_context(ac, ar);
4397        if (*errp) {
4398                ar->len = 0;
4399                goto out;
4400        }
4401
4402        ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
4403        if (!ext4_mb_use_preallocated(ac)) {
4404                ac->ac_op = EXT4_MB_HISTORY_ALLOC;
4405                ext4_mb_normalize_request(ac, ar);
4406repeat:
4407                /* allocate space in core */
4408                *errp = ext4_mb_regular_allocator(ac);
4409                if (*errp)
4410                        goto discard_and_exit;
4411
4412                /* as we've just preallocated more space than
4413                 * user requested originally, we store allocated
4414                 * space in a special descriptor */
4415                if (ac->ac_status == AC_STATUS_FOUND &&
4416                    ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
4417                        *errp = ext4_mb_new_preallocation(ac);
4418                if (*errp) {
4419                discard_and_exit:
4420                        ext4_discard_allocated_blocks(ac);
4421                        goto errout;
4422                }
4423        }
4424        if (likely(ac->ac_status == AC_STATUS_FOUND)) {
4425                *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
4426                if (*errp == -EAGAIN) {
4427                        /*
4428                         * drop the reference that we took
4429                         * in ext4_mb_use_best_found
4430                         */
4431                        ext4_mb_release_context(ac);
4432                        ac->ac_b_ex.fe_group = 0;
4433                        ac->ac_b_ex.fe_start = 0;
4434                        ac->ac_b_ex.fe_len = 0;
4435                        ac->ac_status = AC_STATUS_CONTINUE;
4436                        goto repeat;
4437                } else if (*errp) {
4438                        ext4_discard_allocated_blocks(ac);
4439                        goto errout;
4440                } else {
4441                        block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
4442                        ar->len = ac->ac_b_ex.fe_len;
4443                }
4444        } else {
4445                freed  = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len);
4446                if (freed)
4447                        goto repeat;
4448                *errp = -ENOSPC;
4449        }
4450
4451errout:
4452        if (*errp) {
4453                ac->ac_b_ex.fe_len = 0;
4454                ar->len = 0;
4455                ext4_mb_show_ac(ac);
4456        }
4457        ext4_mb_release_context(ac);
4458out:
4459        if (ac)
4460                kmem_cache_free(ext4_ac_cachep, ac);
4461        if (inquota && ar->len < inquota)
4462                dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len));
4463        if (!ar->len) {
4464                if (!ext4_test_inode_state(ar->inode,
4465                                           EXT4_STATE_DELALLOC_RESERVED))
4466                        /* release all the reserved blocks if non delalloc */
4467                        percpu_counter_sub(&sbi->s_dirtyclusters_counter,
4468                                                reserv_clstrs);
4469        }
4470
4471        trace_ext4_allocate_blocks(ar, (unsigned long long)block);
4472
4473        return block;
4474}
4475
4476/*
4477 * We can merge two free data extents only if the physical blocks
4478 * are contiguous, AND the extents were freed by the same transaction,
4479 * AND the blocks are associated with the same group.
4480 */
4481static int can_merge(struct ext4_free_data *entry1,
4482                        struct ext4_free_data *entry2)
4483{
4484        if ((entry1->efd_tid == entry2->efd_tid) &&
4485            (entry1->efd_group == entry2->efd_group) &&
4486            ((entry1->efd_start_cluster + entry1->efd_count) == entry2->efd_start_cluster))
4487                return 1;
4488        return 0;
4489}
4490
4491static noinline_for_stack int
4492ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4493                      struct ext4_free_data *new_entry)
4494{
4495        ext4_group_t group = e4b->bd_group;
4496        ext4_grpblk_t cluster;
4497        struct ext4_free_data *entry;
4498        struct ext4_group_info *db = e4b->bd_info;
4499        struct super_block *sb = e4b->bd_sb;
4500        struct ext4_sb_info *sbi = EXT4_SB(sb);
4501        struct rb_node **n = &db->bb_free_root.rb_node, *node;
4502        struct rb_node *parent = NULL, *new_node;
4503
4504        BUG_ON(!ext4_handle_valid(handle));
4505        BUG_ON(e4b->bd_bitmap_page == NULL);
4506        BUG_ON(e4b->bd_buddy_page == NULL);
4507
4508        new_node = &new_entry->efd_node;
4509        cluster = new_entry->efd_start_cluster;
4510
4511        if (!*n) {
4512                /* first free block exent. We need to
4513                   protect buddy cache from being freed,
4514                 * otherwise we'll refresh it from
4515                 * on-disk bitmap and lose not-yet-available
4516                 * blocks */
4517                page_cache_get(e4b->bd_buddy_page);
4518                page_cache_get(e4b->bd_bitmap_page);
4519        }
4520        while (*n) {
4521                parent = *n;
4522                entry = rb_entry(parent, struct ext4_free_data, efd_node);
4523                if (cluster < entry->efd_start_cluster)
4524                        n = &(*n)->rb_left;
4525                else if (cluster >= (entry->efd_start_cluster + entry->efd_count))
4526                        n = &(*n)->rb_right;
4527                else {
4528                        ext4_grp_locked_error(sb, group, 0,
4529                                ext4_group_first_block_no(sb, group) +
4530                                EXT4_C2B(sbi, cluster),
4531                                "Block already on to-be-freed list");
4532                        return 0;
4533                }
4534        }
4535
4536        rb_link_node(new_node, parent, n);
4537        rb_insert_color(new_node, &db->bb_free_root);
4538
4539        /* Now try to see the extent can be merged to left and right */
4540        node = rb_prev(new_node);
4541        if (node) {
4542                entry = rb_entry(node, struct ext4_free_data, efd_node);
4543                if (can_merge(entry, new_entry) &&
4544                    ext4_journal_callback_try_del(handle, &entry->efd_jce)) {
4545                        new_entry->efd_start_cluster = entry->efd_start_cluster;
4546                        new_entry->efd_count += entry->efd_count;
4547                        rb_erase(node, &(db->bb_free_root));
4548                        kmem_cache_free(ext4_free_data_cachep, entry);
4549                }
4550        }
4551
4552        node = rb_next(new_node);
4553        if (node) {
4554                entry = rb_entry(node, struct ext4_free_data, efd_node);
4555                if (can_merge(new_entry, entry) &&
4556                    ext4_journal_callback_try_del(handle, &entry->efd_jce)) {
4557                        new_entry->efd_count += entry->efd_count;
4558                        rb_erase(node, &(db->bb_free_root));
4559                        kmem_cache_free(ext4_free_data_cachep, entry);
4560                }
4561        }
4562        /* Add the extent to transaction's private list */
4563        ext4_journal_callback_add(handle, ext4_free_data_callback,
4564                                  &new_entry->efd_jce);
4565        return 0;
4566}
4567
4568/**
4569 * ext4_free_blocks() -- Free given blocks and update quota
4570 * @handle:             handle for this transaction
4571 * @inode:              inode
4572 * @block:              start physical block to free
4573 * @count:              number of blocks to count
4574 * @flags:              flags used by ext4_free_blocks
4575 */
4576void ext4_free_blocks(handle_t *handle, struct inode *inode,
4577                      struct buffer_head *bh, ext4_fsblk_t block,
4578                      unsigned long count, int flags)
4579{
4580        struct buffer_head *bitmap_bh = NULL;
4581        struct super_block *sb = inode->i_sb;
4582        struct ext4_group_desc *gdp;
4583        unsigned int overflow;
4584        ext4_grpblk_t bit;
4585        struct buffer_head *gd_bh;
4586        ext4_group_t block_group;
4587        struct ext4_sb_info *sbi;
4588        struct ext4_buddy e4b;
4589        unsigned int count_clusters;
4590        int err = 0;
4591        int ret;
4592
4593        might_sleep();
4594        if (bh) {
4595                if (block)
4596                        BUG_ON(block != bh->b_blocknr);
4597                else
4598                        block = bh->b_blocknr;
4599        }
4600
4601        sbi = EXT4_SB(sb);
4602        if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
4603            !ext4_data_block_valid(sbi, block, count)) {
4604                ext4_error(sb, "Freeing blocks not in datazone - "
4605                           "block = %llu, count = %lu", block, count);
4606                goto error_return;
4607        }
4608
4609        ext4_debug("freeing block %llu\n", block);
4610        trace_ext4_free_blocks(inode, block, count, flags);
4611
4612        if (flags & EXT4_FREE_BLOCKS_FORGET) {
4613                struct buffer_head *tbh = bh;
4614                int i;
4615
4616                BUG_ON(bh && (count > 1));
4617
4618                for (i = 0; i < count; i++) {
4619                        cond_resched();
4620                        if (!bh)
4621                                tbh = sb_find_get_block(inode->i_sb,
4622                                                        block + i);
4623                        if (!tbh)
4624                                continue;
4625                        ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
4626                                    inode, tbh, block + i);
4627                }
4628        }
4629
4630        /*
4631         * We need to make sure we don't reuse the freed block until
4632         * after the transaction is committed, which we can do by
4633         * treating the block as metadata, below.  We make an
4634         * exception if the inode is to be written in writeback mode
4635         * since writeback mode has weak data consistency guarantees.
4636         */
4637        if (!ext4_should_writeback_data(inode))
4638                flags |= EXT4_FREE_BLOCKS_METADATA;
4639
4640        /*
4641         * If the extent to be freed does not begin on a cluster
4642         * boundary, we need to deal with partial clusters at the
4643         * beginning and end of the extent.  Normally we will free
4644         * blocks at the beginning or the end unless we are explicitly
4645         * requested to avoid doing so.
4646         */
4647        overflow = block & (sbi->s_cluster_ratio - 1);
4648        if (overflow) {
4649                if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
4650                        overflow = sbi->s_cluster_ratio - overflow;
4651                        block += overflow;
4652                        if (count > overflow)
4653                                count -= overflow;
4654                        else
4655                                return;
4656                } else {
4657                        block -= overflow;
4658                        count += overflow;
4659                }
4660        }
4661        overflow = count & (sbi->s_cluster_ratio - 1);
4662        if (overflow) {
4663                if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
4664                        if (count > overflow)
4665                                count -= overflow;
4666                        else
4667                                return;
4668                } else
4669                        count += sbi->s_cluster_ratio - overflow;
4670        }
4671
4672do_more:
4673        overflow = 0;
4674        ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
4675
4676        /*
4677         * Check to see if we are freeing blocks across a group
4678         * boundary.
4679         */
4680        if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) {
4681                overflow = EXT4_C2B(sbi, bit) + count -
4682                        EXT4_BLOCKS_PER_GROUP(sb);
4683                count -= overflow;
4684        }
4685        count_clusters = EXT4_NUM_B2C(sbi, count);
4686        bitmap_bh = ext4_read_block_bitmap(sb, block_group);
4687        if (!bitmap_bh) {
4688                err = -EIO;
4689                goto error_return;
4690        }
4691        gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
4692        if (!gdp) {
4693                err = -EIO;
4694                goto error_return;
4695        }
4696
4697        if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
4698            in_range(ext4_inode_bitmap(sb, gdp), block, count) ||
4699            in_range(block, ext4_inode_table(sb, gdp),
4700                     EXT4_SB(sb)->s_itb_per_group) ||
4701            in_range(block + count - 1, ext4_inode_table(sb, gdp),
4702                     EXT4_SB(sb)->s_itb_per_group)) {
4703
4704                ext4_error(sb, "Freeing blocks in system zone - "
4705                           "Block = %llu, count = %lu", block, count);
4706                /* err = 0. ext4_std_error should be a no op */
4707                goto error_return;
4708        }
4709
4710        BUFFER_TRACE(bitmap_bh, "getting write access");
4711        err = ext4_journal_get_write_access(handle, bitmap_bh);
4712        if (err)
4713                goto error_return;
4714
4715        /*
4716         * We are about to modify some metadata.  Call the journal APIs
4717         * to unshare ->b_data if a currently-committing transaction is
4718         * using it
4719         */
4720        BUFFER_TRACE(gd_bh, "get_write_access");
4721        err = ext4_journal_get_write_access(handle, gd_bh);
4722        if (err)
4723                goto error_return;
4724#ifdef AGGRESSIVE_CHECK
4725        {
4726                int i;
4727                for (i = 0; i < count_clusters; i++)
4728                        BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
4729        }
4730#endif
4731        trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters);
4732
4733        err = ext4_mb_load_buddy(sb, block_group, &e4b);
4734        if (err)
4735                goto error_return;
4736
4737        if ((flags & EXT4_FREE_BLOCKS_METADATA) && ext4_handle_valid(handle)) {
4738                struct ext4_free_data *new_entry;
4739                /*
4740                 * blocks being freed are metadata. these blocks shouldn't
4741                 * be used until this transaction is committed
4742                 */
4743        retry:
4744                new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS);
4745                if (!new_entry) {
4746                        /*
4747                         * We use a retry loop because
4748                         * ext4_free_blocks() is not allowed to fail.
4749                         */
4750                        cond_resched();
4751                        congestion_wait(BLK_RW_ASYNC, HZ/50);
4752                        goto retry;
4753                }
4754                new_entry->efd_start_cluster = bit;
4755                new_entry->efd_group = block_group;
4756                new_entry->efd_count = count_clusters;
4757                new_entry->efd_tid = handle->h_transaction->t_tid;
4758
4759                ext4_lock_group(sb, block_group);
4760                mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
4761                ext4_mb_free_metadata(handle, &e4b, new_entry);
4762        } else {
4763                /* need to update group_info->bb_free and bitmap
4764                 * with group lock held. generate_buddy look at
4765                 * them with group lock_held
4766                 */
4767                if (test_opt(sb, DISCARD)) {
4768                        err = ext4_issue_discard(sb, block_group, bit, count);
4769                        if (err && err != -EOPNOTSUPP)
4770                                ext4_msg(sb, KERN_WARNING, "discard request in"
4771                                         " group:%d block:%d count:%lu failed"
4772                                         " with %d", block_group, bit, count,
4773                                         err);
4774                }
4775
4776
4777                ext4_lock_group(sb, block_group);
4778                mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
4779                mb_free_blocks(inode, &e4b, bit, count_clusters);
4780        }
4781
4782        ret = ext4_free_group_clusters(sb, gdp) + count_clusters;
4783        ext4_free_group_clusters_set(sb, gdp, ret);
4784        ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh);
4785        ext4_group_desc_csum_set(sb, block_group, gdp);
4786        ext4_unlock_group(sb, block_group);
4787        percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters);
4788
4789        if (sbi->s_log_groups_per_flex) {
4790                ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
4791                atomic64_add(count_clusters,
4792                             &sbi->s_flex_groups[flex_group].free_clusters);
4793        }
4794
4795        ext4_mb_unload_buddy(&e4b);
4796
4797        if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
4798                dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
4799
4800        /* We dirtied the bitmap block */
4801        BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
4802        err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
4803
4804        /* And the group descriptor block */
4805        BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
4806        ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
4807        if (!err)
4808                err = ret;
4809
4810        if (overflow && !err) {
4811                block += count;
4812                count = overflow;
4813                put_bh(bitmap_bh);
4814                goto do_more;
4815        }
4816error_return:
4817        brelse(bitmap_bh);
4818        ext4_std_error(sb, err);
4819        return;
4820}
4821
4822/**
4823 * ext4_group_add_blocks() -- Add given blocks to an existing group
4824 * @handle:                     handle to this transaction
4825 * @sb:                         super block
4826 * @block:                      start physical block to add to the block group
4827 * @count:                      number of blocks to free
4828 *
4829 * This marks the blocks as free in the bitmap and buddy.
4830 */
4831int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
4832                         ext4_fsblk_t block, unsigned long count)
4833{
4834        struct buffer_head *bitmap_bh = NULL;
4835        struct buffer_head *gd_bh;
4836        ext4_group_t block_group;
4837        ext4_grpblk_t bit;
4838        unsigned int i;
4839        struct ext4_group_desc *desc;
4840        struct ext4_sb_info *sbi = EXT4_SB(sb);
4841        struct ext4_buddy e4b;
4842        int err = 0, ret, blk_free_count;
4843        ext4_grpblk_t blocks_freed;
4844
4845        ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
4846
4847        if (count == 0)
4848                return 0;
4849
4850        ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
4851        /*
4852         * Check to see if we are freeing blocks across a group
4853         * boundary.
4854         */
4855        if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
4856                ext4_warning(sb, "too much blocks added to group %u\n",
4857                             block_group);
4858                err = -EINVAL;
4859                goto error_return;
4860        }
4861
4862        bitmap_bh = ext4_read_block_bitmap(sb, block_group);
4863        if (!bitmap_bh) {
4864                err = -EIO;
4865                goto error_return;
4866        }
4867
4868        desc = ext4_get_group_desc(sb, block_group, &gd_bh);
4869        if (!desc) {
4870                err = -EIO;
4871                goto error_return;
4872        }
4873
4874        if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
4875            in_range(ext4_inode_bitmap(sb, desc), block, count) ||
4876            in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
4877            in_range(block + count - 1, ext4_inode_table(sb, desc),
4878                     sbi->s_itb_per_group)) {
4879                ext4_error(sb, "Adding blocks in system zones - "
4880                           "Block = %llu, count = %lu",
4881                           block, count);
4882                err = -EINVAL;
4883                goto error_return;
4884        }
4885
4886        BUFFER_TRACE(bitmap_bh, "getting write access");
4887        err = ext4_journal_get_write_access(handle, bitmap_bh);
4888        if (err)
4889                goto error_return;
4890
4891        /*
4892         * We are about to modify some metadata.  Call the journal APIs
4893         * to unshare ->b_data if a currently-committing transaction is
4894         * using it
4895         */
4896        BUFFER_TRACE(gd_bh, "get_write_access");
4897        err = ext4_journal_get_write_access(handle, gd_bh);
4898        if (err)
4899                goto error_return;
4900
4901        for (i = 0, blocks_freed = 0; i < count; i++) {
4902                BUFFER_TRACE(bitmap_bh, "clear bit");
4903                if (!mb_test_bit(bit + i, bitmap_bh->b_data)) {
4904                        ext4_error(sb, "bit already cleared for block %llu",
4905                                   (ext4_fsblk_t)(block + i));
4906                        BUFFER_TRACE(bitmap_bh, "bit already cleared");
4907                } else {
4908                        blocks_freed++;
4909                }
4910        }
4911
4912        err = ext4_mb_load_buddy(sb, block_group, &e4b);
4913        if (err)
4914                goto error_return;
4915
4916        /*
4917         * need to update group_info->bb_free and bitmap
4918         * with group lock held. generate_buddy look at
4919         * them with group lock_held
4920         */
4921        ext4_lock_group(sb, block_group);
4922        mb_clear_bits(bitmap_bh->b_data, bit, count);
4923        mb_free_blocks(NULL, &e4b, bit, count);
4924        blk_free_count = blocks_freed + ext4_free_group_clusters(sb, desc);
4925        ext4_free_group_clusters_set(sb, desc, blk_free_count);
4926        ext4_block_bitmap_csum_set(sb, block_group, desc, bitmap_bh);
4927        ext4_group_desc_csum_set(sb, block_group, desc);
4928        ext4_unlock_group(sb, block_group);
4929        percpu_counter_add(&sbi->s_freeclusters_counter,
4930                           EXT4_NUM_B2C(sbi, blocks_freed));
4931
4932        if (sbi->s_log_groups_per_flex) {
4933                ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
4934                atomic64_add(EXT4_NUM_B2C(sbi, blocks_freed),
4935                             &sbi->s_flex_groups[flex_group].free_clusters);
4936        }
4937
4938        ext4_mb_unload_buddy(&e4b);
4939
4940        /* We dirtied the bitmap block */
4941        BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
4942        err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
4943
4944        /* And the group descriptor block */
4945        BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
4946        ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
4947        if (!err)
4948                err = ret;
4949
4950error_return:
4951        brelse(bitmap_bh);
4952        ext4_std_error(sb, err);
4953        return err;
4954}
4955
4956/**
4957 * ext4_trim_extent -- function to TRIM one single free extent in the group
4958 * @sb:         super block for the file system
4959 * @start:      starting block of the free extent in the alloc. group
4960 * @count:      number of blocks to TRIM
4961 * @group:      alloc. group we are working with
4962 * @e4b:        ext4 buddy for the group
4963 *
4964 * Trim "count" blocks starting at "start" in the "group". To assure that no
4965 * one will allocate those blocks, mark it as used in buddy bitmap. This must
4966 * be called with under the group lock.
4967 */
4968static int ext4_trim_extent(struct super_block *sb, int start, int count,
4969                             ext4_group_t group, struct ext4_buddy *e4b)
4970{
4971        struct ext4_free_extent ex;
4972        int ret = 0;
4973
4974        trace_ext4_trim_extent(sb, group, start, count);
4975
4976        assert_spin_locked(ext4_group_lock_ptr(sb, group));
4977
4978        ex.fe_start = start;
4979        ex.fe_group = group;
4980        ex.fe_len = count;
4981
4982        /*
4983         * Mark blocks used, so no one can reuse them while
4984         * being trimmed.
4985         */
4986        mb_mark_used(e4b, &ex);
4987        ext4_unlock_group(sb, group);
4988        ret = ext4_issue_discard(sb, group, start, count);
4989        ext4_lock_group(sb, group);
4990        mb_free_blocks(NULL, e4b, start, ex.fe_len);
4991        return ret;
4992}
4993
4994/**
4995 * ext4_trim_all_free -- function to trim all free space in alloc. group
4996 * @sb:                 super block for file system
4997 * @group:              group to be trimmed
4998 * @start:              first group block to examine
4999 * @max:                last group block to examine
5000 * @minblocks:          minimum extent block count
5001 *
5002 * ext4_trim_all_free walks through group's buddy bitmap searching for free
5003 * extents. When the free block is found, ext4_trim_extent is called to TRIM
5004 * the extent.
5005 *
5006 *
5007 * ext4_trim_all_free walks through group's block bitmap searching for free
5008 * extents. When the free extent is found, mark it as used in group buddy
5009 * bitmap. Then issue a TRIM command on this extent and free the extent in
5010 * the group buddy bitmap. This is done until whole group is scanned.
5011 */
5012static ext4_grpblk_t
5013ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
5014                   ext4_grpblk_t start, ext4_grpblk_t max,
5015                   ext4_grpblk_t minblocks)
5016{
5017        void *bitmap;
5018        ext4_grpblk_t next, count = 0, free_count = 0;
5019        struct ext4_buddy e4b;
5020        int ret = 0;
5021
5022        trace_ext4_trim_all_free(sb, group, start, max);
5023
5024        ret = ext4_mb_load_buddy(sb, group, &e4b);
5025        if (ret) {
5026                ext4_error(sb, "Error in loading buddy "
5027                                "information for %u", group);
5028                return ret;
5029        }
5030        bitmap = e4b.bd_bitmap;
5031
5032        ext4_lock_group(sb, group);
5033        if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) &&
5034            minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks))
5035                goto out;
5036
5037        start = (e4b.bd_info->bb_first_free > start) ?
5038                e4b.bd_info->bb_first_free : start;
5039
5040        while (start <= max) {
5041                start = mb_find_next_zero_bit(bitmap, max + 1, start);
5042                if (start > max)
5043                        break;
5044                next = mb_find_next_bit(bitmap, max + 1, start);
5045
5046                if ((next - start) >= minblocks) {
5047                        ret = ext4_trim_extent(sb, start,
5048                                               next - start, group, &e4b);
5049                        if (ret && ret != -EOPNOTSUPP)
5050                                break;
5051                        ret = 0;
5052                        count += next - start;
5053                }
5054                free_count += next - start;
5055                start = next + 1;
5056
5057                if (fatal_signal_pending(current)) {
5058                        count = -ERESTARTSYS;
5059                        break;
5060                }
5061
5062                if (need_resched()) {
5063                        ext4_unlock_group(sb, group);
5064                        cond_resched();
5065                        ext4_lock_group(sb, group);
5066                }
5067
5068                if ((e4b.bd_info->bb_free - free_count) < minblocks)
5069                        break;
5070        }
5071
5072        if (!ret) {
5073                ret = count;
5074                EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
5075        }
5076out:
5077        ext4_unlock_group(sb, group);
5078        ext4_mb_unload_buddy(&e4b);
5079
5080        ext4_debug("trimmed %d blocks in the group %d\n",
5081                count, group);
5082
5083        return ret;
5084}
5085
5086/**
5087 * ext4_trim_fs() -- trim ioctl handle function
5088 * @sb:                 superblock for filesystem
5089 * @range:              fstrim_range structure
5090 *
5091 * start:       First Byte to trim
5092 * len:         number of Bytes to trim from start
5093 * minlen:      minimum extent length in Bytes
5094 * ext4_trim_fs goes through all allocation groups containing Bytes from
5095 * start to start+len. For each such a group ext4_trim_all_free function
5096 * is invoked to trim all free space.
5097 */
5098int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
5099{
5100        struct ext4_group_info *grp;
5101        ext4_group_t group, first_group, last_group;
5102        ext4_grpblk_t cnt = 0, first_cluster, last_cluster;
5103        uint64_t start, end, minlen, trimmed = 0;
5104        ext4_fsblk_t first_data_blk =
5105                        le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
5106        ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es);
5107        int ret = 0;
5108
5109        start = range->start >> sb->s_blocksize_bits;
5110        end = start + (range->len >> sb->s_blocksize_bits) - 1;
5111        minlen = EXT4_NUM_B2C(EXT4_SB(sb),
5112                              range->minlen >> sb->s_blocksize_bits);
5113
5114        if (minlen > EXT4_CLUSTERS_PER_GROUP(sb) ||
5115            start >= max_blks ||
5116            range->len < sb->s_blocksize)
5117                return -EINVAL;
5118        if (end >= max_blks)
5119                end = max_blks - 1;
5120        if (end <= first_data_blk)
5121                goto out;
5122        if (start < first_data_blk)
5123                start = first_data_blk;
5124
5125        /* Determine first and last group to examine based on start and end */
5126        ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
5127                                     &first_group, &first_cluster);
5128        ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end,
5129                                     &last_group, &last_cluster);
5130
5131        /* end now represents the last cluster to discard in this group */
5132        end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
5133
5134        for (group = first_group; group <= last_group; group++) {
5135                grp = ext4_get_group_info(sb, group);
5136                /* We only do this if the grp has never been initialized */
5137                if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
5138                        ret = ext4_mb_init_group(sb, group);
5139                        if (ret)
5140                                break;
5141                }
5142
5143                /*
5144                 * For all the groups except the last one, last cluster will
5145                 * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to
5146                 * change it for the last group, note that last_cluster is
5147                 * already computed earlier by ext4_get_group_no_and_offset()
5148                 */
5149                if (group == last_group)
5150                        end = last_cluster;
5151
5152                if (grp->bb_free >= minlen) {
5153                        cnt = ext4_trim_all_free(sb, group, first_cluster,
5154                                                end, minlen);
5155                        if (cnt < 0) {
5156                                ret = cnt;
5157                                break;
5158                        }
5159                        trimmed += cnt;
5160                }
5161
5162                /*
5163                 * For every group except the first one, we are sure
5164                 * that the first cluster to discard will be cluster #0.
5165                 */
5166                first_cluster = 0;
5167        }
5168
5169        if (!ret)
5170                atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen);
5171
5172out:
5173        range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits;
5174        return ret;
5175}
5176