linux/fs/ext4/mballoc.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
   3 * Written by Alex Tomas <alex@clusterfs.com>
   4 *
   5 * This program is free software; you can redistribute it and/or modify
   6 * it under the terms of the GNU General Public License version 2 as
   7 * published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope that it will be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public License
  15 * along with this program; if not, write to the Free Software
  16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
  17 */
  18
  19
  20/*
  21 * mballoc.c contains the multiblocks allocation routines
  22 */
  23
  24#include "ext4_jbd2.h"
  25#include "mballoc.h"
  26#include <linux/log2.h>
  27#include <linux/module.h>
  28#include <linux/slab.h>
  29#include <linux/backing-dev.h>
  30#include <trace/events/ext4.h>
  31
  32#ifdef CONFIG_EXT4_DEBUG
  33ushort ext4_mballoc_debug __read_mostly;
  34
  35module_param_named(mballoc_debug, ext4_mballoc_debug, ushort, 0644);
  36MODULE_PARM_DESC(mballoc_debug, "Debugging level for ext4's mballoc");
  37#endif
  38
  39/*
  40 * MUSTDO:
  41 *   - test ext4_ext_search_left() and ext4_ext_search_right()
  42 *   - search for metadata in few groups
  43 *
  44 * TODO v4:
  45 *   - normalization should take into account whether file is still open
  46 *   - discard preallocations if no free space left (policy?)
  47 *   - don't normalize tails
  48 *   - quota
  49 *   - reservation for superuser
  50 *
  51 * TODO v3:
  52 *   - bitmap read-ahead (proposed by Oleg Drokin aka green)
  53 *   - track min/max extents in each group for better group selection
  54 *   - mb_mark_used() may allocate chunk right after splitting buddy
  55 *   - tree of groups sorted by number of free blocks
  56 *   - error handling
  57 */
  58
  59/*
  60 * The allocation request involve request for multiple number of blocks
  61 * near to the goal(block) value specified.
  62 *
  63 * During initialization phase of the allocator we decide to use the
  64 * group preallocation or inode preallocation depending on the size of
  65 * the file. The size of the file could be the resulting file size we
  66 * would have after allocation, or the current file size, which ever
  67 * is larger. If the size is less than sbi->s_mb_stream_request we
  68 * select to use the group preallocation. The default value of
  69 * s_mb_stream_request is 16 blocks. This can also be tuned via
  70 * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in
  71 * terms of number of blocks.
  72 *
  73 * The main motivation for having small file use group preallocation is to
  74 * ensure that we have small files closer together on the disk.
  75 *
  76 * First stage the allocator looks at the inode prealloc list,
  77 * ext4_inode_info->i_prealloc_list, which contains list of prealloc
  78 * spaces for this particular inode. The inode prealloc space is
  79 * represented as:
  80 *
  81 * pa_lstart -> the logical start block for this prealloc space
  82 * pa_pstart -> the physical start block for this prealloc space
  83 * pa_len    -> length for this prealloc space (in clusters)
  84 * pa_free   ->  free space available in this prealloc space (in clusters)
  85 *
  86 * The inode preallocation space is used looking at the _logical_ start
  87 * block. If only the logical file block falls within the range of prealloc
  88 * space we will consume the particular prealloc space. This makes sure that
  89 * we have contiguous physical blocks representing the file blocks
  90 *
  91 * The important thing to be noted in case of inode prealloc space is that
  92 * we don't modify the values associated to inode prealloc space except
  93 * pa_free.
  94 *
  95 * If we are not able to find blocks in the inode prealloc space and if we
  96 * have the group allocation flag set then we look at the locality group
  97 * prealloc space. These are per CPU prealloc list represented as
  98 *
  99 * ext4_sb_info.s_locality_groups[smp_processor_id()]
 100 *
 101 * The reason for having a per cpu locality group is to reduce the contention
 102 * between CPUs. It is possible to get scheduled at this point.
 103 *
 104 * The locality group prealloc space is used looking at whether we have
 105 * enough free space (pa_free) within the prealloc space.
 106 *
 107 * If we can't allocate blocks via inode prealloc or/and locality group
 108 * prealloc then we look at the buddy cache. The buddy cache is represented
 109 * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets
 110 * mapped to the buddy and bitmap information regarding different
 111 * groups. The buddy information is attached to buddy cache inode so that
 112 * we can access them through the page cache. The information regarding
 113 * each group is loaded via ext4_mb_load_buddy.  The information involve
 114 * block bitmap and buddy information. The information are stored in the
 115 * inode as:
 116 *
 117 *  {                        page                        }
 118 *  [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
 119 *
 120 *
 121 * one block each for bitmap and buddy information.  So for each group we
 122 * take up 2 blocks. A page can contain blocks_per_page (PAGE_SIZE /
 123 * blocksize) blocks.  So it can have information regarding groups_per_page
 124 * which is blocks_per_page/2
 125 *
 126 * The buddy cache inode is not stored on disk. The inode is thrown
 127 * away when the filesystem is unmounted.
 128 *
 129 * We look for count number of blocks in the buddy cache. If we were able
 130 * to locate that many free blocks we return with additional information
 131 * regarding rest of the contiguous physical block available
 132 *
 133 * Before allocating blocks via buddy cache we normalize the request
 134 * blocks. This ensure we ask for more blocks that we needed. The extra
 135 * blocks that we get after allocation is added to the respective prealloc
 136 * list. In case of inode preallocation we follow a list of heuristics
 137 * based on file size. This can be found in ext4_mb_normalize_request. If
 138 * we are doing a group prealloc we try to normalize the request to
 139 * sbi->s_mb_group_prealloc.  The default value of s_mb_group_prealloc is
 140 * dependent on the cluster size; for non-bigalloc file systems, it is
 141 * 512 blocks. This can be tuned via
 142 * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in
 143 * terms of number of blocks. If we have mounted the file system with -O
 144 * stripe=<value> option the group prealloc request is normalized to the
 145 * the smallest multiple of the stripe value (sbi->s_stripe) which is
 146 * greater than the default mb_group_prealloc.
 147 *
 148 * The regular allocator (using the buddy cache) supports a few tunables.
 149 *
 150 * /sys/fs/ext4/<partition>/mb_min_to_scan
 151 * /sys/fs/ext4/<partition>/mb_max_to_scan
 152 * /sys/fs/ext4/<partition>/mb_order2_req
 153 *
 154 * The regular allocator uses buddy scan only if the request len is power of
 155 * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
 156 * value of s_mb_order2_reqs can be tuned via
 157 * /sys/fs/ext4/<partition>/mb_order2_req.  If the request len is equal to
 158 * stripe size (sbi->s_stripe), we try to search for contiguous block in
 159 * stripe size. This should result in better allocation on RAID setups. If
 160 * not, we search in the specific group using bitmap for best extents. The
 161 * tunable min_to_scan and max_to_scan control the behaviour here.
 162 * min_to_scan indicate how long the mballoc __must__ look for a best
 163 * extent and max_to_scan indicates how long the mballoc __can__ look for a
 164 * best extent in the found extents. Searching for the blocks starts with
 165 * the group specified as the goal value in allocation context via
 166 * ac_g_ex. Each group is first checked based on the criteria whether it
 167 * can be used for allocation. ext4_mb_good_group explains how the groups are
 168 * checked.
 169 *
 170 * Both the prealloc space are getting populated as above. So for the first
 171 * request we will hit the buddy cache which will result in this prealloc
 172 * space getting filled. The prealloc space is then later used for the
 173 * subsequent request.
 174 */
 175
 176/*
 177 * mballoc operates on the following data:
 178 *  - on-disk bitmap
 179 *  - in-core buddy (actually includes buddy and bitmap)
 180 *  - preallocation descriptors (PAs)
 181 *
 182 * there are two types of preallocations:
 183 *  - inode
 184 *    assiged to specific inode and can be used for this inode only.
 185 *    it describes part of inode's space preallocated to specific
 186 *    physical blocks. any block from that preallocated can be used
 187 *    independent. the descriptor just tracks number of blocks left
 188 *    unused. so, before taking some block from descriptor, one must
 189 *    make sure corresponded logical block isn't allocated yet. this
 190 *    also means that freeing any block within descriptor's range
 191 *    must discard all preallocated blocks.
 192 *  - locality group
 193 *    assigned to specific locality group which does not translate to
 194 *    permanent set of inodes: inode can join and leave group. space
 195 *    from this type of preallocation can be used for any inode. thus
 196 *    it's consumed from the beginning to the end.
 197 *
 198 * relation between them can be expressed as:
 199 *    in-core buddy = on-disk bitmap + preallocation descriptors
 200 *
 201 * this mean blocks mballoc considers used are:
 202 *  - allocated blocks (persistent)
 203 *  - preallocated blocks (non-persistent)
 204 *
 205 * consistency in mballoc world means that at any time a block is either
 206 * free or used in ALL structures. notice: "any time" should not be read
 207 * literally -- time is discrete and delimited by locks.
 208 *
 209 *  to keep it simple, we don't use block numbers, instead we count number of
 210 *  blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA.
 211 *
 212 * all operations can be expressed as:
 213 *  - init buddy:                       buddy = on-disk + PAs
 214 *  - new PA:                           buddy += N; PA = N
 215 *  - use inode PA:                     on-disk += N; PA -= N
 216 *  - discard inode PA                  buddy -= on-disk - PA; PA = 0
 217 *  - use locality group PA             on-disk += N; PA -= N
 218 *  - discard locality group PA         buddy -= PA; PA = 0
 219 *  note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap
 220 *        is used in real operation because we can't know actual used
 221 *        bits from PA, only from on-disk bitmap
 222 *
 223 * if we follow this strict logic, then all operations above should be atomic.
 224 * given some of them can block, we'd have to use something like semaphores
 225 * killing performance on high-end SMP hardware. let's try to relax it using
 226 * the following knowledge:
 227 *  1) if buddy is referenced, it's already initialized
 228 *  2) while block is used in buddy and the buddy is referenced,
 229 *     nobody can re-allocate that block
 230 *  3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has
 231 *     bit set and PA claims same block, it's OK. IOW, one can set bit in
 232 *     on-disk bitmap if buddy has same bit set or/and PA covers corresponded
 233 *     block
 234 *
 235 * so, now we're building a concurrency table:
 236 *  - init buddy vs.
 237 *    - new PA
 238 *      blocks for PA are allocated in the buddy, buddy must be referenced
 239 *      until PA is linked to allocation group to avoid concurrent buddy init
 240 *    - use inode PA
 241 *      we need to make sure that either on-disk bitmap or PA has uptodate data
 242 *      given (3) we care that PA-=N operation doesn't interfere with init
 243 *    - discard inode PA
 244 *      the simplest way would be to have buddy initialized by the discard
 245 *    - use locality group PA
 246 *      again PA-=N must be serialized with init
 247 *    - discard locality group PA
 248 *      the simplest way would be to have buddy initialized by the discard
 249 *  - new PA vs.
 250 *    - use inode PA
 251 *      i_data_sem serializes them
 252 *    - discard inode PA
 253 *      discard process must wait until PA isn't used by another process
 254 *    - use locality group PA
 255 *      some mutex should serialize them
 256 *    - discard locality group PA
 257 *      discard process must wait until PA isn't used by another process
 258 *  - use inode PA
 259 *    - use inode PA
 260 *      i_data_sem or another mutex should serializes them
 261 *    - discard inode PA
 262 *      discard process must wait until PA isn't used by another process
 263 *    - use locality group PA
 264 *      nothing wrong here -- they're different PAs covering different blocks
 265 *    - discard locality group PA
 266 *      discard process must wait until PA isn't used by another process
 267 *
 268 * now we're ready to make few consequences:
 269 *  - PA is referenced and while it is no discard is possible
 270 *  - PA is referenced until block isn't marked in on-disk bitmap
 271 *  - PA changes only after on-disk bitmap
 272 *  - discard must not compete with init. either init is done before
 273 *    any discard or they're serialized somehow
 274 *  - buddy init as sum of on-disk bitmap and PAs is done atomically
 275 *
 276 * a special case when we've used PA to emptiness. no need to modify buddy
 277 * in this case, but we should care about concurrent init
 278 *
 279 */
 280
 281 /*
 282 * Logic in few words:
 283 *
 284 *  - allocation:
 285 *    load group
 286 *    find blocks
 287 *    mark bits in on-disk bitmap
 288 *    release group
 289 *
 290 *  - use preallocation:
 291 *    find proper PA (per-inode or group)
 292 *    load group
 293 *    mark bits in on-disk bitmap
 294 *    release group
 295 *    release PA
 296 *
 297 *  - free:
 298 *    load group
 299 *    mark bits in on-disk bitmap
 300 *    release group
 301 *
 302 *  - discard preallocations in group:
 303 *    mark PAs deleted
 304 *    move them onto local list
 305 *    load on-disk bitmap
 306 *    load group
 307 *    remove PA from object (inode or locality group)
 308 *    mark free blocks in-core
 309 *
 310 *  - discard inode's preallocations:
 311 */
 312
 313/*
 314 * Locking rules
 315 *
 316 * Locks:
 317 *  - bitlock on a group        (group)
 318 *  - object (inode/locality)   (object)
 319 *  - per-pa lock               (pa)
 320 *
 321 * Paths:
 322 *  - new pa
 323 *    object
 324 *    group
 325 *
 326 *  - find and use pa:
 327 *    pa
 328 *
 329 *  - release consumed pa:
 330 *    pa
 331 *    group
 332 *    object
 333 *
 334 *  - generate in-core bitmap:
 335 *    group
 336 *        pa
 337 *
 338 *  - discard all for given object (inode, locality group):
 339 *    object
 340 *        pa
 341 *    group
 342 *
 343 *  - discard all for given group:
 344 *    group
 345 *        pa
 346 *    group
 347 *        object
 348 *
 349 */
 350static struct kmem_cache *ext4_pspace_cachep;
 351static struct kmem_cache *ext4_ac_cachep;
 352static struct kmem_cache *ext4_free_data_cachep;
 353
 354/* We create slab caches for groupinfo data structures based on the
 355 * superblock block size.  There will be one per mounted filesystem for
 356 * each unique s_blocksize_bits */
 357#define NR_GRPINFO_CACHES 8
 358static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
 359
 360static const char *ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
 361        "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k",
 362        "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k",
 363        "ext4_groupinfo_64k", "ext4_groupinfo_128k"
 364};
 365
 366static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
 367                                        ext4_group_t group);
 368static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
 369                                                ext4_group_t group);
 370static void ext4_free_data_callback(struct super_block *sb,
 371                                struct ext4_journal_cb_entry *jce, int rc);
 372
 373static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
 374{
 375#if BITS_PER_LONG == 64
 376        *bit += ((unsigned long) addr & 7UL) << 3;
 377        addr = (void *) ((unsigned long) addr & ~7UL);
 378#elif BITS_PER_LONG == 32
 379        *bit += ((unsigned long) addr & 3UL) << 3;
 380        addr = (void *) ((unsigned long) addr & ~3UL);
 381#else
 382#error "how many bits you are?!"
 383#endif
 384        return addr;
 385}
 386
 387static inline int mb_test_bit(int bit, void *addr)
 388{
 389        /*
 390         * ext4_test_bit on architecture like powerpc
 391         * needs unsigned long aligned address
 392         */
 393        addr = mb_correct_addr_and_bit(&bit, addr);
 394        return ext4_test_bit(bit, addr);
 395}
 396
 397static inline void mb_set_bit(int bit, void *addr)
 398{
 399        addr = mb_correct_addr_and_bit(&bit, addr);
 400        ext4_set_bit(bit, addr);
 401}
 402
 403static inline void mb_clear_bit(int bit, void *addr)
 404{
 405        addr = mb_correct_addr_and_bit(&bit, addr);
 406        ext4_clear_bit(bit, addr);
 407}
 408
 409static inline int mb_test_and_clear_bit(int bit, void *addr)
 410{
 411        addr = mb_correct_addr_and_bit(&bit, addr);
 412        return ext4_test_and_clear_bit(bit, addr);
 413}
 414
 415static inline int mb_find_next_zero_bit(void *addr, int max, int start)
 416{
 417        int fix = 0, ret, tmpmax;
 418        addr = mb_correct_addr_and_bit(&fix, addr);
 419        tmpmax = max + fix;
 420        start += fix;
 421
 422        ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix;
 423        if (ret > max)
 424                return max;
 425        return ret;
 426}
 427
 428static inline int mb_find_next_bit(void *addr, int max, int start)
 429{
 430        int fix = 0, ret, tmpmax;
 431        addr = mb_correct_addr_and_bit(&fix, addr);
 432        tmpmax = max + fix;
 433        start += fix;
 434
 435        ret = ext4_find_next_bit(addr, tmpmax, start) - fix;
 436        if (ret > max)
 437                return max;
 438        return ret;
 439}
 440
 441static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
 442{
 443        char *bb;
 444
 445        BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
 446        BUG_ON(max == NULL);
 447
 448        if (order > e4b->bd_blkbits + 1) {
 449                *max = 0;
 450                return NULL;
 451        }
 452
 453        /* at order 0 we see each particular block */
 454        if (order == 0) {
 455                *max = 1 << (e4b->bd_blkbits + 3);
 456                return e4b->bd_bitmap;
 457        }
 458
 459        bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
 460        *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
 461
 462        return bb;
 463}
 464
 465#ifdef DOUBLE_CHECK
 466static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
 467                           int first, int count)
 468{
 469        int i;
 470        struct super_block *sb = e4b->bd_sb;
 471
 472        if (unlikely(e4b->bd_info->bb_bitmap == NULL))
 473                return;
 474        assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
 475        for (i = 0; i < count; i++) {
 476                if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
 477                        ext4_fsblk_t blocknr;
 478
 479                        blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
 480                        blocknr += EXT4_C2B(EXT4_SB(sb), first + i);
 481                        ext4_grp_locked_error(sb, e4b->bd_group,
 482                                              inode ? inode->i_ino : 0,
 483                                              blocknr,
 484                                              "freeing block already freed "
 485                                              "(bit %u)",
 486                                              first + i);
 487                }
 488                mb_clear_bit(first + i, e4b->bd_info->bb_bitmap);
 489        }
 490}
 491
 492static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count)
 493{
 494        int i;
 495
 496        if (unlikely(e4b->bd_info->bb_bitmap == NULL))
 497                return;
 498        assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
 499        for (i = 0; i < count; i++) {
 500                BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap));
 501                mb_set_bit(first + i, e4b->bd_info->bb_bitmap);
 502        }
 503}
 504
 505static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
 506{
 507        if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) {
 508                unsigned char *b1, *b2;
 509                int i;
 510                b1 = (unsigned char *) e4b->bd_info->bb_bitmap;
 511                b2 = (unsigned char *) bitmap;
 512                for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
 513                        if (b1[i] != b2[i]) {
 514                                ext4_msg(e4b->bd_sb, KERN_ERR,
 515                                         "corruption in group %u "
 516                                         "at byte %u(%u): %x in copy != %x "
 517                                         "on disk/prealloc",
 518                                         e4b->bd_group, i, i * 8, b1[i], b2[i]);
 519                                BUG();
 520                        }
 521                }
 522        }
 523}
 524
 525#else
 526static inline void mb_free_blocks_double(struct inode *inode,
 527                                struct ext4_buddy *e4b, int first, int count)
 528{
 529        return;
 530}
 531static inline void mb_mark_used_double(struct ext4_buddy *e4b,
 532                                                int first, int count)
 533{
 534        return;
 535}
 536static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
 537{
 538        return;
 539}
 540#endif
 541
 542#ifdef AGGRESSIVE_CHECK
 543
 544#define MB_CHECK_ASSERT(assert)                                         \
 545do {                                                                    \
 546        if (!(assert)) {                                                \
 547                printk(KERN_EMERG                                       \
 548                        "Assertion failure in %s() at %s:%d: \"%s\"\n", \
 549                        function, file, line, # assert);                \
 550                BUG();                                                  \
 551        }                                                               \
 552} while (0)
 553
 554static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
 555                                const char *function, int line)
 556{
 557        struct super_block *sb = e4b->bd_sb;
 558        int order = e4b->bd_blkbits + 1;
 559        int max;
 560        int max2;
 561        int i;
 562        int j;
 563        int k;
 564        int count;
 565        struct ext4_group_info *grp;
 566        int fragments = 0;
 567        int fstart;
 568        struct list_head *cur;
 569        void *buddy;
 570        void *buddy2;
 571
 572        {
 573                static int mb_check_counter;
 574                if (mb_check_counter++ % 100 != 0)
 575                        return 0;
 576        }
 577
 578        while (order > 1) {
 579                buddy = mb_find_buddy(e4b, order, &max);
 580                MB_CHECK_ASSERT(buddy);
 581                buddy2 = mb_find_buddy(e4b, order - 1, &max2);
 582                MB_CHECK_ASSERT(buddy2);
 583                MB_CHECK_ASSERT(buddy != buddy2);
 584                MB_CHECK_ASSERT(max * 2 == max2);
 585
 586                count = 0;
 587                for (i = 0; i < max; i++) {
 588
 589                        if (mb_test_bit(i, buddy)) {
 590                                /* only single bit in buddy2 may be 1 */
 591                                if (!mb_test_bit(i << 1, buddy2)) {
 592                                        MB_CHECK_ASSERT(
 593                                                mb_test_bit((i<<1)+1, buddy2));
 594                                } else if (!mb_test_bit((i << 1) + 1, buddy2)) {
 595                                        MB_CHECK_ASSERT(
 596                                                mb_test_bit(i << 1, buddy2));
 597                                }
 598                                continue;
 599                        }
 600
 601                        /* both bits in buddy2 must be 1 */
 602                        MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2));
 603                        MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2));
 604
 605                        for (j = 0; j < (1 << order); j++) {
 606                                k = (i * (1 << order)) + j;
 607                                MB_CHECK_ASSERT(
 608                                        !mb_test_bit(k, e4b->bd_bitmap));
 609                        }
 610                        count++;
 611                }
 612                MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count);
 613                order--;
 614        }
 615
 616        fstart = -1;
 617        buddy = mb_find_buddy(e4b, 0, &max);
 618        for (i = 0; i < max; i++) {
 619                if (!mb_test_bit(i, buddy)) {
 620                        MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free);
 621                        if (fstart == -1) {
 622                                fragments++;
 623                                fstart = i;
 624                        }
 625                        continue;
 626                }
 627                fstart = -1;
 628                /* check used bits only */
 629                for (j = 0; j < e4b->bd_blkbits + 1; j++) {
 630                        buddy2 = mb_find_buddy(e4b, j, &max2);
 631                        k = i >> j;
 632                        MB_CHECK_ASSERT(k < max2);
 633                        MB_CHECK_ASSERT(mb_test_bit(k, buddy2));
 634                }
 635        }
 636        MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info));
 637        MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);
 638
 639        grp = ext4_get_group_info(sb, e4b->bd_group);
 640        list_for_each(cur, &grp->bb_prealloc_list) {
 641                ext4_group_t groupnr;
 642                struct ext4_prealloc_space *pa;
 643                pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
 644                ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k);
 645                MB_CHECK_ASSERT(groupnr == e4b->bd_group);
 646                for (i = 0; i < pa->pa_len; i++)
 647                        MB_CHECK_ASSERT(mb_test_bit(k + i, buddy));
 648        }
 649        return 0;
 650}
 651#undef MB_CHECK_ASSERT
 652#define mb_check_buddy(e4b) __mb_check_buddy(e4b,       \
 653                                        __FILE__, __func__, __LINE__)
 654#else
 655#define mb_check_buddy(e4b)
 656#endif
 657
 658/*
 659 * Divide blocks started from @first with length @len into
 660 * smaller chunks with power of 2 blocks.
 661 * Clear the bits in bitmap which the blocks of the chunk(s) covered,
 662 * then increase bb_counters[] for corresponded chunk size.
 663 */
 664static void ext4_mb_mark_free_simple(struct super_block *sb,
 665                                void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
 666                                        struct ext4_group_info *grp)
 667{
 668        struct ext4_sb_info *sbi = EXT4_SB(sb);
 669        ext4_grpblk_t min;
 670        ext4_grpblk_t max;
 671        ext4_grpblk_t chunk;
 672        unsigned short border;
 673
 674        BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb));
 675
 676        border = 2 << sb->s_blocksize_bits;
 677
 678        while (len > 0) {
 679                /* find how many blocks can be covered since this position */
 680                max = ffs(first | border) - 1;
 681
 682                /* find how many blocks of power 2 we need to mark */
 683                min = fls(len) - 1;
 684
 685                if (max < min)
 686                        min = max;
 687                chunk = 1 << min;
 688
 689                /* mark multiblock chunks only */
 690                grp->bb_counters[min]++;
 691                if (min > 0)
 692                        mb_clear_bit(first >> min,
 693                                     buddy + sbi->s_mb_offsets[min]);
 694
 695                len -= chunk;
 696                first += chunk;
 697        }
 698}
 699
 700/*
 701 * Cache the order of the largest free extent we have available in this block
 702 * group.
 703 */
 704static void
 705mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
 706{
 707        int i;
 708        int bits;
 709
 710        grp->bb_largest_free_order = -1; /* uninit */
 711
 712        bits = sb->s_blocksize_bits + 1;
 713        for (i = bits; i >= 0; i--) {
 714                if (grp->bb_counters[i] > 0) {
 715                        grp->bb_largest_free_order = i;
 716                        break;
 717                }
 718        }
 719}
 720
 721static noinline_for_stack
 722void ext4_mb_generate_buddy(struct super_block *sb,
 723                                void *buddy, void *bitmap, ext4_group_t group)
 724{
 725        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
 726        struct ext4_sb_info *sbi = EXT4_SB(sb);
 727        ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
 728        ext4_grpblk_t i = 0;
 729        ext4_grpblk_t first;
 730        ext4_grpblk_t len;
 731        unsigned free = 0;
 732        unsigned fragments = 0;
 733        unsigned long long period = get_cycles();
 734
 735        /* initialize buddy from bitmap which is aggregation
 736         * of on-disk bitmap and preallocations */
 737        i = mb_find_next_zero_bit(bitmap, max, 0);
 738        grp->bb_first_free = i;
 739        while (i < max) {
 740                fragments++;
 741                first = i;
 742                i = mb_find_next_bit(bitmap, max, i);
 743                len = i - first;
 744                free += len;
 745                if (len > 1)
 746                        ext4_mb_mark_free_simple(sb, buddy, first, len, grp);
 747                else
 748                        grp->bb_counters[0]++;
 749                if (i < max)
 750                        i = mb_find_next_zero_bit(bitmap, max, i);
 751        }
 752        grp->bb_fragments = fragments;
 753
 754        if (free != grp->bb_free) {
 755                ext4_grp_locked_error(sb, group, 0, 0,
 756                                      "block bitmap and bg descriptor "
 757                                      "inconsistent: %u vs %u free clusters",
 758                                      free, grp->bb_free);
 759                /*
 760                 * If we intend to continue, we consider group descriptor
 761                 * corrupt and update bb_free using bitmap value
 762                 */
 763                grp->bb_free = free;
 764                if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
 765                        percpu_counter_sub(&sbi->s_freeclusters_counter,
 766                                           grp->bb_free);
 767                set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
 768        }
 769        mb_set_largest_free_order(sb, grp);
 770
 771        clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
 772
 773        period = get_cycles() - period;
 774        spin_lock(&EXT4_SB(sb)->s_bal_lock);
 775        EXT4_SB(sb)->s_mb_buddies_generated++;
 776        EXT4_SB(sb)->s_mb_generation_time += period;
 777        spin_unlock(&EXT4_SB(sb)->s_bal_lock);
 778}
 779
 780static void mb_regenerate_buddy(struct ext4_buddy *e4b)
 781{
 782        int count;
 783        int order = 1;
 784        void *buddy;
 785
 786        while ((buddy = mb_find_buddy(e4b, order++, &count))) {
 787                ext4_set_bits(buddy, 0, count);
 788        }
 789        e4b->bd_info->bb_fragments = 0;
 790        memset(e4b->bd_info->bb_counters, 0,
 791                sizeof(*e4b->bd_info->bb_counters) *
 792                (e4b->bd_sb->s_blocksize_bits + 2));
 793
 794        ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy,
 795                e4b->bd_bitmap, e4b->bd_group);
 796}
 797
 798/* The buddy information is attached the buddy cache inode
 799 * for convenience. The information regarding each group
 800 * is loaded via ext4_mb_load_buddy. The information involve
 801 * block bitmap and buddy information. The information are
 802 * stored in the inode as
 803 *
 804 * {                        page                        }
 805 * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
 806 *
 807 *
 808 * one block each for bitmap and buddy information.
 809 * So for each group we take up 2 blocks. A page can
 810 * contain blocks_per_page (PAGE_SIZE / blocksize)  blocks.
 811 * So it can have information regarding groups_per_page which
 812 * is blocks_per_page/2
 813 *
 814 * Locking note:  This routine takes the block group lock of all groups
 815 * for this page; do not hold this lock when calling this routine!
 816 */
 817
 818static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
 819{
 820        ext4_group_t ngroups;
 821        int blocksize;
 822        int blocks_per_page;
 823        int groups_per_page;
 824        int err = 0;
 825        int i;
 826        ext4_group_t first_group, group;
 827        int first_block;
 828        struct super_block *sb;
 829        struct buffer_head *bhs;
 830        struct buffer_head **bh = NULL;
 831        struct inode *inode;
 832        char *data;
 833        char *bitmap;
 834        struct ext4_group_info *grinfo;
 835
 836        mb_debug(1, "init page %lu\n", page->index);
 837
 838        inode = page->mapping->host;
 839        sb = inode->i_sb;
 840        ngroups = ext4_get_groups_count(sb);
 841        blocksize = 1 << inode->i_blkbits;
 842        blocks_per_page = PAGE_SIZE / blocksize;
 843
 844        groups_per_page = blocks_per_page >> 1;
 845        if (groups_per_page == 0)
 846                groups_per_page = 1;
 847
 848        /* allocate buffer_heads to read bitmaps */
 849        if (groups_per_page > 1) {
 850                i = sizeof(struct buffer_head *) * groups_per_page;
 851                bh = kzalloc(i, gfp);
 852                if (bh == NULL) {
 853                        err = -ENOMEM;
 854                        goto out;
 855                }
 856        } else
 857                bh = &bhs;
 858
 859        first_group = page->index * blocks_per_page / 2;
 860
 861        /* read all groups the page covers into the cache */
 862        for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
 863                if (group >= ngroups)
 864                        break;
 865
 866                grinfo = ext4_get_group_info(sb, group);
 867                /*
 868                 * If page is uptodate then we came here after online resize
 869                 * which added some new uninitialized group info structs, so
 870                 * we must skip all initialized uptodate buddies on the page,
 871                 * which may be currently in use by an allocating task.
 872                 */
 873                if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) {
 874                        bh[i] = NULL;
 875                        continue;
 876                }
 877                bh[i] = ext4_read_block_bitmap_nowait(sb, group);
 878                if (IS_ERR(bh[i])) {
 879                        err = PTR_ERR(bh[i]);
 880                        bh[i] = NULL;
 881                        goto out;
 882                }
 883                mb_debug(1, "read bitmap for group %u\n", group);
 884        }
 885
 886        /* wait for I/O completion */
 887        for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
 888                int err2;
 889
 890                if (!bh[i])
 891                        continue;
 892                err2 = ext4_wait_block_bitmap(sb, group, bh[i]);
 893                if (!err)
 894                        err = err2;
 895        }
 896
 897        first_block = page->index * blocks_per_page;
 898        for (i = 0; i < blocks_per_page; i++) {
 899                group = (first_block + i) >> 1;
 900                if (group >= ngroups)
 901                        break;
 902
 903                if (!bh[group - first_group])
 904                        /* skip initialized uptodate buddy */
 905                        continue;
 906
 907                if (!buffer_verified(bh[group - first_group]))
 908                        /* Skip faulty bitmaps */
 909                        continue;
 910                err = 0;
 911
 912                /*
 913                 * data carry information regarding this
 914                 * particular group in the format specified
 915                 * above
 916                 *
 917                 */
 918                data = page_address(page) + (i * blocksize);
 919                bitmap = bh[group - first_group]->b_data;
 920
 921                /*
 922                 * We place the buddy block and bitmap block
 923                 * close together
 924                 */
 925                if ((first_block + i) & 1) {
 926                        /* this is block of buddy */
 927                        BUG_ON(incore == NULL);
 928                        mb_debug(1, "put buddy for group %u in page %lu/%x\n",
 929                                group, page->index, i * blocksize);
 930                        trace_ext4_mb_buddy_bitmap_load(sb, group);
 931                        grinfo = ext4_get_group_info(sb, group);
 932                        grinfo->bb_fragments = 0;
 933                        memset(grinfo->bb_counters, 0,
 934                               sizeof(*grinfo->bb_counters) *
 935                                (sb->s_blocksize_bits+2));
 936                        /*
 937                         * incore got set to the group block bitmap below
 938                         */
 939                        ext4_lock_group(sb, group);
 940                        /* init the buddy */
 941                        memset(data, 0xff, blocksize);
 942                        ext4_mb_generate_buddy(sb, data, incore, group);
 943                        ext4_unlock_group(sb, group);
 944                        incore = NULL;
 945                } else {
 946                        /* this is block of bitmap */
 947                        BUG_ON(incore != NULL);
 948                        mb_debug(1, "put bitmap for group %u in page %lu/%x\n",
 949                                group, page->index, i * blocksize);
 950                        trace_ext4_mb_bitmap_load(sb, group);
 951
 952                        /* see comments in ext4_mb_put_pa() */
 953                        ext4_lock_group(sb, group);
 954                        memcpy(data, bitmap, blocksize);
 955
 956                        /* mark all preallocated blks used in in-core bitmap */
 957                        ext4_mb_generate_from_pa(sb, data, group);
 958                        ext4_mb_generate_from_freelist(sb, data, group);
 959                        ext4_unlock_group(sb, group);
 960
 961                        /* set incore so that the buddy information can be
 962                         * generated using this
 963                         */
 964                        incore = data;
 965                }
 966        }
 967        SetPageUptodate(page);
 968
 969out:
 970        if (bh) {
 971                for (i = 0; i < groups_per_page; i++)
 972                        brelse(bh[i]);
 973                if (bh != &bhs)
 974                        kfree(bh);
 975        }
 976        return err;
 977}
 978
 979/*
 980 * Lock the buddy and bitmap pages. This make sure other parallel init_group
 981 * on the same buddy page doesn't happen whild holding the buddy page lock.
 982 * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap
 983 * are on the same page e4b->bd_buddy_page is NULL and return value is 0.
 984 */
 985static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
 986                ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp)
 987{
 988        struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
 989        int block, pnum, poff;
 990        int blocks_per_page;
 991        struct page *page;
 992
 993        e4b->bd_buddy_page = NULL;
 994        e4b->bd_bitmap_page = NULL;
 995
 996        blocks_per_page = PAGE_SIZE / sb->s_blocksize;
 997        /*
 998         * the buddy cache inode stores the block bitmap
 999         * and buddy information in consecutive blocks.
1000         * So for each group we need two blocks.
1001         */
1002        block = group * 2;
1003        pnum = block / blocks_per_page;
1004        poff = block % blocks_per_page;
1005        page = find_or_create_page(inode->i_mapping, pnum, gfp);
1006        if (!page)
1007                return -ENOMEM;
1008        BUG_ON(page->mapping != inode->i_mapping);
1009        e4b->bd_bitmap_page = page;
1010        e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
1011
1012        if (blocks_per_page >= 2) {
1013                /* buddy and bitmap are on the same page */
1014                return 0;
1015        }
1016
1017        block++;
1018        pnum = block / blocks_per_page;
1019        page = find_or_create_page(inode->i_mapping, pnum, gfp);
1020        if (!page)
1021                return -ENOMEM;
1022        BUG_ON(page->mapping != inode->i_mapping);
1023        e4b->bd_buddy_page = page;
1024        return 0;
1025}
1026
1027static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
1028{
1029        if (e4b->bd_bitmap_page) {
1030                unlock_page(e4b->bd_bitmap_page);
1031                put_page(e4b->bd_bitmap_page);
1032        }
1033        if (e4b->bd_buddy_page) {
1034                unlock_page(e4b->bd_buddy_page);
1035                put_page(e4b->bd_buddy_page);
1036        }
1037}
1038
1039/*
1040 * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
1041 * block group lock of all groups for this page; do not hold the BG lock when
1042 * calling this routine!
1043 */
1044static noinline_for_stack
1045int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp)
1046{
1047
1048        struct ext4_group_info *this_grp;
1049        struct ext4_buddy e4b;
1050        struct page *page;
1051        int ret = 0;
1052
1053        might_sleep();
1054        mb_debug(1, "init group %u\n", group);
1055        this_grp = ext4_get_group_info(sb, group);
1056        /*
1057         * This ensures that we don't reinit the buddy cache
1058         * page which map to the group from which we are already
1059         * allocating. If we are looking at the buddy cache we would
1060         * have taken a reference using ext4_mb_load_buddy and that
1061         * would have pinned buddy page to page cache.
1062         * The call to ext4_mb_get_buddy_page_lock will mark the
1063         * page accessed.
1064         */
1065        ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b, gfp);
1066        if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
1067                /*
1068                 * somebody initialized the group
1069                 * return without doing anything
1070                 */
1071                goto err;
1072        }
1073
1074        page = e4b.bd_bitmap_page;
1075        ret = ext4_mb_init_cache(page, NULL, gfp);
1076        if (ret)
1077                goto err;
1078        if (!PageUptodate(page)) {
1079                ret = -EIO;
1080                goto err;
1081        }
1082
1083        if (e4b.bd_buddy_page == NULL) {
1084                /*
1085                 * If both the bitmap and buddy are in
1086                 * the same page we don't need to force
1087                 * init the buddy
1088                 */
1089                ret = 0;
1090                goto err;
1091        }
1092        /* init buddy cache */
1093        page = e4b.bd_buddy_page;
1094        ret = ext4_mb_init_cache(page, e4b.bd_bitmap, gfp);
1095        if (ret)
1096                goto err;
1097        if (!PageUptodate(page)) {
1098                ret = -EIO;
1099                goto err;
1100        }
1101err:
1102        ext4_mb_put_buddy_page_lock(&e4b);
1103        return ret;
1104}
1105
1106/*
1107 * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
1108 * block group lock of all groups for this page; do not hold the BG lock when
1109 * calling this routine!
1110 */
1111static noinline_for_stack int
1112ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
1113                       struct ext4_buddy *e4b, gfp_t gfp)
1114{
1115        int blocks_per_page;
1116        int block;
1117        int pnum;
1118        int poff;
1119        struct page *page;
1120        int ret;
1121        struct ext4_group_info *grp;
1122        struct ext4_sb_info *sbi = EXT4_SB(sb);
1123        struct inode *inode = sbi->s_buddy_cache;
1124
1125        might_sleep();
1126        mb_debug(1, "load group %u\n", group);
1127
1128        blocks_per_page = PAGE_SIZE / sb->s_blocksize;
1129        grp = ext4_get_group_info(sb, group);
1130
1131        e4b->bd_blkbits = sb->s_blocksize_bits;
1132        e4b->bd_info = grp;
1133        e4b->bd_sb = sb;
1134        e4b->bd_group = group;
1135        e4b->bd_buddy_page = NULL;
1136        e4b->bd_bitmap_page = NULL;
1137
1138        if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
1139                /*
1140                 * we need full data about the group
1141                 * to make a good selection
1142                 */
1143                ret = ext4_mb_init_group(sb, group, gfp);
1144                if (ret)
1145                        return ret;
1146        }
1147
1148        /*
1149         * the buddy cache inode stores the block bitmap
1150         * and buddy information in consecutive blocks.
1151         * So for each group we need two blocks.
1152         */
1153        block = group * 2;
1154        pnum = block / blocks_per_page;
1155        poff = block % blocks_per_page;
1156
1157        /* we could use find_or_create_page(), but it locks page
1158         * what we'd like to avoid in fast path ... */
1159        page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
1160        if (page == NULL || !PageUptodate(page)) {
1161                if (page)
1162                        /*
1163                         * drop the page reference and try
1164                         * to get the page with lock. If we
1165                         * are not uptodate that implies
1166                         * somebody just created the page but
1167                         * is yet to initialize the same. So
1168                         * wait for it to initialize.
1169                         */
1170                        put_page(page);
1171                page = find_or_create_page(inode->i_mapping, pnum, gfp);
1172                if (page) {
1173                        BUG_ON(page->mapping != inode->i_mapping);
1174                        if (!PageUptodate(page)) {
1175                                ret = ext4_mb_init_cache(page, NULL, gfp);
1176                                if (ret) {
1177                                        unlock_page(page);
1178                                        goto err;
1179                                }
1180                                mb_cmp_bitmaps(e4b, page_address(page) +
1181                                               (poff * sb->s_blocksize));
1182                        }
1183                        unlock_page(page);
1184                }
1185        }
1186        if (page == NULL) {
1187                ret = -ENOMEM;
1188                goto err;
1189        }
1190        if (!PageUptodate(page)) {
1191                ret = -EIO;
1192                goto err;
1193        }
1194
1195        /* Pages marked accessed already */
1196        e4b->bd_bitmap_page = page;
1197        e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
1198
1199        block++;
1200        pnum = block / blocks_per_page;
1201        poff = block % blocks_per_page;
1202
1203        page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
1204        if (page == NULL || !PageUptodate(page)) {
1205                if (page)
1206                        put_page(page);
1207                page = find_or_create_page(inode->i_mapping, pnum, gfp);
1208                if (page) {
1209                        BUG_ON(page->mapping != inode->i_mapping);
1210                        if (!PageUptodate(page)) {
1211                                ret = ext4_mb_init_cache(page, e4b->bd_bitmap,
1212                                                         gfp);
1213                                if (ret) {
1214                                        unlock_page(page);
1215                                        goto err;
1216                                }
1217                        }
1218                        unlock_page(page);
1219                }
1220        }
1221        if (page == NULL) {
1222                ret = -ENOMEM;
1223                goto err;
1224        }
1225        if (!PageUptodate(page)) {
1226                ret = -EIO;
1227                goto err;
1228        }
1229
1230        /* Pages marked accessed already */
1231        e4b->bd_buddy_page = page;
1232        e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
1233
1234        BUG_ON(e4b->bd_bitmap_page == NULL);
1235        BUG_ON(e4b->bd_buddy_page == NULL);
1236
1237        return 0;
1238
1239err:
1240        if (page)
1241                put_page(page);
1242        if (e4b->bd_bitmap_page)
1243                put_page(e4b->bd_bitmap_page);
1244        if (e4b->bd_buddy_page)
1245                put_page(e4b->bd_buddy_page);
1246        e4b->bd_buddy = NULL;
1247        e4b->bd_bitmap = NULL;
1248        return ret;
1249}
1250
1251static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1252                              struct ext4_buddy *e4b)
1253{
1254        return ext4_mb_load_buddy_gfp(sb, group, e4b, GFP_NOFS);
1255}
1256
1257static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
1258{
1259        if (e4b->bd_bitmap_page)
1260                put_page(e4b->bd_bitmap_page);
1261        if (e4b->bd_buddy_page)
1262                put_page(e4b->bd_buddy_page);
1263}
1264
1265
1266static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
1267{
1268        int order = 1;
1269        int bb_incr = 1 << (e4b->bd_blkbits - 1);
1270        void *bb;
1271
1272        BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
1273        BUG_ON(block >= (1 << (e4b->bd_blkbits + 3)));
1274
1275        bb = e4b->bd_buddy;
1276        while (order <= e4b->bd_blkbits + 1) {
1277                block = block >> 1;
1278                if (!mb_test_bit(block, bb)) {
1279                        /* this block is part of buddy of order 'order' */
1280                        return order;
1281                }
1282                bb += bb_incr;
1283                bb_incr >>= 1;
1284                order++;
1285        }
1286        return 0;
1287}
1288
1289static void mb_clear_bits(void *bm, int cur, int len)
1290{
1291        __u32 *addr;
1292
1293        len = cur + len;
1294        while (cur < len) {
1295                if ((cur & 31) == 0 && (len - cur) >= 32) {
1296                        /* fast path: clear whole word at once */
1297                        addr = bm + (cur >> 3);
1298                        *addr = 0;
1299                        cur += 32;
1300                        continue;
1301                }
1302                mb_clear_bit(cur, bm);
1303                cur++;
1304        }
1305}
1306
1307/* clear bits in given range
1308 * will return first found zero bit if any, -1 otherwise
1309 */
1310static int mb_test_and_clear_bits(void *bm, int cur, int len)
1311{
1312        __u32 *addr;
1313        int zero_bit = -1;
1314
1315        len = cur + len;
1316        while (cur < len) {
1317                if ((cur & 31) == 0 && (len - cur) >= 32) {
1318                        /* fast path: clear whole word at once */
1319                        addr = bm + (cur >> 3);
1320                        if (*addr != (__u32)(-1) && zero_bit == -1)
1321                                zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0);
1322                        *addr = 0;
1323                        cur += 32;
1324                        continue;
1325                }
1326                if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1)
1327                        zero_bit = cur;
1328                cur++;
1329        }
1330
1331        return zero_bit;
1332}
1333
1334void ext4_set_bits(void *bm, int cur, int len)
1335{
1336        __u32 *addr;
1337
1338        len = cur + len;
1339        while (cur < len) {
1340                if ((cur & 31) == 0 && (len - cur) >= 32) {
1341                        /* fast path: set whole word at once */
1342                        addr = bm + (cur >> 3);
1343                        *addr = 0xffffffff;
1344                        cur += 32;
1345                        continue;
1346                }
1347                mb_set_bit(cur, bm);
1348                cur++;
1349        }
1350}
1351
1352/*
1353 * _________________________________________________________________ */
1354
1355static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side)
1356{
1357        if (mb_test_bit(*bit + side, bitmap)) {
1358                mb_clear_bit(*bit, bitmap);
1359                (*bit) -= side;
1360                return 1;
1361        }
1362        else {
1363                (*bit) += side;
1364                mb_set_bit(*bit, bitmap);
1365                return -1;
1366        }
1367}
1368
1369static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last)
1370{
1371        int max;
1372        int order = 1;
1373        void *buddy = mb_find_buddy(e4b, order, &max);
1374
1375        while (buddy) {
1376                void *buddy2;
1377
1378                /* Bits in range [first; last] are known to be set since
1379                 * corresponding blocks were allocated. Bits in range
1380                 * (first; last) will stay set because they form buddies on
1381                 * upper layer. We just deal with borders if they don't
1382                 * align with upper layer and then go up.
1383                 * Releasing entire group is all about clearing
1384                 * single bit of highest order buddy.
1385                 */
1386
1387                /* Example:
1388                 * ---------------------------------
1389                 * |   1   |   1   |   1   |   1   |
1390                 * ---------------------------------
1391                 * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
1392                 * ---------------------------------
1393                 *   0   1   2   3   4   5   6   7
1394                 *      \_____________________/
1395                 *
1396                 * Neither [1] nor [6] is aligned to above layer.
1397                 * Left neighbour [0] is free, so mark it busy,
1398                 * decrease bb_counters and extend range to
1399                 * [0; 6]
1400                 * Right neighbour [7] is busy. It can't be coaleasced with [6], so
1401                 * mark [6] free, increase bb_counters and shrink range to
1402                 * [0; 5].
1403                 * Then shift range to [0; 2], go up and do the same.
1404                 */
1405
1406
1407                if (first & 1)
1408                        e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1);
1409                if (!(last & 1))
1410                        e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1);
1411                if (first > last)
1412                        break;
1413                order++;
1414
1415                if (first == last || !(buddy2 = mb_find_buddy(e4b, order, &max))) {
1416                        mb_clear_bits(buddy, first, last - first + 1);
1417                        e4b->bd_info->bb_counters[order - 1] += last - first + 1;
1418                        break;
1419                }
1420                first >>= 1;
1421                last >>= 1;
1422                buddy = buddy2;
1423        }
1424}
1425
1426static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1427                           int first, int count)
1428{
1429        int left_is_free = 0;
1430        int right_is_free = 0;
1431        int block;
1432        int last = first + count - 1;
1433        struct super_block *sb = e4b->bd_sb;
1434
1435        if (WARN_ON(count == 0))
1436                return;
1437        BUG_ON(last >= (sb->s_blocksize << 3));
1438        assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
1439        /* Don't bother if the block group is corrupt. */
1440        if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
1441                return;
1442
1443        mb_check_buddy(e4b);
1444        mb_free_blocks_double(inode, e4b, first, count);
1445
1446        e4b->bd_info->bb_free += count;
1447        if (first < e4b->bd_info->bb_first_free)
1448                e4b->bd_info->bb_first_free = first;
1449
1450        /* access memory sequentially: check left neighbour,
1451         * clear range and then check right neighbour
1452         */
1453        if (first != 0)
1454                left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap);
1455        block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count);
1456        if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0])
1457                right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap);
1458
1459        if (unlikely(block != -1)) {
1460                struct ext4_sb_info *sbi = EXT4_SB(sb);
1461                ext4_fsblk_t blocknr;
1462
1463                blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
1464                blocknr += EXT4_C2B(EXT4_SB(sb), block);
1465                ext4_grp_locked_error(sb, e4b->bd_group,
1466                                      inode ? inode->i_ino : 0,
1467                                      blocknr,
1468                                      "freeing already freed block "
1469                                      "(bit %u); block bitmap corrupt.",
1470                                      block);
1471                if (!EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))
1472                        percpu_counter_sub(&sbi->s_freeclusters_counter,
1473                                           e4b->bd_info->bb_free);
1474                /* Mark the block group as corrupt. */
1475                set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT,
1476                        &e4b->bd_info->bb_state);
1477                mb_regenerate_buddy(e4b);
1478                goto done;
1479        }
1480
1481        /* let's maintain fragments counter */
1482        if (left_is_free && right_is_free)
1483                e4b->bd_info->bb_fragments--;
1484        else if (!left_is_free && !right_is_free)
1485                e4b->bd_info->bb_fragments++;
1486
1487        /* buddy[0] == bd_bitmap is a special case, so handle
1488         * it right away and let mb_buddy_mark_free stay free of
1489         * zero order checks.
1490         * Check if neighbours are to be coaleasced,
1491         * adjust bitmap bb_counters and borders appropriately.
1492         */
1493        if (first & 1) {
1494                first += !left_is_free;
1495                e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1;
1496        }
1497        if (!(last & 1)) {
1498                last -= !right_is_free;
1499                e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1;
1500        }
1501
1502        if (first <= last)
1503                mb_buddy_mark_free(e4b, first >> 1, last >> 1);
1504
1505done:
1506        mb_set_largest_free_order(sb, e4b->bd_info);
1507        mb_check_buddy(e4b);
1508}
1509
1510static int mb_find_extent(struct ext4_buddy *e4b, int block,
1511                                int needed, struct ext4_free_extent *ex)
1512{
1513        int next = block;
1514        int max, order;
1515        void *buddy;
1516
1517        assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
1518        BUG_ON(ex == NULL);
1519
1520        buddy = mb_find_buddy(e4b, 0, &max);
1521        BUG_ON(buddy == NULL);
1522        BUG_ON(block >= max);
1523        if (mb_test_bit(block, buddy)) {
1524                ex->fe_len = 0;
1525                ex->fe_start = 0;
1526                ex->fe_group = 0;
1527                return 0;
1528        }
1529
1530        /* find actual order */
1531        order = mb_find_order_for_block(e4b, block);
1532        block = block >> order;
1533
1534        ex->fe_len = 1 << order;
1535        ex->fe_start = block << order;
1536        ex->fe_group = e4b->bd_group;
1537
1538        /* calc difference from given start */
1539        next = next - ex->fe_start;
1540        ex->fe_len -= next;
1541        ex->fe_start += next;
1542
1543        while (needed > ex->fe_len &&
1544               mb_find_buddy(e4b, order, &max)) {
1545
1546                if (block + 1 >= max)
1547                        break;
1548
1549                next = (block + 1) * (1 << order);
1550                if (mb_test_bit(next, e4b->bd_bitmap))
1551                        break;
1552
1553                order = mb_find_order_for_block(e4b, next);
1554
1555                block = next >> order;
1556                ex->fe_len += 1 << order;
1557        }
1558
1559        BUG_ON(ex->fe_start + ex->fe_len > (1 << (e4b->bd_blkbits + 3)));
1560        return ex->fe_len;
1561}
1562
1563static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
1564{
1565        int ord;
1566        int mlen = 0;
1567        int max = 0;
1568        int cur;
1569        int start = ex->fe_start;
1570        int len = ex->fe_len;
1571        unsigned ret = 0;
1572        int len0 = len;
1573        void *buddy;
1574
1575        BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
1576        BUG_ON(e4b->bd_group != ex->fe_group);
1577        assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
1578        mb_check_buddy(e4b);
1579        mb_mark_used_double(e4b, start, len);
1580
1581        e4b->bd_info->bb_free -= len;
1582        if (e4b->bd_info->bb_first_free == start)
1583                e4b->bd_info->bb_first_free += len;
1584
1585        /* let's maintain fragments counter */
1586        if (start != 0)
1587                mlen = !mb_test_bit(start - 1, e4b->bd_bitmap);
1588        if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0])
1589                max = !mb_test_bit(start + len, e4b->bd_bitmap);
1590        if (mlen && max)
1591                e4b->bd_info->bb_fragments++;
1592        else if (!mlen && !max)
1593                e4b->bd_info->bb_fragments--;
1594
1595        /* let's maintain buddy itself */
1596        while (len) {
1597                ord = mb_find_order_for_block(e4b, start);
1598
1599                if (((start >> ord) << ord) == start && len >= (1 << ord)) {
1600                        /* the whole chunk may be allocated at once! */
1601                        mlen = 1 << ord;
1602                        buddy = mb_find_buddy(e4b, ord, &max);
1603                        BUG_ON((start >> ord) >= max);
1604                        mb_set_bit(start >> ord, buddy);
1605                        e4b->bd_info->bb_counters[ord]--;
1606                        start += mlen;
1607                        len -= mlen;
1608                        BUG_ON(len < 0);
1609                        continue;
1610                }
1611
1612                /* store for history */
1613                if (ret == 0)
1614                        ret = len | (ord << 16);
1615
1616                /* we have to split large buddy */
1617                BUG_ON(ord <= 0);
1618                buddy = mb_find_buddy(e4b, ord, &max);
1619                mb_set_bit(start >> ord, buddy);
1620                e4b->bd_info->bb_counters[ord]--;
1621
1622                ord--;
1623                cur = (start >> ord) & ~1U;
1624                buddy = mb_find_buddy(e4b, ord, &max);
1625                mb_clear_bit(cur, buddy);
1626                mb_clear_bit(cur + 1, buddy);
1627                e4b->bd_info->bb_counters[ord]++;
1628                e4b->bd_info->bb_counters[ord]++;
1629        }
1630        mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
1631
1632        ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0);
1633        mb_check_buddy(e4b);
1634
1635        return ret;
1636}
1637
1638/*
1639 * Must be called under group lock!
1640 */
1641static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
1642                                        struct ext4_buddy *e4b)
1643{
1644        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
1645        int ret;
1646
1647        BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group);
1648        BUG_ON(ac->ac_status == AC_STATUS_FOUND);
1649
1650        ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len);
1651        ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical;
1652        ret = mb_mark_used(e4b, &ac->ac_b_ex);
1653
1654        /* preallocation can change ac_b_ex, thus we store actually
1655         * allocated blocks for history */
1656        ac->ac_f_ex = ac->ac_b_ex;
1657
1658        ac->ac_status = AC_STATUS_FOUND;
1659        ac->ac_tail = ret & 0xffff;
1660        ac->ac_buddy = ret >> 16;
1661
1662        /*
1663         * take the page reference. We want the page to be pinned
1664         * so that we don't get a ext4_mb_init_cache_call for this
1665         * group until we update the bitmap. That would mean we
1666         * double allocate blocks. The reference is dropped
1667         * in ext4_mb_release_context
1668         */
1669        ac->ac_bitmap_page = e4b->bd_bitmap_page;
1670        get_page(ac->ac_bitmap_page);
1671        ac->ac_buddy_page = e4b->bd_buddy_page;
1672        get_page(ac->ac_buddy_page);
1673        /* store last allocated for subsequent stream allocation */
1674        if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
1675                spin_lock(&sbi->s_md_lock);
1676                sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
1677                sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
1678                spin_unlock(&sbi->s_md_lock);
1679        }
1680}
1681
1682/*
1683 * regular allocator, for general purposes allocation
1684 */
1685
1686static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
1687                                        struct ext4_buddy *e4b,
1688                                        int finish_group)
1689{
1690        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
1691        struct ext4_free_extent *bex = &ac->ac_b_ex;
1692        struct ext4_free_extent *gex = &ac->ac_g_ex;
1693        struct ext4_free_extent ex;
1694        int max;
1695
1696        if (ac->ac_status == AC_STATUS_FOUND)
1697                return;
1698        /*
1699         * We don't want to scan for a whole year
1700         */
1701        if (ac->ac_found > sbi->s_mb_max_to_scan &&
1702                        !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
1703                ac->ac_status = AC_STATUS_BREAK;
1704                return;
1705        }
1706
1707        /*
1708         * Haven't found good chunk so far, let's continue
1709         */
1710        if (bex->fe_len < gex->fe_len)
1711                return;
1712
1713        if ((finish_group || ac->ac_found > sbi->s_mb_min_to_scan)
1714                        && bex->fe_group == e4b->bd_group) {
1715                /* recheck chunk's availability - we don't know
1716                 * when it was found (within this lock-unlock
1717                 * period or not) */
1718                max = mb_find_extent(e4b, bex->fe_start, gex->fe_len, &ex);
1719                if (max >= gex->fe_len) {
1720                        ext4_mb_use_best_found(ac, e4b);
1721                        return;
1722                }
1723        }
1724}
1725
1726/*
1727 * The routine checks whether found extent is good enough. If it is,
1728 * then the extent gets marked used and flag is set to the context
1729 * to stop scanning. Otherwise, the extent is compared with the
1730 * previous found extent and if new one is better, then it's stored
1731 * in the context. Later, the best found extent will be used, if
1732 * mballoc can't find good enough extent.
1733 *
1734 * FIXME: real allocation policy is to be designed yet!
1735 */
1736static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
1737                                        struct ext4_free_extent *ex,
1738                                        struct ext4_buddy *e4b)
1739{
1740        struct ext4_free_extent *bex = &ac->ac_b_ex;
1741        struct ext4_free_extent *gex = &ac->ac_g_ex;
1742
1743        BUG_ON(ex->fe_len <= 0);
1744        BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
1745        BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
1746        BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
1747
1748        ac->ac_found++;
1749
1750        /*
1751         * The special case - take what you catch first
1752         */
1753        if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
1754                *bex = *ex;
1755                ext4_mb_use_best_found(ac, e4b);
1756                return;
1757        }
1758
1759        /*
1760         * Let's check whether the chuck is good enough
1761         */
1762        if (ex->fe_len == gex->fe_len) {
1763                *bex = *ex;
1764                ext4_mb_use_best_found(ac, e4b);
1765                return;
1766        }
1767
1768        /*
1769         * If this is first found extent, just store it in the context
1770         */
1771        if (bex->fe_len == 0) {
1772                *bex = *ex;
1773                return;
1774        }
1775
1776        /*
1777         * If new found extent is better, store it in the context
1778         */
1779        if (bex->fe_len < gex->fe_len) {
1780                /* if the request isn't satisfied, any found extent
1781                 * larger than previous best one is better */
1782                if (ex->fe_len > bex->fe_len)
1783                        *bex = *ex;
1784        } else if (ex->fe_len > gex->fe_len) {
1785                /* if the request is satisfied, then we try to find
1786                 * an extent that still satisfy the request, but is
1787                 * smaller than previous one */
1788                if (ex->fe_len < bex->fe_len)
1789                        *bex = *ex;
1790        }
1791
1792        ext4_mb_check_limits(ac, e4b, 0);
1793}
1794
1795static noinline_for_stack
1796int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
1797                                        struct ext4_buddy *e4b)
1798{
1799        struct ext4_free_extent ex = ac->ac_b_ex;
1800        ext4_group_t group = ex.fe_group;
1801        int max;
1802        int err;
1803
1804        BUG_ON(ex.fe_len <= 0);
1805        err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
1806        if (err)
1807                return err;
1808
1809        ext4_lock_group(ac->ac_sb, group);
1810        max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex);
1811
1812        if (max > 0) {
1813                ac->ac_b_ex = ex;
1814                ext4_mb_use_best_found(ac, e4b);
1815        }
1816
1817        ext4_unlock_group(ac->ac_sb, group);
1818        ext4_mb_unload_buddy(e4b);
1819
1820        return 0;
1821}
1822
1823static noinline_for_stack
1824int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
1825                                struct ext4_buddy *e4b)
1826{
1827        ext4_group_t group = ac->ac_g_ex.fe_group;
1828        int max;
1829        int err;
1830        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
1831        struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
1832        struct ext4_free_extent ex;
1833
1834        if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
1835                return 0;
1836        if (grp->bb_free == 0)
1837                return 0;
1838
1839        err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
1840        if (err)
1841                return err;
1842
1843        if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) {
1844                ext4_mb_unload_buddy(e4b);
1845                return 0;
1846        }
1847
1848        ext4_lock_group(ac->ac_sb, group);
1849        max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
1850                             ac->ac_g_ex.fe_len, &ex);
1851        ex.fe_logical = 0xDEADFA11; /* debug value */
1852
1853        if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
1854                ext4_fsblk_t start;
1855
1856                start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) +
1857                        ex.fe_start;
1858                /* use do_div to get remainder (would be 64-bit modulo) */
1859                if (do_div(start, sbi->s_stripe) == 0) {
1860                        ac->ac_found++;
1861                        ac->ac_b_ex = ex;
1862                        ext4_mb_use_best_found(ac, e4b);
1863                }
1864        } else if (max >= ac->ac_g_ex.fe_len) {
1865                BUG_ON(ex.fe_len <= 0);
1866                BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
1867                BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
1868                ac->ac_found++;
1869                ac->ac_b_ex = ex;
1870                ext4_mb_use_best_found(ac, e4b);
1871        } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) {
1872                /* Sometimes, caller may want to merge even small
1873                 * number of blocks to an existing extent */
1874                BUG_ON(ex.fe_len <= 0);
1875                BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
1876                BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
1877                ac->ac_found++;
1878                ac->ac_b_ex = ex;
1879                ext4_mb_use_best_found(ac, e4b);
1880        }
1881        ext4_unlock_group(ac->ac_sb, group);
1882        ext4_mb_unload_buddy(e4b);
1883
1884        return 0;
1885}
1886
1887/*
1888 * The routine scans buddy structures (not bitmap!) from given order
1889 * to max order and tries to find big enough chunk to satisfy the req
1890 */
1891static noinline_for_stack
1892void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
1893                                        struct ext4_buddy *e4b)
1894{
1895        struct super_block *sb = ac->ac_sb;
1896        struct ext4_group_info *grp = e4b->bd_info;
1897        void *buddy;
1898        int i;
1899        int k;
1900        int max;
1901
1902        BUG_ON(ac->ac_2order <= 0);
1903        for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) {
1904                if (grp->bb_counters[i] == 0)
1905                        continue;
1906
1907                buddy = mb_find_buddy(e4b, i, &max);
1908                BUG_ON(buddy == NULL);
1909
1910                k = mb_find_next_zero_bit(buddy, max, 0);
1911                BUG_ON(k >= max);
1912
1913                ac->ac_found++;
1914
1915                ac->ac_b_ex.fe_len = 1 << i;
1916                ac->ac_b_ex.fe_start = k << i;
1917                ac->ac_b_ex.fe_group = e4b->bd_group;
1918
1919                ext4_mb_use_best_found(ac, e4b);
1920
1921                BUG_ON(ac->ac_b_ex.fe_len != ac->ac_g_ex.fe_len);
1922
1923                if (EXT4_SB(sb)->s_mb_stats)
1924                        atomic_inc(&EXT4_SB(sb)->s_bal_2orders);
1925
1926                break;
1927        }
1928}
1929
1930/*
1931 * The routine scans the group and measures all found extents.
1932 * In order to optimize scanning, caller must pass number of
1933 * free blocks in the group, so the routine can know upper limit.
1934 */
1935static noinline_for_stack
1936void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
1937                                        struct ext4_buddy *e4b)
1938{
1939        struct super_block *sb = ac->ac_sb;
1940        void *bitmap = e4b->bd_bitmap;
1941        struct ext4_free_extent ex;
1942        int i;
1943        int free;
1944
1945        free = e4b->bd_info->bb_free;
1946        BUG_ON(free <= 0);
1947
1948        i = e4b->bd_info->bb_first_free;
1949
1950        while (free && ac->ac_status == AC_STATUS_CONTINUE) {
1951                i = mb_find_next_zero_bit(bitmap,
1952                                                EXT4_CLUSTERS_PER_GROUP(sb), i);
1953                if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) {
1954                        /*
1955                         * IF we have corrupt bitmap, we won't find any
1956                         * free blocks even though group info says we
1957                         * we have free blocks
1958                         */
1959                        ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
1960                                        "%d free clusters as per "
1961                                        "group info. But bitmap says 0",
1962                                        free);
1963                        break;
1964                }
1965
1966                mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex);
1967                BUG_ON(ex.fe_len <= 0);
1968                if (free < ex.fe_len) {
1969                        ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
1970                                        "%d free clusters as per "
1971                                        "group info. But got %d blocks",
1972                                        free, ex.fe_len);
1973                        /*
1974                         * The number of free blocks differs. This mostly
1975                         * indicate that the bitmap is corrupt. So exit
1976                         * without claiming the space.
1977                         */
1978                        break;
1979                }
1980                ex.fe_logical = 0xDEADC0DE; /* debug value */
1981                ext4_mb_measure_extent(ac, &ex, e4b);
1982
1983                i += ex.fe_len;
1984                free -= ex.fe_len;
1985        }
1986
1987        ext4_mb_check_limits(ac, e4b, 1);
1988}
1989
1990/*
1991 * This is a special case for storages like raid5
1992 * we try to find stripe-aligned chunks for stripe-size-multiple requests
1993 */
1994static noinline_for_stack
1995void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
1996                                 struct ext4_buddy *e4b)
1997{
1998        struct super_block *sb = ac->ac_sb;
1999        struct ext4_sb_info *sbi = EXT4_SB(sb);
2000        void *bitmap = e4b->bd_bitmap;
2001        struct ext4_free_extent ex;
2002        ext4_fsblk_t first_group_block;
2003        ext4_fsblk_t a;
2004        ext4_grpblk_t i;
2005        int max;
2006
2007        BUG_ON(sbi->s_stripe == 0);
2008
2009        /* find first stripe-aligned block in group */
2010        first_group_block = ext4_group_first_block_no(sb, e4b->bd_group);
2011
2012        a = first_group_block + sbi->s_stripe - 1;
2013        do_div(a, sbi->s_stripe);
2014        i = (a * sbi->s_stripe) - first_group_block;
2015
2016        while (i < EXT4_CLUSTERS_PER_GROUP(sb)) {
2017                if (!mb_test_bit(i, bitmap)) {
2018                        max = mb_find_extent(e4b, i, sbi->s_stripe, &ex);
2019                        if (max >= sbi->s_stripe) {
2020                                ac->ac_found++;
2021                                ex.fe_logical = 0xDEADF00D; /* debug value */
2022                                ac->ac_b_ex = ex;
2023                                ext4_mb_use_best_found(ac, e4b);
2024                                break;
2025                        }
2026                }
2027                i += sbi->s_stripe;
2028        }
2029}
2030
2031/*
2032 * This is now called BEFORE we load the buddy bitmap.
2033 * Returns either 1 or 0 indicating that the group is either suitable
2034 * for the allocation or not. In addition it can also return negative
2035 * error code when something goes wrong.
2036 */
2037static int ext4_mb_good_group(struct ext4_allocation_context *ac,
2038                                ext4_group_t group, int cr)
2039{
2040        unsigned free, fragments;
2041        int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
2042        struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
2043
2044        BUG_ON(cr < 0 || cr >= 4);
2045
2046        free = grp->bb_free;
2047        if (free == 0)
2048                return 0;
2049        if (cr <= 2 && free < ac->ac_g_ex.fe_len)
2050                return 0;
2051
2052        if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
2053                return 0;
2054
2055        /* We only do this if the grp has never been initialized */
2056        if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
2057                int ret = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS);
2058                if (ret)
2059                        return ret;
2060        }
2061
2062        fragments = grp->bb_fragments;
2063        if (fragments == 0)
2064                return 0;
2065
2066        switch (cr) {
2067        case 0:
2068                BUG_ON(ac->ac_2order == 0);
2069
2070                /* Avoid using the first bg of a flexgroup for data files */
2071                if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
2072                    (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
2073                    ((group % flex_size) == 0))
2074                        return 0;
2075
2076                if ((ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) ||
2077                    (free / fragments) >= ac->ac_g_ex.fe_len)
2078                        return 1;
2079
2080                if (grp->bb_largest_free_order < ac->ac_2order)
2081                        return 0;
2082
2083                return 1;
2084        case 1:
2085                if ((free / fragments) >= ac->ac_g_ex.fe_len)
2086                        return 1;
2087                break;
2088        case 2:
2089                if (free >= ac->ac_g_ex.fe_len)
2090                        return 1;
2091                break;
2092        case 3:
2093                return 1;
2094        default:
2095                BUG();
2096        }
2097
2098        return 0;
2099}
2100
2101static noinline_for_stack int
2102ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
2103{
2104        ext4_group_t ngroups, group, i;
2105        int cr;
2106        int err = 0, first_err = 0;
2107        struct ext4_sb_info *sbi;
2108        struct super_block *sb;
2109        struct ext4_buddy e4b;
2110
2111        sb = ac->ac_sb;
2112        sbi = EXT4_SB(sb);
2113        ngroups = ext4_get_groups_count(sb);
2114        /* non-extent files are limited to low blocks/groups */
2115        if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
2116                ngroups = sbi->s_blockfile_groups;
2117
2118        BUG_ON(ac->ac_status == AC_STATUS_FOUND);
2119
2120        /* first, try the goal */
2121        err = ext4_mb_find_by_goal(ac, &e4b);
2122        if (err || ac->ac_status == AC_STATUS_FOUND)
2123                goto out;
2124
2125        if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
2126                goto out;
2127
2128        /*
2129         * ac->ac2_order is set only if the fe_len is a power of 2
2130         * if ac2_order is set we also set criteria to 0 so that we
2131         * try exact allocation using buddy.
2132         */
2133        i = fls(ac->ac_g_ex.fe_len);
2134        ac->ac_2order = 0;
2135        /*
2136         * We search using buddy data only if the order of the request
2137         * is greater than equal to the sbi_s_mb_order2_reqs
2138         * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
2139         */
2140        if (i >= sbi->s_mb_order2_reqs) {
2141                /*
2142                 * This should tell if fe_len is exactly power of 2
2143                 */
2144                if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0)
2145                        ac->ac_2order = i - 1;
2146        }
2147
2148        /* if stream allocation is enabled, use global goal */
2149        if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
2150                /* TBD: may be hot point */
2151                spin_lock(&sbi->s_md_lock);
2152                ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
2153                ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
2154                spin_unlock(&sbi->s_md_lock);
2155        }
2156
2157        /* Let's just scan groups to find more-less suitable blocks */
2158        cr = ac->ac_2order ? 0 : 1;
2159        /*
2160         * cr == 0 try to get exact allocation,
2161         * cr == 3  try to get anything
2162         */
2163repeat:
2164        for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
2165                ac->ac_criteria = cr;
2166                /*
2167                 * searching for the right group start
2168                 * from the goal value specified
2169                 */
2170                group = ac->ac_g_ex.fe_group;
2171
2172                for (i = 0; i < ngroups; group++, i++) {
2173                        int ret = 0;
2174                        cond_resched();
2175                        /*
2176                         * Artificially restricted ngroups for non-extent
2177                         * files makes group > ngroups possible on first loop.
2178                         */
2179                        if (group >= ngroups)
2180                                group = 0;
2181
2182                        /* This now checks without needing the buddy page */
2183                        ret = ext4_mb_good_group(ac, group, cr);
2184                        if (ret <= 0) {
2185                                if (!first_err)
2186                                        first_err = ret;
2187                                continue;
2188                        }
2189
2190                        err = ext4_mb_load_buddy(sb, group, &e4b);
2191                        if (err)
2192                                goto out;
2193
2194                        ext4_lock_group(sb, group);
2195
2196                        /*
2197                         * We need to check again after locking the
2198                         * block group
2199                         */
2200                        ret = ext4_mb_good_group(ac, group, cr);
2201                        if (ret <= 0) {
2202                                ext4_unlock_group(sb, group);
2203                                ext4_mb_unload_buddy(&e4b);
2204                                if (!first_err)
2205                                        first_err = ret;
2206                                continue;
2207                        }
2208
2209                        ac->ac_groups_scanned++;
2210                        if (cr == 0 && ac->ac_2order < sb->s_blocksize_bits+2)
2211                                ext4_mb_simple_scan_group(ac, &e4b);
2212                        else if (cr == 1 && sbi->s_stripe &&
2213                                        !(ac->ac_g_ex.fe_len % sbi->s_stripe))
2214                                ext4_mb_scan_aligned(ac, &e4b);
2215                        else
2216                                ext4_mb_complex_scan_group(ac, &e4b);
2217
2218                        ext4_unlock_group(sb, group);
2219                        ext4_mb_unload_buddy(&e4b);
2220
2221                        if (ac->ac_status != AC_STATUS_CONTINUE)
2222                                break;
2223                }
2224        }
2225
2226        if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
2227            !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
2228                /*
2229                 * We've been searching too long. Let's try to allocate
2230                 * the best chunk we've found so far
2231                 */
2232
2233                ext4_mb_try_best_found(ac, &e4b);
2234                if (ac->ac_status != AC_STATUS_FOUND) {
2235                        /*
2236                         * Someone more lucky has already allocated it.
2237                         * The only thing we can do is just take first
2238                         * found block(s)
2239                        printk(KERN_DEBUG "EXT4-fs: someone won our chunk\n");
2240                         */
2241                        ac->ac_b_ex.fe_group = 0;
2242                        ac->ac_b_ex.fe_start = 0;
2243                        ac->ac_b_ex.fe_len = 0;
2244                        ac->ac_status = AC_STATUS_CONTINUE;
2245                        ac->ac_flags |= EXT4_MB_HINT_FIRST;
2246                        cr = 3;
2247                        atomic_inc(&sbi->s_mb_lost_chunks);
2248                        goto repeat;
2249                }
2250        }
2251out:
2252        if (!err && ac->ac_status != AC_STATUS_FOUND && first_err)
2253                err = first_err;
2254        return err;
2255}
2256
2257static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
2258{
2259        struct super_block *sb = seq->private;
2260        ext4_group_t group;
2261
2262        if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
2263                return NULL;
2264        group = *pos + 1;
2265        return (void *) ((unsigned long) group);
2266}
2267
2268static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
2269{
2270        struct super_block *sb = seq->private;
2271        ext4_group_t group;
2272
2273        ++*pos;
2274        if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
2275                return NULL;
2276        group = *pos + 1;
2277        return (void *) ((unsigned long) group);
2278}
2279
2280static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
2281{
2282        struct super_block *sb = seq->private;
2283        ext4_group_t group = (ext4_group_t) ((unsigned long) v);
2284        int i;
2285        int err, buddy_loaded = 0;
2286        struct ext4_buddy e4b;
2287        struct ext4_group_info *grinfo;
2288        struct sg {
2289                struct ext4_group_info info;
2290                ext4_grpblk_t counters[16];
2291        } sg;
2292
2293        group--;
2294        if (group == 0)
2295                seq_puts(seq, "#group: free  frags first ["
2296                              " 2^0   2^1   2^2   2^3   2^4   2^5   2^6  "
2297                              " 2^7   2^8   2^9   2^10  2^11  2^12  2^13  ]\n");
2298
2299        i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
2300                sizeof(struct ext4_group_info);
2301        grinfo = ext4_get_group_info(sb, group);
2302        /* Load the group info in memory only if not already loaded. */
2303        if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) {
2304                err = ext4_mb_load_buddy(sb, group, &e4b);
2305                if (err) {
2306                        seq_printf(seq, "#%-5u: I/O error\n", group);
2307                        return 0;
2308                }
2309                buddy_loaded = 1;
2310        }
2311
2312        memcpy(&sg, ext4_get_group_info(sb, group), i);
2313
2314        if (buddy_loaded)
2315                ext4_mb_unload_buddy(&e4b);
2316
2317        seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
2318                        sg.info.bb_fragments, sg.info.bb_first_free);
2319        for (i = 0; i <= 13; i++)
2320                seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ?
2321                                sg.info.bb_counters[i] : 0);
2322        seq_printf(seq, " ]\n");
2323
2324        return 0;
2325}
2326
2327static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v)
2328{
2329}
2330
2331static const struct seq_operations ext4_mb_seq_groups_ops = {
2332        .start  = ext4_mb_seq_groups_start,
2333        .next   = ext4_mb_seq_groups_next,
2334        .stop   = ext4_mb_seq_groups_stop,
2335        .show   = ext4_mb_seq_groups_show,
2336};
2337
2338static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
2339{
2340        struct super_block *sb = PDE_DATA(inode);
2341        int rc;
2342
2343        rc = seq_open(file, &ext4_mb_seq_groups_ops);
2344        if (rc == 0) {
2345                struct seq_file *m = file->private_data;
2346                m->private = sb;
2347        }
2348        return rc;
2349
2350}
2351
2352const struct file_operations ext4_seq_mb_groups_fops = {
2353        .open           = ext4_mb_seq_groups_open,
2354        .read           = seq_read,
2355        .llseek         = seq_lseek,
2356        .release        = seq_release,
2357};
2358
2359static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
2360{
2361        int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
2362        struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index];
2363
2364        BUG_ON(!cachep);
2365        return cachep;
2366}
2367
2368/*
2369 * Allocate the top-level s_group_info array for the specified number
2370 * of groups
2371 */
2372int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
2373{
2374        struct ext4_sb_info *sbi = EXT4_SB(sb);
2375        unsigned size;
2376        struct ext4_group_info ***new_groupinfo;
2377
2378        size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >>
2379                EXT4_DESC_PER_BLOCK_BITS(sb);
2380        if (size <= sbi->s_group_info_size)
2381                return 0;
2382
2383        size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size);
2384        new_groupinfo = ext4_kvzalloc(size, GFP_KERNEL);
2385        if (!new_groupinfo) {
2386                ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
2387                return -ENOMEM;
2388        }
2389        if (sbi->s_group_info) {
2390                memcpy(new_groupinfo, sbi->s_group_info,
2391                       sbi->s_group_info_size * sizeof(*sbi->s_group_info));
2392                kvfree(sbi->s_group_info);
2393        }
2394        sbi->s_group_info = new_groupinfo;
2395        sbi->s_group_info_size = size / sizeof(*sbi->s_group_info);
2396        ext4_debug("allocated s_groupinfo array for %d meta_bg's\n", 
2397                   sbi->s_group_info_size);
2398        return 0;
2399}
2400
2401/* Create and initialize ext4_group_info data for the given group. */
2402int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2403                          struct ext4_group_desc *desc)
2404{
2405        int i;
2406        int metalen = 0;
2407        struct ext4_sb_info *sbi = EXT4_SB(sb);
2408        struct ext4_group_info **meta_group_info;
2409        struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2410
2411        /*
2412         * First check if this group is the first of a reserved block.
2413         * If it's true, we have to allocate a new table of pointers
2414         * to ext4_group_info structures
2415         */
2416        if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
2417                metalen = sizeof(*meta_group_info) <<
2418                        EXT4_DESC_PER_BLOCK_BITS(sb);
2419                meta_group_info = kmalloc(metalen, GFP_NOFS);
2420                if (meta_group_info == NULL) {
2421                        ext4_msg(sb, KERN_ERR, "can't allocate mem "
2422                                 "for a buddy group");
2423                        goto exit_meta_group_info;
2424                }
2425                sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] =
2426                        meta_group_info;
2427        }
2428
2429        meta_group_info =
2430                sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
2431        i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
2432
2433        meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_NOFS);
2434        if (meta_group_info[i] == NULL) {
2435                ext4_msg(sb, KERN_ERR, "can't allocate buddy mem");
2436                goto exit_group_info;
2437        }
2438        set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
2439                &(meta_group_info[i]->bb_state));
2440
2441        /*
2442         * initialize bb_free to be able to skip
2443         * empty groups without initialization
2444         */
2445        if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
2446                meta_group_info[i]->bb_free =
2447                        ext4_free_clusters_after_init(sb, group, desc);
2448        } else {
2449                meta_group_info[i]->bb_free =
2450                        ext4_free_group_clusters(sb, desc);
2451        }
2452
2453        INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
2454        init_rwsem(&meta_group_info[i]->alloc_sem);
2455        meta_group_info[i]->bb_free_root = RB_ROOT;
2456        meta_group_info[i]->bb_largest_free_order = -1;  /* uninit */
2457
2458#ifdef DOUBLE_CHECK
2459        {
2460                struct buffer_head *bh;
2461                meta_group_info[i]->bb_bitmap =
2462                        kmalloc(sb->s_blocksize, GFP_NOFS);
2463                BUG_ON(meta_group_info[i]->bb_bitmap == NULL);
2464                bh = ext4_read_block_bitmap(sb, group);
2465                BUG_ON(IS_ERR_OR_NULL(bh));
2466                memcpy(meta_group_info[i]->bb_bitmap, bh->b_data,
2467                        sb->s_blocksize);
2468                put_bh(bh);
2469        }
2470#endif
2471
2472        return 0;
2473
2474exit_group_info:
2475        /* If a meta_group_info table has been allocated, release it now */
2476        if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
2477                kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]);
2478                sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = NULL;
2479        }
2480exit_meta_group_info:
2481        return -ENOMEM;
2482} /* ext4_mb_add_groupinfo */
2483
2484static int ext4_mb_init_backend(struct super_block *sb)
2485{
2486        ext4_group_t ngroups = ext4_get_groups_count(sb);
2487        ext4_group_t i;
2488        struct ext4_sb_info *sbi = EXT4_SB(sb);
2489        int err;
2490        struct ext4_group_desc *desc;
2491        struct kmem_cache *cachep;
2492
2493        err = ext4_mb_alloc_groupinfo(sb, ngroups);
2494        if (err)
2495                return err;
2496
2497        sbi->s_buddy_cache = new_inode(sb);
2498        if (sbi->s_buddy_cache == NULL) {
2499                ext4_msg(sb, KERN_ERR, "can't get new inode");
2500                goto err_freesgi;
2501        }
2502        /* To avoid potentially colliding with an valid on-disk inode number,
2503         * use EXT4_BAD_INO for the buddy cache inode number.  This inode is
2504         * not in the inode hash, so it should never be found by iget(), but
2505         * this will avoid confusion if it ever shows up during debugging. */
2506        sbi->s_buddy_cache->i_ino = EXT4_BAD_INO;
2507        EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
2508        for (i = 0; i < ngroups; i++) {
2509                desc = ext4_get_group_desc(sb, i, NULL);
2510                if (desc == NULL) {
2511                        ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i);
2512                        goto err_freebuddy;
2513                }
2514                if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
2515                        goto err_freebuddy;
2516        }
2517
2518        return 0;
2519
2520err_freebuddy:
2521        cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2522        while (i-- > 0)
2523                kmem_cache_free(cachep, ext4_get_group_info(sb, i));
2524        i = sbi->s_group_info_size;
2525        while (i-- > 0)
2526                kfree(sbi->s_group_info[i]);
2527        iput(sbi->s_buddy_cache);
2528err_freesgi:
2529        kvfree(sbi->s_group_info);
2530        return -ENOMEM;
2531}
2532
2533static void ext4_groupinfo_destroy_slabs(void)
2534{
2535        int i;
2536
2537        for (i = 0; i < NR_GRPINFO_CACHES; i++) {
2538                if (ext4_groupinfo_caches[i])
2539                        kmem_cache_destroy(ext4_groupinfo_caches[i]);
2540                ext4_groupinfo_caches[i] = NULL;
2541        }
2542}
2543
2544static int ext4_groupinfo_create_slab(size_t size)
2545{
2546        static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex);
2547        int slab_size;
2548        int blocksize_bits = order_base_2(size);
2549        int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
2550        struct kmem_cache *cachep;
2551
2552        if (cache_index >= NR_GRPINFO_CACHES)
2553                return -EINVAL;
2554
2555        if (unlikely(cache_index < 0))
2556                cache_index = 0;
2557
2558        mutex_lock(&ext4_grpinfo_slab_create_mutex);
2559        if (ext4_groupinfo_caches[cache_index]) {
2560                mutex_unlock(&ext4_grpinfo_slab_create_mutex);
2561                return 0;       /* Already created */
2562        }
2563
2564        slab_size = offsetof(struct ext4_group_info,
2565                                bb_counters[blocksize_bits + 2]);
2566
2567        cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index],
2568                                        slab_size, 0, SLAB_RECLAIM_ACCOUNT,
2569                                        NULL);
2570
2571        ext4_groupinfo_caches[cache_index] = cachep;
2572
2573        mutex_unlock(&ext4_grpinfo_slab_create_mutex);
2574        if (!cachep) {
2575                printk(KERN_EMERG
2576                       "EXT4-fs: no memory for groupinfo slab cache\n");
2577                return -ENOMEM;
2578        }
2579
2580        return 0;
2581}
2582
2583int ext4_mb_init(struct super_block *sb)
2584{
2585        struct ext4_sb_info *sbi = EXT4_SB(sb);
2586        unsigned i, j;
2587        unsigned offset, offset_incr;
2588        unsigned max;
2589        int ret;
2590
2591        i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
2592
2593        sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
2594        if (sbi->s_mb_offsets == NULL) {
2595                ret = -ENOMEM;
2596                goto out;
2597        }
2598
2599        i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
2600        sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
2601        if (sbi->s_mb_maxs == NULL) {
2602                ret = -ENOMEM;
2603                goto out;
2604        }
2605
2606        ret = ext4_groupinfo_create_slab(sb->s_blocksize);
2607        if (ret < 0)
2608                goto out;
2609
2610        /* order 0 is regular bitmap */
2611        sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
2612        sbi->s_mb_offsets[0] = 0;
2613
2614        i = 1;
2615        offset = 0;
2616        offset_incr = 1 << (sb->s_blocksize_bits - 1);
2617        max = sb->s_blocksize << 2;
2618        do {
2619                sbi->s_mb_offsets[i] = offset;
2620                sbi->s_mb_maxs[i] = max;
2621                offset += offset_incr;
2622                offset_incr = offset_incr >> 1;
2623                max = max >> 1;
2624                i++;
2625        } while (i <= sb->s_blocksize_bits + 1);
2626
2627        spin_lock_init(&sbi->s_md_lock);
2628        spin_lock_init(&sbi->s_bal_lock);
2629        sbi->s_mb_free_pending = 0;
2630
2631        sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
2632        sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
2633        sbi->s_mb_stats = MB_DEFAULT_STATS;
2634        sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
2635        sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
2636        /*
2637         * The default group preallocation is 512, which for 4k block
2638         * sizes translates to 2 megabytes.  However for bigalloc file
2639         * systems, this is probably too big (i.e, if the cluster size
2640         * is 1 megabyte, then group preallocation size becomes half a
2641         * gigabyte!).  As a default, we will keep a two megabyte
2642         * group pralloc size for cluster sizes up to 64k, and after
2643         * that, we will force a minimum group preallocation size of
2644         * 32 clusters.  This translates to 8 megs when the cluster
2645         * size is 256k, and 32 megs when the cluster size is 1 meg,
2646         * which seems reasonable as a default.
2647         */
2648        sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >>
2649                                       sbi->s_cluster_bits, 32);
2650        /*
2651         * If there is a s_stripe > 1, then we set the s_mb_group_prealloc
2652         * to the lowest multiple of s_stripe which is bigger than
2653         * the s_mb_group_prealloc as determined above. We want
2654         * the preallocation size to be an exact multiple of the
2655         * RAID stripe size so that preallocations don't fragment
2656         * the stripes.
2657         */
2658        if (sbi->s_stripe > 1) {
2659                sbi->s_mb_group_prealloc = roundup(
2660                        sbi->s_mb_group_prealloc, sbi->s_stripe);
2661        }
2662
2663        sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
2664        if (sbi->s_locality_groups == NULL) {
2665                ret = -ENOMEM;
2666                goto out;
2667        }
2668        for_each_possible_cpu(i) {
2669                struct ext4_locality_group *lg;
2670                lg = per_cpu_ptr(sbi->s_locality_groups, i);
2671                mutex_init(&lg->lg_mutex);
2672                for (j = 0; j < PREALLOC_TB_SIZE; j++)
2673                        INIT_LIST_HEAD(&lg->lg_prealloc_list[j]);
2674                spin_lock_init(&lg->lg_prealloc_lock);
2675        }
2676
2677        /* init file for buddy data */
2678        ret = ext4_mb_init_backend(sb);
2679        if (ret != 0)
2680                goto out_free_locality_groups;
2681
2682        return 0;
2683
2684out_free_locality_groups:
2685        free_percpu(sbi->s_locality_groups);
2686        sbi->s_locality_groups = NULL;
2687out:
2688        kfree(sbi->s_mb_offsets);
2689        sbi->s_mb_offsets = NULL;
2690        kfree(sbi->s_mb_maxs);
2691        sbi->s_mb_maxs = NULL;
2692        return ret;
2693}
2694
2695/* need to called with the ext4 group lock held */
2696static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
2697{
2698        struct ext4_prealloc_space *pa;
2699        struct list_head *cur, *tmp;
2700        int count = 0;
2701
2702        list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) {
2703                pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
2704                list_del(&pa->pa_group_list);
2705                count++;
2706                kmem_cache_free(ext4_pspace_cachep, pa);
2707        }
2708        if (count)
2709                mb_debug(1, "mballoc: %u PAs left\n", count);
2710
2711}
2712
2713int ext4_mb_release(struct super_block *sb)
2714{
2715        ext4_group_t ngroups = ext4_get_groups_count(sb);
2716        ext4_group_t i;
2717        int num_meta_group_infos;
2718        struct ext4_group_info *grinfo;
2719        struct ext4_sb_info *sbi = EXT4_SB(sb);
2720        struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2721
2722        if (sbi->s_group_info) {
2723                for (i = 0; i < ngroups; i++) {
2724                        grinfo = ext4_get_group_info(sb, i);
2725#ifdef DOUBLE_CHECK
2726                        kfree(grinfo->bb_bitmap);
2727#endif
2728                        ext4_lock_group(sb, i);
2729                        ext4_mb_cleanup_pa(grinfo);
2730                        ext4_unlock_group(sb, i);
2731                        kmem_cache_free(cachep, grinfo);
2732                }
2733                num_meta_group_infos = (ngroups +
2734                                EXT4_DESC_PER_BLOCK(sb) - 1) >>
2735                        EXT4_DESC_PER_BLOCK_BITS(sb);
2736                for (i = 0; i < num_meta_group_infos; i++)
2737                        kfree(sbi->s_group_info[i]);
2738                kvfree(sbi->s_group_info);
2739        }
2740        kfree(sbi->s_mb_offsets);
2741        kfree(sbi->s_mb_maxs);
2742        iput(sbi->s_buddy_cache);
2743        if (sbi->s_mb_stats) {
2744                ext4_msg(sb, KERN_INFO,
2745                       "mballoc: %u blocks %u reqs (%u success)",
2746                                atomic_read(&sbi->s_bal_allocated),
2747                                atomic_read(&sbi->s_bal_reqs),
2748                                atomic_read(&sbi->s_bal_success));
2749                ext4_msg(sb, KERN_INFO,
2750                      "mballoc: %u extents scanned, %u goal hits, "
2751                                "%u 2^N hits, %u breaks, %u lost",
2752                                atomic_read(&sbi->s_bal_ex_scanned),
2753                                atomic_read(&sbi->s_bal_goals),
2754                                atomic_read(&sbi->s_bal_2orders),
2755                                atomic_read(&sbi->s_bal_breaks),
2756                                atomic_read(&sbi->s_mb_lost_chunks));
2757                ext4_msg(sb, KERN_INFO,
2758                       "mballoc: %lu generated and it took %Lu",
2759                                sbi->s_mb_buddies_generated,
2760                                sbi->s_mb_generation_time);
2761                ext4_msg(sb, KERN_INFO,
2762                       "mballoc: %u preallocated, %u discarded",
2763                                atomic_read(&sbi->s_mb_preallocated),
2764                                atomic_read(&sbi->s_mb_discarded));
2765        }
2766
2767        free_percpu(sbi->s_locality_groups);
2768
2769        return 0;
2770}
2771
2772static inline int ext4_issue_discard(struct super_block *sb,
2773                ext4_group_t block_group, ext4_grpblk_t cluster, int count)
2774{
2775        ext4_fsblk_t discard_block;
2776
2777        discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) +
2778                         ext4_group_first_block_no(sb, block_group));
2779        count = EXT4_C2B(EXT4_SB(sb), count);
2780        trace_ext4_discard_blocks(sb,
2781                        (unsigned long long) discard_block, count);
2782        return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
2783}
2784
2785/*
2786 * This function is called by the jbd2 layer once the commit has finished,
2787 * so we know we can free the blocks that were released with that commit.
2788 */
2789static void ext4_free_data_callback(struct super_block *sb,
2790                                    struct ext4_journal_cb_entry *jce,
2791                                    int rc)
2792{
2793        struct ext4_free_data *entry = (struct ext4_free_data *)jce;
2794        struct ext4_buddy e4b;
2795        struct ext4_group_info *db;
2796        int err, count = 0, count2 = 0;
2797
2798        mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
2799                 entry->efd_count, entry->efd_group, entry);
2800
2801        if (test_opt(sb, DISCARD)) {
2802                err = ext4_issue_discard(sb, entry->efd_group,
2803                                         entry->efd_start_cluster,
2804                                         entry->efd_count);
2805                if (err && err != -EOPNOTSUPP)
2806                        ext4_msg(sb, KERN_WARNING, "discard request in"
2807                                 " group:%d block:%d count:%d failed"
2808                                 " with %d", entry->efd_group,
2809                                 entry->efd_start_cluster,
2810                                 entry->efd_count, err);
2811        }
2812
2813        err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
2814        /* we expect to find existing buddy because it's pinned */
2815        BUG_ON(err != 0);
2816
2817        spin_lock(&EXT4_SB(sb)->s_md_lock);
2818        EXT4_SB(sb)->s_mb_free_pending -= entry->efd_count;
2819        spin_unlock(&EXT4_SB(sb)->s_md_lock);
2820
2821        db = e4b.bd_info;
2822        /* there are blocks to put in buddy to make them really free */
2823        count += entry->efd_count;
2824        count2++;
2825        ext4_lock_group(sb, entry->efd_group);
2826        /* Take it out of per group rb tree */
2827        rb_erase(&entry->efd_node, &(db->bb_free_root));
2828        mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count);
2829
2830        /*
2831         * Clear the trimmed flag for the group so that the next
2832         * ext4_trim_fs can trim it.
2833         * If the volume is mounted with -o discard, online discard
2834         * is supported and the free blocks will be trimmed online.
2835         */
2836        if (!test_opt(sb, DISCARD))
2837                EXT4_MB_GRP_CLEAR_TRIMMED(db);
2838
2839        if (!db->bb_free_root.rb_node) {
2840                /* No more items in the per group rb tree
2841                 * balance refcounts from ext4_mb_free_metadata()
2842                 */
2843                put_page(e4b.bd_buddy_page);
2844                put_page(e4b.bd_bitmap_page);
2845        }
2846        ext4_unlock_group(sb, entry->efd_group);
2847        kmem_cache_free(ext4_free_data_cachep, entry);
2848        ext4_mb_unload_buddy(&e4b);
2849
2850        mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
2851}
2852
2853int __init ext4_init_mballoc(void)
2854{
2855        ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
2856                                        SLAB_RECLAIM_ACCOUNT);
2857        if (ext4_pspace_cachep == NULL)
2858                return -ENOMEM;
2859
2860        ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context,
2861                                    SLAB_RECLAIM_ACCOUNT);
2862        if (ext4_ac_cachep == NULL) {
2863                kmem_cache_destroy(ext4_pspace_cachep);
2864                return -ENOMEM;
2865        }
2866
2867        ext4_free_data_cachep = KMEM_CACHE(ext4_free_data,
2868                                           SLAB_RECLAIM_ACCOUNT);
2869        if (ext4_free_data_cachep == NULL) {
2870                kmem_cache_destroy(ext4_pspace_cachep);
2871                kmem_cache_destroy(ext4_ac_cachep);
2872                return -ENOMEM;
2873        }
2874        return 0;
2875}
2876
2877void ext4_exit_mballoc(void)
2878{
2879        /*
2880         * Wait for completion of call_rcu()'s on ext4_pspace_cachep
2881         * before destroying the slab cache.
2882         */
2883        rcu_barrier();
2884        kmem_cache_destroy(ext4_pspace_cachep);
2885        kmem_cache_destroy(ext4_ac_cachep);
2886        kmem_cache_destroy(ext4_free_data_cachep);
2887        ext4_groupinfo_destroy_slabs();
2888}
2889
2890
2891/*
2892 * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps
2893 * Returns 0 if success or error code
2894 */
2895static noinline_for_stack int
2896ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2897                                handle_t *handle, unsigned int reserv_clstrs)
2898{
2899        struct buffer_head *bitmap_bh = NULL;
2900        struct ext4_group_desc *gdp;
2901        struct buffer_head *gdp_bh;
2902        struct ext4_sb_info *sbi;
2903        struct super_block *sb;
2904        ext4_fsblk_t block;
2905        int err, len;
2906
2907        BUG_ON(ac->ac_status != AC_STATUS_FOUND);
2908        BUG_ON(ac->ac_b_ex.fe_len <= 0);
2909
2910        sb = ac->ac_sb;
2911        sbi = EXT4_SB(sb);
2912
2913        bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
2914        if (IS_ERR(bitmap_bh)) {
2915                err = PTR_ERR(bitmap_bh);
2916                bitmap_bh = NULL;
2917                goto out_err;
2918        }
2919
2920        BUFFER_TRACE(bitmap_bh, "getting write access");
2921        err = ext4_journal_get_write_access(handle, bitmap_bh);
2922        if (err)
2923                goto out_err;
2924
2925        err = -EIO;
2926        gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh);
2927        if (!gdp)
2928                goto out_err;
2929
2930        ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
2931                        ext4_free_group_clusters(sb, gdp));
2932
2933        BUFFER_TRACE(gdp_bh, "get_write_access");
2934        err = ext4_journal_get_write_access(handle, gdp_bh);
2935        if (err)
2936                goto out_err;
2937
2938        block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
2939
2940        len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
2941        if (!ext4_data_block_valid(sbi, block, len)) {
2942                ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
2943                           "fs metadata", block, block+len);
2944                /* File system mounted not to panic on error
2945                 * Fix the bitmap and return EFSCORRUPTED
2946                 * We leak some of the blocks here.
2947                 */
2948                ext4_lock_group(sb, ac->ac_b_ex.fe_group);
2949                ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
2950                              ac->ac_b_ex.fe_len);
2951                ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
2952                err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
2953                if (!err)
2954                        err = -EFSCORRUPTED;
2955                goto out_err;
2956        }
2957
2958        ext4_lock_group(sb, ac->ac_b_ex.fe_group);
2959#ifdef AGGRESSIVE_CHECK
2960        {
2961                int i;
2962                for (i = 0; i < ac->ac_b_ex.fe_len; i++) {
2963                        BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i,
2964                                                bitmap_bh->b_data));
2965                }
2966        }
2967#endif
2968        ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
2969                      ac->ac_b_ex.fe_len);
2970        if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
2971                gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
2972                ext4_free_group_clusters_set(sb, gdp,
2973                                             ext4_free_clusters_after_init(sb,
2974                                                ac->ac_b_ex.fe_group, gdp));
2975        }
2976        len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len;
2977        ext4_free_group_clusters_set(sb, gdp, len);
2978        ext4_block_bitmap_csum_set(sb, ac->ac_b_ex.fe_group, gdp, bitmap_bh);
2979        ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp);
2980
2981        ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
2982        percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len);
2983        /*
2984         * Now reduce the dirty block count also. Should not go negative
2985         */
2986        if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
2987                /* release all the reserved blocks if non delalloc */
2988                percpu_counter_sub(&sbi->s_dirtyclusters_counter,
2989                                   reserv_clstrs);
2990
2991        if (sbi->s_log_groups_per_flex) {
2992                ext4_group_t flex_group = ext4_flex_group(sbi,
2993                                                          ac->ac_b_ex.fe_group);
2994                atomic64_sub(ac->ac_b_ex.fe_len,
2995                             &sbi->s_flex_groups[flex_group].free_clusters);
2996        }
2997
2998        err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
2999        if (err)
3000                goto out_err;
3001        err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
3002
3003out_err:
3004        brelse(bitmap_bh);
3005        return err;
3006}
3007
3008/*
3009 * here we normalize request for locality group
3010 * Group request are normalized to s_mb_group_prealloc, which goes to
3011 * s_strip if we set the same via mount option.
3012 * s_mb_group_prealloc can be configured via
3013 * /sys/fs/ext4/<partition>/mb_group_prealloc
3014 *
3015 * XXX: should we try to preallocate more than the group has now?
3016 */
3017static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
3018{
3019        struct super_block *sb = ac->ac_sb;
3020        struct ext4_locality_group *lg = ac->ac_lg;
3021
3022        BUG_ON(lg == NULL);
3023        ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
3024        mb_debug(1, "#%u: goal %u blocks for locality group\n",
3025                current->pid, ac->ac_g_ex.fe_len);
3026}
3027
3028/*
3029 * Normalization means making request better in terms of
3030 * size and alignment
3031 */
3032static noinline_for_stack void
3033ext4_mb_normalize_request(struct ext4_allocation_context *ac,
3034                                struct ext4_allocation_request *ar)
3035{
3036        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
3037        int bsbits, max;
3038        ext4_lblk_t end;
3039        loff_t size, start_off;
3040        loff_t orig_size __maybe_unused;
3041        ext4_lblk_t start;
3042        struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
3043        struct ext4_prealloc_space *pa;
3044
3045        /* do normalize only data requests, metadata requests
3046           do not need preallocation */
3047        if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
3048                return;
3049
3050        /* sometime caller may want exact blocks */
3051        if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
3052                return;
3053
3054        /* caller may indicate that preallocation isn't
3055         * required (it's a tail, for example) */
3056        if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC)
3057                return;
3058
3059        if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) {
3060                ext4_mb_normalize_group_request(ac);
3061                return ;
3062        }
3063
3064        bsbits = ac->ac_sb->s_blocksize_bits;
3065
3066        /* first, let's learn actual file size
3067         * given current request is allocated */
3068        size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
3069        size = size << bsbits;
3070        if (size < i_size_read(ac->ac_inode))
3071                size = i_size_read(ac->ac_inode);
3072        orig_size = size;
3073
3074        /* max size of free chunks */
3075        max = 2 << bsbits;
3076
3077#define NRL_CHECK_SIZE(req, size, max, chunk_size)      \
3078                (req <= (size) || max <= (chunk_size))
3079
3080        /* first, try to predict filesize */
3081        /* XXX: should this table be tunable? */
3082        start_off = 0;
3083        if (size <= 16 * 1024) {
3084                size = 16 * 1024;
3085        } else if (size <= 32 * 1024) {
3086                size = 32 * 1024;
3087        } else if (size <= 64 * 1024) {
3088                size = 64 * 1024;
3089        } else if (size <= 128 * 1024) {
3090                size = 128 * 1024;
3091        } else if (size <= 256 * 1024) {
3092                size = 256 * 1024;
3093        } else if (size <= 512 * 1024) {
3094                size = 512 * 1024;
3095        } else if (size <= 1024 * 1024) {
3096                size = 1024 * 1024;
3097        } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) {
3098                start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
3099                                                (21 - bsbits)) << 21;
3100                size = 2 * 1024 * 1024;
3101        } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) {
3102                start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
3103                                                        (22 - bsbits)) << 22;
3104                size = 4 * 1024 * 1024;
3105        } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len,
3106                                        (8<<20)>>bsbits, max, 8 * 1024)) {
3107                start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
3108                                                        (23 - bsbits)) << 23;
3109                size = 8 * 1024 * 1024;
3110        } else {
3111                start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits;
3112                size      = (loff_t) EXT4_C2B(EXT4_SB(ac->ac_sb),
3113                                              ac->ac_o_ex.fe_len) << bsbits;
3114        }
3115        size = size >> bsbits;
3116        start = start_off >> bsbits;
3117
3118        /* don't cover already allocated blocks in selected range */
3119        if (ar->pleft && start <= ar->lleft) {
3120                size -= ar->lleft + 1 - start;
3121                start = ar->lleft + 1;
3122        }
3123        if (ar->pright && start + size - 1 >= ar->lright)
3124                size -= start + size - ar->lright;
3125
3126        end = start + size;
3127
3128        /* check we don't cross already preallocated blocks */
3129        rcu_read_lock();
3130        list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
3131                ext4_lblk_t pa_end;
3132
3133                if (pa->pa_deleted)
3134                        continue;
3135                spin_lock(&pa->pa_lock);
3136                if (pa->pa_deleted) {
3137                        spin_unlock(&pa->pa_lock);
3138                        continue;
3139                }
3140
3141                pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
3142                                                  pa->pa_len);
3143
3144                /* PA must not overlap original request */
3145                BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
3146                        ac->ac_o_ex.fe_logical < pa->pa_lstart));
3147
3148                /* skip PAs this normalized request doesn't overlap with */
3149                if (pa->pa_lstart >= end || pa_end <= start) {
3150                        spin_unlock(&pa->pa_lock);
3151                        continue;
3152                }
3153                BUG_ON(pa->pa_lstart <= start && pa_end >= end);
3154
3155                /* adjust start or end to be adjacent to this pa */
3156                if (pa_end <= ac->ac_o_ex.fe_logical) {
3157                        BUG_ON(pa_end < start);
3158                        start = pa_end;
3159                } else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
3160                        BUG_ON(pa->pa_lstart > end);
3161                        end = pa->pa_lstart;
3162                }
3163                spin_unlock(&pa->pa_lock);
3164        }
3165        rcu_read_unlock();
3166        size = end - start;
3167
3168        /* XXX: extra loop to check we really don't overlap preallocations */
3169        rcu_read_lock();
3170        list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
3171                ext4_lblk_t pa_end;
3172
3173                spin_lock(&pa->pa_lock);
3174                if (pa->pa_deleted == 0) {
3175                        pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
3176                                                          pa->pa_len);
3177                        BUG_ON(!(start >= pa_end || end <= pa->pa_lstart));
3178                }
3179                spin_unlock(&pa->pa_lock);
3180        }
3181        rcu_read_unlock();
3182
3183        if (start + size <= ac->ac_o_ex.fe_logical &&
3184                        start > ac->ac_o_ex.fe_logical) {
3185                ext4_msg(ac->ac_sb, KERN_ERR,
3186                         "start %lu, size %lu, fe_logical %lu",
3187                         (unsigned long) start, (unsigned long) size,
3188                         (unsigned long) ac->ac_o_ex.fe_logical);
3189                BUG();
3190        }
3191        BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
3192
3193        /* now prepare goal request */
3194
3195        /* XXX: is it better to align blocks WRT to logical
3196         * placement or satisfy big request as is */
3197        ac->ac_g_ex.fe_logical = start;
3198        ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size);
3199
3200        /* define goal start in order to merge */
3201        if (ar->pright && (ar->lright == (start + size))) {
3202                /* merge to the right */
3203                ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size,
3204                                                &ac->ac_f_ex.fe_group,
3205                                                &ac->ac_f_ex.fe_start);
3206                ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
3207        }
3208        if (ar->pleft && (ar->lleft + 1 == start)) {
3209                /* merge to the left */
3210                ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1,
3211                                                &ac->ac_f_ex.fe_group,
3212                                                &ac->ac_f_ex.fe_start);
3213                ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
3214        }
3215
3216        mb_debug(1, "goal: %u(was %u) blocks at %u\n", (unsigned) size,
3217                (unsigned) orig_size, (unsigned) start);
3218}
3219
3220static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
3221{
3222        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
3223
3224        if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) {
3225                atomic_inc(&sbi->s_bal_reqs);
3226                atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
3227                if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
3228                        atomic_inc(&sbi->s_bal_success);
3229                atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
3230                if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
3231                                ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
3232                        atomic_inc(&sbi->s_bal_goals);
3233                if (ac->ac_found > sbi->s_mb_max_to_scan)
3234                        atomic_inc(&sbi->s_bal_breaks);
3235        }
3236
3237        if (ac->ac_op == EXT4_MB_HISTORY_ALLOC)
3238                trace_ext4_mballoc_alloc(ac);
3239        else
3240                trace_ext4_mballoc_prealloc(ac);
3241}
3242
3243/*
3244 * Called on failure; free up any blocks from the inode PA for this
3245 * context.  We don't need this for MB_GROUP_PA because we only change
3246 * pa_free in ext4_mb_release_context(), but on failure, we've already
3247 * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed.
3248 */
3249static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
3250{
3251        struct ext4_prealloc_space *pa = ac->ac_pa;
3252        struct ext4_buddy e4b;
3253        int err;
3254
3255        if (pa == NULL) {
3256                if (ac->ac_f_ex.fe_len == 0)
3257                        return;
3258                err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b);
3259                if (err) {
3260                        /*
3261                         * This should never happen since we pin the
3262                         * pages in the ext4_allocation_context so
3263                         * ext4_mb_load_buddy() should never fail.
3264                         */
3265                        WARN(1, "mb_load_buddy failed (%d)", err);
3266                        return;
3267                }
3268                ext4_lock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
3269                mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start,
3270                               ac->ac_f_ex.fe_len);
3271                ext4_unlock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
3272                ext4_mb_unload_buddy(&e4b);
3273                return;
3274        }
3275        if (pa->pa_type == MB_INODE_PA)
3276                pa->pa_free += ac->ac_b_ex.fe_len;
3277}
3278
3279/*
3280 * use blocks preallocated to inode
3281 */
3282static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
3283                                struct ext4_prealloc_space *pa)
3284{
3285        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
3286        ext4_fsblk_t start;
3287        ext4_fsblk_t end;
3288        int len;
3289
3290        /* found preallocated blocks, use them */
3291        start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart);
3292        end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len),
3293                  start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len));
3294        len = EXT4_NUM_B2C(sbi, end - start);
3295        ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group,
3296                                        &ac->ac_b_ex.fe_start);
3297        ac->ac_b_ex.fe_len = len;
3298        ac->ac_status = AC_STATUS_FOUND;
3299        ac->ac_pa = pa;
3300
3301        BUG_ON(start < pa->pa_pstart);
3302        BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len));
3303        BUG_ON(pa->pa_free < len);
3304        pa->pa_free -= len;
3305
3306        mb_debug(1, "use %llu/%u from inode pa %p\n", start, len, pa);
3307}
3308
3309/*
3310 * use blocks preallocated to locality group
3311 */
3312static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
3313                                struct ext4_prealloc_space *pa)
3314{
3315        unsigned int len = ac->ac_o_ex.fe_len;
3316
3317        ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart,
3318                                        &ac->ac_b_ex.fe_group,
3319                                        &ac->ac_b_ex.fe_start);
3320        ac->ac_b_ex.fe_len = len;
3321        ac->ac_status = AC_STATUS_FOUND;
3322        ac->ac_pa = pa;
3323
3324        /* we don't correct pa_pstart or pa_plen here to avoid
3325         * possible race when the group is being loaded concurrently
3326         * instead we correct pa later, after blocks are marked
3327         * in on-disk bitmap -- see ext4_mb_release_context()
3328         * Other CPUs are prevented from allocating from this pa by lg_mutex
3329         */
3330        mb_debug(1, "use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
3331}
3332
3333/*
3334 * Return the prealloc space that have minimal distance
3335 * from the goal block. @cpa is the prealloc
3336 * space that is having currently known minimal distance
3337 * from the goal block.
3338 */
3339static struct ext4_prealloc_space *
3340ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
3341                        struct ext4_prealloc_space *pa,
3342                        struct ext4_prealloc_space *cpa)
3343{
3344        ext4_fsblk_t cur_distance, new_distance;
3345
3346        if (cpa == NULL) {
3347                atomic_inc(&pa->pa_count);
3348                return pa;
3349        }
3350        cur_distance = abs(goal_block - cpa->pa_pstart);
3351        new_distance = abs(goal_block - pa->pa_pstart);
3352
3353        if (cur_distance <= new_distance)
3354                return cpa;
3355
3356        /* drop the previous reference */
3357        atomic_dec(&cpa->pa_count);
3358        atomic_inc(&pa->pa_count);
3359        return pa;
3360}
3361
3362/*
3363 * search goal blocks in preallocated space
3364 */
3365static noinline_for_stack int
3366ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
3367{
3368        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
3369        int order, i;
3370        struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
3371        struct ext4_locality_group *lg;
3372        struct ext4_prealloc_space *pa, *cpa = NULL;
3373        ext4_fsblk_t goal_block;
3374
3375        /* only data can be preallocated */
3376        if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
3377                return 0;
3378
3379        /* first, try per-file preallocation */
3380        rcu_read_lock();
3381        list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
3382
3383                /* all fields in this condition don't change,
3384                 * so we can skip locking for them */
3385                if (ac->ac_o_ex.fe_logical < pa->pa_lstart ||
3386                    ac->ac_o_ex.fe_logical >= (pa->pa_lstart +
3387                                               EXT4_C2B(sbi, pa->pa_len)))
3388                        continue;
3389
3390                /* non-extent files can't have physical blocks past 2^32 */
3391                if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
3392                    (pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len) >
3393                     EXT4_MAX_BLOCK_FILE_PHYS))
3394                        continue;
3395
3396                /* found preallocated blocks, use them */
3397                spin_lock(&pa->pa_lock);
3398                if (pa->pa_deleted == 0 && pa->pa_free) {
3399                        atomic_inc(&pa->pa_count);
3400                        ext4_mb_use_inode_pa(ac, pa);
3401                        spin_unlock(&pa->pa_lock);
3402                        ac->ac_criteria = 10;
3403                        rcu_read_unlock();
3404                        return 1;
3405                }
3406                spin_unlock(&pa->pa_lock);
3407        }
3408        rcu_read_unlock();
3409
3410        /* can we use group allocation? */
3411        if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC))
3412                return 0;
3413
3414        /* inode may have no locality group for some reason */
3415        lg = ac->ac_lg;
3416        if (lg == NULL)
3417                return 0;
3418        order  = fls(ac->ac_o_ex.fe_len) - 1;
3419        if (order > PREALLOC_TB_SIZE - 1)
3420                /* The max size of hash table is PREALLOC_TB_SIZE */
3421                order = PREALLOC_TB_SIZE - 1;
3422
3423        goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex);
3424        /*
3425         * search for the prealloc space that is having
3426         * minimal distance from the goal block.
3427         */
3428        for (i = order; i < PREALLOC_TB_SIZE; i++) {
3429                rcu_read_lock();
3430                list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i],
3431                                        pa_inode_list) {
3432                        spin_lock(&pa->pa_lock);
3433                        if (pa->pa_deleted == 0 &&
3434                                        pa->pa_free >= ac->ac_o_ex.fe_len) {
3435
3436                                cpa = ext4_mb_check_group_pa(goal_block,
3437                                                                pa, cpa);
3438                        }
3439                        spin_unlock(&pa->pa_lock);
3440                }
3441                rcu_read_unlock();
3442        }
3443        if (cpa) {
3444                ext4_mb_use_group_pa(ac, cpa);
3445                ac->ac_criteria = 20;
3446                return 1;
3447        }
3448        return 0;
3449}
3450
3451/*
3452 * the function goes through all block freed in the group
3453 * but not yet committed and marks them used in in-core bitmap.
3454 * buddy must be generated from this bitmap
3455 * Need to be called with the ext4 group lock held
3456 */
3457static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
3458                                                ext4_group_t group)
3459{
3460        struct rb_node *n;
3461        struct ext4_group_info *grp;
3462        struct ext4_free_data *entry;
3463
3464        grp = ext4_get_group_info(sb, group);
3465        n = rb_first(&(grp->bb_free_root));
3466
3467        while (n) {
3468                entry = rb_entry(n, struct ext4_free_data, efd_node);
3469                ext4_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count);
3470                n = rb_next(n);
3471        }
3472        return;
3473}
3474
3475/*
3476 * the function goes through all preallocation in this group and marks them
3477 * used in in-core bitmap. buddy must be generated from this bitmap
3478 * Need to be called with ext4 group lock held
3479 */
3480static noinline_for_stack
3481void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
3482                                        ext4_group_t group)
3483{
3484        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
3485        struct ext4_prealloc_space *pa;
3486        struct list_head *cur;
3487        ext4_group_t groupnr;
3488        ext4_grpblk_t start;
3489        int preallocated = 0;
3490        int len;
3491
3492        /* all form of preallocation discards first load group,
3493         * so the only competing code is preallocation use.
3494         * we don't need any locking here
3495         * notice we do NOT ignore preallocations with pa_deleted
3496         * otherwise we could leave used blocks available for
3497         * allocation in buddy when concurrent ext4_mb_put_pa()
3498         * is dropping preallocation
3499         */
3500        list_for_each(cur, &grp->bb_prealloc_list) {
3501                pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
3502                spin_lock(&pa->pa_lock);
3503                ext4_get_group_no_and_offset(sb, pa->pa_pstart,
3504                                             &groupnr, &start);
3505                len = pa->pa_len;
3506                spin_unlock(&pa->pa_lock);
3507                if (unlikely(len == 0))
3508                        continue;
3509                BUG_ON(groupnr != group);
3510                ext4_set_bits(bitmap, start, len);
3511                preallocated += len;
3512        }
3513        mb_debug(1, "prellocated %u for group %u\n", preallocated, group);
3514}
3515
3516static void ext4_mb_pa_callback(struct rcu_head *head)
3517{
3518        struct ext4_prealloc_space *pa;
3519        pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
3520
3521        BUG_ON(atomic_read(&pa->pa_count));
3522        BUG_ON(pa->pa_deleted == 0);
3523        kmem_cache_free(ext4_pspace_cachep, pa);
3524}
3525
3526/*
3527 * drops a reference to preallocated space descriptor
3528 * if this was the last reference and the space is consumed
3529 */
3530static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
3531                        struct super_block *sb, struct ext4_prealloc_space *pa)
3532{
3533        ext4_group_t grp;
3534        ext4_fsblk_t grp_blk;
3535
3536        /* in this short window concurrent discard can set pa_deleted */
3537        spin_lock(&pa->pa_lock);
3538        if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) {
3539                spin_unlock(&pa->pa_lock);
3540                return;
3541        }
3542
3543        if (pa->pa_deleted == 1) {
3544                spin_unlock(&pa->pa_lock);
3545                return;
3546        }
3547
3548        pa->pa_deleted = 1;
3549        spin_unlock(&pa->pa_lock);
3550
3551        grp_blk = pa->pa_pstart;
3552        /*
3553         * If doing group-based preallocation, pa_pstart may be in the
3554         * next group when pa is used up
3555         */
3556        if (pa->pa_type == MB_GROUP_PA)
3557                grp_blk--;
3558
3559        grp = ext4_get_group_number(sb, grp_blk);
3560
3561        /*
3562         * possible race:
3563         *
3564         *  P1 (buddy init)                     P2 (regular allocation)
3565         *                                      find block B in PA
3566         *  copy on-disk bitmap to buddy
3567         *                                      mark B in on-disk bitmap
3568         *                                      drop PA from group
3569         *  mark all PAs in buddy
3570         *
3571         * thus, P1 initializes buddy with B available. to prevent this
3572         * we make "copy" and "mark all PAs" atomic and serialize "drop PA"
3573         * against that pair
3574         */
3575        ext4_lock_group(sb, grp);
3576        list_del(&pa->pa_group_list);
3577        ext4_unlock_group(sb, grp);
3578
3579        spin_lock(pa->pa_obj_lock);
3580        list_del_rcu(&pa->pa_inode_list);
3581        spin_unlock(pa->pa_obj_lock);
3582
3583        call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
3584}
3585
3586/*
3587 * creates new preallocated space for given inode
3588 */
3589static noinline_for_stack int
3590ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
3591{
3592        struct super_block *sb = ac->ac_sb;
3593        struct ext4_sb_info *sbi = EXT4_SB(sb);
3594        struct ext4_prealloc_space *pa;
3595        struct ext4_group_info *grp;
3596        struct ext4_inode_info *ei;
3597
3598        /* preallocate only when found space is larger then requested */
3599        BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
3600        BUG_ON(ac->ac_status != AC_STATUS_FOUND);
3601        BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
3602
3603        pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
3604        if (pa == NULL)
3605                return -ENOMEM;
3606
3607        if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) {
3608                int winl;
3609                int wins;
3610                int win;
3611                int offs;
3612
3613                /* we can't allocate as much as normalizer wants.
3614                 * so, found space must get proper lstart
3615                 * to cover original request */
3616                BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical);
3617                BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len);
3618
3619                /* we're limited by original request in that
3620                 * logical block must be covered any way
3621                 * winl is window we can move our chunk within */
3622                winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical;
3623
3624                /* also, we should cover whole original request */
3625                wins = EXT4_C2B(sbi, ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len);
3626
3627                /* the smallest one defines real window */
3628                win = min(winl, wins);
3629
3630                offs = ac->ac_o_ex.fe_logical %
3631                        EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
3632                if (offs && offs < win)
3633                        win = offs;
3634
3635                ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical -
3636                        EXT4_NUM_B2C(sbi, win);
3637                BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
3638                BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
3639        }
3640
3641        /* preallocation can change ac_b_ex, thus we store actually
3642         * allocated blocks for history */
3643        ac->ac_f_ex = ac->ac_b_ex;
3644
3645        pa->pa_lstart = ac->ac_b_ex.fe_logical;
3646        pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
3647        pa->pa_len = ac->ac_b_ex.fe_len;
3648        pa->pa_free = pa->pa_len;
3649        atomic_set(&pa->pa_count, 1);
3650        spin_lock_init(&pa->pa_lock);
3651        INIT_LIST_HEAD(&pa->pa_inode_list);
3652        INIT_LIST_HEAD(&pa->pa_group_list);
3653        pa->pa_deleted = 0;
3654        pa->pa_type = MB_INODE_PA;
3655
3656        mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa,
3657                        pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3658        trace_ext4_mb_new_inode_pa(ac, pa);
3659
3660        ext4_mb_use_inode_pa(ac, pa);
3661        atomic_add(pa->pa_free, &sbi->s_mb_preallocated);
3662
3663        ei = EXT4_I(ac->ac_inode);
3664        grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
3665
3666        pa->pa_obj_lock = &ei->i_prealloc_lock;
3667        pa->pa_inode = ac->ac_inode;
3668
3669        ext4_lock_group(sb, ac->ac_b_ex.fe_group);
3670        list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
3671        ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
3672
3673        spin_lock(pa->pa_obj_lock);
3674        list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
3675        spin_unlock(pa->pa_obj_lock);
3676
3677        return 0;
3678}
3679
3680/*
3681 * creates new preallocated space for locality group inodes belongs to
3682 */
3683static noinline_for_stack int
3684ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
3685{
3686        struct super_block *sb = ac->ac_sb;
3687        struct ext4_locality_group *lg;
3688        struct ext4_prealloc_space *pa;
3689        struct ext4_group_info *grp;
3690
3691        /* preallocate only when found space is larger then requested */
3692        BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
3693        BUG_ON(ac->ac_status != AC_STATUS_FOUND);
3694        BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
3695
3696        BUG_ON(ext4_pspace_cachep == NULL);
3697        pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
3698        if (pa == NULL)
3699                return -ENOMEM;
3700
3701        /* preallocation can change ac_b_ex, thus we store actually
3702         * allocated blocks for history */
3703        ac->ac_f_ex = ac->ac_b_ex;
3704
3705        pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
3706        pa->pa_lstart = pa->pa_pstart;
3707        pa->pa_len = ac->ac_b_ex.fe_len;
3708        pa->pa_free = pa->pa_len;
3709        atomic_set(&pa->pa_count, 1);
3710        spin_lock_init(&pa->pa_lock);
3711        INIT_LIST_HEAD(&pa->pa_inode_list);
3712        INIT_LIST_HEAD(&pa->pa_group_list);
3713        pa->pa_deleted = 0;
3714        pa->pa_type = MB_GROUP_PA;
3715
3716        mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa,
3717                        pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3718        trace_ext4_mb_new_group_pa(ac, pa);
3719
3720        ext4_mb_use_group_pa(ac, pa);
3721        atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
3722
3723        grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
3724        lg = ac->ac_lg;
3725        BUG_ON(lg == NULL);
3726
3727        pa->pa_obj_lock = &lg->lg_prealloc_lock;
3728        pa->pa_inode = NULL;
3729
3730        ext4_lock_group(sb, ac->ac_b_ex.fe_group);
3731        list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
3732        ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
3733
3734        /*
3735         * We will later add the new pa to the right bucket
3736         * after updating the pa_free in ext4_mb_release_context
3737         */
3738        return 0;
3739}
3740
3741static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
3742{
3743        int err;
3744
3745        if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
3746                err = ext4_mb_new_group_pa(ac);
3747        else
3748                err = ext4_mb_new_inode_pa(ac);
3749        return err;
3750}
3751
3752/*
3753 * finds all unused blocks in on-disk bitmap, frees them in
3754 * in-core bitmap and buddy.
3755 * @pa must be unlinked from inode and group lists, so that
3756 * nobody else can find/use it.
3757 * the caller MUST hold group/inode locks.
3758 * TODO: optimize the case when there are no in-core structures yet
3759 */
3760static noinline_for_stack int
3761ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3762                        struct ext4_prealloc_space *pa)
3763{
3764        struct super_block *sb = e4b->bd_sb;
3765        struct ext4_sb_info *sbi = EXT4_SB(sb);
3766        unsigned int end;
3767        unsigned int next;
3768        ext4_group_t group;
3769        ext4_grpblk_t bit;
3770        unsigned long long grp_blk_start;
3771        int err = 0;
3772        int free = 0;
3773
3774        BUG_ON(pa->pa_deleted == 0);
3775        ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
3776        grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit);
3777        BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
3778        end = bit + pa->pa_len;
3779
3780        while (bit < end) {
3781                bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit);
3782                if (bit >= end)
3783                        break;
3784                next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
3785                mb_debug(1, "    free preallocated %u/%u in group %u\n",
3786                         (unsigned) ext4_group_first_block_no(sb, group) + bit,
3787                         (unsigned) next - bit, (unsigned) group);
3788                free += next - bit;
3789
3790                trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
3791                trace_ext4_mb_release_inode_pa(pa, (grp_blk_start +
3792                                                    EXT4_C2B(sbi, bit)),
3793                                               next - bit);
3794                mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
3795                bit = next + 1;
3796        }
3797        if (free != pa->pa_free) {
3798                ext4_msg(e4b->bd_sb, KERN_CRIT,
3799                         "pa %p: logic %lu, phys. %lu, len %lu",
3800                         pa, (unsigned long) pa->pa_lstart,
3801                         (unsigned long) pa->pa_pstart,
3802                         (unsigned long) pa->pa_len);
3803                ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u",
3804                                        free, pa->pa_free);
3805                /*
3806                 * pa is already deleted so we use the value obtained
3807                 * from the bitmap and continue.
3808                 */
3809        }
3810        atomic_add(free, &sbi->s_mb_discarded);
3811
3812        return err;
3813}
3814
3815static noinline_for_stack int
3816ext4_mb_release_group_pa(struct ext4_buddy *e4b,
3817                                struct ext4_prealloc_space *pa)
3818{
3819        struct super_block *sb = e4b->bd_sb;
3820        ext4_group_t group;
3821        ext4_grpblk_t bit;
3822
3823        trace_ext4_mb_release_group_pa(sb, pa);
3824        BUG_ON(pa->pa_deleted == 0);
3825        ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
3826        BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
3827        mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
3828        atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
3829        trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);
3830
3831        return 0;
3832}
3833
3834/*
3835 * releases all preallocations in given group
3836 *
3837 * first, we need to decide discard policy:
3838 * - when do we discard
3839 *   1) ENOSPC
3840 * - how many do we discard
3841 *   1) how many requested
3842 */
3843static noinline_for_stack int
3844ext4_mb_discard_group_preallocations(struct super_block *sb,
3845                                        ext4_group_t group, int needed)
3846{
3847        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
3848        struct buffer_head *bitmap_bh = NULL;
3849        struct ext4_prealloc_space *pa, *tmp;
3850        struct list_head list;
3851        struct ext4_buddy e4b;
3852        int err;
3853        int busy = 0;
3854        int free = 0;
3855
3856        mb_debug(1, "discard preallocation for group %u\n", group);
3857
3858        if (list_empty(&grp->bb_prealloc_list))
3859                return 0;
3860
3861        bitmap_bh = ext4_read_block_bitmap(sb, group);
3862        if (IS_ERR(bitmap_bh)) {
3863                err = PTR_ERR(bitmap_bh);
3864                ext4_error(sb, "Error %d reading block bitmap for %u",
3865                           err, group);
3866                return 0;
3867        }
3868
3869        err = ext4_mb_load_buddy(sb, group, &e4b);
3870        if (err) {
3871                ext4_error(sb, "Error loading buddy information for %u", group);
3872                put_bh(bitmap_bh);
3873                return 0;
3874        }
3875
3876        if (needed == 0)
3877                needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
3878
3879        INIT_LIST_HEAD(&list);
3880repeat:
3881        ext4_lock_group(sb, group);
3882        list_for_each_entry_safe(pa, tmp,
3883                                &grp->bb_prealloc_list, pa_group_list) {
3884                spin_lock(&pa->pa_lock);
3885                if (atomic_read(&pa->pa_count)) {
3886                        spin_unlock(&pa->pa_lock);
3887                        busy = 1;
3888                        continue;
3889                }
3890                if (pa->pa_deleted) {
3891                        spin_unlock(&pa->pa_lock);
3892                        continue;
3893                }
3894
3895                /* seems this one can be freed ... */
3896                pa->pa_deleted = 1;
3897
3898                /* we can trust pa_free ... */
3899                free += pa->pa_free;
3900
3901                spin_unlock(&pa->pa_lock);
3902
3903                list_del(&pa->pa_group_list);
3904                list_add(&pa->u.pa_tmp_list, &list);
3905        }
3906
3907        /* if we still need more blocks and some PAs were used, try again */
3908        if (free < needed && busy) {
3909                busy = 0;
3910                ext4_unlock_group(sb, group);
3911                cond_resched();
3912                goto repeat;
3913        }
3914
3915        /* found anything to free? */
3916        if (list_empty(&list)) {
3917                BUG_ON(free != 0);
3918                goto out;
3919        }
3920
3921        /* now free all selected PAs */
3922        list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
3923
3924                /* remove from object (inode or locality group) */
3925                spin_lock(pa->pa_obj_lock);
3926                list_del_rcu(&pa->pa_inode_list);
3927                spin_unlock(pa->pa_obj_lock);
3928
3929                if (pa->pa_type == MB_GROUP_PA)
3930                        ext4_mb_release_group_pa(&e4b, pa);
3931                else
3932                        ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
3933
3934                list_del(&pa->u.pa_tmp_list);
3935                call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
3936        }
3937
3938out:
3939        ext4_unlock_group(sb, group);
3940        ext4_mb_unload_buddy(&e4b);
3941        put_bh(bitmap_bh);
3942        return free;
3943}
3944
3945/*
3946 * releases all non-used preallocated blocks for given inode
3947 *
3948 * It's important to discard preallocations under i_data_sem
3949 * We don't want another block to be served from the prealloc
3950 * space when we are discarding the inode prealloc space.
3951 *
3952 * FIXME!! Make sure it is valid at all the call sites
3953 */
3954void ext4_discard_preallocations(struct inode *inode)
3955{
3956        struct ext4_inode_info *ei = EXT4_I(inode);
3957        struct super_block *sb = inode->i_sb;
3958        struct buffer_head *bitmap_bh = NULL;
3959        struct ext4_prealloc_space *pa, *tmp;
3960        ext4_group_t group = 0;
3961        struct list_head list;
3962        struct ext4_buddy e4b;
3963        int err;
3964
3965        if (!S_ISREG(inode->i_mode)) {
3966                /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/
3967                return;
3968        }
3969
3970        mb_debug(1, "discard preallocation for inode %lu\n", inode->i_ino);
3971        trace_ext4_discard_preallocations(inode);
3972
3973        INIT_LIST_HEAD(&list);
3974
3975repeat:
3976        /* first, collect all pa's in the inode */
3977        spin_lock(&ei->i_prealloc_lock);
3978        while (!list_empty(&ei->i_prealloc_list)) {
3979                pa = list_entry(ei->i_prealloc_list.next,
3980                                struct ext4_prealloc_space, pa_inode_list);
3981                BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock);
3982                spin_lock(&pa->pa_lock);
3983                if (atomic_read(&pa->pa_count)) {
3984                        /* this shouldn't happen often - nobody should
3985                         * use preallocation while we're discarding it */
3986                        spin_unlock(&pa->pa_lock);
3987                        spin_unlock(&ei->i_prealloc_lock);
3988                        ext4_msg(sb, KERN_ERR,
3989                                 "uh-oh! used pa while discarding");
3990                        WARN_ON(1);
3991                        schedule_timeout_uninterruptible(HZ);
3992                        goto repeat;
3993
3994                }
3995                if (pa->pa_deleted == 0) {
3996                        pa->pa_deleted = 1;
3997                        spin_unlock(&pa->pa_lock);
3998                        list_del_rcu(&pa->pa_inode_list);
3999                        list_add(&pa->u.pa_tmp_list, &list);
4000                        continue;
4001                }
4002
4003                /* someone is deleting pa right now */
4004                spin_unlock(&pa->pa_lock);
4005                spin_unlock(&ei->i_prealloc_lock);
4006
4007                /* we have to wait here because pa_deleted
4008                 * doesn't mean pa is already unlinked from
4009                 * the list. as we might be called from
4010                 * ->clear_inode() the inode will get freed
4011                 * and concurrent thread which is unlinking
4012                 * pa from inode's list may access already
4013                 * freed memory, bad-bad-bad */
4014
4015                /* XXX: if this happens too often, we can
4016                 * add a flag to force wait only in case
4017                 * of ->clear_inode(), but not in case of
4018                 * regular truncate */
4019                schedule_timeout_uninterruptible(HZ);
4020                goto repeat;
4021        }
4022        spin_unlock(&ei->i_prealloc_lock);
4023
4024        list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
4025                BUG_ON(pa->pa_type != MB_INODE_PA);
4026                group = ext4_get_group_number(sb, pa->pa_pstart);
4027
4028                err = ext4_mb_load_buddy(sb, group, &e4b);
4029                if (err) {
4030                        ext4_error(sb, "Error loading buddy information for %u",
4031                                        group);
4032                        continue;
4033                }
4034
4035                bitmap_bh = ext4_read_block_bitmap(sb, group);
4036                if (IS_ERR(bitmap_bh)) {
4037                        err = PTR_ERR(bitmap_bh);
4038                        ext4_error(sb, "Error %d reading block bitmap for %u",
4039                                        err, group);
4040                        ext4_mb_unload_buddy(&e4b);
4041                        continue;
4042                }
4043
4044                ext4_lock_group(sb, group);
4045                list_del(&pa->pa_group_list);
4046                ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
4047                ext4_unlock_group(sb, group);
4048
4049                ext4_mb_unload_buddy(&e4b);
4050                put_bh(bitmap_bh);
4051
4052                list_del(&pa->u.pa_tmp_list);
4053                call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
4054        }
4055}
4056
4057#ifdef CONFIG_EXT4_DEBUG
4058static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
4059{
4060        struct super_block *sb = ac->ac_sb;
4061        ext4_group_t ngroups, i;
4062
4063        if (!ext4_mballoc_debug ||
4064            (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED))
4065                return;
4066
4067        ext4_msg(ac->ac_sb, KERN_ERR, "Can't allocate:"
4068                        " Allocation context details:");
4069        ext4_msg(ac->ac_sb, KERN_ERR, "status %d flags %d",
4070                        ac->ac_status, ac->ac_flags);
4071        ext4_msg(ac->ac_sb, KERN_ERR, "orig %lu/%lu/%lu@%lu, "
4072                        "goal %lu/%lu/%lu@%lu, "
4073                        "best %lu/%lu/%lu@%lu cr %d",
4074                        (unsigned long)ac->ac_o_ex.fe_group,
4075                        (unsigned long)ac->ac_o_ex.fe_start,
4076                        (unsigned long)ac->ac_o_ex.fe_len,
4077                        (unsigned long)ac->ac_o_ex.fe_logical,
4078                        (unsigned long)ac->ac_g_ex.fe_group,
4079                        (unsigned long)ac->ac_g_ex.fe_start,
4080                        (unsigned long)ac->ac_g_ex.fe_len,
4081                        (unsigned long)ac->ac_g_ex.fe_logical,
4082                        (unsigned long)ac->ac_b_ex.fe_group,
4083                        (unsigned long)ac->ac_b_ex.fe_start,
4084                        (unsigned long)ac->ac_b_ex.fe_len,
4085                        (unsigned long)ac->ac_b_ex.fe_logical,
4086                        (int)ac->ac_criteria);
4087        ext4_msg(ac->ac_sb, KERN_ERR, "%d found", ac->ac_found);
4088        ext4_msg(ac->ac_sb, KERN_ERR, "groups: ");
4089        ngroups = ext4_get_groups_count(sb);
4090        for (i = 0; i < ngroups; i++) {
4091                struct ext4_group_info *grp = ext4_get_group_info(sb, i);
4092                struct ext4_prealloc_space *pa;
4093                ext4_grpblk_t start;
4094                struct list_head *cur;
4095                ext4_lock_group(sb, i);
4096                list_for_each(cur, &grp->bb_prealloc_list) {
4097                        pa = list_entry(cur, struct ext4_prealloc_space,
4098                                        pa_group_list);
4099                        spin_lock(&pa->pa_lock);
4100                        ext4_get_group_no_and_offset(sb, pa->pa_pstart,
4101                                                     NULL, &start);
4102                        spin_unlock(&pa->pa_lock);
4103                        printk(KERN_ERR "PA:%u:%d:%u \n", i,
4104                               start, pa->pa_len);
4105                }
4106                ext4_unlock_group(sb, i);
4107
4108                if (grp->bb_free == 0)
4109                        continue;
4110                printk(KERN_ERR "%u: %d/%d \n",
4111                       i, grp->bb_free, grp->bb_fragments);
4112        }
4113        printk(KERN_ERR "\n");
4114}
4115#else
4116static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
4117{
4118        return;
4119}
4120#endif
4121
4122/*
4123 * We use locality group preallocation for small size file. The size of the
4124 * file is determined by the current size or the resulting size after
4125 * allocation which ever is larger
4126 *
4127 * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req
4128 */
4129static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
4130{
4131        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
4132        int bsbits = ac->ac_sb->s_blocksize_bits;
4133        loff_t size, isize;
4134
4135        if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
4136                return;
4137
4138        if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
4139                return;
4140
4141        size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
4142        isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
4143                >> bsbits;
4144
4145        if ((size == isize) &&
4146            !ext4_fs_is_busy(sbi) &&
4147            (atomic_read(&ac->ac_inode->i_writecount) == 0)) {
4148                ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
4149                return;
4150        }
4151
4152        if (sbi->s_mb_group_prealloc <= 0) {
4153                ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
4154                return;
4155        }
4156
4157        /* don't use group allocation for large files */
4158        size = max(size, isize);
4159        if (size > sbi->s_mb_stream_request) {
4160                ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
4161                return;
4162        }
4163
4164        BUG_ON(ac->ac_lg != NULL);
4165        /*
4166         * locality group prealloc space are per cpu. The reason for having
4167         * per cpu locality group is to reduce the contention between block
4168         * request from multiple CPUs.
4169         */
4170        ac->ac_lg = raw_cpu_ptr(sbi->s_locality_groups);
4171
4172        /* we're going to use group allocation */
4173        ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;
4174
4175        /* serialize all allocations in the group */
4176        mutex_lock(&ac->ac_lg->lg_mutex);
4177}
4178
4179static noinline_for_stack int
4180ext4_mb_initialize_context(struct ext4_allocation_context *ac,
4181                                struct ext4_allocation_request *ar)
4182{
4183        struct super_block *sb = ar->inode->i_sb;
4184        struct ext4_sb_info *sbi = EXT4_SB(sb);
4185        struct ext4_super_block *es = sbi->s_es;
4186        ext4_group_t group;
4187        unsigned int len;
4188        ext4_fsblk_t goal;
4189        ext4_grpblk_t block;
4190
4191        /* we can't allocate > group size */
4192        len = ar->len;
4193
4194        /* just a dirty hack to filter too big requests  */
4195        if (len >= EXT4_CLUSTERS_PER_GROUP(sb))
4196                len = EXT4_CLUSTERS_PER_GROUP(sb);
4197
4198        /* start searching from the goal */
4199        goal = ar->goal;
4200        if (goal < le32_to_cpu(es->s_first_data_block) ||
4201                        goal >= ext4_blocks_count(es))
4202                goal = le32_to_cpu(es->s_first_data_block);
4203        ext4_get_group_no_and_offset(sb, goal, &group, &block);
4204
4205        /* set up allocation goals */
4206        ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical);
4207        ac->ac_status = AC_STATUS_CONTINUE;
4208        ac->ac_sb = sb;
4209        ac->ac_inode = ar->inode;
4210        ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical;
4211        ac->ac_o_ex.fe_group = group;
4212        ac->ac_o_ex.fe_start = block;
4213        ac->ac_o_ex.fe_len = len;
4214        ac->ac_g_ex = ac->ac_o_ex;
4215        ac->ac_flags = ar->flags;
4216
4217        /* we have to define context: we'll we work with a file or
4218         * locality group. this is a policy, actually */
4219        ext4_mb_group_or_file(ac);
4220
4221        mb_debug(1, "init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, "
4222                        "left: %u/%u, right %u/%u to %swritable\n",
4223                        (unsigned) ar->len, (unsigned) ar->logical,
4224                        (unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
4225                        (unsigned) ar->lleft, (unsigned) ar->pleft,
4226                        (unsigned) ar->lright, (unsigned) ar->pright,
4227                        atomic_read(&ar->inode->i_writecount) ? "" : "non-");
4228        return 0;
4229
4230}
4231
4232static noinline_for_stack void
4233ext4_mb_discard_lg_preallocations(struct super_block *sb,
4234                                        struct ext4_locality_group *lg,
4235                                        int order, int total_entries)
4236{
4237        ext4_group_t group = 0;
4238        struct ext4_buddy e4b;
4239        struct list_head discard_list;
4240        struct ext4_prealloc_space *pa, *tmp;
4241
4242        mb_debug(1, "discard locality group preallocation\n");
4243
4244        INIT_LIST_HEAD(&discard_list);
4245
4246        spin_lock(&lg->lg_prealloc_lock);
4247        list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
4248                                                pa_inode_list) {
4249                spin_lock(&pa->pa_lock);
4250                if (atomic_read(&pa->pa_count)) {
4251                        /*
4252                         * This is the pa that we just used
4253                         * for block allocation. So don't
4254                         * free that
4255                         */
4256                        spin_unlock(&pa->pa_lock);
4257                        continue;
4258                }
4259                if (pa->pa_deleted) {
4260                        spin_unlock(&pa->pa_lock);
4261                        continue;
4262                }
4263                /* only lg prealloc space */
4264                BUG_ON(pa->pa_type != MB_GROUP_PA);
4265
4266                /* seems this one can be freed ... */
4267                pa->pa_deleted = 1;
4268                spin_unlock(&pa->pa_lock);
4269
4270                list_del_rcu(&pa->pa_inode_list);
4271                list_add(&pa->u.pa_tmp_list, &discard_list);
4272
4273                total_entries--;
4274                if (total_entries <= 5) {
4275                        /*
4276                         * we want to keep only 5 entries
4277                         * allowing it to grow to 8. This
4278                         * mak sure we don't call discard
4279                         * soon for this list.
4280                         */
4281                        break;
4282                }
4283        }
4284        spin_unlock(&lg->lg_prealloc_lock);
4285
4286        list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {
4287
4288                group = ext4_get_group_number(sb, pa->pa_pstart);
4289                if (ext4_mb_load_buddy(sb, group, &e4b)) {
4290                        ext4_error(sb, "Error loading buddy information for %u",
4291                                        group);
4292                        continue;
4293                }
4294                ext4_lock_group(sb, group);
4295                list_del(&pa->pa_group_list);
4296                ext4_mb_release_group_pa(&e4b, pa);
4297                ext4_unlock_group(sb, group);
4298
4299                ext4_mb_unload_buddy(&e4b);
4300                list_del(&pa->u.pa_tmp_list);
4301                call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
4302        }
4303}
4304
4305/*
4306 * We have incremented pa_count. So it cannot be freed at this
4307 * point. Also we hold lg_mutex. So no parallel allocation is
4308 * possible from this lg. That means pa_free cannot be updated.
4309 *
4310 * A parallel ext4_mb_discard_group_preallocations is possible.
4311 * which can cause the lg_prealloc_list to be updated.
4312 */
4313
4314static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
4315{
4316        int order, added = 0, lg_prealloc_count = 1;
4317        struct super_block *sb = ac->ac_sb;
4318        struct ext4_locality_group *lg = ac->ac_lg;
4319        struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa;
4320
4321        order = fls(pa->pa_free) - 1;
4322        if (order > PREALLOC_TB_SIZE - 1)
4323                /* The max size of hash table is PREALLOC_TB_SIZE */
4324                order = PREALLOC_TB_SIZE - 1;
4325        /* Add the prealloc space to lg */
4326        spin_lock(&lg->lg_prealloc_lock);
4327        list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
4328                                                pa_inode_list) {
4329                spin_lock(&tmp_pa->pa_lock);
4330                if (tmp_pa->pa_deleted) {
4331                        spin_unlock(&tmp_pa->pa_lock);
4332                        continue;
4333                }
4334                if (!added && pa->pa_free < tmp_pa->pa_free) {
4335                        /* Add to the tail of the previous entry */
4336                        list_add_tail_rcu(&pa->pa_inode_list,
4337                                                &tmp_pa->pa_inode_list);
4338                        added = 1;
4339                        /*
4340                         * we want to count the total
4341                         * number of entries in the list
4342                         */
4343                }
4344                spin_unlock(&tmp_pa->pa_lock);
4345                lg_prealloc_count++;
4346        }
4347        if (!added)
4348                list_add_tail_rcu(&pa->pa_inode_list,
4349                                        &lg->lg_prealloc_list[order]);
4350        spin_unlock(&lg->lg_prealloc_lock);
4351
4352        /* Now trim the list to be not more than 8 elements */
4353        if (lg_prealloc_count > 8) {
4354                ext4_mb_discard_lg_preallocations(sb, lg,
4355                                                  order, lg_prealloc_count);
4356                return;
4357        }
4358        return ;
4359}
4360
4361/*
4362 * release all resource we used in allocation
4363 */
4364static int ext4_mb_release_context(struct ext4_allocation_context *ac)
4365{
4366        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
4367        struct ext4_prealloc_space *pa = ac->ac_pa;
4368        if (pa) {
4369                if (pa->pa_type == MB_GROUP_PA) {
4370                        /* see comment in ext4_mb_use_group_pa() */
4371                        spin_lock(&pa->pa_lock);
4372                        pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
4373                        pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
4374                        pa->pa_free -= ac->ac_b_ex.fe_len;
4375                        pa->pa_len -= ac->ac_b_ex.fe_len;
4376                        spin_unlock(&pa->pa_lock);
4377                }
4378        }
4379        if (pa) {
4380                /*
4381                 * We want to add the pa to the right bucket.
4382                 * Remove it from the list and while adding
4383                 * make sure the list to which we are adding
4384                 * doesn't grow big.
4385                 */
4386                if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
4387                        spin_lock(pa->pa_obj_lock);
4388                        list_del_rcu(&pa->pa_inode_list);
4389                        spin_unlock(pa->pa_obj_lock);
4390                        ext4_mb_add_n_trim(ac);
4391                }
4392                ext4_mb_put_pa(ac, ac->ac_sb, pa);
4393        }
4394        if (ac->ac_bitmap_page)
4395                put_page(ac->ac_bitmap_page);
4396        if (ac->ac_buddy_page)
4397                put_page(ac->ac_buddy_page);
4398        if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
4399                mutex_unlock(&ac->ac_lg->lg_mutex);
4400        ext4_mb_collect_stats(ac);
4401        return 0;
4402}
4403
4404static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
4405{
4406        ext4_group_t i, ngroups = ext4_get_groups_count(sb);
4407        int ret;
4408        int freed = 0;
4409
4410        trace_ext4_mb_discard_preallocations(sb, needed);
4411        for (i = 0; i < ngroups && needed > 0; i++) {
4412                ret = ext4_mb_discard_group_preallocations(sb, i, needed);
4413                freed += ret;
4414                needed -= ret;
4415        }
4416
4417        return freed;
4418}
4419
4420/*
4421 * Main entry point into mballoc to allocate blocks
4422 * it tries to use preallocation first, then falls back
4423 * to usual allocation
4424 */
4425ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4426                                struct ext4_allocation_request *ar, int *errp)
4427{
4428        int freed;
4429        struct ext4_allocation_context *ac = NULL;
4430        struct ext4_sb_info *sbi;
4431        struct super_block *sb;
4432        ext4_fsblk_t block = 0;
4433        unsigned int inquota = 0;
4434        unsigned int reserv_clstrs = 0;
4435
4436        might_sleep();
4437        sb = ar->inode->i_sb;
4438        sbi = EXT4_SB(sb);
4439
4440        trace_ext4_request_blocks(ar);
4441
4442        /* Allow to use superuser reservation for quota file */
4443        if (IS_NOQUOTA(ar->inode))
4444                ar->flags |= EXT4_MB_USE_ROOT_BLOCKS;
4445
4446        if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) {
4447                /* Without delayed allocation we need to verify
4448                 * there is enough free blocks to do block allocation
4449                 * and verify allocation doesn't exceed the quota limits.
4450                 */
4451                while (ar->len &&
4452                        ext4_claim_free_clusters(sbi, ar->len, ar->flags)) {
4453
4454                        /* let others to free the space */
4455                        cond_resched();
4456                        ar->len = ar->len >> 1;
4457                }
4458                if (!ar->len) {
4459                        *errp = -ENOSPC;
4460                        return 0;
4461                }
4462                reserv_clstrs = ar->len;
4463                if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) {
4464                        dquot_alloc_block_nofail(ar->inode,
4465                                                 EXT4_C2B(sbi, ar->len));
4466                } else {
4467                        while (ar->len &&
4468                                dquot_alloc_block(ar->inode,
4469                                                  EXT4_C2B(sbi, ar->len))) {
4470
4471                                ar->flags |= EXT4_MB_HINT_NOPREALLOC;
4472                                ar->len--;
4473                        }
4474                }
4475                inquota = ar->len;
4476                if (ar->len == 0) {
4477                        *errp = -EDQUOT;
4478                        goto out;
4479                }
4480        }
4481
4482        ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS);
4483        if (!ac) {
4484                ar->len = 0;
4485                *errp = -ENOMEM;
4486                goto out;
4487        }
4488
4489        *errp = ext4_mb_initialize_context(ac, ar);
4490        if (*errp) {
4491                ar->len = 0;
4492                goto out;
4493        }
4494
4495        ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
4496        if (!ext4_mb_use_preallocated(ac)) {
4497                ac->ac_op = EXT4_MB_HISTORY_ALLOC;
4498                ext4_mb_normalize_request(ac, ar);
4499repeat:
4500                /* allocate space in core */
4501                *errp = ext4_mb_regular_allocator(ac);
4502                if (*errp)
4503                        goto discard_and_exit;
4504
4505                /* as we've just preallocated more space than
4506                 * user requested originally, we store allocated
4507                 * space in a special descriptor */
4508                if (ac->ac_status == AC_STATUS_FOUND &&
4509                    ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
4510                        *errp = ext4_mb_new_preallocation(ac);
4511                if (*errp) {
4512                discard_and_exit:
4513                        ext4_discard_allocated_blocks(ac);
4514                        goto errout;
4515                }
4516        }
4517        if (likely(ac->ac_status == AC_STATUS_FOUND)) {
4518                *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
4519                if (*errp) {
4520                        ext4_discard_allocated_blocks(ac);
4521                        goto errout;
4522                } else {
4523                        block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
4524                        ar->len = ac->ac_b_ex.fe_len;
4525                }
4526        } else {
4527                freed  = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len);
4528                if (freed)
4529                        goto repeat;
4530                *errp = -ENOSPC;
4531        }
4532
4533errout:
4534        if (*errp) {
4535                ac->ac_b_ex.fe_len = 0;
4536                ar->len = 0;
4537                ext4_mb_show_ac(ac);
4538        }
4539        ext4_mb_release_context(ac);
4540out:
4541        if (ac)
4542                kmem_cache_free(ext4_ac_cachep, ac);
4543        if (inquota && ar->len < inquota)
4544                dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len));
4545        if (!ar->len) {
4546                if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0)
4547                        /* release all the reserved blocks if non delalloc */
4548                        percpu_counter_sub(&sbi->s_dirtyclusters_counter,
4549                                                reserv_clstrs);
4550        }
4551
4552        trace_ext4_allocate_blocks(ar, (unsigned long long)block);
4553
4554        return block;
4555}
4556
4557/*
4558 * We can merge two free data extents only if the physical blocks
4559 * are contiguous, AND the extents were freed by the same transaction,
4560 * AND the blocks are associated with the same group.
4561 */
4562static int can_merge(struct ext4_free_data *entry1,
4563                        struct ext4_free_data *entry2)
4564{
4565        if ((entry1->efd_tid == entry2->efd_tid) &&
4566            (entry1->efd_group == entry2->efd_group) &&
4567            ((entry1->efd_start_cluster + entry1->efd_count) == entry2->efd_start_cluster))
4568                return 1;
4569        return 0;
4570}
4571
4572static noinline_for_stack int
4573ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4574                      struct ext4_free_data *new_entry)
4575{
4576        ext4_group_t group = e4b->bd_group;
4577        ext4_grpblk_t cluster;
4578        ext4_grpblk_t clusters = new_entry->efd_count;
4579        struct ext4_free_data *entry;
4580        struct ext4_group_info *db = e4b->bd_info;
4581        struct super_block *sb = e4b->bd_sb;
4582        struct ext4_sb_info *sbi = EXT4_SB(sb);
4583        struct rb_node **n = &db->bb_free_root.rb_node, *node;
4584        struct rb_node *parent = NULL, *new_node;
4585
4586        BUG_ON(!ext4_handle_valid(handle));
4587        BUG_ON(e4b->bd_bitmap_page == NULL);
4588        BUG_ON(e4b->bd_buddy_page == NULL);
4589
4590        new_node = &new_entry->efd_node;
4591        cluster = new_entry->efd_start_cluster;
4592
4593        if (!*n) {
4594                /* first free block exent. We need to
4595                   protect buddy cache from being freed,
4596                 * otherwise we'll refresh it from
4597                 * on-disk bitmap and lose not-yet-available
4598                 * blocks */
4599                get_page(e4b->bd_buddy_page);
4600                get_page(e4b->bd_bitmap_page);
4601        }
4602        while (*n) {
4603                parent = *n;
4604                entry = rb_entry(parent, struct ext4_free_data, efd_node);
4605                if (cluster < entry->efd_start_cluster)
4606                        n = &(*n)->rb_left;
4607                else if (cluster >= (entry->efd_start_cluster + entry->efd_count))
4608                        n = &(*n)->rb_right;
4609                else {
4610                        ext4_grp_locked_error(sb, group, 0,
4611                                ext4_group_first_block_no(sb, group) +
4612                                EXT4_C2B(sbi, cluster),
4613                                "Block already on to-be-freed list");
4614                        return 0;
4615                }
4616        }
4617
4618        rb_link_node(new_node, parent, n);
4619        rb_insert_color(new_node, &db->bb_free_root);
4620
4621        /* Now try to see the extent can be merged to left and right */
4622        node = rb_prev(new_node);
4623        if (node) {
4624                entry = rb_entry(node, struct ext4_free_data, efd_node);
4625                if (can_merge(entry, new_entry) &&
4626                    ext4_journal_callback_try_del(handle, &entry->efd_jce)) {
4627                        new_entry->efd_start_cluster = entry->efd_start_cluster;
4628                        new_entry->efd_count += entry->efd_count;
4629                        rb_erase(node, &(db->bb_free_root));
4630                        kmem_cache_free(ext4_free_data_cachep, entry);
4631                }
4632        }
4633
4634        node = rb_next(new_node);
4635        if (node) {
4636                entry = rb_entry(node, struct ext4_free_data, efd_node);
4637                if (can_merge(new_entry, entry) &&
4638                    ext4_journal_callback_try_del(handle, &entry->efd_jce)) {
4639                        new_entry->efd_count += entry->efd_count;
4640                        rb_erase(node, &(db->bb_free_root));
4641                        kmem_cache_free(ext4_free_data_cachep, entry);
4642                }
4643        }
4644        /* Add the extent to transaction's private list */
4645        new_entry->efd_jce.jce_func = ext4_free_data_callback;
4646        spin_lock(&sbi->s_md_lock);
4647        _ext4_journal_callback_add(handle, &new_entry->efd_jce);
4648        sbi->s_mb_free_pending += clusters;
4649        spin_unlock(&sbi->s_md_lock);
4650        return 0;
4651}
4652
4653/**
4654 * ext4_free_blocks() -- Free given blocks and update quota
4655 * @handle:             handle for this transaction
4656 * @inode:              inode
4657 * @block:              start physical block to free
4658 * @count:              number of blocks to count
4659 * @flags:              flags used by ext4_free_blocks
4660 */
4661void ext4_free_blocks(handle_t *handle, struct inode *inode,
4662                      struct buffer_head *bh, ext4_fsblk_t block,
4663                      unsigned long count, int flags)
4664{
4665        struct buffer_head *bitmap_bh = NULL;
4666        struct super_block *sb = inode->i_sb;
4667        struct ext4_group_desc *gdp;
4668        unsigned int overflow;
4669        ext4_grpblk_t bit;
4670        struct buffer_head *gd_bh;
4671        ext4_group_t block_group;
4672        struct ext4_sb_info *sbi;
4673        struct ext4_buddy e4b;
4674        unsigned int count_clusters;
4675        int err = 0;
4676        int ret;
4677
4678        might_sleep();
4679        if (bh) {
4680                if (block)
4681                        BUG_ON(block != bh->b_blocknr);
4682                else
4683                        block = bh->b_blocknr;
4684        }
4685
4686        sbi = EXT4_SB(sb);
4687        if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
4688            !ext4_data_block_valid(sbi, block, count)) {
4689                ext4_error(sb, "Freeing blocks not in datazone - "
4690                           "block = %llu, count = %lu", block, count);
4691                goto error_return;
4692        }
4693
4694        ext4_debug("freeing block %llu\n", block);
4695        trace_ext4_free_blocks(inode, block, count, flags);
4696
4697        if (bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
4698                BUG_ON(count > 1);
4699
4700                ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
4701                            inode, bh, block);
4702        }
4703
4704        /*
4705         * If the extent to be freed does not begin on a cluster
4706         * boundary, we need to deal with partial clusters at the
4707         * beginning and end of the extent.  Normally we will free
4708         * blocks at the beginning or the end unless we are explicitly
4709         * requested to avoid doing so.
4710         */
4711        overflow = EXT4_PBLK_COFF(sbi, block);
4712        if (overflow) {
4713                if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
4714                        overflow = sbi->s_cluster_ratio - overflow;
4715                        block += overflow;
4716                        if (count > overflow)
4717                                count -= overflow;
4718                        else
4719                                return;
4720                } else {
4721                        block -= overflow;
4722                        count += overflow;
4723                }
4724        }
4725        overflow = EXT4_LBLK_COFF(sbi, count);
4726        if (overflow) {
4727                if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
4728                        if (count > overflow)
4729                                count -= overflow;
4730                        else
4731                                return;
4732                } else
4733                        count += sbi->s_cluster_ratio - overflow;
4734        }
4735
4736        if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
4737                int i;
4738                int is_metadata = flags & EXT4_FREE_BLOCKS_METADATA;
4739
4740                for (i = 0; i < count; i++) {
4741                        cond_resched();
4742                        if (is_metadata)
4743                                bh = sb_find_get_block(inode->i_sb, block + i);
4744                        ext4_forget(handle, is_metadata, inode, bh, block + i);
4745                }
4746        }
4747
4748do_more:
4749        overflow = 0;
4750        ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
4751
4752        if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(
4753                        ext4_get_group_info(sb, block_group))))
4754                return;
4755
4756        /*
4757         * Check to see if we are freeing blocks across a group
4758         * boundary.
4759         */
4760        if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) {
4761                overflow = EXT4_C2B(sbi, bit) + count -
4762                        EXT4_BLOCKS_PER_GROUP(sb);
4763                count -= overflow;
4764        }
4765        count_clusters = EXT4_NUM_B2C(sbi, count);
4766        bitmap_bh = ext4_read_block_bitmap(sb, block_group);
4767        if (IS_ERR(bitmap_bh)) {
4768                err = PTR_ERR(bitmap_bh);
4769                bitmap_bh = NULL;
4770                goto error_return;
4771        }
4772        gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
4773        if (!gdp) {
4774                err = -EIO;
4775                goto error_return;
4776        }
4777
4778        if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
4779            in_range(ext4_inode_bitmap(sb, gdp), block, count) ||
4780            in_range(block, ext4_inode_table(sb, gdp),
4781                     EXT4_SB(sb)->s_itb_per_group) ||
4782            in_range(block + count - 1, ext4_inode_table(sb, gdp),
4783                     EXT4_SB(sb)->s_itb_per_group)) {
4784
4785                ext4_error(sb, "Freeing blocks in system zone - "
4786                           "Block = %llu, count = %lu", block, count);
4787                /* err = 0. ext4_std_error should be a no op */
4788                goto error_return;
4789        }
4790
4791        BUFFER_TRACE(bitmap_bh, "getting write access");
4792        err = ext4_journal_get_write_access(handle, bitmap_bh);
4793        if (err)
4794                goto error_return;
4795
4796        /*
4797         * We are about to modify some metadata.  Call the journal APIs
4798         * to unshare ->b_data if a currently-committing transaction is
4799         * using it
4800         */
4801        BUFFER_TRACE(gd_bh, "get_write_access");
4802        err = ext4_journal_get_write_access(handle, gd_bh);
4803        if (err)
4804                goto error_return;
4805#ifdef AGGRESSIVE_CHECK
4806        {
4807                int i;
4808                for (i = 0; i < count_clusters; i++)
4809                        BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
4810        }
4811#endif
4812        trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters);
4813
4814        /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */
4815        err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b,
4816                                     GFP_NOFS|__GFP_NOFAIL);
4817        if (err)
4818                goto error_return;
4819
4820        /*
4821         * We need to make sure we don't reuse the freed block until after the
4822         * transaction is committed. We make an exception if the inode is to be
4823         * written in writeback mode since writeback mode has weak data
4824         * consistency guarantees.
4825         */
4826        if (ext4_handle_valid(handle) &&
4827            ((flags & EXT4_FREE_BLOCKS_METADATA) ||
4828             !ext4_should_writeback_data(inode))) {
4829                struct ext4_free_data *new_entry;
4830                /*
4831                 * We use __GFP_NOFAIL because ext4_free_blocks() is not allowed
4832                 * to fail.
4833                 */
4834                new_entry = kmem_cache_alloc(ext4_free_data_cachep,
4835                                GFP_NOFS|__GFP_NOFAIL);
4836                new_entry->efd_start_cluster = bit;
4837                new_entry->efd_group = block_group;
4838                new_entry->efd_count = count_clusters;
4839                new_entry->efd_tid = handle->h_transaction->t_tid;
4840
4841                ext4_lock_group(sb, block_group);
4842                mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
4843                ext4_mb_free_metadata(handle, &e4b, new_entry);
4844        } else {
4845                /* need to update group_info->bb_free and bitmap
4846                 * with group lock held. generate_buddy look at
4847                 * them with group lock_held
4848                 */
4849                if (test_opt(sb, DISCARD)) {
4850                        err = ext4_issue_discard(sb, block_group, bit, count);
4851                        if (err && err != -EOPNOTSUPP)
4852                                ext4_msg(sb, KERN_WARNING, "discard request in"
4853                                         " group:%d block:%d count:%lu failed"
4854                                         " with %d", block_group, bit, count,
4855                                         err);
4856                } else
4857                        EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info);
4858
4859                ext4_lock_group(sb, block_group);
4860                mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
4861                mb_free_blocks(inode, &e4b, bit, count_clusters);
4862        }
4863
4864        ret = ext4_free_group_clusters(sb, gdp) + count_clusters;
4865        ext4_free_group_clusters_set(sb, gdp, ret);
4866        ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh);
4867        ext4_group_desc_csum_set(sb, block_group, gdp);
4868        ext4_unlock_group(sb, block_group);
4869
4870        if (sbi->s_log_groups_per_flex) {
4871                ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
4872                atomic64_add(count_clusters,
4873                             &sbi->s_flex_groups[flex_group].free_clusters);
4874        }
4875
4876        if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
4877                dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
4878        percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters);
4879
4880        ext4_mb_unload_buddy(&e4b);
4881
4882        /* We dirtied the bitmap block */
4883        BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
4884        err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
4885
4886        /* And the group descriptor block */
4887        BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
4888        ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
4889        if (!err)
4890                err = ret;
4891
4892        if (overflow && !err) {
4893                block += count;
4894                count = overflow;
4895                put_bh(bitmap_bh);
4896                goto do_more;
4897        }
4898error_return:
4899        brelse(bitmap_bh);
4900        ext4_std_error(sb, err);
4901        return;
4902}
4903
4904/**
4905 * ext4_group_add_blocks() -- Add given blocks to an existing group
4906 * @handle:                     handle to this transaction
4907 * @sb:                         super block
4908 * @block:                      start physical block to add to the block group
4909 * @count:                      number of blocks to free
4910 *
4911 * This marks the blocks as free in the bitmap and buddy.
4912 */
4913int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
4914                         ext4_fsblk_t block, unsigned long count)
4915{
4916        struct buffer_head *bitmap_bh = NULL;
4917        struct buffer_head *gd_bh;
4918        ext4_group_t block_group;
4919        ext4_grpblk_t bit;
4920        unsigned int i;
4921        struct ext4_group_desc *desc;
4922        struct ext4_sb_info *sbi = EXT4_SB(sb);
4923        struct ext4_buddy e4b;
4924        int err = 0, ret, blk_free_count;
4925        ext4_grpblk_t blocks_freed;
4926
4927        ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
4928
4929        if (count == 0)
4930                return 0;
4931
4932        ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
4933        /*
4934         * Check to see if we are freeing blocks across a group
4935         * boundary.
4936         */
4937        if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
4938                ext4_warning(sb, "too much blocks added to group %u",
4939                             block_group);
4940                err = -EINVAL;
4941                goto error_return;
4942        }
4943
4944        bitmap_bh = ext4_read_block_bitmap(sb, block_group);
4945        if (IS_ERR(bitmap_bh)) {
4946                err = PTR_ERR(bitmap_bh);
4947                bitmap_bh = NULL;
4948                goto error_return;
4949        }
4950
4951        desc = ext4_get_group_desc(sb, block_group, &gd_bh);
4952        if (!desc) {
4953                err = -EIO;
4954                goto error_return;
4955        }
4956
4957        if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
4958            in_range(ext4_inode_bitmap(sb, desc), block, count) ||
4959            in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
4960            in_range(block + count - 1, ext4_inode_table(sb, desc),
4961                     sbi->s_itb_per_group)) {
4962                ext4_error(sb, "Adding blocks in system zones - "
4963                           "Block = %llu, count = %lu",
4964                           block, count);
4965                err = -EINVAL;
4966                goto error_return;
4967        }
4968
4969        BUFFER_TRACE(bitmap_bh, "getting write access");
4970        err = ext4_journal_get_write_access(handle, bitmap_bh);
4971        if (err)
4972                goto error_return;
4973
4974        /*
4975         * We are about to modify some metadata.  Call the journal APIs
4976         * to unshare ->b_data if a currently-committing transaction is
4977         * using it
4978         */
4979        BUFFER_TRACE(gd_bh, "get_write_access");
4980        err = ext4_journal_get_write_access(handle, gd_bh);
4981        if (err)
4982                goto error_return;
4983
4984        for (i = 0, blocks_freed = 0; i < count; i++) {
4985                BUFFER_TRACE(bitmap_bh, "clear bit");
4986                if (!mb_test_bit(bit + i, bitmap_bh->b_data)) {
4987                        ext4_error(sb, "bit already cleared for block %llu",
4988                                   (ext4_fsblk_t)(block + i));
4989                        BUFFER_TRACE(bitmap_bh, "bit already cleared");
4990                } else {
4991                        blocks_freed++;
4992                }
4993        }
4994
4995        err = ext4_mb_load_buddy(sb, block_group, &e4b);
4996        if (err)
4997                goto error_return;
4998
4999        /*
5000         * need to update group_info->bb_free and bitmap
5001         * with group lock held. generate_buddy look at
5002         * them with group lock_held
5003         */
5004        ext4_lock_group(sb, block_group);
5005        mb_clear_bits(bitmap_bh->b_data, bit, count);
5006        mb_free_blocks(NULL, &e4b, bit, count);
5007        blk_free_count = blocks_freed + ext4_free_group_clusters(sb, desc);
5008        ext4_free_group_clusters_set(sb, desc, blk_free_count);
5009        ext4_block_bitmap_csum_set(sb, block_group, desc, bitmap_bh);
5010        ext4_group_desc_csum_set(sb, block_group, desc);
5011        ext4_unlock_group(sb, block_group);
5012        percpu_counter_add(&sbi->s_freeclusters_counter,
5013                           EXT4_NUM_B2C(sbi, blocks_freed));
5014
5015        if (sbi->s_log_groups_per_flex) {
5016                ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
5017                atomic64_add(EXT4_NUM_B2C(sbi, blocks_freed),
5018                             &sbi->s_flex_groups[flex_group].free_clusters);
5019        }
5020
5021        ext4_mb_unload_buddy(&e4b);
5022
5023        /* We dirtied the bitmap block */
5024        BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
5025        err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
5026
5027        /* And the group descriptor block */
5028        BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
5029        ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
5030        if (!err)
5031                err = ret;
5032
5033error_return:
5034        brelse(bitmap_bh);
5035        ext4_std_error(sb, err);
5036        return err;
5037}
5038
5039/**
5040 * ext4_trim_extent -- function to TRIM one single free extent in the group
5041 * @sb:         super block for the file system
5042 * @start:      starting block of the free extent in the alloc. group
5043 * @count:      number of blocks to TRIM
5044 * @group:      alloc. group we are working with
5045 * @e4b:        ext4 buddy for the group
5046 *
5047 * Trim "count" blocks starting at "start" in the "group". To assure that no
5048 * one will allocate those blocks, mark it as used in buddy bitmap. This must
5049 * be called with under the group lock.
5050 */
5051static int ext4_trim_extent(struct super_block *sb, int start, int count,
5052                             ext4_group_t group, struct ext4_buddy *e4b)
5053__releases(bitlock)
5054__acquires(bitlock)
5055{
5056        struct ext4_free_extent ex;
5057        int ret = 0;
5058
5059        trace_ext4_trim_extent(sb, group, start, count);
5060
5061        assert_spin_locked(ext4_group_lock_ptr(sb, group));
5062
5063        ex.fe_start = start;
5064        ex.fe_group = group;
5065        ex.fe_len = count;
5066
5067        /*
5068         * Mark blocks used, so no one can reuse them while
5069         * being trimmed.
5070         */
5071        mb_mark_used(e4b, &ex);
5072        ext4_unlock_group(sb, group);
5073        ret = ext4_issue_discard(sb, group, start, count);
5074        ext4_lock_group(sb, group);
5075        mb_free_blocks(NULL, e4b, start, ex.fe_len);
5076        return ret;
5077}
5078
5079/**
5080 * ext4_trim_all_free -- function to trim all free space in alloc. group
5081 * @sb:                 super block for file system
5082 * @group:              group to be trimmed
5083 * @start:              first group block to examine
5084 * @max:                last group block to examine
5085 * @minblocks:          minimum extent block count
5086 *
5087 * ext4_trim_all_free walks through group's buddy bitmap searching for free
5088 * extents. When the free block is found, ext4_trim_extent is called to TRIM
5089 * the extent.
5090 *
5091 *
5092 * ext4_trim_all_free walks through group's block bitmap searching for free
5093 * extents. When the free extent is found, mark it as used in group buddy
5094 * bitmap. Then issue a TRIM command on this extent and free the extent in
5095 * the group buddy bitmap. This is done until whole group is scanned.
5096 */
5097static ext4_grpblk_t
5098ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
5099                   ext4_grpblk_t start, ext4_grpblk_t max,
5100                   ext4_grpblk_t minblocks)
5101{
5102        void *bitmap;
5103        ext4_grpblk_t next, count = 0, free_count = 0;
5104        struct ext4_buddy e4b;
5105        int ret = 0;
5106
5107        trace_ext4_trim_all_free(sb, group, start, max);
5108
5109        ret = ext4_mb_load_buddy(sb, group, &e4b);
5110        if (ret) {
5111                ext4_error(sb, "Error in loading buddy "
5112                                "information for %u", group);
5113                return ret;
5114        }
5115        bitmap = e4b.bd_bitmap;
5116
5117        ext4_lock_group(sb, group);
5118        if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) &&
5119            minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks))
5120                goto out;
5121
5122        start = (e4b.bd_info->bb_first_free > start) ?
5123                e4b.bd_info->bb_first_free : start;
5124
5125        while (start <= max) {
5126                start = mb_find_next_zero_bit(bitmap, max + 1, start);
5127                if (start > max)
5128                        break;
5129                next = mb_find_next_bit(bitmap, max + 1, start);
5130
5131                if ((next - start) >= minblocks) {
5132                        ret = ext4_trim_extent(sb, start,
5133                                               next - start, group, &e4b);
5134                        if (ret && ret != -EOPNOTSUPP)
5135                                break;
5136                        ret = 0;
5137                        count += next - start;
5138                }
5139                free_count += next - start;
5140                start = next + 1;
5141
5142                if (fatal_signal_pending(current)) {
5143                        count = -ERESTARTSYS;
5144                        break;
5145                }
5146
5147                if (need_resched()) {
5148                        ext4_unlock_group(sb, group);
5149                        cond_resched();
5150                        ext4_lock_group(sb, group);
5151                }
5152
5153                if ((e4b.bd_info->bb_free - free_count) < minblocks)
5154                        break;
5155        }
5156
5157        if (!ret) {
5158                ret = count;
5159                EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
5160        }
5161out:
5162        ext4_unlock_group(sb, group);
5163        ext4_mb_unload_buddy(&e4b);
5164
5165        ext4_debug("trimmed %d blocks in the group %d\n",
5166                count, group);
5167
5168        return ret;
5169}
5170
5171/**
5172 * ext4_trim_fs() -- trim ioctl handle function
5173 * @sb:                 superblock for filesystem
5174 * @range:              fstrim_range structure
5175 *
5176 * start:       First Byte to trim
5177 * len:         number of Bytes to trim from start
5178 * minlen:      minimum extent length in Bytes
5179 * ext4_trim_fs goes through all allocation groups containing Bytes from
5180 * start to start+len. For each such a group ext4_trim_all_free function
5181 * is invoked to trim all free space.
5182 */
5183int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
5184{
5185        struct ext4_group_info *grp;
5186        ext4_group_t group, first_group, last_group;
5187        ext4_grpblk_t cnt = 0, first_cluster, last_cluster;
5188        uint64_t start, end, minlen, trimmed = 0;
5189        ext4_fsblk_t first_data_blk =
5190                        le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
5191        ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es);
5192        int ret = 0;
5193
5194        start = range->start >> sb->s_blocksize_bits;
5195        end = start + (range->len >> sb->s_blocksize_bits) - 1;
5196        minlen = EXT4_NUM_B2C(EXT4_SB(sb),
5197                              range->minlen >> sb->s_blocksize_bits);
5198
5199        if (minlen > EXT4_CLUSTERS_PER_GROUP(sb) ||
5200            start >= max_blks ||
5201            range->len < sb->s_blocksize)
5202                return -EINVAL;
5203        if (end >= max_blks)
5204                end = max_blks - 1;
5205        if (end <= first_data_blk)
5206                goto out;
5207        if (start < first_data_blk)
5208                start = first_data_blk;
5209
5210        /* Determine first and last group to examine based on start and end */
5211        ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
5212                                     &first_group, &first_cluster);
5213        ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end,
5214                                     &last_group, &last_cluster);
5215
5216        /* end now represents the last cluster to discard in this group */
5217        end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
5218
5219        for (group = first_group; group <= last_group; group++) {
5220                grp = ext4_get_group_info(sb, group);
5221                /* We only do this if the grp has never been initialized */
5222                if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
5223                        ret = ext4_mb_init_group(sb, group, GFP_NOFS);
5224                        if (ret)
5225                                break;
5226                }
5227
5228                /*
5229                 * For all the groups except the last one, last cluster will
5230                 * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to
5231                 * change it for the last group, note that last_cluster is
5232                 * already computed earlier by ext4_get_group_no_and_offset()
5233                 */
5234                if (group == last_group)
5235                        end = last_cluster;
5236
5237                if (grp->bb_free >= minlen) {
5238                        cnt = ext4_trim_all_free(sb, group, first_cluster,
5239                                                end, minlen);
5240                        if (cnt < 0) {
5241                                ret = cnt;
5242                                break;
5243                        }
5244                        trimmed += cnt;
5245                }
5246
5247                /*
5248                 * For every group except the first one, we are sure
5249                 * that the first cluster to discard will be cluster #0.
5250                 */
5251                first_cluster = 0;
5252        }
5253
5254        if (!ret)
5255                atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen);
5256
5257out:
5258        range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits;
5259        return ret;
5260}
5261