linux/fs/ext4/mballoc.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
   3 * Written by Alex Tomas <alex@clusterfs.com>
   4 *
   5 * This program is free software; you can redistribute it and/or modify
   6 * it under the terms of the GNU General Public License version 2 as
   7 * published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope that it will be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public License
  15 * along with this program; if not, write to the Free Software
  16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
  17 */
  18
  19
  20/*
  21 * mballoc.c contains the multiblocks allocation routines
  22 */
  23
  24#include "ext4_jbd2.h"
  25#include "mballoc.h"
  26#include <linux/log2.h>
  27#include <linux/module.h>
  28#include <linux/slab.h>
  29#include <linux/backing-dev.h>
  30#include <trace/events/ext4.h>
  31
  32#ifdef CONFIG_EXT4_DEBUG
  33ushort ext4_mballoc_debug __read_mostly;
  34
  35module_param_named(mballoc_debug, ext4_mballoc_debug, ushort, 0644);
  36MODULE_PARM_DESC(mballoc_debug, "Debugging level for ext4's mballoc");
  37#endif
  38
  39/*
  40 * MUSTDO:
  41 *   - test ext4_ext_search_left() and ext4_ext_search_right()
  42 *   - search for metadata in few groups
  43 *
  44 * TODO v4:
  45 *   - normalization should take into account whether file is still open
  46 *   - discard preallocations if no free space left (policy?)
  47 *   - don't normalize tails
  48 *   - quota
  49 *   - reservation for superuser
  50 *
  51 * TODO v3:
  52 *   - bitmap read-ahead (proposed by Oleg Drokin aka green)
  53 *   - track min/max extents in each group for better group selection
  54 *   - mb_mark_used() may allocate chunk right after splitting buddy
  55 *   - tree of groups sorted by number of free blocks
  56 *   - error handling
  57 */
  58
  59/*
  60 * The allocation request involve request for multiple number of blocks
  61 * near to the goal(block) value specified.
  62 *
  63 * During initialization phase of the allocator we decide to use the
  64 * group preallocation or inode preallocation depending on the size of
  65 * the file. The size of the file could be the resulting file size we
  66 * would have after allocation, or the current file size, which ever
  67 * is larger. If the size is less than sbi->s_mb_stream_request we
  68 * select to use the group preallocation. The default value of
  69 * s_mb_stream_request is 16 blocks. This can also be tuned via
  70 * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in
  71 * terms of number of blocks.
  72 *
  73 * The main motivation for having small file use group preallocation is to
  74 * ensure that we have small files closer together on the disk.
  75 *
  76 * First stage the allocator looks at the inode prealloc list,
  77 * ext4_inode_info->i_prealloc_list, which contains list of prealloc
  78 * spaces for this particular inode. The inode prealloc space is
  79 * represented as:
  80 *
  81 * pa_lstart -> the logical start block for this prealloc space
  82 * pa_pstart -> the physical start block for this prealloc space
  83 * pa_len    -> length for this prealloc space (in clusters)
  84 * pa_free   ->  free space available in this prealloc space (in clusters)
  85 *
  86 * The inode preallocation space is used looking at the _logical_ start
  87 * block. If only the logical file block falls within the range of prealloc
  88 * space we will consume the particular prealloc space. This makes sure that
  89 * we have contiguous physical blocks representing the file blocks
  90 *
  91 * The important thing to be noted in case of inode prealloc space is that
  92 * we don't modify the values associated to inode prealloc space except
  93 * pa_free.
  94 *
  95 * If we are not able to find blocks in the inode prealloc space and if we
  96 * have the group allocation flag set then we look at the locality group
  97 * prealloc space. These are per CPU prealloc list represented as
  98 *
  99 * ext4_sb_info.s_locality_groups[smp_processor_id()]
 100 *
 101 * The reason for having a per cpu locality group is to reduce the contention
 102 * between CPUs. It is possible to get scheduled at this point.
 103 *
 104 * The locality group prealloc space is used looking at whether we have
 105 * enough free space (pa_free) within the prealloc space.
 106 *
 107 * If we can't allocate blocks via inode prealloc or/and locality group
 108 * prealloc then we look at the buddy cache. The buddy cache is represented
 109 * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets
 110 * mapped to the buddy and bitmap information regarding different
 111 * groups. The buddy information is attached to buddy cache inode so that
 112 * we can access them through the page cache. The information regarding
 113 * each group is loaded via ext4_mb_load_buddy.  The information involve
 114 * block bitmap and buddy information. The information are stored in the
 115 * inode as:
 116 *
 117 *  {                        page                        }
 118 *  [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
 119 *
 120 *
 121 * one block each for bitmap and buddy information.  So for each group we
 122 * take up 2 blocks. A page can contain blocks_per_page (PAGE_SIZE /
 123 * blocksize) blocks.  So it can have information regarding groups_per_page
 124 * which is blocks_per_page/2
 125 *
 126 * The buddy cache inode is not stored on disk. The inode is thrown
 127 * away when the filesystem is unmounted.
 128 *
 129 * We look for count number of blocks in the buddy cache. If we were able
 130 * to locate that many free blocks we return with additional information
 131 * regarding rest of the contiguous physical block available
 132 *
 133 * Before allocating blocks via buddy cache we normalize the request
 134 * blocks. This ensure we ask for more blocks that we needed. The extra
 135 * blocks that we get after allocation is added to the respective prealloc
 136 * list. In case of inode preallocation we follow a list of heuristics
 137 * based on file size. This can be found in ext4_mb_normalize_request. If
 138 * we are doing a group prealloc we try to normalize the request to
 139 * sbi->s_mb_group_prealloc.  The default value of s_mb_group_prealloc is
 140 * dependent on the cluster size; for non-bigalloc file systems, it is
 141 * 512 blocks. This can be tuned via
 142 * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in
 143 * terms of number of blocks. If we have mounted the file system with -O
 144 * stripe=<value> option the group prealloc request is normalized to the
 145 * the smallest multiple of the stripe value (sbi->s_stripe) which is
 146 * greater than the default mb_group_prealloc.
 147 *
 148 * The regular allocator (using the buddy cache) supports a few tunables.
 149 *
 150 * /sys/fs/ext4/<partition>/mb_min_to_scan
 151 * /sys/fs/ext4/<partition>/mb_max_to_scan
 152 * /sys/fs/ext4/<partition>/mb_order2_req
 153 *
 154 * The regular allocator uses buddy scan only if the request len is power of
 155 * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
 156 * value of s_mb_order2_reqs can be tuned via
 157 * /sys/fs/ext4/<partition>/mb_order2_req.  If the request len is equal to
 158 * stripe size (sbi->s_stripe), we try to search for contiguous block in
 159 * stripe size. This should result in better allocation on RAID setups. If
 160 * not, we search in the specific group using bitmap for best extents. The
 161 * tunable min_to_scan and max_to_scan control the behaviour here.
 162 * min_to_scan indicate how long the mballoc __must__ look for a best
 163 * extent and max_to_scan indicates how long the mballoc __can__ look for a
 164 * best extent in the found extents. Searching for the blocks starts with
 165 * the group specified as the goal value in allocation context via
 166 * ac_g_ex. Each group is first checked based on the criteria whether it
 167 * can be used for allocation. ext4_mb_good_group explains how the groups are
 168 * checked.
 169 *
 170 * Both the prealloc space are getting populated as above. So for the first
 171 * request we will hit the buddy cache which will result in this prealloc
 172 * space getting filled. The prealloc space is then later used for the
 173 * subsequent request.
 174 */
 175
 176/*
 177 * mballoc operates on the following data:
 178 *  - on-disk bitmap
 179 *  - in-core buddy (actually includes buddy and bitmap)
 180 *  - preallocation descriptors (PAs)
 181 *
 182 * there are two types of preallocations:
 183 *  - inode
 184 *    assiged to specific inode and can be used for this inode only.
 185 *    it describes part of inode's space preallocated to specific
 186 *    physical blocks. any block from that preallocated can be used
 187 *    independent. the descriptor just tracks number of blocks left
 188 *    unused. so, before taking some block from descriptor, one must
 189 *    make sure corresponded logical block isn't allocated yet. this
 190 *    also means that freeing any block within descriptor's range
 191 *    must discard all preallocated blocks.
 192 *  - locality group
 193 *    assigned to specific locality group which does not translate to
 194 *    permanent set of inodes: inode can join and leave group. space
 195 *    from this type of preallocation can be used for any inode. thus
 196 *    it's consumed from the beginning to the end.
 197 *
 198 * relation between them can be expressed as:
 199 *    in-core buddy = on-disk bitmap + preallocation descriptors
 200 *
 201 * this mean blocks mballoc considers used are:
 202 *  - allocated blocks (persistent)
 203 *  - preallocated blocks (non-persistent)
 204 *
 205 * consistency in mballoc world means that at any time a block is either
 206 * free or used in ALL structures. notice: "any time" should not be read
 207 * literally -- time is discrete and delimited by locks.
 208 *
 209 *  to keep it simple, we don't use block numbers, instead we count number of
 210 *  blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA.
 211 *
 212 * all operations can be expressed as:
 213 *  - init buddy:                       buddy = on-disk + PAs
 214 *  - new PA:                           buddy += N; PA = N
 215 *  - use inode PA:                     on-disk += N; PA -= N
 216 *  - discard inode PA                  buddy -= on-disk - PA; PA = 0
 217 *  - use locality group PA             on-disk += N; PA -= N
 218 *  - discard locality group PA         buddy -= PA; PA = 0
 219 *  note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap
 220 *        is used in real operation because we can't know actual used
 221 *        bits from PA, only from on-disk bitmap
 222 *
 223 * if we follow this strict logic, then all operations above should be atomic.
 224 * given some of them can block, we'd have to use something like semaphores
 225 * killing performance on high-end SMP hardware. let's try to relax it using
 226 * the following knowledge:
 227 *  1) if buddy is referenced, it's already initialized
 228 *  2) while block is used in buddy and the buddy is referenced,
 229 *     nobody can re-allocate that block
 230 *  3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has
 231 *     bit set and PA claims same block, it's OK. IOW, one can set bit in
 232 *     on-disk bitmap if buddy has same bit set or/and PA covers corresponded
 233 *     block
 234 *
 235 * so, now we're building a concurrency table:
 236 *  - init buddy vs.
 237 *    - new PA
 238 *      blocks for PA are allocated in the buddy, buddy must be referenced
 239 *      until PA is linked to allocation group to avoid concurrent buddy init
 240 *    - use inode PA
 241 *      we need to make sure that either on-disk bitmap or PA has uptodate data
 242 *      given (3) we care that PA-=N operation doesn't interfere with init
 243 *    - discard inode PA
 244 *      the simplest way would be to have buddy initialized by the discard
 245 *    - use locality group PA
 246 *      again PA-=N must be serialized with init
 247 *    - discard locality group PA
 248 *      the simplest way would be to have buddy initialized by the discard
 249 *  - new PA vs.
 250 *    - use inode PA
 251 *      i_data_sem serializes them
 252 *    - discard inode PA
 253 *      discard process must wait until PA isn't used by another process
 254 *    - use locality group PA
 255 *      some mutex should serialize them
 256 *    - discard locality group PA
 257 *      discard process must wait until PA isn't used by another process
 258 *  - use inode PA
 259 *    - use inode PA
 260 *      i_data_sem or another mutex should serializes them
 261 *    - discard inode PA
 262 *      discard process must wait until PA isn't used by another process
 263 *    - use locality group PA
 264 *      nothing wrong here -- they're different PAs covering different blocks
 265 *    - discard locality group PA
 266 *      discard process must wait until PA isn't used by another process
 267 *
 268 * now we're ready to make few consequences:
 269 *  - PA is referenced and while it is no discard is possible
 270 *  - PA is referenced until block isn't marked in on-disk bitmap
 271 *  - PA changes only after on-disk bitmap
 272 *  - discard must not compete with init. either init is done before
 273 *    any discard or they're serialized somehow
 274 *  - buddy init as sum of on-disk bitmap and PAs is done atomically
 275 *
 276 * a special case when we've used PA to emptiness. no need to modify buddy
 277 * in this case, but we should care about concurrent init
 278 *
 279 */
 280
 281 /*
 282 * Logic in few words:
 283 *
 284 *  - allocation:
 285 *    load group
 286 *    find blocks
 287 *    mark bits in on-disk bitmap
 288 *    release group
 289 *
 290 *  - use preallocation:
 291 *    find proper PA (per-inode or group)
 292 *    load group
 293 *    mark bits in on-disk bitmap
 294 *    release group
 295 *    release PA
 296 *
 297 *  - free:
 298 *    load group
 299 *    mark bits in on-disk bitmap
 300 *    release group
 301 *
 302 *  - discard preallocations in group:
 303 *    mark PAs deleted
 304 *    move them onto local list
 305 *    load on-disk bitmap
 306 *    load group
 307 *    remove PA from object (inode or locality group)
 308 *    mark free blocks in-core
 309 *
 310 *  - discard inode's preallocations:
 311 */
 312
 313/*
 314 * Locking rules
 315 *
 316 * Locks:
 317 *  - bitlock on a group        (group)
 318 *  - object (inode/locality)   (object)
 319 *  - per-pa lock               (pa)
 320 *
 321 * Paths:
 322 *  - new pa
 323 *    object
 324 *    group
 325 *
 326 *  - find and use pa:
 327 *    pa
 328 *
 329 *  - release consumed pa:
 330 *    pa
 331 *    group
 332 *    object
 333 *
 334 *  - generate in-core bitmap:
 335 *    group
 336 *        pa
 337 *
 338 *  - discard all for given object (inode, locality group):
 339 *    object
 340 *        pa
 341 *    group
 342 *
 343 *  - discard all for given group:
 344 *    group
 345 *        pa
 346 *    group
 347 *        object
 348 *
 349 */
 350static struct kmem_cache *ext4_pspace_cachep;
 351static struct kmem_cache *ext4_ac_cachep;
 352static struct kmem_cache *ext4_free_data_cachep;
 353
 354/* We create slab caches for groupinfo data structures based on the
 355 * superblock block size.  There will be one per mounted filesystem for
 356 * each unique s_blocksize_bits */
 357#define NR_GRPINFO_CACHES 8
 358static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES];
 359
 360static const char * const ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = {
 361        "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k",
 362        "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k",
 363        "ext4_groupinfo_64k", "ext4_groupinfo_128k"
 364};
 365
 366static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
 367                                        ext4_group_t group);
 368static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
 369                                                ext4_group_t group);
 370
 371static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
 372{
 373#if BITS_PER_LONG == 64
 374        *bit += ((unsigned long) addr & 7UL) << 3;
 375        addr = (void *) ((unsigned long) addr & ~7UL);
 376#elif BITS_PER_LONG == 32
 377        *bit += ((unsigned long) addr & 3UL) << 3;
 378        addr = (void *) ((unsigned long) addr & ~3UL);
 379#else
 380#error "how many bits you are?!"
 381#endif
 382        return addr;
 383}
 384
 385static inline int mb_test_bit(int bit, void *addr)
 386{
 387        /*
 388         * ext4_test_bit on architecture like powerpc
 389         * needs unsigned long aligned address
 390         */
 391        addr = mb_correct_addr_and_bit(&bit, addr);
 392        return ext4_test_bit(bit, addr);
 393}
 394
 395static inline void mb_set_bit(int bit, void *addr)
 396{
 397        addr = mb_correct_addr_and_bit(&bit, addr);
 398        ext4_set_bit(bit, addr);
 399}
 400
 401static inline void mb_clear_bit(int bit, void *addr)
 402{
 403        addr = mb_correct_addr_and_bit(&bit, addr);
 404        ext4_clear_bit(bit, addr);
 405}
 406
 407static inline int mb_test_and_clear_bit(int bit, void *addr)
 408{
 409        addr = mb_correct_addr_and_bit(&bit, addr);
 410        return ext4_test_and_clear_bit(bit, addr);
 411}
 412
 413static inline int mb_find_next_zero_bit(void *addr, int max, int start)
 414{
 415        int fix = 0, ret, tmpmax;
 416        addr = mb_correct_addr_and_bit(&fix, addr);
 417        tmpmax = max + fix;
 418        start += fix;
 419
 420        ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix;
 421        if (ret > max)
 422                return max;
 423        return ret;
 424}
 425
 426static inline int mb_find_next_bit(void *addr, int max, int start)
 427{
 428        int fix = 0, ret, tmpmax;
 429        addr = mb_correct_addr_and_bit(&fix, addr);
 430        tmpmax = max + fix;
 431        start += fix;
 432
 433        ret = ext4_find_next_bit(addr, tmpmax, start) - fix;
 434        if (ret > max)
 435                return max;
 436        return ret;
 437}
 438
 439static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max)
 440{
 441        char *bb;
 442
 443        BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
 444        BUG_ON(max == NULL);
 445
 446        if (order > e4b->bd_blkbits + 1) {
 447                *max = 0;
 448                return NULL;
 449        }
 450
 451        /* at order 0 we see each particular block */
 452        if (order == 0) {
 453                *max = 1 << (e4b->bd_blkbits + 3);
 454                return e4b->bd_bitmap;
 455        }
 456
 457        bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
 458        *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
 459
 460        return bb;
 461}
 462
 463#ifdef DOUBLE_CHECK
 464static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
 465                           int first, int count)
 466{
 467        int i;
 468        struct super_block *sb = e4b->bd_sb;
 469
 470        if (unlikely(e4b->bd_info->bb_bitmap == NULL))
 471                return;
 472        assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
 473        for (i = 0; i < count; i++) {
 474                if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
 475                        ext4_fsblk_t blocknr;
 476
 477                        blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
 478                        blocknr += EXT4_C2B(EXT4_SB(sb), first + i);
 479                        ext4_grp_locked_error(sb, e4b->bd_group,
 480                                              inode ? inode->i_ino : 0,
 481                                              blocknr,
 482                                              "freeing block already freed "
 483                                              "(bit %u)",
 484                                              first + i);
 485                }
 486                mb_clear_bit(first + i, e4b->bd_info->bb_bitmap);
 487        }
 488}
 489
 490static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count)
 491{
 492        int i;
 493
 494        if (unlikely(e4b->bd_info->bb_bitmap == NULL))
 495                return;
 496        assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
 497        for (i = 0; i < count; i++) {
 498                BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap));
 499                mb_set_bit(first + i, e4b->bd_info->bb_bitmap);
 500        }
 501}
 502
 503static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
 504{
 505        if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) {
 506                unsigned char *b1, *b2;
 507                int i;
 508                b1 = (unsigned char *) e4b->bd_info->bb_bitmap;
 509                b2 = (unsigned char *) bitmap;
 510                for (i = 0; i < e4b->bd_sb->s_blocksize; i++) {
 511                        if (b1[i] != b2[i]) {
 512                                ext4_msg(e4b->bd_sb, KERN_ERR,
 513                                         "corruption in group %u "
 514                                         "at byte %u(%u): %x in copy != %x "
 515                                         "on disk/prealloc",
 516                                         e4b->bd_group, i, i * 8, b1[i], b2[i]);
 517                                BUG();
 518                        }
 519                }
 520        }
 521}
 522
 523#else
 524static inline void mb_free_blocks_double(struct inode *inode,
 525                                struct ext4_buddy *e4b, int first, int count)
 526{
 527        return;
 528}
 529static inline void mb_mark_used_double(struct ext4_buddy *e4b,
 530                                                int first, int count)
 531{
 532        return;
 533}
 534static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap)
 535{
 536        return;
 537}
 538#endif
 539
 540#ifdef AGGRESSIVE_CHECK
 541
 542#define MB_CHECK_ASSERT(assert)                                         \
 543do {                                                                    \
 544        if (!(assert)) {                                                \
 545                printk(KERN_EMERG                                       \
 546                        "Assertion failure in %s() at %s:%d: \"%s\"\n", \
 547                        function, file, line, # assert);                \
 548                BUG();                                                  \
 549        }                                                               \
 550} while (0)
 551
 552static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
 553                                const char *function, int line)
 554{
 555        struct super_block *sb = e4b->bd_sb;
 556        int order = e4b->bd_blkbits + 1;
 557        int max;
 558        int max2;
 559        int i;
 560        int j;
 561        int k;
 562        int count;
 563        struct ext4_group_info *grp;
 564        int fragments = 0;
 565        int fstart;
 566        struct list_head *cur;
 567        void *buddy;
 568        void *buddy2;
 569
 570        {
 571                static int mb_check_counter;
 572                if (mb_check_counter++ % 100 != 0)
 573                        return 0;
 574        }
 575
 576        while (order > 1) {
 577                buddy = mb_find_buddy(e4b, order, &max);
 578                MB_CHECK_ASSERT(buddy);
 579                buddy2 = mb_find_buddy(e4b, order - 1, &max2);
 580                MB_CHECK_ASSERT(buddy2);
 581                MB_CHECK_ASSERT(buddy != buddy2);
 582                MB_CHECK_ASSERT(max * 2 == max2);
 583
 584                count = 0;
 585                for (i = 0; i < max; i++) {
 586
 587                        if (mb_test_bit(i, buddy)) {
 588                                /* only single bit in buddy2 may be 1 */
 589                                if (!mb_test_bit(i << 1, buddy2)) {
 590                                        MB_CHECK_ASSERT(
 591                                                mb_test_bit((i<<1)+1, buddy2));
 592                                } else if (!mb_test_bit((i << 1) + 1, buddy2)) {
 593                                        MB_CHECK_ASSERT(
 594                                                mb_test_bit(i << 1, buddy2));
 595                                }
 596                                continue;
 597                        }
 598
 599                        /* both bits in buddy2 must be 1 */
 600                        MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2));
 601                        MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2));
 602
 603                        for (j = 0; j < (1 << order); j++) {
 604                                k = (i * (1 << order)) + j;
 605                                MB_CHECK_ASSERT(
 606                                        !mb_test_bit(k, e4b->bd_bitmap));
 607                        }
 608                        count++;
 609                }
 610                MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count);
 611                order--;
 612        }
 613
 614        fstart = -1;
 615        buddy = mb_find_buddy(e4b, 0, &max);
 616        for (i = 0; i < max; i++) {
 617                if (!mb_test_bit(i, buddy)) {
 618                        MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free);
 619                        if (fstart == -1) {
 620                                fragments++;
 621                                fstart = i;
 622                        }
 623                        continue;
 624                }
 625                fstart = -1;
 626                /* check used bits only */
 627                for (j = 0; j < e4b->bd_blkbits + 1; j++) {
 628                        buddy2 = mb_find_buddy(e4b, j, &max2);
 629                        k = i >> j;
 630                        MB_CHECK_ASSERT(k < max2);
 631                        MB_CHECK_ASSERT(mb_test_bit(k, buddy2));
 632                }
 633        }
 634        MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info));
 635        MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments);
 636
 637        grp = ext4_get_group_info(sb, e4b->bd_group);
 638        list_for_each(cur, &grp->bb_prealloc_list) {
 639                ext4_group_t groupnr;
 640                struct ext4_prealloc_space *pa;
 641                pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
 642                ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k);
 643                MB_CHECK_ASSERT(groupnr == e4b->bd_group);
 644                for (i = 0; i < pa->pa_len; i++)
 645                        MB_CHECK_ASSERT(mb_test_bit(k + i, buddy));
 646        }
 647        return 0;
 648}
 649#undef MB_CHECK_ASSERT
 650#define mb_check_buddy(e4b) __mb_check_buddy(e4b,       \
 651                                        __FILE__, __func__, __LINE__)
 652#else
 653#define mb_check_buddy(e4b)
 654#endif
 655
 656/*
 657 * Divide blocks started from @first with length @len into
 658 * smaller chunks with power of 2 blocks.
 659 * Clear the bits in bitmap which the blocks of the chunk(s) covered,
 660 * then increase bb_counters[] for corresponded chunk size.
 661 */
 662static void ext4_mb_mark_free_simple(struct super_block *sb,
 663                                void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
 664                                        struct ext4_group_info *grp)
 665{
 666        struct ext4_sb_info *sbi = EXT4_SB(sb);
 667        ext4_grpblk_t min;
 668        ext4_grpblk_t max;
 669        ext4_grpblk_t chunk;
 670        unsigned int border;
 671
 672        BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb));
 673
 674        border = 2 << sb->s_blocksize_bits;
 675
 676        while (len > 0) {
 677                /* find how many blocks can be covered since this position */
 678                max = ffs(first | border) - 1;
 679
 680                /* find how many blocks of power 2 we need to mark */
 681                min = fls(len) - 1;
 682
 683                if (max < min)
 684                        min = max;
 685                chunk = 1 << min;
 686
 687                /* mark multiblock chunks only */
 688                grp->bb_counters[min]++;
 689                if (min > 0)
 690                        mb_clear_bit(first >> min,
 691                                     buddy + sbi->s_mb_offsets[min]);
 692
 693                len -= chunk;
 694                first += chunk;
 695        }
 696}
 697
 698/*
 699 * Cache the order of the largest free extent we have available in this block
 700 * group.
 701 */
 702static void
 703mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
 704{
 705        int i;
 706        int bits;
 707
 708        grp->bb_largest_free_order = -1; /* uninit */
 709
 710        bits = sb->s_blocksize_bits + 1;
 711        for (i = bits; i >= 0; i--) {
 712                if (grp->bb_counters[i] > 0) {
 713                        grp->bb_largest_free_order = i;
 714                        break;
 715                }
 716        }
 717}
 718
 719static noinline_for_stack
 720void ext4_mb_generate_buddy(struct super_block *sb,
 721                                void *buddy, void *bitmap, ext4_group_t group)
 722{
 723        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
 724        struct ext4_sb_info *sbi = EXT4_SB(sb);
 725        ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
 726        ext4_grpblk_t i = 0;
 727        ext4_grpblk_t first;
 728        ext4_grpblk_t len;
 729        unsigned free = 0;
 730        unsigned fragments = 0;
 731        unsigned long long period = get_cycles();
 732
 733        /* initialize buddy from bitmap which is aggregation
 734         * of on-disk bitmap and preallocations */
 735        i = mb_find_next_zero_bit(bitmap, max, 0);
 736        grp->bb_first_free = i;
 737        while (i < max) {
 738                fragments++;
 739                first = i;
 740                i = mb_find_next_bit(bitmap, max, i);
 741                len = i - first;
 742                free += len;
 743                if (len > 1)
 744                        ext4_mb_mark_free_simple(sb, buddy, first, len, grp);
 745                else
 746                        grp->bb_counters[0]++;
 747                if (i < max)
 748                        i = mb_find_next_zero_bit(bitmap, max, i);
 749        }
 750        grp->bb_fragments = fragments;
 751
 752        if (free != grp->bb_free) {
 753                ext4_grp_locked_error(sb, group, 0, 0,
 754                                      "block bitmap and bg descriptor "
 755                                      "inconsistent: %u vs %u free clusters",
 756                                      free, grp->bb_free);
 757                /*
 758                 * If we intend to continue, we consider group descriptor
 759                 * corrupt and update bb_free using bitmap value
 760                 */
 761                grp->bb_free = free;
 762                if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp))
 763                        percpu_counter_sub(&sbi->s_freeclusters_counter,
 764                                           grp->bb_free);
 765                set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &grp->bb_state);
 766        }
 767        mb_set_largest_free_order(sb, grp);
 768
 769        clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
 770
 771        period = get_cycles() - period;
 772        spin_lock(&EXT4_SB(sb)->s_bal_lock);
 773        EXT4_SB(sb)->s_mb_buddies_generated++;
 774        EXT4_SB(sb)->s_mb_generation_time += period;
 775        spin_unlock(&EXT4_SB(sb)->s_bal_lock);
 776}
 777
 778static void mb_regenerate_buddy(struct ext4_buddy *e4b)
 779{
 780        int count;
 781        int order = 1;
 782        void *buddy;
 783
 784        while ((buddy = mb_find_buddy(e4b, order++, &count))) {
 785                ext4_set_bits(buddy, 0, count);
 786        }
 787        e4b->bd_info->bb_fragments = 0;
 788        memset(e4b->bd_info->bb_counters, 0,
 789                sizeof(*e4b->bd_info->bb_counters) *
 790                (e4b->bd_sb->s_blocksize_bits + 2));
 791
 792        ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy,
 793                e4b->bd_bitmap, e4b->bd_group);
 794}
 795
 796/* The buddy information is attached the buddy cache inode
 797 * for convenience. The information regarding each group
 798 * is loaded via ext4_mb_load_buddy. The information involve
 799 * block bitmap and buddy information. The information are
 800 * stored in the inode as
 801 *
 802 * {                        page                        }
 803 * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]...
 804 *
 805 *
 806 * one block each for bitmap and buddy information.
 807 * So for each group we take up 2 blocks. A page can
 808 * contain blocks_per_page (PAGE_SIZE / blocksize)  blocks.
 809 * So it can have information regarding groups_per_page which
 810 * is blocks_per_page/2
 811 *
 812 * Locking note:  This routine takes the block group lock of all groups
 813 * for this page; do not hold this lock when calling this routine!
 814 */
 815
 816static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp)
 817{
 818        ext4_group_t ngroups;
 819        int blocksize;
 820        int blocks_per_page;
 821        int groups_per_page;
 822        int err = 0;
 823        int i;
 824        ext4_group_t first_group, group;
 825        int first_block;
 826        struct super_block *sb;
 827        struct buffer_head *bhs;
 828        struct buffer_head **bh = NULL;
 829        struct inode *inode;
 830        char *data;
 831        char *bitmap;
 832        struct ext4_group_info *grinfo;
 833
 834        mb_debug(1, "init page %lu\n", page->index);
 835
 836        inode = page->mapping->host;
 837        sb = inode->i_sb;
 838        ngroups = ext4_get_groups_count(sb);
 839        blocksize = i_blocksize(inode);
 840        blocks_per_page = PAGE_SIZE / blocksize;
 841
 842        groups_per_page = blocks_per_page >> 1;
 843        if (groups_per_page == 0)
 844                groups_per_page = 1;
 845
 846        /* allocate buffer_heads to read bitmaps */
 847        if (groups_per_page > 1) {
 848                i = sizeof(struct buffer_head *) * groups_per_page;
 849                bh = kzalloc(i, gfp);
 850                if (bh == NULL) {
 851                        err = -ENOMEM;
 852                        goto out;
 853                }
 854        } else
 855                bh = &bhs;
 856
 857        first_group = page->index * blocks_per_page / 2;
 858
 859        /* read all groups the page covers into the cache */
 860        for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
 861                if (group >= ngroups)
 862                        break;
 863
 864                grinfo = ext4_get_group_info(sb, group);
 865                /*
 866                 * If page is uptodate then we came here after online resize
 867                 * which added some new uninitialized group info structs, so
 868                 * we must skip all initialized uptodate buddies on the page,
 869                 * which may be currently in use by an allocating task.
 870                 */
 871                if (PageUptodate(page) && !EXT4_MB_GRP_NEED_INIT(grinfo)) {
 872                        bh[i] = NULL;
 873                        continue;
 874                }
 875                bh[i] = ext4_read_block_bitmap_nowait(sb, group);
 876                if (IS_ERR(bh[i])) {
 877                        err = PTR_ERR(bh[i]);
 878                        bh[i] = NULL;
 879                        goto out;
 880                }
 881                mb_debug(1, "read bitmap for group %u\n", group);
 882        }
 883
 884        /* wait for I/O completion */
 885        for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
 886                int err2;
 887
 888                if (!bh[i])
 889                        continue;
 890                err2 = ext4_wait_block_bitmap(sb, group, bh[i]);
 891                if (!err)
 892                        err = err2;
 893        }
 894
 895        first_block = page->index * blocks_per_page;
 896        for (i = 0; i < blocks_per_page; i++) {
 897                group = (first_block + i) >> 1;
 898                if (group >= ngroups)
 899                        break;
 900
 901                if (!bh[group - first_group])
 902                        /* skip initialized uptodate buddy */
 903                        continue;
 904
 905                if (!buffer_verified(bh[group - first_group]))
 906                        /* Skip faulty bitmaps */
 907                        continue;
 908                err = 0;
 909
 910                /*
 911                 * data carry information regarding this
 912                 * particular group in the format specified
 913                 * above
 914                 *
 915                 */
 916                data = page_address(page) + (i * blocksize);
 917                bitmap = bh[group - first_group]->b_data;
 918
 919                /*
 920                 * We place the buddy block and bitmap block
 921                 * close together
 922                 */
 923                if ((first_block + i) & 1) {
 924                        /* this is block of buddy */
 925                        BUG_ON(incore == NULL);
 926                        mb_debug(1, "put buddy for group %u in page %lu/%x\n",
 927                                group, page->index, i * blocksize);
 928                        trace_ext4_mb_buddy_bitmap_load(sb, group);
 929                        grinfo = ext4_get_group_info(sb, group);
 930                        grinfo->bb_fragments = 0;
 931                        memset(grinfo->bb_counters, 0,
 932                               sizeof(*grinfo->bb_counters) *
 933                                (sb->s_blocksize_bits+2));
 934                        /*
 935                         * incore got set to the group block bitmap below
 936                         */
 937                        ext4_lock_group(sb, group);
 938                        /* init the buddy */
 939                        memset(data, 0xff, blocksize);
 940                        ext4_mb_generate_buddy(sb, data, incore, group);
 941                        ext4_unlock_group(sb, group);
 942                        incore = NULL;
 943                } else {
 944                        /* this is block of bitmap */
 945                        BUG_ON(incore != NULL);
 946                        mb_debug(1, "put bitmap for group %u in page %lu/%x\n",
 947                                group, page->index, i * blocksize);
 948                        trace_ext4_mb_bitmap_load(sb, group);
 949
 950                        /* see comments in ext4_mb_put_pa() */
 951                        ext4_lock_group(sb, group);
 952                        memcpy(data, bitmap, blocksize);
 953
 954                        /* mark all preallocated blks used in in-core bitmap */
 955                        ext4_mb_generate_from_pa(sb, data, group);
 956                        ext4_mb_generate_from_freelist(sb, data, group);
 957                        ext4_unlock_group(sb, group);
 958
 959                        /* set incore so that the buddy information can be
 960                         * generated using this
 961                         */
 962                        incore = data;
 963                }
 964        }
 965        SetPageUptodate(page);
 966
 967out:
 968        if (bh) {
 969                for (i = 0; i < groups_per_page; i++)
 970                        brelse(bh[i]);
 971                if (bh != &bhs)
 972                        kfree(bh);
 973        }
 974        return err;
 975}
 976
 977/*
 978 * Lock the buddy and bitmap pages. This make sure other parallel init_group
 979 * on the same buddy page doesn't happen whild holding the buddy page lock.
 980 * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap
 981 * are on the same page e4b->bd_buddy_page is NULL and return value is 0.
 982 */
 983static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
 984                ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp)
 985{
 986        struct inode *inode = EXT4_SB(sb)->s_buddy_cache;
 987        int block, pnum, poff;
 988        int blocks_per_page;
 989        struct page *page;
 990
 991        e4b->bd_buddy_page = NULL;
 992        e4b->bd_bitmap_page = NULL;
 993
 994        blocks_per_page = PAGE_SIZE / sb->s_blocksize;
 995        /*
 996         * the buddy cache inode stores the block bitmap
 997         * and buddy information in consecutive blocks.
 998         * So for each group we need two blocks.
 999         */
1000        block = group * 2;
1001        pnum = block / blocks_per_page;
1002        poff = block % blocks_per_page;
1003        page = find_or_create_page(inode->i_mapping, pnum, gfp);
1004        if (!page)
1005                return -ENOMEM;
1006        BUG_ON(page->mapping != inode->i_mapping);
1007        e4b->bd_bitmap_page = page;
1008        e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
1009
1010        if (blocks_per_page >= 2) {
1011                /* buddy and bitmap are on the same page */
1012                return 0;
1013        }
1014
1015        block++;
1016        pnum = block / blocks_per_page;
1017        page = find_or_create_page(inode->i_mapping, pnum, gfp);
1018        if (!page)
1019                return -ENOMEM;
1020        BUG_ON(page->mapping != inode->i_mapping);
1021        e4b->bd_buddy_page = page;
1022        return 0;
1023}
1024
1025static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b)
1026{
1027        if (e4b->bd_bitmap_page) {
1028                unlock_page(e4b->bd_bitmap_page);
1029                put_page(e4b->bd_bitmap_page);
1030        }
1031        if (e4b->bd_buddy_page) {
1032                unlock_page(e4b->bd_buddy_page);
1033                put_page(e4b->bd_buddy_page);
1034        }
1035}
1036
1037/*
1038 * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
1039 * block group lock of all groups for this page; do not hold the BG lock when
1040 * calling this routine!
1041 */
1042static noinline_for_stack
1043int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp)
1044{
1045
1046        struct ext4_group_info *this_grp;
1047        struct ext4_buddy e4b;
1048        struct page *page;
1049        int ret = 0;
1050
1051        might_sleep();
1052        mb_debug(1, "init group %u\n", group);
1053        this_grp = ext4_get_group_info(sb, group);
1054        /*
1055         * This ensures that we don't reinit the buddy cache
1056         * page which map to the group from which we are already
1057         * allocating. If we are looking at the buddy cache we would
1058         * have taken a reference using ext4_mb_load_buddy and that
1059         * would have pinned buddy page to page cache.
1060         * The call to ext4_mb_get_buddy_page_lock will mark the
1061         * page accessed.
1062         */
1063        ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b, gfp);
1064        if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) {
1065                /*
1066                 * somebody initialized the group
1067                 * return without doing anything
1068                 */
1069                goto err;
1070        }
1071
1072        page = e4b.bd_bitmap_page;
1073        ret = ext4_mb_init_cache(page, NULL, gfp);
1074        if (ret)
1075                goto err;
1076        if (!PageUptodate(page)) {
1077                ret = -EIO;
1078                goto err;
1079        }
1080
1081        if (e4b.bd_buddy_page == NULL) {
1082                /*
1083                 * If both the bitmap and buddy are in
1084                 * the same page we don't need to force
1085                 * init the buddy
1086                 */
1087                ret = 0;
1088                goto err;
1089        }
1090        /* init buddy cache */
1091        page = e4b.bd_buddy_page;
1092        ret = ext4_mb_init_cache(page, e4b.bd_bitmap, gfp);
1093        if (ret)
1094                goto err;
1095        if (!PageUptodate(page)) {
1096                ret = -EIO;
1097                goto err;
1098        }
1099err:
1100        ext4_mb_put_buddy_page_lock(&e4b);
1101        return ret;
1102}
1103
1104/*
1105 * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
1106 * block group lock of all groups for this page; do not hold the BG lock when
1107 * calling this routine!
1108 */
1109static noinline_for_stack int
1110ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group,
1111                       struct ext4_buddy *e4b, gfp_t gfp)
1112{
1113        int blocks_per_page;
1114        int block;
1115        int pnum;
1116        int poff;
1117        struct page *page;
1118        int ret;
1119        struct ext4_group_info *grp;
1120        struct ext4_sb_info *sbi = EXT4_SB(sb);
1121        struct inode *inode = sbi->s_buddy_cache;
1122
1123        might_sleep();
1124        mb_debug(1, "load group %u\n", group);
1125
1126        blocks_per_page = PAGE_SIZE / sb->s_blocksize;
1127        grp = ext4_get_group_info(sb, group);
1128
1129        e4b->bd_blkbits = sb->s_blocksize_bits;
1130        e4b->bd_info = grp;
1131        e4b->bd_sb = sb;
1132        e4b->bd_group = group;
1133        e4b->bd_buddy_page = NULL;
1134        e4b->bd_bitmap_page = NULL;
1135
1136        if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
1137                /*
1138                 * we need full data about the group
1139                 * to make a good selection
1140                 */
1141                ret = ext4_mb_init_group(sb, group, gfp);
1142                if (ret)
1143                        return ret;
1144        }
1145
1146        /*
1147         * the buddy cache inode stores the block bitmap
1148         * and buddy information in consecutive blocks.
1149         * So for each group we need two blocks.
1150         */
1151        block = group * 2;
1152        pnum = block / blocks_per_page;
1153        poff = block % blocks_per_page;
1154
1155        /* we could use find_or_create_page(), but it locks page
1156         * what we'd like to avoid in fast path ... */
1157        page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
1158        if (page == NULL || !PageUptodate(page)) {
1159                if (page)
1160                        /*
1161                         * drop the page reference and try
1162                         * to get the page with lock. If we
1163                         * are not uptodate that implies
1164                         * somebody just created the page but
1165                         * is yet to initialize the same. So
1166                         * wait for it to initialize.
1167                         */
1168                        put_page(page);
1169                page = find_or_create_page(inode->i_mapping, pnum, gfp);
1170                if (page) {
1171                        BUG_ON(page->mapping != inode->i_mapping);
1172                        if (!PageUptodate(page)) {
1173                                ret = ext4_mb_init_cache(page, NULL, gfp);
1174                                if (ret) {
1175                                        unlock_page(page);
1176                                        goto err;
1177                                }
1178                                mb_cmp_bitmaps(e4b, page_address(page) +
1179                                               (poff * sb->s_blocksize));
1180                        }
1181                        unlock_page(page);
1182                }
1183        }
1184        if (page == NULL) {
1185                ret = -ENOMEM;
1186                goto err;
1187        }
1188        if (!PageUptodate(page)) {
1189                ret = -EIO;
1190                goto err;
1191        }
1192
1193        /* Pages marked accessed already */
1194        e4b->bd_bitmap_page = page;
1195        e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize);
1196
1197        block++;
1198        pnum = block / blocks_per_page;
1199        poff = block % blocks_per_page;
1200
1201        page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED);
1202        if (page == NULL || !PageUptodate(page)) {
1203                if (page)
1204                        put_page(page);
1205                page = find_or_create_page(inode->i_mapping, pnum, gfp);
1206                if (page) {
1207                        BUG_ON(page->mapping != inode->i_mapping);
1208                        if (!PageUptodate(page)) {
1209                                ret = ext4_mb_init_cache(page, e4b->bd_bitmap,
1210                                                         gfp);
1211                                if (ret) {
1212                                        unlock_page(page);
1213                                        goto err;
1214                                }
1215                        }
1216                        unlock_page(page);
1217                }
1218        }
1219        if (page == NULL) {
1220                ret = -ENOMEM;
1221                goto err;
1222        }
1223        if (!PageUptodate(page)) {
1224                ret = -EIO;
1225                goto err;
1226        }
1227
1228        /* Pages marked accessed already */
1229        e4b->bd_buddy_page = page;
1230        e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize);
1231
1232        BUG_ON(e4b->bd_bitmap_page == NULL);
1233        BUG_ON(e4b->bd_buddy_page == NULL);
1234
1235        return 0;
1236
1237err:
1238        if (page)
1239                put_page(page);
1240        if (e4b->bd_bitmap_page)
1241                put_page(e4b->bd_bitmap_page);
1242        if (e4b->bd_buddy_page)
1243                put_page(e4b->bd_buddy_page);
1244        e4b->bd_buddy = NULL;
1245        e4b->bd_bitmap = NULL;
1246        return ret;
1247}
1248
1249static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
1250                              struct ext4_buddy *e4b)
1251{
1252        return ext4_mb_load_buddy_gfp(sb, group, e4b, GFP_NOFS);
1253}
1254
1255static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
1256{
1257        if (e4b->bd_bitmap_page)
1258                put_page(e4b->bd_bitmap_page);
1259        if (e4b->bd_buddy_page)
1260                put_page(e4b->bd_buddy_page);
1261}
1262
1263
1264static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
1265{
1266        int order = 1;
1267        int bb_incr = 1 << (e4b->bd_blkbits - 1);
1268        void *bb;
1269
1270        BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
1271        BUG_ON(block >= (1 << (e4b->bd_blkbits + 3)));
1272
1273        bb = e4b->bd_buddy;
1274        while (order <= e4b->bd_blkbits + 1) {
1275                block = block >> 1;
1276                if (!mb_test_bit(block, bb)) {
1277                        /* this block is part of buddy of order 'order' */
1278                        return order;
1279                }
1280                bb += bb_incr;
1281                bb_incr >>= 1;
1282                order++;
1283        }
1284        return 0;
1285}
1286
1287static void mb_clear_bits(void *bm, int cur, int len)
1288{
1289        __u32 *addr;
1290
1291        len = cur + len;
1292        while (cur < len) {
1293                if ((cur & 31) == 0 && (len - cur) >= 32) {
1294                        /* fast path: clear whole word at once */
1295                        addr = bm + (cur >> 3);
1296                        *addr = 0;
1297                        cur += 32;
1298                        continue;
1299                }
1300                mb_clear_bit(cur, bm);
1301                cur++;
1302        }
1303}
1304
1305/* clear bits in given range
1306 * will return first found zero bit if any, -1 otherwise
1307 */
1308static int mb_test_and_clear_bits(void *bm, int cur, int len)
1309{
1310        __u32 *addr;
1311        int zero_bit = -1;
1312
1313        len = cur + len;
1314        while (cur < len) {
1315                if ((cur & 31) == 0 && (len - cur) >= 32) {
1316                        /* fast path: clear whole word at once */
1317                        addr = bm + (cur >> 3);
1318                        if (*addr != (__u32)(-1) && zero_bit == -1)
1319                                zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0);
1320                        *addr = 0;
1321                        cur += 32;
1322                        continue;
1323                }
1324                if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1)
1325                        zero_bit = cur;
1326                cur++;
1327        }
1328
1329        return zero_bit;
1330}
1331
1332void ext4_set_bits(void *bm, int cur, int len)
1333{
1334        __u32 *addr;
1335
1336        len = cur + len;
1337        while (cur < len) {
1338                if ((cur & 31) == 0 && (len - cur) >= 32) {
1339                        /* fast path: set whole word at once */
1340                        addr = bm + (cur >> 3);
1341                        *addr = 0xffffffff;
1342                        cur += 32;
1343                        continue;
1344                }
1345                mb_set_bit(cur, bm);
1346                cur++;
1347        }
1348}
1349
1350/*
1351 * _________________________________________________________________ */
1352
1353static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side)
1354{
1355        if (mb_test_bit(*bit + side, bitmap)) {
1356                mb_clear_bit(*bit, bitmap);
1357                (*bit) -= side;
1358                return 1;
1359        }
1360        else {
1361                (*bit) += side;
1362                mb_set_bit(*bit, bitmap);
1363                return -1;
1364        }
1365}
1366
1367static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last)
1368{
1369        int max;
1370        int order = 1;
1371        void *buddy = mb_find_buddy(e4b, order, &max);
1372
1373        while (buddy) {
1374                void *buddy2;
1375
1376                /* Bits in range [first; last] are known to be set since
1377                 * corresponding blocks were allocated. Bits in range
1378                 * (first; last) will stay set because they form buddies on
1379                 * upper layer. We just deal with borders if they don't
1380                 * align with upper layer and then go up.
1381                 * Releasing entire group is all about clearing
1382                 * single bit of highest order buddy.
1383                 */
1384
1385                /* Example:
1386                 * ---------------------------------
1387                 * |   1   |   1   |   1   |   1   |
1388                 * ---------------------------------
1389                 * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
1390                 * ---------------------------------
1391                 *   0   1   2   3   4   5   6   7
1392                 *      \_____________________/
1393                 *
1394                 * Neither [1] nor [6] is aligned to above layer.
1395                 * Left neighbour [0] is free, so mark it busy,
1396                 * decrease bb_counters and extend range to
1397                 * [0; 6]
1398                 * Right neighbour [7] is busy. It can't be coaleasced with [6], so
1399                 * mark [6] free, increase bb_counters and shrink range to
1400                 * [0; 5].
1401                 * Then shift range to [0; 2], go up and do the same.
1402                 */
1403
1404
1405                if (first & 1)
1406                        e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1);
1407                if (!(last & 1))
1408                        e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1);
1409                if (first > last)
1410                        break;
1411                order++;
1412
1413                if (first == last || !(buddy2 = mb_find_buddy(e4b, order, &max))) {
1414                        mb_clear_bits(buddy, first, last - first + 1);
1415                        e4b->bd_info->bb_counters[order - 1] += last - first + 1;
1416                        break;
1417                }
1418                first >>= 1;
1419                last >>= 1;
1420                buddy = buddy2;
1421        }
1422}
1423
1424static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
1425                           int first, int count)
1426{
1427        int left_is_free = 0;
1428        int right_is_free = 0;
1429        int block;
1430        int last = first + count - 1;
1431        struct super_block *sb = e4b->bd_sb;
1432
1433        if (WARN_ON(count == 0))
1434                return;
1435        BUG_ON(last >= (sb->s_blocksize << 3));
1436        assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
1437        /* Don't bother if the block group is corrupt. */
1438        if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
1439                return;
1440
1441        mb_check_buddy(e4b);
1442        mb_free_blocks_double(inode, e4b, first, count);
1443
1444        e4b->bd_info->bb_free += count;
1445        if (first < e4b->bd_info->bb_first_free)
1446                e4b->bd_info->bb_first_free = first;
1447
1448        /* access memory sequentially: check left neighbour,
1449         * clear range and then check right neighbour
1450         */
1451        if (first != 0)
1452                left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap);
1453        block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count);
1454        if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0])
1455                right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap);
1456
1457        if (unlikely(block != -1)) {
1458                struct ext4_sb_info *sbi = EXT4_SB(sb);
1459                ext4_fsblk_t blocknr;
1460
1461                blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
1462                blocknr += EXT4_C2B(EXT4_SB(sb), block);
1463                ext4_grp_locked_error(sb, e4b->bd_group,
1464                                      inode ? inode->i_ino : 0,
1465                                      blocknr,
1466                                      "freeing already freed block "
1467                                      "(bit %u); block bitmap corrupt.",
1468                                      block);
1469                if (!EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))
1470                        percpu_counter_sub(&sbi->s_freeclusters_counter,
1471                                           e4b->bd_info->bb_free);
1472                /* Mark the block group as corrupt. */
1473                set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT,
1474                        &e4b->bd_info->bb_state);
1475                mb_regenerate_buddy(e4b);
1476                goto done;
1477        }
1478
1479        /* let's maintain fragments counter */
1480        if (left_is_free && right_is_free)
1481                e4b->bd_info->bb_fragments--;
1482        else if (!left_is_free && !right_is_free)
1483                e4b->bd_info->bb_fragments++;
1484
1485        /* buddy[0] == bd_bitmap is a special case, so handle
1486         * it right away and let mb_buddy_mark_free stay free of
1487         * zero order checks.
1488         * Check if neighbours are to be coaleasced,
1489         * adjust bitmap bb_counters and borders appropriately.
1490         */
1491        if (first & 1) {
1492                first += !left_is_free;
1493                e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1;
1494        }
1495        if (!(last & 1)) {
1496                last -= !right_is_free;
1497                e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1;
1498        }
1499
1500        if (first <= last)
1501                mb_buddy_mark_free(e4b, first >> 1, last >> 1);
1502
1503done:
1504        mb_set_largest_free_order(sb, e4b->bd_info);
1505        mb_check_buddy(e4b);
1506}
1507
1508static int mb_find_extent(struct ext4_buddy *e4b, int block,
1509                                int needed, struct ext4_free_extent *ex)
1510{
1511        int next = block;
1512        int max, order;
1513        void *buddy;
1514
1515        assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
1516        BUG_ON(ex == NULL);
1517
1518        buddy = mb_find_buddy(e4b, 0, &max);
1519        BUG_ON(buddy == NULL);
1520        BUG_ON(block >= max);
1521        if (mb_test_bit(block, buddy)) {
1522                ex->fe_len = 0;
1523                ex->fe_start = 0;
1524                ex->fe_group = 0;
1525                return 0;
1526        }
1527
1528        /* find actual order */
1529        order = mb_find_order_for_block(e4b, block);
1530        block = block >> order;
1531
1532        ex->fe_len = 1 << order;
1533        ex->fe_start = block << order;
1534        ex->fe_group = e4b->bd_group;
1535
1536        /* calc difference from given start */
1537        next = next - ex->fe_start;
1538        ex->fe_len -= next;
1539        ex->fe_start += next;
1540
1541        while (needed > ex->fe_len &&
1542               mb_find_buddy(e4b, order, &max)) {
1543
1544                if (block + 1 >= max)
1545                        break;
1546
1547                next = (block + 1) * (1 << order);
1548                if (mb_test_bit(next, e4b->bd_bitmap))
1549                        break;
1550
1551                order = mb_find_order_for_block(e4b, next);
1552
1553                block = next >> order;
1554                ex->fe_len += 1 << order;
1555        }
1556
1557        if (ex->fe_start + ex->fe_len > (1 << (e4b->bd_blkbits + 3))) {
1558                /* Should never happen! (but apparently sometimes does?!?) */
1559                WARN_ON(1);
1560                ext4_error(e4b->bd_sb, "corruption or bug in mb_find_extent "
1561                           "block=%d, order=%d needed=%d ex=%u/%d/%d@%u",
1562                           block, order, needed, ex->fe_group, ex->fe_start,
1563                           ex->fe_len, ex->fe_logical);
1564                ex->fe_len = 0;
1565                ex->fe_start = 0;
1566                ex->fe_group = 0;
1567        }
1568        return ex->fe_len;
1569}
1570
1571static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
1572{
1573        int ord;
1574        int mlen = 0;
1575        int max = 0;
1576        int cur;
1577        int start = ex->fe_start;
1578        int len = ex->fe_len;
1579        unsigned ret = 0;
1580        int len0 = len;
1581        void *buddy;
1582
1583        BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
1584        BUG_ON(e4b->bd_group != ex->fe_group);
1585        assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
1586        mb_check_buddy(e4b);
1587        mb_mark_used_double(e4b, start, len);
1588
1589        e4b->bd_info->bb_free -= len;
1590        if (e4b->bd_info->bb_first_free == start)
1591                e4b->bd_info->bb_first_free += len;
1592
1593        /* let's maintain fragments counter */
1594        if (start != 0)
1595                mlen = !mb_test_bit(start - 1, e4b->bd_bitmap);
1596        if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0])
1597                max = !mb_test_bit(start + len, e4b->bd_bitmap);
1598        if (mlen && max)
1599                e4b->bd_info->bb_fragments++;
1600        else if (!mlen && !max)
1601                e4b->bd_info->bb_fragments--;
1602
1603        /* let's maintain buddy itself */
1604        while (len) {
1605                ord = mb_find_order_for_block(e4b, start);
1606
1607                if (((start >> ord) << ord) == start && len >= (1 << ord)) {
1608                        /* the whole chunk may be allocated at once! */
1609                        mlen = 1 << ord;
1610                        buddy = mb_find_buddy(e4b, ord, &max);
1611                        BUG_ON((start >> ord) >= max);
1612                        mb_set_bit(start >> ord, buddy);
1613                        e4b->bd_info->bb_counters[ord]--;
1614                        start += mlen;
1615                        len -= mlen;
1616                        BUG_ON(len < 0);
1617                        continue;
1618                }
1619
1620                /* store for history */
1621                if (ret == 0)
1622                        ret = len | (ord << 16);
1623
1624                /* we have to split large buddy */
1625                BUG_ON(ord <= 0);
1626                buddy = mb_find_buddy(e4b, ord, &max);
1627                mb_set_bit(start >> ord, buddy);
1628                e4b->bd_info->bb_counters[ord]--;
1629
1630                ord--;
1631                cur = (start >> ord) & ~1U;
1632                buddy = mb_find_buddy(e4b, ord, &max);
1633                mb_clear_bit(cur, buddy);
1634                mb_clear_bit(cur + 1, buddy);
1635                e4b->bd_info->bb_counters[ord]++;
1636                e4b->bd_info->bb_counters[ord]++;
1637        }
1638        mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
1639
1640        ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0);
1641        mb_check_buddy(e4b);
1642
1643        return ret;
1644}
1645
1646/*
1647 * Must be called under group lock!
1648 */
1649static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
1650                                        struct ext4_buddy *e4b)
1651{
1652        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
1653        int ret;
1654
1655        BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group);
1656        BUG_ON(ac->ac_status == AC_STATUS_FOUND);
1657
1658        ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len);
1659        ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical;
1660        ret = mb_mark_used(e4b, &ac->ac_b_ex);
1661
1662        /* preallocation can change ac_b_ex, thus we store actually
1663         * allocated blocks for history */
1664        ac->ac_f_ex = ac->ac_b_ex;
1665
1666        ac->ac_status = AC_STATUS_FOUND;
1667        ac->ac_tail = ret & 0xffff;
1668        ac->ac_buddy = ret >> 16;
1669
1670        /*
1671         * take the page reference. We want the page to be pinned
1672         * so that we don't get a ext4_mb_init_cache_call for this
1673         * group until we update the bitmap. That would mean we
1674         * double allocate blocks. The reference is dropped
1675         * in ext4_mb_release_context
1676         */
1677        ac->ac_bitmap_page = e4b->bd_bitmap_page;
1678        get_page(ac->ac_bitmap_page);
1679        ac->ac_buddy_page = e4b->bd_buddy_page;
1680        get_page(ac->ac_buddy_page);
1681        /* store last allocated for subsequent stream allocation */
1682        if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
1683                spin_lock(&sbi->s_md_lock);
1684                sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
1685                sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
1686                spin_unlock(&sbi->s_md_lock);
1687        }
1688}
1689
1690/*
1691 * regular allocator, for general purposes allocation
1692 */
1693
1694static void ext4_mb_check_limits(struct ext4_allocation_context *ac,
1695                                        struct ext4_buddy *e4b,
1696                                        int finish_group)
1697{
1698        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
1699        struct ext4_free_extent *bex = &ac->ac_b_ex;
1700        struct ext4_free_extent *gex = &ac->ac_g_ex;
1701        struct ext4_free_extent ex;
1702        int max;
1703
1704        if (ac->ac_status == AC_STATUS_FOUND)
1705                return;
1706        /*
1707         * We don't want to scan for a whole year
1708         */
1709        if (ac->ac_found > sbi->s_mb_max_to_scan &&
1710                        !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
1711                ac->ac_status = AC_STATUS_BREAK;
1712                return;
1713        }
1714
1715        /*
1716         * Haven't found good chunk so far, let's continue
1717         */
1718        if (bex->fe_len < gex->fe_len)
1719                return;
1720
1721        if ((finish_group || ac->ac_found > sbi->s_mb_min_to_scan)
1722                        && bex->fe_group == e4b->bd_group) {
1723                /* recheck chunk's availability - we don't know
1724                 * when it was found (within this lock-unlock
1725                 * period or not) */
1726                max = mb_find_extent(e4b, bex->fe_start, gex->fe_len, &ex);
1727                if (max >= gex->fe_len) {
1728                        ext4_mb_use_best_found(ac, e4b);
1729                        return;
1730                }
1731        }
1732}
1733
1734/*
1735 * The routine checks whether found extent is good enough. If it is,
1736 * then the extent gets marked used and flag is set to the context
1737 * to stop scanning. Otherwise, the extent is compared with the
1738 * previous found extent and if new one is better, then it's stored
1739 * in the context. Later, the best found extent will be used, if
1740 * mballoc can't find good enough extent.
1741 *
1742 * FIXME: real allocation policy is to be designed yet!
1743 */
1744static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
1745                                        struct ext4_free_extent *ex,
1746                                        struct ext4_buddy *e4b)
1747{
1748        struct ext4_free_extent *bex = &ac->ac_b_ex;
1749        struct ext4_free_extent *gex = &ac->ac_g_ex;
1750
1751        BUG_ON(ex->fe_len <= 0);
1752        BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
1753        BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb));
1754        BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
1755
1756        ac->ac_found++;
1757
1758        /*
1759         * The special case - take what you catch first
1760         */
1761        if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
1762                *bex = *ex;
1763                ext4_mb_use_best_found(ac, e4b);
1764                return;
1765        }
1766
1767        /*
1768         * Let's check whether the chuck is good enough
1769         */
1770        if (ex->fe_len == gex->fe_len) {
1771                *bex = *ex;
1772                ext4_mb_use_best_found(ac, e4b);
1773                return;
1774        }
1775
1776        /*
1777         * If this is first found extent, just store it in the context
1778         */
1779        if (bex->fe_len == 0) {
1780                *bex = *ex;
1781                return;
1782        }
1783
1784        /*
1785         * If new found extent is better, store it in the context
1786         */
1787        if (bex->fe_len < gex->fe_len) {
1788                /* if the request isn't satisfied, any found extent
1789                 * larger than previous best one is better */
1790                if (ex->fe_len > bex->fe_len)
1791                        *bex = *ex;
1792        } else if (ex->fe_len > gex->fe_len) {
1793                /* if the request is satisfied, then we try to find
1794                 * an extent that still satisfy the request, but is
1795                 * smaller than previous one */
1796                if (ex->fe_len < bex->fe_len)
1797                        *bex = *ex;
1798        }
1799
1800        ext4_mb_check_limits(ac, e4b, 0);
1801}
1802
1803static noinline_for_stack
1804int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
1805                                        struct ext4_buddy *e4b)
1806{
1807        struct ext4_free_extent ex = ac->ac_b_ex;
1808        ext4_group_t group = ex.fe_group;
1809        int max;
1810        int err;
1811
1812        BUG_ON(ex.fe_len <= 0);
1813        err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
1814        if (err)
1815                return err;
1816
1817        ext4_lock_group(ac->ac_sb, group);
1818        max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex);
1819
1820        if (max > 0) {
1821                ac->ac_b_ex = ex;
1822                ext4_mb_use_best_found(ac, e4b);
1823        }
1824
1825        ext4_unlock_group(ac->ac_sb, group);
1826        ext4_mb_unload_buddy(e4b);
1827
1828        return 0;
1829}
1830
1831static noinline_for_stack
1832int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
1833                                struct ext4_buddy *e4b)
1834{
1835        ext4_group_t group = ac->ac_g_ex.fe_group;
1836        int max;
1837        int err;
1838        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
1839        struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
1840        struct ext4_free_extent ex;
1841
1842        if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL))
1843                return 0;
1844        if (grp->bb_free == 0)
1845                return 0;
1846
1847        err = ext4_mb_load_buddy(ac->ac_sb, group, e4b);
1848        if (err)
1849                return err;
1850
1851        if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) {
1852                ext4_mb_unload_buddy(e4b);
1853                return 0;
1854        }
1855
1856        ext4_lock_group(ac->ac_sb, group);
1857        max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
1858                             ac->ac_g_ex.fe_len, &ex);
1859        ex.fe_logical = 0xDEADFA11; /* debug value */
1860
1861        if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
1862                ext4_fsblk_t start;
1863
1864                start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) +
1865                        ex.fe_start;
1866                /* use do_div to get remainder (would be 64-bit modulo) */
1867                if (do_div(start, sbi->s_stripe) == 0) {
1868                        ac->ac_found++;
1869                        ac->ac_b_ex = ex;
1870                        ext4_mb_use_best_found(ac, e4b);
1871                }
1872        } else if (max >= ac->ac_g_ex.fe_len) {
1873                BUG_ON(ex.fe_len <= 0);
1874                BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
1875                BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
1876                ac->ac_found++;
1877                ac->ac_b_ex = ex;
1878                ext4_mb_use_best_found(ac, e4b);
1879        } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) {
1880                /* Sometimes, caller may want to merge even small
1881                 * number of blocks to an existing extent */
1882                BUG_ON(ex.fe_len <= 0);
1883                BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group);
1884                BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start);
1885                ac->ac_found++;
1886                ac->ac_b_ex = ex;
1887                ext4_mb_use_best_found(ac, e4b);
1888        }
1889        ext4_unlock_group(ac->ac_sb, group);
1890        ext4_mb_unload_buddy(e4b);
1891
1892        return 0;
1893}
1894
1895/*
1896 * The routine scans buddy structures (not bitmap!) from given order
1897 * to max order and tries to find big enough chunk to satisfy the req
1898 */
1899static noinline_for_stack
1900void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
1901                                        struct ext4_buddy *e4b)
1902{
1903        struct super_block *sb = ac->ac_sb;
1904        struct ext4_group_info *grp = e4b->bd_info;
1905        void *buddy;
1906        int i;
1907        int k;
1908        int max;
1909
1910        BUG_ON(ac->ac_2order <= 0);
1911        for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) {
1912                if (grp->bb_counters[i] == 0)
1913                        continue;
1914
1915                buddy = mb_find_buddy(e4b, i, &max);
1916                BUG_ON(buddy == NULL);
1917
1918                k = mb_find_next_zero_bit(buddy, max, 0);
1919                BUG_ON(k >= max);
1920
1921                ac->ac_found++;
1922
1923                ac->ac_b_ex.fe_len = 1 << i;
1924                ac->ac_b_ex.fe_start = k << i;
1925                ac->ac_b_ex.fe_group = e4b->bd_group;
1926
1927                ext4_mb_use_best_found(ac, e4b);
1928
1929                BUG_ON(ac->ac_b_ex.fe_len != ac->ac_g_ex.fe_len);
1930
1931                if (EXT4_SB(sb)->s_mb_stats)
1932                        atomic_inc(&EXT4_SB(sb)->s_bal_2orders);
1933
1934                break;
1935        }
1936}
1937
1938/*
1939 * The routine scans the group and measures all found extents.
1940 * In order to optimize scanning, caller must pass number of
1941 * free blocks in the group, so the routine can know upper limit.
1942 */
1943static noinline_for_stack
1944void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
1945                                        struct ext4_buddy *e4b)
1946{
1947        struct super_block *sb = ac->ac_sb;
1948        void *bitmap = e4b->bd_bitmap;
1949        struct ext4_free_extent ex;
1950        int i;
1951        int free;
1952
1953        free = e4b->bd_info->bb_free;
1954        BUG_ON(free <= 0);
1955
1956        i = e4b->bd_info->bb_first_free;
1957
1958        while (free && ac->ac_status == AC_STATUS_CONTINUE) {
1959                i = mb_find_next_zero_bit(bitmap,
1960                                                EXT4_CLUSTERS_PER_GROUP(sb), i);
1961                if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) {
1962                        /*
1963                         * IF we have corrupt bitmap, we won't find any
1964                         * free blocks even though group info says we
1965                         * we have free blocks
1966                         */
1967                        ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
1968                                        "%d free clusters as per "
1969                                        "group info. But bitmap says 0",
1970                                        free);
1971                        break;
1972                }
1973
1974                mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex);
1975                BUG_ON(ex.fe_len <= 0);
1976                if (free < ex.fe_len) {
1977                        ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
1978                                        "%d free clusters as per "
1979                                        "group info. But got %d blocks",
1980                                        free, ex.fe_len);
1981                        /*
1982                         * The number of free blocks differs. This mostly
1983                         * indicate that the bitmap is corrupt. So exit
1984                         * without claiming the space.
1985                         */
1986                        break;
1987                }
1988                ex.fe_logical = 0xDEADC0DE; /* debug value */
1989                ext4_mb_measure_extent(ac, &ex, e4b);
1990
1991                i += ex.fe_len;
1992                free -= ex.fe_len;
1993        }
1994
1995        ext4_mb_check_limits(ac, e4b, 1);
1996}
1997
1998/*
1999 * This is a special case for storages like raid5
2000 * we try to find stripe-aligned chunks for stripe-size-multiple requests
2001 */
2002static noinline_for_stack
2003void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
2004                                 struct ext4_buddy *e4b)
2005{
2006        struct super_block *sb = ac->ac_sb;
2007        struct ext4_sb_info *sbi = EXT4_SB(sb);
2008        void *bitmap = e4b->bd_bitmap;
2009        struct ext4_free_extent ex;
2010        ext4_fsblk_t first_group_block;
2011        ext4_fsblk_t a;
2012        ext4_grpblk_t i;
2013        int max;
2014
2015        BUG_ON(sbi->s_stripe == 0);
2016
2017        /* find first stripe-aligned block in group */
2018        first_group_block = ext4_group_first_block_no(sb, e4b->bd_group);
2019
2020        a = first_group_block + sbi->s_stripe - 1;
2021        do_div(a, sbi->s_stripe);
2022        i = (a * sbi->s_stripe) - first_group_block;
2023
2024        while (i < EXT4_CLUSTERS_PER_GROUP(sb)) {
2025                if (!mb_test_bit(i, bitmap)) {
2026                        max = mb_find_extent(e4b, i, sbi->s_stripe, &ex);
2027                        if (max >= sbi->s_stripe) {
2028                                ac->ac_found++;
2029                                ex.fe_logical = 0xDEADF00D; /* debug value */
2030                                ac->ac_b_ex = ex;
2031                                ext4_mb_use_best_found(ac, e4b);
2032                                break;
2033                        }
2034                }
2035                i += sbi->s_stripe;
2036        }
2037}
2038
2039/*
2040 * This is now called BEFORE we load the buddy bitmap.
2041 * Returns either 1 or 0 indicating that the group is either suitable
2042 * for the allocation or not. In addition it can also return negative
2043 * error code when something goes wrong.
2044 */
2045static int ext4_mb_good_group(struct ext4_allocation_context *ac,
2046                                ext4_group_t group, int cr)
2047{
2048        unsigned free, fragments;
2049        int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
2050        struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
2051
2052        BUG_ON(cr < 0 || cr >= 4);
2053
2054        free = grp->bb_free;
2055        if (free == 0)
2056                return 0;
2057        if (cr <= 2 && free < ac->ac_g_ex.fe_len)
2058                return 0;
2059
2060        if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
2061                return 0;
2062
2063        /* We only do this if the grp has never been initialized */
2064        if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
2065                int ret = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS);
2066                if (ret)
2067                        return ret;
2068        }
2069
2070        fragments = grp->bb_fragments;
2071        if (fragments == 0)
2072                return 0;
2073
2074        switch (cr) {
2075        case 0:
2076                BUG_ON(ac->ac_2order == 0);
2077
2078                /* Avoid using the first bg of a flexgroup for data files */
2079                if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
2080                    (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
2081                    ((group % flex_size) == 0))
2082                        return 0;
2083
2084                if ((ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) ||
2085                    (free / fragments) >= ac->ac_g_ex.fe_len)
2086                        return 1;
2087
2088                if (grp->bb_largest_free_order < ac->ac_2order)
2089                        return 0;
2090
2091                return 1;
2092        case 1:
2093                if ((free / fragments) >= ac->ac_g_ex.fe_len)
2094                        return 1;
2095                break;
2096        case 2:
2097                if (free >= ac->ac_g_ex.fe_len)
2098                        return 1;
2099                break;
2100        case 3:
2101                return 1;
2102        default:
2103                BUG();
2104        }
2105
2106        return 0;
2107}
2108
2109static noinline_for_stack int
2110ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
2111{
2112        ext4_group_t ngroups, group, i;
2113        int cr;
2114        int err = 0, first_err = 0;
2115        struct ext4_sb_info *sbi;
2116        struct super_block *sb;
2117        struct ext4_buddy e4b;
2118
2119        sb = ac->ac_sb;
2120        sbi = EXT4_SB(sb);
2121        ngroups = ext4_get_groups_count(sb);
2122        /* non-extent files are limited to low blocks/groups */
2123        if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
2124                ngroups = sbi->s_blockfile_groups;
2125
2126        BUG_ON(ac->ac_status == AC_STATUS_FOUND);
2127
2128        /* first, try the goal */
2129        err = ext4_mb_find_by_goal(ac, &e4b);
2130        if (err || ac->ac_status == AC_STATUS_FOUND)
2131                goto out;
2132
2133        if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
2134                goto out;
2135
2136        /*
2137         * ac->ac2_order is set only if the fe_len is a power of 2
2138         * if ac2_order is set we also set criteria to 0 so that we
2139         * try exact allocation using buddy.
2140         */
2141        i = fls(ac->ac_g_ex.fe_len);
2142        ac->ac_2order = 0;
2143        /*
2144         * We search using buddy data only if the order of the request
2145         * is greater than equal to the sbi_s_mb_order2_reqs
2146         * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
2147         * We also support searching for power-of-two requests only for
2148         * requests upto maximum buddy size we have constructed.
2149         */
2150        if (i >= sbi->s_mb_order2_reqs && i <= sb->s_blocksize_bits + 2) {
2151                /*
2152                 * This should tell if fe_len is exactly power of 2
2153                 */
2154                if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0)
2155                        ac->ac_2order = i - 1;
2156        }
2157
2158        /* if stream allocation is enabled, use global goal */
2159        if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
2160                /* TBD: may be hot point */
2161                spin_lock(&sbi->s_md_lock);
2162                ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
2163                ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
2164                spin_unlock(&sbi->s_md_lock);
2165        }
2166
2167        /* Let's just scan groups to find more-less suitable blocks */
2168        cr = ac->ac_2order ? 0 : 1;
2169        /*
2170         * cr == 0 try to get exact allocation,
2171         * cr == 3  try to get anything
2172         */
2173repeat:
2174        for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
2175                ac->ac_criteria = cr;
2176                /*
2177                 * searching for the right group start
2178                 * from the goal value specified
2179                 */
2180                group = ac->ac_g_ex.fe_group;
2181
2182                for (i = 0; i < ngroups; group++, i++) {
2183                        int ret = 0;
2184                        cond_resched();
2185                        /*
2186                         * Artificially restricted ngroups for non-extent
2187                         * files makes group > ngroups possible on first loop.
2188                         */
2189                        if (group >= ngroups)
2190                                group = 0;
2191
2192                        /* This now checks without needing the buddy page */
2193                        ret = ext4_mb_good_group(ac, group, cr);
2194                        if (ret <= 0) {
2195                                if (!first_err)
2196                                        first_err = ret;
2197                                continue;
2198                        }
2199
2200                        err = ext4_mb_load_buddy(sb, group, &e4b);
2201                        if (err)
2202                                goto out;
2203
2204                        ext4_lock_group(sb, group);
2205
2206                        /*
2207                         * We need to check again after locking the
2208                         * block group
2209                         */
2210                        ret = ext4_mb_good_group(ac, group, cr);
2211                        if (ret <= 0) {
2212                                ext4_unlock_group(sb, group);
2213                                ext4_mb_unload_buddy(&e4b);
2214                                if (!first_err)
2215                                        first_err = ret;
2216                                continue;
2217                        }
2218
2219                        ac->ac_groups_scanned++;
2220                        if (cr == 0)
2221                                ext4_mb_simple_scan_group(ac, &e4b);
2222                        else if (cr == 1 && sbi->s_stripe &&
2223                                        !(ac->ac_g_ex.fe_len % sbi->s_stripe))
2224                                ext4_mb_scan_aligned(ac, &e4b);
2225                        else
2226                                ext4_mb_complex_scan_group(ac, &e4b);
2227
2228                        ext4_unlock_group(sb, group);
2229                        ext4_mb_unload_buddy(&e4b);
2230
2231                        if (ac->ac_status != AC_STATUS_CONTINUE)
2232                                break;
2233                }
2234        }
2235
2236        if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
2237            !(ac->ac_flags & EXT4_MB_HINT_FIRST)) {
2238                /*
2239                 * We've been searching too long. Let's try to allocate
2240                 * the best chunk we've found so far
2241                 */
2242
2243                ext4_mb_try_best_found(ac, &e4b);
2244                if (ac->ac_status != AC_STATUS_FOUND) {
2245                        /*
2246                         * Someone more lucky has already allocated it.
2247                         * The only thing we can do is just take first
2248                         * found block(s)
2249                        printk(KERN_DEBUG "EXT4-fs: someone won our chunk\n");
2250                         */
2251                        ac->ac_b_ex.fe_group = 0;
2252                        ac->ac_b_ex.fe_start = 0;
2253                        ac->ac_b_ex.fe_len = 0;
2254                        ac->ac_status = AC_STATUS_CONTINUE;
2255                        ac->ac_flags |= EXT4_MB_HINT_FIRST;
2256                        cr = 3;
2257                        atomic_inc(&sbi->s_mb_lost_chunks);
2258                        goto repeat;
2259                }
2260        }
2261out:
2262        if (!err && ac->ac_status != AC_STATUS_FOUND && first_err)
2263                err = first_err;
2264        return err;
2265}
2266
2267static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
2268{
2269        struct super_block *sb = seq->private;
2270        ext4_group_t group;
2271
2272        if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
2273                return NULL;
2274        group = *pos + 1;
2275        return (void *) ((unsigned long) group);
2276}
2277
2278static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
2279{
2280        struct super_block *sb = seq->private;
2281        ext4_group_t group;
2282
2283        ++*pos;
2284        if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
2285                return NULL;
2286        group = *pos + 1;
2287        return (void *) ((unsigned long) group);
2288}
2289
2290static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
2291{
2292        struct super_block *sb = seq->private;
2293        ext4_group_t group = (ext4_group_t) ((unsigned long) v);
2294        int i;
2295        int err, buddy_loaded = 0;
2296        struct ext4_buddy e4b;
2297        struct ext4_group_info *grinfo;
2298        unsigned char blocksize_bits = min_t(unsigned char,
2299                                             sb->s_blocksize_bits,
2300                                             EXT4_MAX_BLOCK_LOG_SIZE);
2301        struct sg {
2302                struct ext4_group_info info;
2303                ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2];
2304        } sg;
2305
2306        group--;
2307        if (group == 0)
2308                seq_puts(seq, "#group: free  frags first ["
2309                              " 2^0   2^1   2^2   2^3   2^4   2^5   2^6  "
2310                              " 2^7   2^8   2^9   2^10  2^11  2^12  2^13  ]\n");
2311
2312        i = (blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
2313                sizeof(struct ext4_group_info);
2314
2315        grinfo = ext4_get_group_info(sb, group);
2316        /* Load the group info in memory only if not already loaded. */
2317        if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) {
2318                err = ext4_mb_load_buddy(sb, group, &e4b);
2319                if (err) {
2320                        seq_printf(seq, "#%-5u: I/O error\n", group);
2321                        return 0;
2322                }
2323                buddy_loaded = 1;
2324        }
2325
2326        memcpy(&sg, ext4_get_group_info(sb, group), i);
2327
2328        if (buddy_loaded)
2329                ext4_mb_unload_buddy(&e4b);
2330
2331        seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
2332                        sg.info.bb_fragments, sg.info.bb_first_free);
2333        for (i = 0; i <= 13; i++)
2334                seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ?
2335                                sg.info.bb_counters[i] : 0);
2336        seq_printf(seq, " ]\n");
2337
2338        return 0;
2339}
2340
2341static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v)
2342{
2343}
2344
2345static const struct seq_operations ext4_mb_seq_groups_ops = {
2346        .start  = ext4_mb_seq_groups_start,
2347        .next   = ext4_mb_seq_groups_next,
2348        .stop   = ext4_mb_seq_groups_stop,
2349        .show   = ext4_mb_seq_groups_show,
2350};
2351
2352static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
2353{
2354        struct super_block *sb = PDE_DATA(inode);
2355        int rc;
2356
2357        rc = seq_open(file, &ext4_mb_seq_groups_ops);
2358        if (rc == 0) {
2359                struct seq_file *m = file->private_data;
2360                m->private = sb;
2361        }
2362        return rc;
2363
2364}
2365
2366const struct file_operations ext4_seq_mb_groups_fops = {
2367        .open           = ext4_mb_seq_groups_open,
2368        .read           = seq_read,
2369        .llseek         = seq_lseek,
2370        .release        = seq_release,
2371};
2372
2373static struct kmem_cache *get_groupinfo_cache(int blocksize_bits)
2374{
2375        int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
2376        struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index];
2377
2378        BUG_ON(!cachep);
2379        return cachep;
2380}
2381
2382/*
2383 * Allocate the top-level s_group_info array for the specified number
2384 * of groups
2385 */
2386int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
2387{
2388        struct ext4_sb_info *sbi = EXT4_SB(sb);
2389        unsigned size;
2390        struct ext4_group_info ***new_groupinfo;
2391
2392        size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >>
2393                EXT4_DESC_PER_BLOCK_BITS(sb);
2394        if (size <= sbi->s_group_info_size)
2395                return 0;
2396
2397        size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size);
2398        new_groupinfo = kvzalloc(size, GFP_KERNEL);
2399        if (!new_groupinfo) {
2400                ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
2401                return -ENOMEM;
2402        }
2403        if (sbi->s_group_info) {
2404                memcpy(new_groupinfo, sbi->s_group_info,
2405                       sbi->s_group_info_size * sizeof(*sbi->s_group_info));
2406                kvfree(sbi->s_group_info);
2407        }
2408        sbi->s_group_info = new_groupinfo;
2409        sbi->s_group_info_size = size / sizeof(*sbi->s_group_info);
2410        ext4_debug("allocated s_groupinfo array for %d meta_bg's\n", 
2411                   sbi->s_group_info_size);
2412        return 0;
2413}
2414
2415/* Create and initialize ext4_group_info data for the given group. */
2416int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
2417                          struct ext4_group_desc *desc)
2418{
2419        int i;
2420        int metalen = 0;
2421        struct ext4_sb_info *sbi = EXT4_SB(sb);
2422        struct ext4_group_info **meta_group_info;
2423        struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2424
2425        /*
2426         * First check if this group is the first of a reserved block.
2427         * If it's true, we have to allocate a new table of pointers
2428         * to ext4_group_info structures
2429         */
2430        if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
2431                metalen = sizeof(*meta_group_info) <<
2432                        EXT4_DESC_PER_BLOCK_BITS(sb);
2433                meta_group_info = kmalloc(metalen, GFP_NOFS);
2434                if (meta_group_info == NULL) {
2435                        ext4_msg(sb, KERN_ERR, "can't allocate mem "
2436                                 "for a buddy group");
2437                        goto exit_meta_group_info;
2438                }
2439                sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] =
2440                        meta_group_info;
2441        }
2442
2443        meta_group_info =
2444                sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
2445        i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
2446
2447        meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_NOFS);
2448        if (meta_group_info[i] == NULL) {
2449                ext4_msg(sb, KERN_ERR, "can't allocate buddy mem");
2450                goto exit_group_info;
2451        }
2452        set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT,
2453                &(meta_group_info[i]->bb_state));
2454
2455        /*
2456         * initialize bb_free to be able to skip
2457         * empty groups without initialization
2458         */
2459        if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
2460                meta_group_info[i]->bb_free =
2461                        ext4_free_clusters_after_init(sb, group, desc);
2462        } else {
2463                meta_group_info[i]->bb_free =
2464                        ext4_free_group_clusters(sb, desc);
2465        }
2466
2467        INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
2468        init_rwsem(&meta_group_info[i]->alloc_sem);
2469        meta_group_info[i]->bb_free_root = RB_ROOT;
2470        meta_group_info[i]->bb_largest_free_order = -1;  /* uninit */
2471
2472#ifdef DOUBLE_CHECK
2473        {
2474                struct buffer_head *bh;
2475                meta_group_info[i]->bb_bitmap =
2476                        kmalloc(sb->s_blocksize, GFP_NOFS);
2477                BUG_ON(meta_group_info[i]->bb_bitmap == NULL);
2478                bh = ext4_read_block_bitmap(sb, group);
2479                BUG_ON(IS_ERR_OR_NULL(bh));
2480                memcpy(meta_group_info[i]->bb_bitmap, bh->b_data,
2481                        sb->s_blocksize);
2482                put_bh(bh);
2483        }
2484#endif
2485
2486        return 0;
2487
2488exit_group_info:
2489        /* If a meta_group_info table has been allocated, release it now */
2490        if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
2491                kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]);
2492                sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = NULL;
2493        }
2494exit_meta_group_info:
2495        return -ENOMEM;
2496} /* ext4_mb_add_groupinfo */
2497
2498static int ext4_mb_init_backend(struct super_block *sb)
2499{
2500        ext4_group_t ngroups = ext4_get_groups_count(sb);
2501        ext4_group_t i;
2502        struct ext4_sb_info *sbi = EXT4_SB(sb);
2503        int err;
2504        struct ext4_group_desc *desc;
2505        struct kmem_cache *cachep;
2506
2507        err = ext4_mb_alloc_groupinfo(sb, ngroups);
2508        if (err)
2509                return err;
2510
2511        sbi->s_buddy_cache = new_inode(sb);
2512        if (sbi->s_buddy_cache == NULL) {
2513                ext4_msg(sb, KERN_ERR, "can't get new inode");
2514                goto err_freesgi;
2515        }
2516        /* To avoid potentially colliding with an valid on-disk inode number,
2517         * use EXT4_BAD_INO for the buddy cache inode number.  This inode is
2518         * not in the inode hash, so it should never be found by iget(), but
2519         * this will avoid confusion if it ever shows up during debugging. */
2520        sbi->s_buddy_cache->i_ino = EXT4_BAD_INO;
2521        EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
2522        for (i = 0; i < ngroups; i++) {
2523                desc = ext4_get_group_desc(sb, i, NULL);
2524                if (desc == NULL) {
2525                        ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i);
2526                        goto err_freebuddy;
2527                }
2528                if (ext4_mb_add_groupinfo(sb, i, desc) != 0)
2529                        goto err_freebuddy;
2530        }
2531
2532        return 0;
2533
2534err_freebuddy:
2535        cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2536        while (i-- > 0)
2537                kmem_cache_free(cachep, ext4_get_group_info(sb, i));
2538        i = sbi->s_group_info_size;
2539        while (i-- > 0)
2540                kfree(sbi->s_group_info[i]);
2541        iput(sbi->s_buddy_cache);
2542err_freesgi:
2543        kvfree(sbi->s_group_info);
2544        return -ENOMEM;
2545}
2546
2547static void ext4_groupinfo_destroy_slabs(void)
2548{
2549        int i;
2550
2551        for (i = 0; i < NR_GRPINFO_CACHES; i++) {
2552                if (ext4_groupinfo_caches[i])
2553                        kmem_cache_destroy(ext4_groupinfo_caches[i]);
2554                ext4_groupinfo_caches[i] = NULL;
2555        }
2556}
2557
2558static int ext4_groupinfo_create_slab(size_t size)
2559{
2560        static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex);
2561        int slab_size;
2562        int blocksize_bits = order_base_2(size);
2563        int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE;
2564        struct kmem_cache *cachep;
2565
2566        if (cache_index >= NR_GRPINFO_CACHES)
2567                return -EINVAL;
2568
2569        if (unlikely(cache_index < 0))
2570                cache_index = 0;
2571
2572        mutex_lock(&ext4_grpinfo_slab_create_mutex);
2573        if (ext4_groupinfo_caches[cache_index]) {
2574                mutex_unlock(&ext4_grpinfo_slab_create_mutex);
2575                return 0;       /* Already created */
2576        }
2577
2578        slab_size = offsetof(struct ext4_group_info,
2579                                bb_counters[blocksize_bits + 2]);
2580
2581        cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index],
2582                                        slab_size, 0, SLAB_RECLAIM_ACCOUNT,
2583                                        NULL);
2584
2585        ext4_groupinfo_caches[cache_index] = cachep;
2586
2587        mutex_unlock(&ext4_grpinfo_slab_create_mutex);
2588        if (!cachep) {
2589                printk(KERN_EMERG
2590                       "EXT4-fs: no memory for groupinfo slab cache\n");
2591                return -ENOMEM;
2592        }
2593
2594        return 0;
2595}
2596
2597int ext4_mb_init(struct super_block *sb)
2598{
2599        struct ext4_sb_info *sbi = EXT4_SB(sb);
2600        unsigned i, j;
2601        unsigned offset, offset_incr;
2602        unsigned max;
2603        int ret;
2604
2605        i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
2606
2607        sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
2608        if (sbi->s_mb_offsets == NULL) {
2609                ret = -ENOMEM;
2610                goto out;
2611        }
2612
2613        i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
2614        sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
2615        if (sbi->s_mb_maxs == NULL) {
2616                ret = -ENOMEM;
2617                goto out;
2618        }
2619
2620        ret = ext4_groupinfo_create_slab(sb->s_blocksize);
2621        if (ret < 0)
2622                goto out;
2623
2624        /* order 0 is regular bitmap */
2625        sbi->s_mb_maxs[0] = sb->s_blocksize << 3;
2626        sbi->s_mb_offsets[0] = 0;
2627
2628        i = 1;
2629        offset = 0;
2630        offset_incr = 1 << (sb->s_blocksize_bits - 1);
2631        max = sb->s_blocksize << 2;
2632        do {
2633                sbi->s_mb_offsets[i] = offset;
2634                sbi->s_mb_maxs[i] = max;
2635                offset += offset_incr;
2636                offset_incr = offset_incr >> 1;
2637                max = max >> 1;
2638                i++;
2639        } while (i <= sb->s_blocksize_bits + 1);
2640
2641        spin_lock_init(&sbi->s_md_lock);
2642        spin_lock_init(&sbi->s_bal_lock);
2643        sbi->s_mb_free_pending = 0;
2644        INIT_LIST_HEAD(&sbi->s_freed_data_list);
2645
2646        sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN;
2647        sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN;
2648        sbi->s_mb_stats = MB_DEFAULT_STATS;
2649        sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
2650        sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
2651        /*
2652         * The default group preallocation is 512, which for 4k block
2653         * sizes translates to 2 megabytes.  However for bigalloc file
2654         * systems, this is probably too big (i.e, if the cluster size
2655         * is 1 megabyte, then group preallocation size becomes half a
2656         * gigabyte!).  As a default, we will keep a two megabyte
2657         * group pralloc size for cluster sizes up to 64k, and after
2658         * that, we will force a minimum group preallocation size of
2659         * 32 clusters.  This translates to 8 megs when the cluster
2660         * size is 256k, and 32 megs when the cluster size is 1 meg,
2661         * which seems reasonable as a default.
2662         */
2663        sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >>
2664                                       sbi->s_cluster_bits, 32);
2665        /*
2666         * If there is a s_stripe > 1, then we set the s_mb_group_prealloc
2667         * to the lowest multiple of s_stripe which is bigger than
2668         * the s_mb_group_prealloc as determined above. We want
2669         * the preallocation size to be an exact multiple of the
2670         * RAID stripe size so that preallocations don't fragment
2671         * the stripes.
2672         */
2673        if (sbi->s_stripe > 1) {
2674                sbi->s_mb_group_prealloc = roundup(
2675                        sbi->s_mb_group_prealloc, sbi->s_stripe);
2676        }
2677
2678        sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
2679        if (sbi->s_locality_groups == NULL) {
2680                ret = -ENOMEM;
2681                goto out;
2682        }
2683        for_each_possible_cpu(i) {
2684                struct ext4_locality_group *lg;
2685                lg = per_cpu_ptr(sbi->s_locality_groups, i);
2686                mutex_init(&lg->lg_mutex);
2687                for (j = 0; j < PREALLOC_TB_SIZE; j++)
2688                        INIT_LIST_HEAD(&lg->lg_prealloc_list[j]);
2689                spin_lock_init(&lg->lg_prealloc_lock);
2690        }
2691
2692        /* init file for buddy data */
2693        ret = ext4_mb_init_backend(sb);
2694        if (ret != 0)
2695                goto out_free_locality_groups;
2696
2697        return 0;
2698
2699out_free_locality_groups:
2700        free_percpu(sbi->s_locality_groups);
2701        sbi->s_locality_groups = NULL;
2702out:
2703        kfree(sbi->s_mb_offsets);
2704        sbi->s_mb_offsets = NULL;
2705        kfree(sbi->s_mb_maxs);
2706        sbi->s_mb_maxs = NULL;
2707        return ret;
2708}
2709
2710/* need to called with the ext4 group lock held */
2711static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
2712{
2713        struct ext4_prealloc_space *pa;
2714        struct list_head *cur, *tmp;
2715        int count = 0;
2716
2717        list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) {
2718                pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
2719                list_del(&pa->pa_group_list);
2720                count++;
2721                kmem_cache_free(ext4_pspace_cachep, pa);
2722        }
2723        if (count)
2724                mb_debug(1, "mballoc: %u PAs left\n", count);
2725
2726}
2727
2728int ext4_mb_release(struct super_block *sb)
2729{
2730        ext4_group_t ngroups = ext4_get_groups_count(sb);
2731        ext4_group_t i;
2732        int num_meta_group_infos;
2733        struct ext4_group_info *grinfo;
2734        struct ext4_sb_info *sbi = EXT4_SB(sb);
2735        struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
2736
2737        if (sbi->s_group_info) {
2738                for (i = 0; i < ngroups; i++) {
2739                        grinfo = ext4_get_group_info(sb, i);
2740#ifdef DOUBLE_CHECK
2741                        kfree(grinfo->bb_bitmap);
2742#endif
2743                        ext4_lock_group(sb, i);
2744                        ext4_mb_cleanup_pa(grinfo);
2745                        ext4_unlock_group(sb, i);
2746                        kmem_cache_free(cachep, grinfo);
2747                }
2748                num_meta_group_infos = (ngroups +
2749                                EXT4_DESC_PER_BLOCK(sb) - 1) >>
2750                        EXT4_DESC_PER_BLOCK_BITS(sb);
2751                for (i = 0; i < num_meta_group_infos; i++)
2752                        kfree(sbi->s_group_info[i]);
2753                kvfree(sbi->s_group_info);
2754        }
2755        kfree(sbi->s_mb_offsets);
2756        kfree(sbi->s_mb_maxs);
2757        iput(sbi->s_buddy_cache);
2758        if (sbi->s_mb_stats) {
2759                ext4_msg(sb, KERN_INFO,
2760                       "mballoc: %u blocks %u reqs (%u success)",
2761                                atomic_read(&sbi->s_bal_allocated),
2762                                atomic_read(&sbi->s_bal_reqs),
2763                                atomic_read(&sbi->s_bal_success));
2764                ext4_msg(sb, KERN_INFO,
2765                      "mballoc: %u extents scanned, %u goal hits, "
2766                                "%u 2^N hits, %u breaks, %u lost",
2767                                atomic_read(&sbi->s_bal_ex_scanned),
2768                                atomic_read(&sbi->s_bal_goals),
2769                                atomic_read(&sbi->s_bal_2orders),
2770                                atomic_read(&sbi->s_bal_breaks),
2771                                atomic_read(&sbi->s_mb_lost_chunks));
2772                ext4_msg(sb, KERN_INFO,
2773                       "mballoc: %lu generated and it took %Lu",
2774                                sbi->s_mb_buddies_generated,
2775                                sbi->s_mb_generation_time);
2776                ext4_msg(sb, KERN_INFO,
2777                       "mballoc: %u preallocated, %u discarded",
2778                                atomic_read(&sbi->s_mb_preallocated),
2779                                atomic_read(&sbi->s_mb_discarded));
2780        }
2781
2782        free_percpu(sbi->s_locality_groups);
2783
2784        return 0;
2785}
2786
2787static inline int ext4_issue_discard(struct super_block *sb,
2788                ext4_group_t block_group, ext4_grpblk_t cluster, int count,
2789                struct bio **biop)
2790{
2791        ext4_fsblk_t discard_block;
2792
2793        discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) +
2794                         ext4_group_first_block_no(sb, block_group));
2795        count = EXT4_C2B(EXT4_SB(sb), count);
2796        trace_ext4_discard_blocks(sb,
2797                        (unsigned long long) discard_block, count);
2798        if (biop) {
2799                return __blkdev_issue_discard(sb->s_bdev,
2800                        (sector_t)discard_block << (sb->s_blocksize_bits - 9),
2801                        (sector_t)count << (sb->s_blocksize_bits - 9),
2802                        GFP_NOFS, 0, biop);
2803        } else
2804                return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0);
2805}
2806
2807static void ext4_free_data_in_buddy(struct super_block *sb,
2808                                    struct ext4_free_data *entry)
2809{
2810        struct ext4_buddy e4b;
2811        struct ext4_group_info *db;
2812        int err, count = 0, count2 = 0;
2813
2814        mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
2815                 entry->efd_count, entry->efd_group, entry);
2816
2817        err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
2818        /* we expect to find existing buddy because it's pinned */
2819        BUG_ON(err != 0);
2820
2821        spin_lock(&EXT4_SB(sb)->s_md_lock);
2822        EXT4_SB(sb)->s_mb_free_pending -= entry->efd_count;
2823        spin_unlock(&EXT4_SB(sb)->s_md_lock);
2824
2825        db = e4b.bd_info;
2826        /* there are blocks to put in buddy to make them really free */
2827        count += entry->efd_count;
2828        count2++;
2829        ext4_lock_group(sb, entry->efd_group);
2830        /* Take it out of per group rb tree */
2831        rb_erase(&entry->efd_node, &(db->bb_free_root));
2832        mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count);
2833
2834        /*
2835         * Clear the trimmed flag for the group so that the next
2836         * ext4_trim_fs can trim it.
2837         * If the volume is mounted with -o discard, online discard
2838         * is supported and the free blocks will be trimmed online.
2839         */
2840        if (!test_opt(sb, DISCARD))
2841                EXT4_MB_GRP_CLEAR_TRIMMED(db);
2842
2843        if (!db->bb_free_root.rb_node) {
2844                /* No more items in the per group rb tree
2845                 * balance refcounts from ext4_mb_free_metadata()
2846                 */
2847                put_page(e4b.bd_buddy_page);
2848                put_page(e4b.bd_bitmap_page);
2849        }
2850        ext4_unlock_group(sb, entry->efd_group);
2851        kmem_cache_free(ext4_free_data_cachep, entry);
2852        ext4_mb_unload_buddy(&e4b);
2853
2854        mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
2855}
2856
2857/*
2858 * This function is called by the jbd2 layer once the commit has finished,
2859 * so we know we can free the blocks that were released with that commit.
2860 */
2861void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid)
2862{
2863        struct ext4_sb_info *sbi = EXT4_SB(sb);
2864        struct ext4_free_data *entry, *tmp;
2865        struct bio *discard_bio = NULL;
2866        struct list_head freed_data_list;
2867        struct list_head *cut_pos = NULL;
2868        int err;
2869
2870        INIT_LIST_HEAD(&freed_data_list);
2871
2872        spin_lock(&sbi->s_md_lock);
2873        list_for_each_entry(entry, &sbi->s_freed_data_list, efd_list) {
2874                if (entry->efd_tid != commit_tid)
2875                        break;
2876                cut_pos = &entry->efd_list;
2877        }
2878        if (cut_pos)
2879                list_cut_position(&freed_data_list, &sbi->s_freed_data_list,
2880                                  cut_pos);
2881        spin_unlock(&sbi->s_md_lock);
2882
2883        if (test_opt(sb, DISCARD)) {
2884                list_for_each_entry(entry, &freed_data_list, efd_list) {
2885                        err = ext4_issue_discard(sb, entry->efd_group,
2886                                                 entry->efd_start_cluster,
2887                                                 entry->efd_count,
2888                                                 &discard_bio);
2889                        if (err && err != -EOPNOTSUPP) {
2890                                ext4_msg(sb, KERN_WARNING, "discard request in"
2891                                         " group:%d block:%d count:%d failed"
2892                                         " with %d", entry->efd_group,
2893                                         entry->efd_start_cluster,
2894                                         entry->efd_count, err);
2895                        } else if (err == -EOPNOTSUPP)
2896                                break;
2897                }
2898
2899                if (discard_bio) {
2900                        submit_bio_wait(discard_bio);
2901                        bio_put(discard_bio);
2902                }
2903        }
2904
2905        list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list)
2906                ext4_free_data_in_buddy(sb, entry);
2907}
2908
2909int __init ext4_init_mballoc(void)
2910{
2911        ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space,
2912                                        SLAB_RECLAIM_ACCOUNT);
2913        if (ext4_pspace_cachep == NULL)
2914                return -ENOMEM;
2915
2916        ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context,
2917                                    SLAB_RECLAIM_ACCOUNT);
2918        if (ext4_ac_cachep == NULL) {
2919                kmem_cache_destroy(ext4_pspace_cachep);
2920                return -ENOMEM;
2921        }
2922
2923        ext4_free_data_cachep = KMEM_CACHE(ext4_free_data,
2924                                           SLAB_RECLAIM_ACCOUNT);
2925        if (ext4_free_data_cachep == NULL) {
2926                kmem_cache_destroy(ext4_pspace_cachep);
2927                kmem_cache_destroy(ext4_ac_cachep);
2928                return -ENOMEM;
2929        }
2930        return 0;
2931}
2932
2933void ext4_exit_mballoc(void)
2934{
2935        /*
2936         * Wait for completion of call_rcu()'s on ext4_pspace_cachep
2937         * before destroying the slab cache.
2938         */
2939        rcu_barrier();
2940        kmem_cache_destroy(ext4_pspace_cachep);
2941        kmem_cache_destroy(ext4_ac_cachep);
2942        kmem_cache_destroy(ext4_free_data_cachep);
2943        ext4_groupinfo_destroy_slabs();
2944}
2945
2946
2947/*
2948 * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps
2949 * Returns 0 if success or error code
2950 */
2951static noinline_for_stack int
2952ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
2953                                handle_t *handle, unsigned int reserv_clstrs)
2954{
2955        struct buffer_head *bitmap_bh = NULL;
2956        struct ext4_group_desc *gdp;
2957        struct buffer_head *gdp_bh;
2958        struct ext4_sb_info *sbi;
2959        struct super_block *sb;
2960        ext4_fsblk_t block;
2961        int err, len;
2962
2963        BUG_ON(ac->ac_status != AC_STATUS_FOUND);
2964        BUG_ON(ac->ac_b_ex.fe_len <= 0);
2965
2966        sb = ac->ac_sb;
2967        sbi = EXT4_SB(sb);
2968
2969        bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group);
2970        if (IS_ERR(bitmap_bh)) {
2971                err = PTR_ERR(bitmap_bh);
2972                bitmap_bh = NULL;
2973                goto out_err;
2974        }
2975
2976        BUFFER_TRACE(bitmap_bh, "getting write access");
2977        err = ext4_journal_get_write_access(handle, bitmap_bh);
2978        if (err)
2979                goto out_err;
2980
2981        err = -EIO;
2982        gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, &gdp_bh);
2983        if (!gdp)
2984                goto out_err;
2985
2986        ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group,
2987                        ext4_free_group_clusters(sb, gdp));
2988
2989        BUFFER_TRACE(gdp_bh, "get_write_access");
2990        err = ext4_journal_get_write_access(handle, gdp_bh);
2991        if (err)
2992                goto out_err;
2993
2994        block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
2995
2996        len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
2997        if (!ext4_data_block_valid(sbi, block, len)) {
2998                ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
2999                           "fs metadata", block, block+len);
3000                /* File system mounted not to panic on error
3001                 * Fix the bitmap and return EFSCORRUPTED
3002                 * We leak some of the blocks here.
3003                 */
3004                ext4_lock_group(sb, ac->ac_b_ex.fe_group);
3005                ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
3006                              ac->ac_b_ex.fe_len);
3007                ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
3008                err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
3009                if (!err)
3010                        err = -EFSCORRUPTED;
3011                goto out_err;
3012        }
3013
3014        ext4_lock_group(sb, ac->ac_b_ex.fe_group);
3015#ifdef AGGRESSIVE_CHECK
3016        {
3017                int i;
3018                for (i = 0; i < ac->ac_b_ex.fe_len; i++) {
3019                        BUG_ON(mb_test_bit(ac->ac_b_ex.fe_start + i,
3020                                                bitmap_bh->b_data));
3021                }
3022        }
3023#endif
3024        ext4_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
3025                      ac->ac_b_ex.fe_len);
3026        if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
3027                gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
3028                ext4_free_group_clusters_set(sb, gdp,
3029                                             ext4_free_clusters_after_init(sb,
3030                                                ac->ac_b_ex.fe_group, gdp));
3031        }
3032        len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len;
3033        ext4_free_group_clusters_set(sb, gdp, len);
3034        ext4_block_bitmap_csum_set(sb, ac->ac_b_ex.fe_group, gdp, bitmap_bh);
3035        ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp);
3036
3037        ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
3038        percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len);
3039        /*
3040         * Now reduce the dirty block count also. Should not go negative
3041         */
3042        if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED))
3043                /* release all the reserved blocks if non delalloc */
3044                percpu_counter_sub(&sbi->s_dirtyclusters_counter,
3045                                   reserv_clstrs);
3046
3047        if (sbi->s_log_groups_per_flex) {
3048                ext4_group_t flex_group = ext4_flex_group(sbi,
3049                                                          ac->ac_b_ex.fe_group);
3050                atomic64_sub(ac->ac_b_ex.fe_len,
3051                             &sbi->s_flex_groups[flex_group].free_clusters);
3052        }
3053
3054        err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
3055        if (err)
3056                goto out_err;
3057        err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh);
3058
3059out_err:
3060        brelse(bitmap_bh);
3061        return err;
3062}
3063
3064/*
3065 * here we normalize request for locality group
3066 * Group request are normalized to s_mb_group_prealloc, which goes to
3067 * s_strip if we set the same via mount option.
3068 * s_mb_group_prealloc can be configured via
3069 * /sys/fs/ext4/<partition>/mb_group_prealloc
3070 *
3071 * XXX: should we try to preallocate more than the group has now?
3072 */
3073static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
3074{
3075        struct super_block *sb = ac->ac_sb;
3076        struct ext4_locality_group *lg = ac->ac_lg;
3077
3078        BUG_ON(lg == NULL);
3079        ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
3080        mb_debug(1, "#%u: goal %u blocks for locality group\n",
3081                current->pid, ac->ac_g_ex.fe_len);
3082}
3083
3084/*
3085 * Normalization means making request better in terms of
3086 * size and alignment
3087 */
3088static noinline_for_stack void
3089ext4_mb_normalize_request(struct ext4_allocation_context *ac,
3090                                struct ext4_allocation_request *ar)
3091{
3092        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
3093        int bsbits, max;
3094        ext4_lblk_t end;
3095        loff_t size, start_off;
3096        loff_t orig_size __maybe_unused;
3097        ext4_lblk_t start;
3098        struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
3099        struct ext4_prealloc_space *pa;
3100
3101        /* do normalize only data requests, metadata requests
3102           do not need preallocation */
3103        if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
3104                return;
3105
3106        /* sometime caller may want exact blocks */
3107        if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
3108                return;
3109
3110        /* caller may indicate that preallocation isn't
3111         * required (it's a tail, for example) */
3112        if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC)
3113                return;
3114
3115        if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) {
3116                ext4_mb_normalize_group_request(ac);
3117                return ;
3118        }
3119
3120        bsbits = ac->ac_sb->s_blocksize_bits;
3121
3122        /* first, let's learn actual file size
3123         * given current request is allocated */
3124        size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
3125        size = size << bsbits;
3126        if (size < i_size_read(ac->ac_inode))
3127                size = i_size_read(ac->ac_inode);
3128        orig_size = size;
3129
3130        /* max size of free chunks */
3131        max = 2 << bsbits;
3132
3133#define NRL_CHECK_SIZE(req, size, max, chunk_size)      \
3134                (req <= (size) || max <= (chunk_size))
3135
3136        /* first, try to predict filesize */
3137        /* XXX: should this table be tunable? */
3138        start_off = 0;
3139        if (size <= 16 * 1024) {
3140                size = 16 * 1024;
3141        } else if (size <= 32 * 1024) {
3142                size = 32 * 1024;
3143        } else if (size <= 64 * 1024) {
3144                size = 64 * 1024;
3145        } else if (size <= 128 * 1024) {
3146                size = 128 * 1024;
3147        } else if (size <= 256 * 1024) {
3148                size = 256 * 1024;
3149        } else if (size <= 512 * 1024) {
3150                size = 512 * 1024;
3151        } else if (size <= 1024 * 1024) {
3152                size = 1024 * 1024;
3153        } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) {
3154                start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
3155                                                (21 - bsbits)) << 21;
3156                size = 2 * 1024 * 1024;
3157        } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) {
3158                start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
3159                                                        (22 - bsbits)) << 22;
3160                size = 4 * 1024 * 1024;
3161        } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len,
3162                                        (8<<20)>>bsbits, max, 8 * 1024)) {
3163                start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
3164                                                        (23 - bsbits)) << 23;
3165                size = 8 * 1024 * 1024;
3166        } else {
3167                start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits;
3168                size      = (loff_t) EXT4_C2B(EXT4_SB(ac->ac_sb),
3169                                              ac->ac_o_ex.fe_len) << bsbits;
3170        }
3171        size = size >> bsbits;
3172        start = start_off >> bsbits;
3173
3174        /* don't cover already allocated blocks in selected range */
3175        if (ar->pleft && start <= ar->lleft) {
3176                size -= ar->lleft + 1 - start;
3177                start = ar->lleft + 1;
3178        }
3179        if (ar->pright && start + size - 1 >= ar->lright)
3180                size -= start + size - ar->lright;
3181
3182        /*
3183         * Trim allocation request for filesystems with artificially small
3184         * groups.
3185         */
3186        if (size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb))
3187                size = EXT4_BLOCKS_PER_GROUP(ac->ac_sb);
3188
3189        end = start + size;
3190
3191        /* check we don't cross already preallocated blocks */
3192        rcu_read_lock();
3193        list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
3194                ext4_lblk_t pa_end;
3195
3196                if (pa->pa_deleted)
3197                        continue;
3198                spin_lock(&pa->pa_lock);
3199                if (pa->pa_deleted) {
3200                        spin_unlock(&pa->pa_lock);
3201                        continue;
3202                }
3203
3204                pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
3205                                                  pa->pa_len);
3206
3207                /* PA must not overlap original request */
3208                BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
3209                        ac->ac_o_ex.fe_logical < pa->pa_lstart));
3210
3211                /* skip PAs this normalized request doesn't overlap with */
3212                if (pa->pa_lstart >= end || pa_end <= start) {
3213                        spin_unlock(&pa->pa_lock);
3214                        continue;
3215                }
3216                BUG_ON(pa->pa_lstart <= start && pa_end >= end);
3217
3218                /* adjust start or end to be adjacent to this pa */
3219                if (pa_end <= ac->ac_o_ex.fe_logical) {
3220                        BUG_ON(pa_end < start);
3221                        start = pa_end;
3222                } else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
3223                        BUG_ON(pa->pa_lstart > end);
3224                        end = pa->pa_lstart;
3225                }
3226                spin_unlock(&pa->pa_lock);
3227        }
3228        rcu_read_unlock();
3229        size = end - start;
3230
3231        /* XXX: extra loop to check we really don't overlap preallocations */
3232        rcu_read_lock();
3233        list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
3234                ext4_lblk_t pa_end;
3235
3236                spin_lock(&pa->pa_lock);
3237                if (pa->pa_deleted == 0) {
3238                        pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
3239                                                          pa->pa_len);
3240                        BUG_ON(!(start >= pa_end || end <= pa->pa_lstart));
3241                }
3242                spin_unlock(&pa->pa_lock);
3243        }
3244        rcu_read_unlock();
3245
3246        if (start + size <= ac->ac_o_ex.fe_logical &&
3247                        start > ac->ac_o_ex.fe_logical) {
3248                ext4_msg(ac->ac_sb, KERN_ERR,
3249                         "start %lu, size %lu, fe_logical %lu",
3250                         (unsigned long) start, (unsigned long) size,
3251                         (unsigned long) ac->ac_o_ex.fe_logical);
3252                BUG();
3253        }
3254        BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb));
3255
3256        /* now prepare goal request */
3257
3258        /* XXX: is it better to align blocks WRT to logical
3259         * placement or satisfy big request as is */
3260        ac->ac_g_ex.fe_logical = start;
3261        ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size);
3262
3263        /* define goal start in order to merge */
3264        if (ar->pright && (ar->lright == (start + size))) {
3265                /* merge to the right */
3266                ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size,
3267                                                &ac->ac_f_ex.fe_group,
3268                                                &ac->ac_f_ex.fe_start);
3269                ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
3270        }
3271        if (ar->pleft && (ar->lleft + 1 == start)) {
3272                /* merge to the left */
3273                ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1,
3274                                                &ac->ac_f_ex.fe_group,
3275                                                &ac->ac_f_ex.fe_start);
3276                ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
3277        }
3278
3279        mb_debug(1, "goal: %u(was %u) blocks at %u\n", (unsigned) size,
3280                (unsigned) orig_size, (unsigned) start);
3281}
3282
3283static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
3284{
3285        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
3286
3287        if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) {
3288                atomic_inc(&sbi->s_bal_reqs);
3289                atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
3290                if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
3291                        atomic_inc(&sbi->s_bal_success);
3292                atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
3293                if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
3294                                ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
3295                        atomic_inc(&sbi->s_bal_goals);
3296                if (ac->ac_found > sbi->s_mb_max_to_scan)
3297                        atomic_inc(&sbi->s_bal_breaks);
3298        }
3299
3300        if (ac->ac_op == EXT4_MB_HISTORY_ALLOC)
3301                trace_ext4_mballoc_alloc(ac);
3302        else
3303                trace_ext4_mballoc_prealloc(ac);
3304}
3305
3306/*
3307 * Called on failure; free up any blocks from the inode PA for this
3308 * context.  We don't need this for MB_GROUP_PA because we only change
3309 * pa_free in ext4_mb_release_context(), but on failure, we've already
3310 * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed.
3311 */
3312static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
3313{
3314        struct ext4_prealloc_space *pa = ac->ac_pa;
3315        struct ext4_buddy e4b;
3316        int err;
3317
3318        if (pa == NULL) {
3319                if (ac->ac_f_ex.fe_len == 0)
3320                        return;
3321                err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b);
3322                if (err) {
3323                        /*
3324                         * This should never happen since we pin the
3325                         * pages in the ext4_allocation_context so
3326                         * ext4_mb_load_buddy() should never fail.
3327                         */
3328                        WARN(1, "mb_load_buddy failed (%d)", err);
3329                        return;
3330                }
3331                ext4_lock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
3332                mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start,
3333                               ac->ac_f_ex.fe_len);
3334                ext4_unlock_group(ac->ac_sb, ac->ac_f_ex.fe_group);
3335                ext4_mb_unload_buddy(&e4b);
3336                return;
3337        }
3338        if (pa->pa_type == MB_INODE_PA)
3339                pa->pa_free += ac->ac_b_ex.fe_len;
3340}
3341
3342/*
3343 * use blocks preallocated to inode
3344 */
3345static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
3346                                struct ext4_prealloc_space *pa)
3347{
3348        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
3349        ext4_fsblk_t start;
3350        ext4_fsblk_t end;
3351        int len;
3352
3353        /* found preallocated blocks, use them */
3354        start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart);
3355        end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len),
3356                  start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len));
3357        len = EXT4_NUM_B2C(sbi, end - start);
3358        ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group,
3359                                        &ac->ac_b_ex.fe_start);
3360        ac->ac_b_ex.fe_len = len;
3361        ac->ac_status = AC_STATUS_FOUND;
3362        ac->ac_pa = pa;
3363
3364        BUG_ON(start < pa->pa_pstart);
3365        BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len));
3366        BUG_ON(pa->pa_free < len);
3367        pa->pa_free -= len;
3368
3369        mb_debug(1, "use %llu/%u from inode pa %p\n", start, len, pa);
3370}
3371
3372/*
3373 * use blocks preallocated to locality group
3374 */
3375static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
3376                                struct ext4_prealloc_space *pa)
3377{
3378        unsigned int len = ac->ac_o_ex.fe_len;
3379
3380        ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart,
3381                                        &ac->ac_b_ex.fe_group,
3382                                        &ac->ac_b_ex.fe_start);
3383        ac->ac_b_ex.fe_len = len;
3384        ac->ac_status = AC_STATUS_FOUND;
3385        ac->ac_pa = pa;
3386
3387        /* we don't correct pa_pstart or pa_plen here to avoid
3388         * possible race when the group is being loaded concurrently
3389         * instead we correct pa later, after blocks are marked
3390         * in on-disk bitmap -- see ext4_mb_release_context()
3391         * Other CPUs are prevented from allocating from this pa by lg_mutex
3392         */
3393        mb_debug(1, "use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
3394}
3395
3396/*
3397 * Return the prealloc space that have minimal distance
3398 * from the goal block. @cpa is the prealloc
3399 * space that is having currently known minimal distance
3400 * from the goal block.
3401 */
3402static struct ext4_prealloc_space *
3403ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
3404                        struct ext4_prealloc_space *pa,
3405                        struct ext4_prealloc_space *cpa)
3406{
3407        ext4_fsblk_t cur_distance, new_distance;
3408
3409        if (cpa == NULL) {
3410                atomic_inc(&pa->pa_count);
3411                return pa;
3412        }
3413        cur_distance = abs(goal_block - cpa->pa_pstart);
3414        new_distance = abs(goal_block - pa->pa_pstart);
3415
3416        if (cur_distance <= new_distance)
3417                return cpa;
3418
3419        /* drop the previous reference */
3420        atomic_dec(&cpa->pa_count);
3421        atomic_inc(&pa->pa_count);
3422        return pa;
3423}
3424
3425/*
3426 * search goal blocks in preallocated space
3427 */
3428static noinline_for_stack int
3429ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
3430{
3431        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
3432        int order, i;
3433        struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
3434        struct ext4_locality_group *lg;
3435        struct ext4_prealloc_space *pa, *cpa = NULL;
3436        ext4_fsblk_t goal_block;
3437
3438        /* only data can be preallocated */
3439        if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
3440                return 0;
3441
3442        /* first, try per-file preallocation */
3443        rcu_read_lock();
3444        list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
3445
3446                /* all fields in this condition don't change,
3447                 * so we can skip locking for them */
3448                if (ac->ac_o_ex.fe_logical < pa->pa_lstart ||
3449                    ac->ac_o_ex.fe_logical >= (pa->pa_lstart +
3450                                               EXT4_C2B(sbi, pa->pa_len)))
3451                        continue;
3452
3453                /* non-extent files can't have physical blocks past 2^32 */
3454                if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
3455                    (pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len) >
3456                     EXT4_MAX_BLOCK_FILE_PHYS))
3457                        continue;
3458
3459                /* found preallocated blocks, use them */
3460                spin_lock(&pa->pa_lock);
3461                if (pa->pa_deleted == 0 && pa->pa_free) {
3462                        atomic_inc(&pa->pa_count);
3463                        ext4_mb_use_inode_pa(ac, pa);
3464                        spin_unlock(&pa->pa_lock);
3465                        ac->ac_criteria = 10;
3466                        rcu_read_unlock();
3467                        return 1;
3468                }
3469                spin_unlock(&pa->pa_lock);
3470        }
3471        rcu_read_unlock();
3472
3473        /* can we use group allocation? */
3474        if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC))
3475                return 0;
3476
3477        /* inode may have no locality group for some reason */
3478        lg = ac->ac_lg;
3479        if (lg == NULL)
3480                return 0;
3481        order  = fls(ac->ac_o_ex.fe_len) - 1;
3482        if (order > PREALLOC_TB_SIZE - 1)
3483                /* The max size of hash table is PREALLOC_TB_SIZE */
3484                order = PREALLOC_TB_SIZE - 1;
3485
3486        goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex);
3487        /*
3488         * search for the prealloc space that is having
3489         * minimal distance from the goal block.
3490         */
3491        for (i = order; i < PREALLOC_TB_SIZE; i++) {
3492                rcu_read_lock();
3493                list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i],
3494                                        pa_inode_list) {
3495                        spin_lock(&pa->pa_lock);
3496                        if (pa->pa_deleted == 0 &&
3497                                        pa->pa_free >= ac->ac_o_ex.fe_len) {
3498
3499                                cpa = ext4_mb_check_group_pa(goal_block,
3500                                                                pa, cpa);
3501                        }
3502                        spin_unlock(&pa->pa_lock);
3503                }
3504                rcu_read_unlock();
3505        }
3506        if (cpa) {
3507                ext4_mb_use_group_pa(ac, cpa);
3508                ac->ac_criteria = 20;
3509                return 1;
3510        }
3511        return 0;
3512}
3513
3514/*
3515 * the function goes through all block freed in the group
3516 * but not yet committed and marks them used in in-core bitmap.
3517 * buddy must be generated from this bitmap
3518 * Need to be called with the ext4 group lock held
3519 */
3520static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
3521                                                ext4_group_t group)
3522{
3523        struct rb_node *n;
3524        struct ext4_group_info *grp;
3525        struct ext4_free_data *entry;
3526
3527        grp = ext4_get_group_info(sb, group);
3528        n = rb_first(&(grp->bb_free_root));
3529
3530        while (n) {
3531                entry = rb_entry(n, struct ext4_free_data, efd_node);
3532                ext4_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count);
3533                n = rb_next(n);
3534        }
3535        return;
3536}
3537
3538/*
3539 * the function goes through all preallocation in this group and marks them
3540 * used in in-core bitmap. buddy must be generated from this bitmap
3541 * Need to be called with ext4 group lock held
3542 */
3543static noinline_for_stack
3544void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
3545                                        ext4_group_t group)
3546{
3547        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
3548        struct ext4_prealloc_space *pa;
3549        struct list_head *cur;
3550        ext4_group_t groupnr;
3551        ext4_grpblk_t start;
3552        int preallocated = 0;
3553        int len;
3554
3555        /* all form of preallocation discards first load group,
3556         * so the only competing code is preallocation use.
3557         * we don't need any locking here
3558         * notice we do NOT ignore preallocations with pa_deleted
3559         * otherwise we could leave used blocks available for
3560         * allocation in buddy when concurrent ext4_mb_put_pa()
3561         * is dropping preallocation
3562         */
3563        list_for_each(cur, &grp->bb_prealloc_list) {
3564                pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list);
3565                spin_lock(&pa->pa_lock);
3566                ext4_get_group_no_and_offset(sb, pa->pa_pstart,
3567                                             &groupnr, &start);
3568                len = pa->pa_len;
3569                spin_unlock(&pa->pa_lock);
3570                if (unlikely(len == 0))
3571                        continue;
3572                BUG_ON(groupnr != group);
3573                ext4_set_bits(bitmap, start, len);
3574                preallocated += len;
3575        }
3576        mb_debug(1, "preallocated %u for group %u\n", preallocated, group);
3577}
3578
3579static void ext4_mb_pa_callback(struct rcu_head *head)
3580{
3581        struct ext4_prealloc_space *pa;
3582        pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu);
3583
3584        BUG_ON(atomic_read(&pa->pa_count));
3585        BUG_ON(pa->pa_deleted == 0);
3586        kmem_cache_free(ext4_pspace_cachep, pa);
3587}
3588
3589/*
3590 * drops a reference to preallocated space descriptor
3591 * if this was the last reference and the space is consumed
3592 */
3593static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
3594                        struct super_block *sb, struct ext4_prealloc_space *pa)
3595{
3596        ext4_group_t grp;
3597        ext4_fsblk_t grp_blk;
3598
3599        /* in this short window concurrent discard can set pa_deleted */
3600        spin_lock(&pa->pa_lock);
3601        if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) {
3602                spin_unlock(&pa->pa_lock);
3603                return;
3604        }
3605
3606        if (pa->pa_deleted == 1) {
3607                spin_unlock(&pa->pa_lock);
3608                return;
3609        }
3610
3611        pa->pa_deleted = 1;
3612        spin_unlock(&pa->pa_lock);
3613
3614        grp_blk = pa->pa_pstart;
3615        /*
3616         * If doing group-based preallocation, pa_pstart may be in the
3617         * next group when pa is used up
3618         */
3619        if (pa->pa_type == MB_GROUP_PA)
3620                grp_blk--;
3621
3622        grp = ext4_get_group_number(sb, grp_blk);
3623
3624        /*
3625         * possible race:
3626         *
3627         *  P1 (buddy init)                     P2 (regular allocation)
3628         *                                      find block B in PA
3629         *  copy on-disk bitmap to buddy
3630         *                                      mark B in on-disk bitmap
3631         *                                      drop PA from group
3632         *  mark all PAs in buddy
3633         *
3634         * thus, P1 initializes buddy with B available. to prevent this
3635         * we make "copy" and "mark all PAs" atomic and serialize "drop PA"
3636         * against that pair
3637         */
3638        ext4_lock_group(sb, grp);
3639        list_del(&pa->pa_group_list);
3640        ext4_unlock_group(sb, grp);
3641
3642        spin_lock(pa->pa_obj_lock);
3643        list_del_rcu(&pa->pa_inode_list);
3644        spin_unlock(pa->pa_obj_lock);
3645
3646        call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
3647}
3648
3649/*
3650 * creates new preallocated space for given inode
3651 */
3652static noinline_for_stack int
3653ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
3654{
3655        struct super_block *sb = ac->ac_sb;
3656        struct ext4_sb_info *sbi = EXT4_SB(sb);
3657        struct ext4_prealloc_space *pa;
3658        struct ext4_group_info *grp;
3659        struct ext4_inode_info *ei;
3660
3661        /* preallocate only when found space is larger then requested */
3662        BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
3663        BUG_ON(ac->ac_status != AC_STATUS_FOUND);
3664        BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
3665
3666        pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
3667        if (pa == NULL)
3668                return -ENOMEM;
3669
3670        if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) {
3671                int winl;
3672                int wins;
3673                int win;
3674                int offs;
3675
3676                /* we can't allocate as much as normalizer wants.
3677                 * so, found space must get proper lstart
3678                 * to cover original request */
3679                BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical);
3680                BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len);
3681
3682                /* we're limited by original request in that
3683                 * logical block must be covered any way
3684                 * winl is window we can move our chunk within */
3685                winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical;
3686
3687                /* also, we should cover whole original request */
3688                wins = EXT4_C2B(sbi, ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len);
3689
3690                /* the smallest one defines real window */
3691                win = min(winl, wins);
3692
3693                offs = ac->ac_o_ex.fe_logical %
3694                        EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
3695                if (offs && offs < win)
3696                        win = offs;
3697
3698                ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical -
3699                        EXT4_NUM_B2C(sbi, win);
3700                BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
3701                BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
3702        }
3703
3704        /* preallocation can change ac_b_ex, thus we store actually
3705         * allocated blocks for history */
3706        ac->ac_f_ex = ac->ac_b_ex;
3707
3708        pa->pa_lstart = ac->ac_b_ex.fe_logical;
3709        pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
3710        pa->pa_len = ac->ac_b_ex.fe_len;
3711        pa->pa_free = pa->pa_len;
3712        atomic_set(&pa->pa_count, 1);
3713        spin_lock_init(&pa->pa_lock);
3714        INIT_LIST_HEAD(&pa->pa_inode_list);
3715        INIT_LIST_HEAD(&pa->pa_group_list);
3716        pa->pa_deleted = 0;
3717        pa->pa_type = MB_INODE_PA;
3718
3719        mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa,
3720                        pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3721        trace_ext4_mb_new_inode_pa(ac, pa);
3722
3723        ext4_mb_use_inode_pa(ac, pa);
3724        atomic_add(pa->pa_free, &sbi->s_mb_preallocated);
3725
3726        ei = EXT4_I(ac->ac_inode);
3727        grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
3728
3729        pa->pa_obj_lock = &ei->i_prealloc_lock;
3730        pa->pa_inode = ac->ac_inode;
3731
3732        ext4_lock_group(sb, ac->ac_b_ex.fe_group);
3733        list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
3734        ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
3735
3736        spin_lock(pa->pa_obj_lock);
3737        list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
3738        spin_unlock(pa->pa_obj_lock);
3739
3740        return 0;
3741}
3742
3743/*
3744 * creates new preallocated space for locality group inodes belongs to
3745 */
3746static noinline_for_stack int
3747ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
3748{
3749        struct super_block *sb = ac->ac_sb;
3750        struct ext4_locality_group *lg;
3751        struct ext4_prealloc_space *pa;
3752        struct ext4_group_info *grp;
3753
3754        /* preallocate only when found space is larger then requested */
3755        BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
3756        BUG_ON(ac->ac_status != AC_STATUS_FOUND);
3757        BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
3758
3759        BUG_ON(ext4_pspace_cachep == NULL);
3760        pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
3761        if (pa == NULL)
3762                return -ENOMEM;
3763
3764        /* preallocation can change ac_b_ex, thus we store actually
3765         * allocated blocks for history */
3766        ac->ac_f_ex = ac->ac_b_ex;
3767
3768        pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
3769        pa->pa_lstart = pa->pa_pstart;
3770        pa->pa_len = ac->ac_b_ex.fe_len;
3771        pa->pa_free = pa->pa_len;
3772        atomic_set(&pa->pa_count, 1);
3773        spin_lock_init(&pa->pa_lock);
3774        INIT_LIST_HEAD(&pa->pa_inode_list);
3775        INIT_LIST_HEAD(&pa->pa_group_list);
3776        pa->pa_deleted = 0;
3777        pa->pa_type = MB_GROUP_PA;
3778
3779        mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa,
3780                        pa->pa_pstart, pa->pa_len, pa->pa_lstart);
3781        trace_ext4_mb_new_group_pa(ac, pa);
3782
3783        ext4_mb_use_group_pa(ac, pa);
3784        atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated);
3785
3786        grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group);
3787        lg = ac->ac_lg;
3788        BUG_ON(lg == NULL);
3789
3790        pa->pa_obj_lock = &lg->lg_prealloc_lock;
3791        pa->pa_inode = NULL;
3792
3793        ext4_lock_group(sb, ac->ac_b_ex.fe_group);
3794        list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
3795        ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
3796
3797        /*
3798         * We will later add the new pa to the right bucket
3799         * after updating the pa_free in ext4_mb_release_context
3800         */
3801        return 0;
3802}
3803
3804static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
3805{
3806        int err;
3807
3808        if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
3809                err = ext4_mb_new_group_pa(ac);
3810        else
3811                err = ext4_mb_new_inode_pa(ac);
3812        return err;
3813}
3814
3815/*
3816 * finds all unused blocks in on-disk bitmap, frees them in
3817 * in-core bitmap and buddy.
3818 * @pa must be unlinked from inode and group lists, so that
3819 * nobody else can find/use it.
3820 * the caller MUST hold group/inode locks.
3821 * TODO: optimize the case when there are no in-core structures yet
3822 */
3823static noinline_for_stack int
3824ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
3825                        struct ext4_prealloc_space *pa)
3826{
3827        struct super_block *sb = e4b->bd_sb;
3828        struct ext4_sb_info *sbi = EXT4_SB(sb);
3829        unsigned int end;
3830        unsigned int next;
3831        ext4_group_t group;
3832        ext4_grpblk_t bit;
3833        unsigned long long grp_blk_start;
3834        int err = 0;
3835        int free = 0;
3836
3837        BUG_ON(pa->pa_deleted == 0);
3838        ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
3839        grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit);
3840        BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
3841        end = bit + pa->pa_len;
3842
3843        while (bit < end) {
3844                bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit);
3845                if (bit >= end)
3846                        break;
3847                next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
3848                mb_debug(1, "    free preallocated %u/%u in group %u\n",
3849                         (unsigned) ext4_group_first_block_no(sb, group) + bit,
3850                         (unsigned) next - bit, (unsigned) group);
3851                free += next - bit;
3852
3853                trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit);
3854                trace_ext4_mb_release_inode_pa(pa, (grp_blk_start +
3855                                                    EXT4_C2B(sbi, bit)),
3856                                               next - bit);
3857                mb_free_blocks(pa->pa_inode, e4b, bit, next - bit);
3858                bit = next + 1;
3859        }
3860        if (free != pa->pa_free) {
3861                ext4_msg(e4b->bd_sb, KERN_CRIT,
3862                         "pa %p: logic %lu, phys. %lu, len %lu",
3863                         pa, (unsigned long) pa->pa_lstart,
3864                         (unsigned long) pa->pa_pstart,
3865                         (unsigned long) pa->pa_len);
3866                ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u",
3867                                        free, pa->pa_free);
3868                /*
3869                 * pa is already deleted so we use the value obtained
3870                 * from the bitmap and continue.
3871                 */
3872        }
3873        atomic_add(free, &sbi->s_mb_discarded);
3874
3875        return err;
3876}
3877
3878static noinline_for_stack int
3879ext4_mb_release_group_pa(struct ext4_buddy *e4b,
3880                                struct ext4_prealloc_space *pa)
3881{
3882        struct super_block *sb = e4b->bd_sb;
3883        ext4_group_t group;
3884        ext4_grpblk_t bit;
3885
3886        trace_ext4_mb_release_group_pa(sb, pa);
3887        BUG_ON(pa->pa_deleted == 0);
3888        ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
3889        BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
3890        mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
3891        atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
3892        trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);
3893
3894        return 0;
3895}
3896
3897/*
3898 * releases all preallocations in given group
3899 *
3900 * first, we need to decide discard policy:
3901 * - when do we discard
3902 *   1) ENOSPC
3903 * - how many do we discard
3904 *   1) how many requested
3905 */
3906static noinline_for_stack int
3907ext4_mb_discard_group_preallocations(struct super_block *sb,
3908                                        ext4_group_t group, int needed)
3909{
3910        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
3911        struct buffer_head *bitmap_bh = NULL;
3912        struct ext4_prealloc_space *pa, *tmp;
3913        struct list_head list;
3914        struct ext4_buddy e4b;
3915        int err;
3916        int busy = 0;
3917        int free = 0;
3918
3919        mb_debug(1, "discard preallocation for group %u\n", group);
3920
3921        if (list_empty(&grp->bb_prealloc_list))
3922                return 0;
3923
3924        bitmap_bh = ext4_read_block_bitmap(sb, group);
3925        if (IS_ERR(bitmap_bh)) {
3926                err = PTR_ERR(bitmap_bh);
3927                ext4_error(sb, "Error %d reading block bitmap for %u",
3928                           err, group);
3929                return 0;
3930        }
3931
3932        err = ext4_mb_load_buddy(sb, group, &e4b);
3933        if (err) {
3934                ext4_warning(sb, "Error %d loading buddy information for %u",
3935                             err, group);
3936                put_bh(bitmap_bh);
3937                return 0;
3938        }
3939
3940        if (needed == 0)
3941                needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1;
3942
3943        INIT_LIST_HEAD(&list);
3944repeat:
3945        ext4_lock_group(sb, group);
3946        list_for_each_entry_safe(pa, tmp,
3947                                &grp->bb_prealloc_list, pa_group_list) {
3948                spin_lock(&pa->pa_lock);
3949                if (atomic_read(&pa->pa_count)) {
3950                        spin_unlock(&pa->pa_lock);
3951                        busy = 1;
3952                        continue;
3953                }
3954                if (pa->pa_deleted) {
3955                        spin_unlock(&pa->pa_lock);
3956                        continue;
3957                }
3958
3959                /* seems this one can be freed ... */
3960                pa->pa_deleted = 1;
3961
3962                /* we can trust pa_free ... */
3963                free += pa->pa_free;
3964
3965                spin_unlock(&pa->pa_lock);
3966
3967                list_del(&pa->pa_group_list);
3968                list_add(&pa->u.pa_tmp_list, &list);
3969        }
3970
3971        /* if we still need more blocks and some PAs were used, try again */
3972        if (free < needed && busy) {
3973                busy = 0;
3974                ext4_unlock_group(sb, group);
3975                cond_resched();
3976                goto repeat;
3977        }
3978
3979        /* found anything to free? */
3980        if (list_empty(&list)) {
3981                BUG_ON(free != 0);
3982                goto out;
3983        }
3984
3985        /* now free all selected PAs */
3986        list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
3987
3988                /* remove from object (inode or locality group) */
3989                spin_lock(pa->pa_obj_lock);
3990                list_del_rcu(&pa->pa_inode_list);
3991                spin_unlock(pa->pa_obj_lock);
3992
3993                if (pa->pa_type == MB_GROUP_PA)
3994                        ext4_mb_release_group_pa(&e4b, pa);
3995                else
3996                        ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
3997
3998                list_del(&pa->u.pa_tmp_list);
3999                call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
4000        }
4001
4002out:
4003        ext4_unlock_group(sb, group);
4004        ext4_mb_unload_buddy(&e4b);
4005        put_bh(bitmap_bh);
4006        return free;
4007}
4008
4009/*
4010 * releases all non-used preallocated blocks for given inode
4011 *
4012 * It's important to discard preallocations under i_data_sem
4013 * We don't want another block to be served from the prealloc
4014 * space when we are discarding the inode prealloc space.
4015 *
4016 * FIXME!! Make sure it is valid at all the call sites
4017 */
4018void ext4_discard_preallocations(struct inode *inode)
4019{
4020        struct ext4_inode_info *ei = EXT4_I(inode);
4021        struct super_block *sb = inode->i_sb;
4022        struct buffer_head *bitmap_bh = NULL;
4023        struct ext4_prealloc_space *pa, *tmp;
4024        ext4_group_t group = 0;
4025        struct list_head list;
4026        struct ext4_buddy e4b;
4027        int err;
4028
4029        if (!S_ISREG(inode->i_mode)) {
4030                /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/
4031                return;
4032        }
4033
4034        mb_debug(1, "discard preallocation for inode %lu\n", inode->i_ino);
4035        trace_ext4_discard_preallocations(inode);
4036
4037        INIT_LIST_HEAD(&list);
4038
4039repeat:
4040        /* first, collect all pa's in the inode */
4041        spin_lock(&ei->i_prealloc_lock);
4042        while (!list_empty(&ei->i_prealloc_list)) {
4043                pa = list_entry(ei->i_prealloc_list.next,
4044                                struct ext4_prealloc_space, pa_inode_list);
4045                BUG_ON(pa->pa_obj_lock != &ei->i_prealloc_lock);
4046                spin_lock(&pa->pa_lock);
4047                if (atomic_read(&pa->pa_count)) {
4048                        /* this shouldn't happen often - nobody should
4049                         * use preallocation while we're discarding it */
4050                        spin_unlock(&pa->pa_lock);
4051                        spin_unlock(&ei->i_prealloc_lock);
4052                        ext4_msg(sb, KERN_ERR,
4053                                 "uh-oh! used pa while discarding");
4054                        WARN_ON(1);
4055                        schedule_timeout_uninterruptible(HZ);
4056                        goto repeat;
4057
4058                }
4059                if (pa->pa_deleted == 0) {
4060                        pa->pa_deleted = 1;
4061                        spin_unlock(&pa->pa_lock);
4062                        list_del_rcu(&pa->pa_inode_list);
4063                        list_add(&pa->u.pa_tmp_list, &list);
4064                        continue;
4065                }
4066
4067                /* someone is deleting pa right now */
4068                spin_unlock(&pa->pa_lock);
4069                spin_unlock(&ei->i_prealloc_lock);
4070
4071                /* we have to wait here because pa_deleted
4072                 * doesn't mean pa is already unlinked from
4073                 * the list. as we might be called from
4074                 * ->clear_inode() the inode will get freed
4075                 * and concurrent thread which is unlinking
4076                 * pa from inode's list may access already
4077                 * freed memory, bad-bad-bad */
4078
4079                /* XXX: if this happens too often, we can
4080                 * add a flag to force wait only in case
4081                 * of ->clear_inode(), but not in case of
4082                 * regular truncate */
4083                schedule_timeout_uninterruptible(HZ);
4084                goto repeat;
4085        }
4086        spin_unlock(&ei->i_prealloc_lock);
4087
4088        list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
4089                BUG_ON(pa->pa_type != MB_INODE_PA);
4090                group = ext4_get_group_number(sb, pa->pa_pstart);
4091
4092                err = ext4_mb_load_buddy_gfp(sb, group, &e4b,
4093                                             GFP_NOFS|__GFP_NOFAIL);
4094                if (err) {
4095                        ext4_error(sb, "Error %d loading buddy information for %u",
4096                                   err, group);
4097                        continue;
4098                }
4099
4100                bitmap_bh = ext4_read_block_bitmap(sb, group);
4101                if (IS_ERR(bitmap_bh)) {
4102                        err = PTR_ERR(bitmap_bh);
4103                        ext4_error(sb, "Error %d reading block bitmap for %u",
4104                                        err, group);
4105                        ext4_mb_unload_buddy(&e4b);
4106                        continue;
4107                }
4108
4109                ext4_lock_group(sb, group);
4110                list_del(&pa->pa_group_list);
4111                ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa);
4112                ext4_unlock_group(sb, group);
4113
4114                ext4_mb_unload_buddy(&e4b);
4115                put_bh(bitmap_bh);
4116
4117                list_del(&pa->u.pa_tmp_list);
4118                call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
4119        }
4120}
4121
4122#ifdef CONFIG_EXT4_DEBUG
4123static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
4124{
4125        struct super_block *sb = ac->ac_sb;
4126        ext4_group_t ngroups, i;
4127
4128        if (!ext4_mballoc_debug ||
4129            (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED))
4130                return;
4131
4132        ext4_msg(ac->ac_sb, KERN_ERR, "Can't allocate:"
4133                        " Allocation context details:");
4134        ext4_msg(ac->ac_sb, KERN_ERR, "status %d flags %d",
4135                        ac->ac_status, ac->ac_flags);
4136        ext4_msg(ac->ac_sb, KERN_ERR, "orig %lu/%lu/%lu@%lu, "
4137                        "goal %lu/%lu/%lu@%lu, "
4138                        "best %lu/%lu/%lu@%lu cr %d",
4139                        (unsigned long)ac->ac_o_ex.fe_group,
4140                        (unsigned long)ac->ac_o_ex.fe_start,
4141                        (unsigned long)ac->ac_o_ex.fe_len,
4142                        (unsigned long)ac->ac_o_ex.fe_logical,
4143                        (unsigned long)ac->ac_g_ex.fe_group,
4144                        (unsigned long)ac->ac_g_ex.fe_start,
4145                        (unsigned long)ac->ac_g_ex.fe_len,
4146                        (unsigned long)ac->ac_g_ex.fe_logical,
4147                        (unsigned long)ac->ac_b_ex.fe_group,
4148                        (unsigned long)ac->ac_b_ex.fe_start,
4149                        (unsigned long)ac->ac_b_ex.fe_len,
4150                        (unsigned long)ac->ac_b_ex.fe_logical,
4151                        (int)ac->ac_criteria);
4152        ext4_msg(ac->ac_sb, KERN_ERR, "%d found", ac->ac_found);
4153        ext4_msg(ac->ac_sb, KERN_ERR, "groups: ");
4154        ngroups = ext4_get_groups_count(sb);
4155        for (i = 0; i < ngroups; i++) {
4156                struct ext4_group_info *grp = ext4_get_group_info(sb, i);
4157                struct ext4_prealloc_space *pa;
4158                ext4_grpblk_t start;
4159                struct list_head *cur;
4160                ext4_lock_group(sb, i);
4161                list_for_each(cur, &grp->bb_prealloc_list) {
4162                        pa = list_entry(cur, struct ext4_prealloc_space,
4163                                        pa_group_list);
4164                        spin_lock(&pa->pa_lock);
4165                        ext4_get_group_no_and_offset(sb, pa->pa_pstart,
4166                                                     NULL, &start);
4167                        spin_unlock(&pa->pa_lock);
4168                        printk(KERN_ERR "PA:%u:%d:%u \n", i,
4169                               start, pa->pa_len);
4170                }
4171                ext4_unlock_group(sb, i);
4172
4173                if (grp->bb_free == 0)
4174                        continue;
4175                printk(KERN_ERR "%u: %d/%d \n",
4176                       i, grp->bb_free, grp->bb_fragments);
4177        }
4178        printk(KERN_ERR "\n");
4179}
4180#else
4181static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
4182{
4183        return;
4184}
4185#endif
4186
4187/*
4188 * We use locality group preallocation for small size file. The size of the
4189 * file is determined by the current size or the resulting size after
4190 * allocation which ever is larger
4191 *
4192 * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req
4193 */
4194static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
4195{
4196        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
4197        int bsbits = ac->ac_sb->s_blocksize_bits;
4198        loff_t size, isize;
4199
4200        if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
4201                return;
4202
4203        if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
4204                return;
4205
4206        size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
4207        isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
4208                >> bsbits;
4209
4210        if ((size == isize) &&
4211            !ext4_fs_is_busy(sbi) &&
4212            (atomic_read(&ac->ac_inode->i_writecount) == 0)) {
4213                ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
4214                return;
4215        }
4216
4217        if (sbi->s_mb_group_prealloc <= 0) {
4218                ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
4219                return;
4220        }
4221
4222        /* don't use group allocation for large files */
4223        size = max(size, isize);
4224        if (size > sbi->s_mb_stream_request) {
4225                ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
4226                return;
4227        }
4228
4229        BUG_ON(ac->ac_lg != NULL);
4230        /*
4231         * locality group prealloc space are per cpu. The reason for having
4232         * per cpu locality group is to reduce the contention between block
4233         * request from multiple CPUs.
4234         */
4235        ac->ac_lg = raw_cpu_ptr(sbi->s_locality_groups);
4236
4237        /* we're going to use group allocation */
4238        ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;
4239
4240        /* serialize all allocations in the group */
4241        mutex_lock(&ac->ac_lg->lg_mutex);
4242}
4243
4244static noinline_for_stack int
4245ext4_mb_initialize_context(struct ext4_allocation_context *ac,
4246                                struct ext4_allocation_request *ar)
4247{
4248        struct super_block *sb = ar->inode->i_sb;
4249        struct ext4_sb_info *sbi = EXT4_SB(sb);
4250        struct ext4_super_block *es = sbi->s_es;
4251        ext4_group_t group;
4252        unsigned int len;
4253        ext4_fsblk_t goal;
4254        ext4_grpblk_t block;
4255
4256        /* we can't allocate > group size */
4257        len = ar->len;
4258
4259        /* just a dirty hack to filter too big requests  */
4260        if (len >= EXT4_CLUSTERS_PER_GROUP(sb))
4261                len = EXT4_CLUSTERS_PER_GROUP(sb);
4262
4263        /* start searching from the goal */
4264        goal = ar->goal;
4265        if (goal < le32_to_cpu(es->s_first_data_block) ||
4266                        goal >= ext4_blocks_count(es))
4267                goal = le32_to_cpu(es->s_first_data_block);
4268        ext4_get_group_no_and_offset(sb, goal, &group, &block);
4269
4270        /* set up allocation goals */
4271        ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical);
4272        ac->ac_status = AC_STATUS_CONTINUE;
4273        ac->ac_sb = sb;
4274        ac->ac_inode = ar->inode;
4275        ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical;
4276        ac->ac_o_ex.fe_group = group;
4277        ac->ac_o_ex.fe_start = block;
4278        ac->ac_o_ex.fe_len = len;
4279        ac->ac_g_ex = ac->ac_o_ex;
4280        ac->ac_flags = ar->flags;
4281
4282        /* we have to define context: we'll we work with a file or
4283         * locality group. this is a policy, actually */
4284        ext4_mb_group_or_file(ac);
4285
4286        mb_debug(1, "init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, "
4287                        "left: %u/%u, right %u/%u to %swritable\n",
4288                        (unsigned) ar->len, (unsigned) ar->logical,
4289                        (unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
4290                        (unsigned) ar->lleft, (unsigned) ar->pleft,
4291                        (unsigned) ar->lright, (unsigned) ar->pright,
4292                        atomic_read(&ar->inode->i_writecount) ? "" : "non-");
4293        return 0;
4294
4295}
4296
4297static noinline_for_stack void
4298ext4_mb_discard_lg_preallocations(struct super_block *sb,
4299                                        struct ext4_locality_group *lg,
4300                                        int order, int total_entries)
4301{
4302        ext4_group_t group = 0;
4303        struct ext4_buddy e4b;
4304        struct list_head discard_list;
4305        struct ext4_prealloc_space *pa, *tmp;
4306
4307        mb_debug(1, "discard locality group preallocation\n");
4308
4309        INIT_LIST_HEAD(&discard_list);
4310
4311        spin_lock(&lg->lg_prealloc_lock);
4312        list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order],
4313                                                pa_inode_list) {
4314                spin_lock(&pa->pa_lock);
4315                if (atomic_read(&pa->pa_count)) {
4316                        /*
4317                         * This is the pa that we just used
4318                         * for block allocation. So don't
4319                         * free that
4320                         */
4321                        spin_unlock(&pa->pa_lock);
4322                        continue;
4323                }
4324                if (pa->pa_deleted) {
4325                        spin_unlock(&pa->pa_lock);
4326                        continue;
4327                }
4328                /* only lg prealloc space */
4329                BUG_ON(pa->pa_type != MB_GROUP_PA);
4330
4331                /* seems this one can be freed ... */
4332                pa->pa_deleted = 1;
4333                spin_unlock(&pa->pa_lock);
4334
4335                list_del_rcu(&pa->pa_inode_list);
4336                list_add(&pa->u.pa_tmp_list, &discard_list);
4337
4338                total_entries--;
4339                if (total_entries <= 5) {
4340                        /*
4341                         * we want to keep only 5 entries
4342                         * allowing it to grow to 8. This
4343                         * mak sure we don't call discard
4344                         * soon for this list.
4345                         */
4346                        break;
4347                }
4348        }
4349        spin_unlock(&lg->lg_prealloc_lock);
4350
4351        list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) {
4352                int err;
4353
4354                group = ext4_get_group_number(sb, pa->pa_pstart);
4355                err = ext4_mb_load_buddy_gfp(sb, group, &e4b,
4356                                             GFP_NOFS|__GFP_NOFAIL);
4357                if (err) {
4358                        ext4_error(sb, "Error %d loading buddy information for %u",
4359                                   err, group);
4360                        continue;
4361                }
4362                ext4_lock_group(sb, group);
4363                list_del(&pa->pa_group_list);
4364                ext4_mb_release_group_pa(&e4b, pa);
4365                ext4_unlock_group(sb, group);
4366
4367                ext4_mb_unload_buddy(&e4b);
4368                list_del(&pa->u.pa_tmp_list);
4369                call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
4370        }
4371}
4372
4373/*
4374 * We have incremented pa_count. So it cannot be freed at this
4375 * point. Also we hold lg_mutex. So no parallel allocation is
4376 * possible from this lg. That means pa_free cannot be updated.
4377 *
4378 * A parallel ext4_mb_discard_group_preallocations is possible.
4379 * which can cause the lg_prealloc_list to be updated.
4380 */
4381
4382static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
4383{
4384        int order, added = 0, lg_prealloc_count = 1;
4385        struct super_block *sb = ac->ac_sb;
4386        struct ext4_locality_group *lg = ac->ac_lg;
4387        struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa;
4388
4389        order = fls(pa->pa_free) - 1;
4390        if (order > PREALLOC_TB_SIZE - 1)
4391                /* The max size of hash table is PREALLOC_TB_SIZE */
4392                order = PREALLOC_TB_SIZE - 1;
4393        /* Add the prealloc space to lg */
4394        spin_lock(&lg->lg_prealloc_lock);
4395        list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order],
4396                                                pa_inode_list) {
4397                spin_lock(&tmp_pa->pa_lock);
4398                if (tmp_pa->pa_deleted) {
4399                        spin_unlock(&tmp_pa->pa_lock);
4400                        continue;
4401                }
4402                if (!added && pa->pa_free < tmp_pa->pa_free) {
4403                        /* Add to the tail of the previous entry */
4404                        list_add_tail_rcu(&pa->pa_inode_list,
4405                                                &tmp_pa->pa_inode_list);
4406                        added = 1;
4407                        /*
4408                         * we want to count the total
4409                         * number of entries in the list
4410                         */
4411                }
4412                spin_unlock(&tmp_pa->pa_lock);
4413                lg_prealloc_count++;
4414        }
4415        if (!added)
4416                list_add_tail_rcu(&pa->pa_inode_list,
4417                                        &lg->lg_prealloc_list[order]);
4418        spin_unlock(&lg->lg_prealloc_lock);
4419
4420        /* Now trim the list to be not more than 8 elements */
4421        if (lg_prealloc_count > 8) {
4422                ext4_mb_discard_lg_preallocations(sb, lg,
4423                                                  order, lg_prealloc_count);
4424                return;
4425        }
4426        return ;
4427}
4428
4429/*
4430 * release all resource we used in allocation
4431 */
4432static int ext4_mb_release_context(struct ext4_allocation_context *ac)
4433{
4434        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
4435        struct ext4_prealloc_space *pa = ac->ac_pa;
4436        if (pa) {
4437                if (pa->pa_type == MB_GROUP_PA) {
4438                        /* see comment in ext4_mb_use_group_pa() */
4439                        spin_lock(&pa->pa_lock);
4440                        pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
4441                        pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
4442                        pa->pa_free -= ac->ac_b_ex.fe_len;
4443                        pa->pa_len -= ac->ac_b_ex.fe_len;
4444                        spin_unlock(&pa->pa_lock);
4445                }
4446        }
4447        if (pa) {
4448                /*
4449                 * We want to add the pa to the right bucket.
4450                 * Remove it from the list and while adding
4451                 * make sure the list to which we are adding
4452                 * doesn't grow big.
4453                 */
4454                if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
4455                        spin_lock(pa->pa_obj_lock);
4456                        list_del_rcu(&pa->pa_inode_list);
4457                        spin_unlock(pa->pa_obj_lock);
4458                        ext4_mb_add_n_trim(ac);
4459                }
4460                ext4_mb_put_pa(ac, ac->ac_sb, pa);
4461        }
4462        if (ac->ac_bitmap_page)
4463                put_page(ac->ac_bitmap_page);
4464        if (ac->ac_buddy_page)
4465                put_page(ac->ac_buddy_page);
4466        if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
4467                mutex_unlock(&ac->ac_lg->lg_mutex);
4468        ext4_mb_collect_stats(ac);
4469        return 0;
4470}
4471
4472static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
4473{
4474        ext4_group_t i, ngroups = ext4_get_groups_count(sb);
4475        int ret;
4476        int freed = 0;
4477
4478        trace_ext4_mb_discard_preallocations(sb, needed);
4479        for (i = 0; i < ngroups && needed > 0; i++) {
4480                ret = ext4_mb_discard_group_preallocations(sb, i, needed);
4481                freed += ret;
4482                needed -= ret;
4483        }
4484
4485        return freed;
4486}
4487
4488/*
4489 * Main entry point into mballoc to allocate blocks
4490 * it tries to use preallocation first, then falls back
4491 * to usual allocation
4492 */
4493ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4494                                struct ext4_allocation_request *ar, int *errp)
4495{
4496        int freed;
4497        struct ext4_allocation_context *ac = NULL;
4498        struct ext4_sb_info *sbi;
4499        struct super_block *sb;
4500        ext4_fsblk_t block = 0;
4501        unsigned int inquota = 0;
4502        unsigned int reserv_clstrs = 0;
4503
4504        might_sleep();
4505        sb = ar->inode->i_sb;
4506        sbi = EXT4_SB(sb);
4507
4508        trace_ext4_request_blocks(ar);
4509
4510        /* Allow to use superuser reservation for quota file */
4511        if (ext4_is_quota_file(ar->inode))
4512                ar->flags |= EXT4_MB_USE_ROOT_BLOCKS;
4513
4514        if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) {
4515                /* Without delayed allocation we need to verify
4516                 * there is enough free blocks to do block allocation
4517                 * and verify allocation doesn't exceed the quota limits.
4518                 */
4519                while (ar->len &&
4520                        ext4_claim_free_clusters(sbi, ar->len, ar->flags)) {
4521
4522                        /* let others to free the space */
4523                        cond_resched();
4524                        ar->len = ar->len >> 1;
4525                }
4526                if (!ar->len) {
4527                        *errp = -ENOSPC;
4528                        return 0;
4529                }
4530                reserv_clstrs = ar->len;
4531                if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) {
4532                        dquot_alloc_block_nofail(ar->inode,
4533                                                 EXT4_C2B(sbi, ar->len));
4534                } else {
4535                        while (ar->len &&
4536                                dquot_alloc_block(ar->inode,
4537                                                  EXT4_C2B(sbi, ar->len))) {
4538
4539                                ar->flags |= EXT4_MB_HINT_NOPREALLOC;
4540                                ar->len--;
4541                        }
4542                }
4543                inquota = ar->len;
4544                if (ar->len == 0) {
4545                        *errp = -EDQUOT;
4546                        goto out;
4547                }
4548        }
4549
4550        ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS);
4551        if (!ac) {
4552                ar->len = 0;
4553                *errp = -ENOMEM;
4554                goto out;
4555        }
4556
4557        *errp = ext4_mb_initialize_context(ac, ar);
4558        if (*errp) {
4559                ar->len = 0;
4560                goto out;
4561        }
4562
4563        ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
4564        if (!ext4_mb_use_preallocated(ac)) {
4565                ac->ac_op = EXT4_MB_HISTORY_ALLOC;
4566                ext4_mb_normalize_request(ac, ar);
4567repeat:
4568                /* allocate space in core */
4569                *errp = ext4_mb_regular_allocator(ac);
4570                if (*errp)
4571                        goto discard_and_exit;
4572
4573                /* as we've just preallocated more space than
4574                 * user requested originally, we store allocated
4575                 * space in a special descriptor */
4576                if (ac->ac_status == AC_STATUS_FOUND &&
4577                    ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
4578                        *errp = ext4_mb_new_preallocation(ac);
4579                if (*errp) {
4580                discard_and_exit:
4581                        ext4_discard_allocated_blocks(ac);
4582                        goto errout;
4583                }
4584        }
4585        if (likely(ac->ac_status == AC_STATUS_FOUND)) {
4586                *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
4587                if (*errp) {
4588                        ext4_discard_allocated_blocks(ac);
4589                        goto errout;
4590                } else {
4591                        block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
4592                        ar->len = ac->ac_b_ex.fe_len;
4593                }
4594        } else {
4595                freed  = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len);
4596                if (freed)
4597                        goto repeat;
4598                *errp = -ENOSPC;
4599        }
4600
4601errout:
4602        if (*errp) {
4603                ac->ac_b_ex.fe_len = 0;
4604                ar->len = 0;
4605                ext4_mb_show_ac(ac);
4606        }
4607        ext4_mb_release_context(ac);
4608out:
4609        if (ac)
4610                kmem_cache_free(ext4_ac_cachep, ac);
4611        if (inquota && ar->len < inquota)
4612                dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len));
4613        if (!ar->len) {
4614                if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0)
4615                        /* release all the reserved blocks if non delalloc */
4616                        percpu_counter_sub(&sbi->s_dirtyclusters_counter,
4617                                                reserv_clstrs);
4618        }
4619
4620        trace_ext4_allocate_blocks(ar, (unsigned long long)block);
4621
4622        return block;
4623}
4624
4625/*
4626 * We can merge two free data extents only if the physical blocks
4627 * are contiguous, AND the extents were freed by the same transaction,
4628 * AND the blocks are associated with the same group.
4629 */
4630static void ext4_try_merge_freed_extent(struct ext4_sb_info *sbi,
4631                                        struct ext4_free_data *entry,
4632                                        struct ext4_free_data *new_entry,
4633                                        struct rb_root *entry_rb_root)
4634{
4635        if ((entry->efd_tid != new_entry->efd_tid) ||
4636            (entry->efd_group != new_entry->efd_group))
4637                return;
4638        if (entry->efd_start_cluster + entry->efd_count ==
4639            new_entry->efd_start_cluster) {
4640                new_entry->efd_start_cluster = entry->efd_start_cluster;
4641                new_entry->efd_count += entry->efd_count;
4642        } else if (new_entry->efd_start_cluster + new_entry->efd_count ==
4643                   entry->efd_start_cluster) {
4644                new_entry->efd_count += entry->efd_count;
4645        } else
4646                return;
4647        spin_lock(&sbi->s_md_lock);
4648        list_del(&entry->efd_list);
4649        spin_unlock(&sbi->s_md_lock);
4650        rb_erase(&entry->efd_node, entry_rb_root);
4651        kmem_cache_free(ext4_free_data_cachep, entry);
4652}
4653
4654static noinline_for_stack int
4655ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4656                      struct ext4_free_data *new_entry)
4657{
4658        ext4_group_t group = e4b->bd_group;
4659        ext4_grpblk_t cluster;
4660        ext4_grpblk_t clusters = new_entry->efd_count;
4661        struct ext4_free_data *entry;
4662        struct ext4_group_info *db = e4b->bd_info;
4663        struct super_block *sb = e4b->bd_sb;
4664        struct ext4_sb_info *sbi = EXT4_SB(sb);
4665        struct rb_node **n = &db->bb_free_root.rb_node, *node;
4666        struct rb_node *parent = NULL, *new_node;
4667
4668        BUG_ON(!ext4_handle_valid(handle));
4669        BUG_ON(e4b->bd_bitmap_page == NULL);
4670        BUG_ON(e4b->bd_buddy_page == NULL);
4671
4672        new_node = &new_entry->efd_node;
4673        cluster = new_entry->efd_start_cluster;
4674
4675        if (!*n) {
4676                /* first free block exent. We need to
4677                   protect buddy cache from being freed,
4678                 * otherwise we'll refresh it from
4679                 * on-disk bitmap and lose not-yet-available
4680                 * blocks */
4681                get_page(e4b->bd_buddy_page);
4682                get_page(e4b->bd_bitmap_page);
4683        }
4684        while (*n) {
4685                parent = *n;
4686                entry = rb_entry(parent, struct ext4_free_data, efd_node);
4687                if (cluster < entry->efd_start_cluster)
4688                        n = &(*n)->rb_left;
4689                else if (cluster >= (entry->efd_start_cluster + entry->efd_count))
4690                        n = &(*n)->rb_right;
4691                else {
4692                        ext4_grp_locked_error(sb, group, 0,
4693                                ext4_group_first_block_no(sb, group) +
4694                                EXT4_C2B(sbi, cluster),
4695                                "Block already on to-be-freed list");
4696                        return 0;
4697                }
4698        }
4699
4700        rb_link_node(new_node, parent, n);
4701        rb_insert_color(new_node, &db->bb_free_root);
4702
4703        /* Now try to see the extent can be merged to left and right */
4704        node = rb_prev(new_node);
4705        if (node) {
4706                entry = rb_entry(node, struct ext4_free_data, efd_node);
4707                ext4_try_merge_freed_extent(sbi, entry, new_entry,
4708                                            &(db->bb_free_root));
4709        }
4710
4711        node = rb_next(new_node);
4712        if (node) {
4713                entry = rb_entry(node, struct ext4_free_data, efd_node);
4714                ext4_try_merge_freed_extent(sbi, entry, new_entry,
4715                                            &(db->bb_free_root));
4716        }
4717
4718        spin_lock(&sbi->s_md_lock);
4719        list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list);
4720        sbi->s_mb_free_pending += clusters;
4721        spin_unlock(&sbi->s_md_lock);
4722        return 0;
4723}
4724
4725/**
4726 * ext4_free_blocks() -- Free given blocks and update quota
4727 * @handle:             handle for this transaction
4728 * @inode:              inode
4729 * @block:              start physical block to free
4730 * @count:              number of blocks to count
4731 * @flags:              flags used by ext4_free_blocks
4732 */
4733void ext4_free_blocks(handle_t *handle, struct inode *inode,
4734                      struct buffer_head *bh, ext4_fsblk_t block,
4735                      unsigned long count, int flags)
4736{
4737        struct buffer_head *bitmap_bh = NULL;
4738        struct super_block *sb = inode->i_sb;
4739        struct ext4_group_desc *gdp;
4740        unsigned int overflow;
4741        ext4_grpblk_t bit;
4742        struct buffer_head *gd_bh;
4743        ext4_group_t block_group;
4744        struct ext4_sb_info *sbi;
4745        struct ext4_buddy e4b;
4746        unsigned int count_clusters;
4747        int err = 0;
4748        int ret;
4749
4750        might_sleep();
4751        if (bh) {
4752                if (block)
4753                        BUG_ON(block != bh->b_blocknr);
4754                else
4755                        block = bh->b_blocknr;
4756        }
4757
4758        sbi = EXT4_SB(sb);
4759        if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
4760            !ext4_data_block_valid(sbi, block, count)) {
4761                ext4_error(sb, "Freeing blocks not in datazone - "
4762                           "block = %llu, count = %lu", block, count);
4763                goto error_return;
4764        }
4765
4766        ext4_debug("freeing block %llu\n", block);
4767        trace_ext4_free_blocks(inode, block, count, flags);
4768
4769        if (bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
4770                BUG_ON(count > 1);
4771
4772                ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
4773                            inode, bh, block);
4774        }
4775
4776        /*
4777         * If the extent to be freed does not begin on a cluster
4778         * boundary, we need to deal with partial clusters at the
4779         * beginning and end of the extent.  Normally we will free
4780         * blocks at the beginning or the end unless we are explicitly
4781         * requested to avoid doing so.
4782         */
4783        overflow = EXT4_PBLK_COFF(sbi, block);
4784        if (overflow) {
4785                if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) {
4786                        overflow = sbi->s_cluster_ratio - overflow;
4787                        block += overflow;
4788                        if (count > overflow)
4789                                count -= overflow;
4790                        else
4791                                return;
4792                } else {
4793                        block -= overflow;
4794                        count += overflow;
4795                }
4796        }
4797        overflow = EXT4_LBLK_COFF(sbi, count);
4798        if (overflow) {
4799                if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) {
4800                        if (count > overflow)
4801                                count -= overflow;
4802                        else
4803                                return;
4804                } else
4805                        count += sbi->s_cluster_ratio - overflow;
4806        }
4807
4808        if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) {
4809                int i;
4810                int is_metadata = flags & EXT4_FREE_BLOCKS_METADATA;
4811
4812                for (i = 0; i < count; i++) {
4813                        cond_resched();
4814                        if (is_metadata)
4815                                bh = sb_find_get_block(inode->i_sb, block + i);
4816                        ext4_forget(handle, is_metadata, inode, bh, block + i);
4817                }
4818        }
4819
4820do_more:
4821        overflow = 0;
4822        ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
4823
4824        if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(
4825                        ext4_get_group_info(sb, block_group))))
4826                return;
4827
4828        /*
4829         * Check to see if we are freeing blocks across a group
4830         * boundary.
4831         */
4832        if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) {
4833                overflow = EXT4_C2B(sbi, bit) + count -
4834                        EXT4_BLOCKS_PER_GROUP(sb);
4835                count -= overflow;
4836        }
4837        count_clusters = EXT4_NUM_B2C(sbi, count);
4838        bitmap_bh = ext4_read_block_bitmap(sb, block_group);
4839        if (IS_ERR(bitmap_bh)) {
4840                err = PTR_ERR(bitmap_bh);
4841                bitmap_bh = NULL;
4842                goto error_return;
4843        }
4844        gdp = ext4_get_group_desc(sb, block_group, &gd_bh);
4845        if (!gdp) {
4846                err = -EIO;
4847                goto error_return;
4848        }
4849
4850        if (in_range(ext4_block_bitmap(sb, gdp), block, count) ||
4851            in_range(ext4_inode_bitmap(sb, gdp), block, count) ||
4852            in_range(block, ext4_inode_table(sb, gdp),
4853                     EXT4_SB(sb)->s_itb_per_group) ||
4854            in_range(block + count - 1, ext4_inode_table(sb, gdp),
4855                     EXT4_SB(sb)->s_itb_per_group)) {
4856
4857                ext4_error(sb, "Freeing blocks in system zone - "
4858                           "Block = %llu, count = %lu", block, count);
4859                /* err = 0. ext4_std_error should be a no op */
4860                goto error_return;
4861        }
4862
4863        BUFFER_TRACE(bitmap_bh, "getting write access");
4864        err = ext4_journal_get_write_access(handle, bitmap_bh);
4865        if (err)
4866                goto error_return;
4867
4868        /*
4869         * We are about to modify some metadata.  Call the journal APIs
4870         * to unshare ->b_data if a currently-committing transaction is
4871         * using it
4872         */
4873        BUFFER_TRACE(gd_bh, "get_write_access");
4874        err = ext4_journal_get_write_access(handle, gd_bh);
4875        if (err)
4876                goto error_return;
4877#ifdef AGGRESSIVE_CHECK
4878        {
4879                int i;
4880                for (i = 0; i < count_clusters; i++)
4881                        BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data));
4882        }
4883#endif
4884        trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters);
4885
4886        /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */
4887        err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b,
4888                                     GFP_NOFS|__GFP_NOFAIL);
4889        if (err)
4890                goto error_return;
4891
4892        /*
4893         * We need to make sure we don't reuse the freed block until after the
4894         * transaction is committed. We make an exception if the inode is to be
4895         * written in writeback mode since writeback mode has weak data
4896         * consistency guarantees.
4897         */
4898        if (ext4_handle_valid(handle) &&
4899            ((flags & EXT4_FREE_BLOCKS_METADATA) ||
4900             !ext4_should_writeback_data(inode))) {
4901                struct ext4_free_data *new_entry;
4902                /*
4903                 * We use __GFP_NOFAIL because ext4_free_blocks() is not allowed
4904                 * to fail.
4905                 */
4906                new_entry = kmem_cache_alloc(ext4_free_data_cachep,
4907                                GFP_NOFS|__GFP_NOFAIL);
4908                new_entry->efd_start_cluster = bit;
4909                new_entry->efd_group = block_group;
4910                new_entry->efd_count = count_clusters;
4911                new_entry->efd_tid = handle->h_transaction->t_tid;
4912
4913                ext4_lock_group(sb, block_group);
4914                mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
4915                ext4_mb_free_metadata(handle, &e4b, new_entry);
4916        } else {
4917                /* need to update group_info->bb_free and bitmap
4918                 * with group lock held. generate_buddy look at
4919                 * them with group lock_held
4920                 */
4921                if (test_opt(sb, DISCARD)) {
4922                        err = ext4_issue_discard(sb, block_group, bit, count,
4923                                                 NULL);
4924                        if (err && err != -EOPNOTSUPP)
4925                                ext4_msg(sb, KERN_WARNING, "discard request in"
4926                                         " group:%d block:%d count:%lu failed"
4927                                         " with %d", block_group, bit, count,
4928                                         err);
4929                } else
4930                        EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info);
4931
4932                ext4_lock_group(sb, block_group);
4933                mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
4934                mb_free_blocks(inode, &e4b, bit, count_clusters);
4935        }
4936
4937        ret = ext4_free_group_clusters(sb, gdp) + count_clusters;
4938        ext4_free_group_clusters_set(sb, gdp, ret);
4939        ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh);
4940        ext4_group_desc_csum_set(sb, block_group, gdp);
4941        ext4_unlock_group(sb, block_group);
4942
4943        if (sbi->s_log_groups_per_flex) {
4944                ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
4945                atomic64_add(count_clusters,
4946                             &sbi->s_flex_groups[flex_group].free_clusters);
4947        }
4948
4949        if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE))
4950                dquot_free_block(inode, EXT4_C2B(sbi, count_clusters));
4951        percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters);
4952
4953        ext4_mb_unload_buddy(&e4b);
4954
4955        /* We dirtied the bitmap block */
4956        BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
4957        err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
4958
4959        /* And the group descriptor block */
4960        BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
4961        ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
4962        if (!err)
4963                err = ret;
4964
4965        if (overflow && !err) {
4966                block += count;
4967                count = overflow;
4968                put_bh(bitmap_bh);
4969                goto do_more;
4970        }
4971error_return:
4972        brelse(bitmap_bh);
4973        ext4_std_error(sb, err);
4974        return;
4975}
4976
4977/**
4978 * ext4_group_add_blocks() -- Add given blocks to an existing group
4979 * @handle:                     handle to this transaction
4980 * @sb:                         super block
4981 * @block:                      start physical block to add to the block group
4982 * @count:                      number of blocks to free
4983 *
4984 * This marks the blocks as free in the bitmap and buddy.
4985 */
4986int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
4987                         ext4_fsblk_t block, unsigned long count)
4988{
4989        struct buffer_head *bitmap_bh = NULL;
4990        struct buffer_head *gd_bh;
4991        ext4_group_t block_group;
4992        ext4_grpblk_t bit;
4993        unsigned int i;
4994        struct ext4_group_desc *desc;
4995        struct ext4_sb_info *sbi = EXT4_SB(sb);
4996        struct ext4_buddy e4b;
4997        int err = 0, ret, blk_free_count;
4998        ext4_grpblk_t blocks_freed;
4999
5000        ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1);
5001
5002        if (count == 0)
5003                return 0;
5004
5005        ext4_get_group_no_and_offset(sb, block, &block_group, &bit);
5006        /*
5007         * Check to see if we are freeing blocks across a group
5008         * boundary.
5009         */
5010        if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) {
5011                ext4_warning(sb, "too much blocks added to group %u",
5012                             block_group);
5013                err = -EINVAL;
5014                goto error_return;
5015        }
5016
5017        bitmap_bh = ext4_read_block_bitmap(sb, block_group);
5018        if (IS_ERR(bitmap_bh)) {
5019                err = PTR_ERR(bitmap_bh);
5020                bitmap_bh = NULL;
5021                goto error_return;
5022        }
5023
5024        desc = ext4_get_group_desc(sb, block_group, &gd_bh);
5025        if (!desc) {
5026                err = -EIO;
5027                goto error_return;
5028        }
5029
5030        if (in_range(ext4_block_bitmap(sb, desc), block, count) ||
5031            in_range(ext4_inode_bitmap(sb, desc), block, count) ||
5032            in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) ||
5033            in_range(block + count - 1, ext4_inode_table(sb, desc),
5034                     sbi->s_itb_per_group)) {
5035                ext4_error(sb, "Adding blocks in system zones - "
5036                           "Block = %llu, count = %lu",
5037                           block, count);
5038                err = -EINVAL;
5039                goto error_return;
5040        }
5041
5042        BUFFER_TRACE(bitmap_bh, "getting write access");
5043        err = ext4_journal_get_write_access(handle, bitmap_bh);
5044        if (err)
5045                goto error_return;
5046
5047        /*
5048         * We are about to modify some metadata.  Call the journal APIs
5049         * to unshare ->b_data if a currently-committing transaction is
5050         * using it
5051         */
5052        BUFFER_TRACE(gd_bh, "get_write_access");
5053        err = ext4_journal_get_write_access(handle, gd_bh);
5054        if (err)
5055                goto error_return;
5056
5057        for (i = 0, blocks_freed = 0; i < count; i++) {
5058                BUFFER_TRACE(bitmap_bh, "clear bit");
5059                if (!mb_test_bit(bit + i, bitmap_bh->b_data)) {
5060                        ext4_error(sb, "bit already cleared for block %llu",
5061                                   (ext4_fsblk_t)(block + i));
5062                        BUFFER_TRACE(bitmap_bh, "bit already cleared");
5063                } else {
5064                        blocks_freed++;
5065                }
5066        }
5067
5068        err = ext4_mb_load_buddy(sb, block_group, &e4b);
5069        if (err)
5070                goto error_return;
5071
5072        /*
5073         * need to update group_info->bb_free and bitmap
5074         * with group lock held. generate_buddy look at
5075         * them with group lock_held
5076         */
5077        ext4_lock_group(sb, block_group);
5078        mb_clear_bits(bitmap_bh->b_data, bit, count);
5079        mb_free_blocks(NULL, &e4b, bit, count);
5080        blk_free_count = blocks_freed + ext4_free_group_clusters(sb, desc);
5081        ext4_free_group_clusters_set(sb, desc, blk_free_count);
5082        ext4_block_bitmap_csum_set(sb, block_group, desc, bitmap_bh);
5083        ext4_group_desc_csum_set(sb, block_group, desc);
5084        ext4_unlock_group(sb, block_group);
5085        percpu_counter_add(&sbi->s_freeclusters_counter,
5086                           EXT4_NUM_B2C(sbi, blocks_freed));
5087
5088        if (sbi->s_log_groups_per_flex) {
5089                ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
5090                atomic64_add(EXT4_NUM_B2C(sbi, blocks_freed),
5091                             &sbi->s_flex_groups[flex_group].free_clusters);
5092        }
5093
5094        ext4_mb_unload_buddy(&e4b);
5095
5096        /* We dirtied the bitmap block */
5097        BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
5098        err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
5099
5100        /* And the group descriptor block */
5101        BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
5102        ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh);
5103        if (!err)
5104                err = ret;
5105
5106error_return:
5107        brelse(bitmap_bh);
5108        ext4_std_error(sb, err);
5109        return err;
5110}
5111
5112/**
5113 * ext4_trim_extent -- function to TRIM one single free extent in the group
5114 * @sb:         super block for the file system
5115 * @start:      starting block of the free extent in the alloc. group
5116 * @count:      number of blocks to TRIM
5117 * @group:      alloc. group we are working with
5118 * @e4b:        ext4 buddy for the group
5119 *
5120 * Trim "count" blocks starting at "start" in the "group". To assure that no
5121 * one will allocate those blocks, mark it as used in buddy bitmap. This must
5122 * be called with under the group lock.
5123 */
5124static int ext4_trim_extent(struct super_block *sb, int start, int count,
5125                             ext4_group_t group, struct ext4_buddy *e4b)
5126__releases(bitlock)
5127__acquires(bitlock)
5128{
5129        struct ext4_free_extent ex;
5130        int ret = 0;
5131
5132        trace_ext4_trim_extent(sb, group, start, count);
5133
5134        assert_spin_locked(ext4_group_lock_ptr(sb, group));
5135
5136        ex.fe_start = start;
5137        ex.fe_group = group;
5138        ex.fe_len = count;
5139
5140        /*
5141         * Mark blocks used, so no one can reuse them while
5142         * being trimmed.
5143         */
5144        mb_mark_used(e4b, &ex);
5145        ext4_unlock_group(sb, group);
5146        ret = ext4_issue_discard(sb, group, start, count, NULL);
5147        ext4_lock_group(sb, group);
5148        mb_free_blocks(NULL, e4b, start, ex.fe_len);
5149        return ret;
5150}
5151
5152/**
5153 * ext4_trim_all_free -- function to trim all free space in alloc. group
5154 * @sb:                 super block for file system
5155 * @group:              group to be trimmed
5156 * @start:              first group block to examine
5157 * @max:                last group block to examine
5158 * @minblocks:          minimum extent block count
5159 *
5160 * ext4_trim_all_free walks through group's buddy bitmap searching for free
5161 * extents. When the free block is found, ext4_trim_extent is called to TRIM
5162 * the extent.
5163 *
5164 *
5165 * ext4_trim_all_free walks through group's block bitmap searching for free
5166 * extents. When the free extent is found, mark it as used in group buddy
5167 * bitmap. Then issue a TRIM command on this extent and free the extent in
5168 * the group buddy bitmap. This is done until whole group is scanned.
5169 */
5170static ext4_grpblk_t
5171ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
5172                   ext4_grpblk_t start, ext4_grpblk_t max,
5173                   ext4_grpblk_t minblocks)
5174{
5175        void *bitmap;
5176        ext4_grpblk_t next, count = 0, free_count = 0;
5177        struct ext4_buddy e4b;
5178        int ret = 0;
5179
5180        trace_ext4_trim_all_free(sb, group, start, max);
5181
5182        ret = ext4_mb_load_buddy(sb, group, &e4b);
5183        if (ret) {
5184                ext4_warning(sb, "Error %d loading buddy information for %u",
5185                             ret, group);
5186                return ret;
5187        }
5188        bitmap = e4b.bd_bitmap;
5189
5190        ext4_lock_group(sb, group);
5191        if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) &&
5192            minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks))
5193                goto out;
5194
5195        start = (e4b.bd_info->bb_first_free > start) ?
5196                e4b.bd_info->bb_first_free : start;
5197
5198        while (start <= max) {
5199                start = mb_find_next_zero_bit(bitmap, max + 1, start);
5200                if (start > max)
5201                        break;
5202                next = mb_find_next_bit(bitmap, max + 1, start);
5203
5204                if ((next - start) >= minblocks) {
5205                        ret = ext4_trim_extent(sb, start,
5206                                               next - start, group, &e4b);
5207                        if (ret && ret != -EOPNOTSUPP)
5208                                break;
5209                        ret = 0;
5210                        count += next - start;
5211                }
5212                free_count += next - start;
5213                start = next + 1;
5214
5215                if (fatal_signal_pending(current)) {
5216                        count = -ERESTARTSYS;
5217                        break;
5218                }
5219
5220                if (need_resched()) {
5221                        ext4_unlock_group(sb, group);
5222                        cond_resched();
5223                        ext4_lock_group(sb, group);
5224                }
5225
5226                if ((e4b.bd_info->bb_free - free_count) < minblocks)
5227                        break;
5228        }
5229
5230        if (!ret) {
5231                ret = count;
5232                EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
5233        }
5234out:
5235        ext4_unlock_group(sb, group);
5236        ext4_mb_unload_buddy(&e4b);
5237
5238        ext4_debug("trimmed %d blocks in the group %d\n",
5239                count, group);
5240
5241        return ret;
5242}
5243
5244/**
5245 * ext4_trim_fs() -- trim ioctl handle function
5246 * @sb:                 superblock for filesystem
5247 * @range:              fstrim_range structure
5248 *
5249 * start:       First Byte to trim
5250 * len:         number of Bytes to trim from start
5251 * minlen:      minimum extent length in Bytes
5252 * ext4_trim_fs goes through all allocation groups containing Bytes from
5253 * start to start+len. For each such a group ext4_trim_all_free function
5254 * is invoked to trim all free space.
5255 */
5256int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
5257{
5258        struct ext4_group_info *grp;
5259        ext4_group_t group, first_group, last_group;
5260        ext4_grpblk_t cnt = 0, first_cluster, last_cluster;
5261        uint64_t start, end, minlen, trimmed = 0;
5262        ext4_fsblk_t first_data_blk =
5263                        le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
5264        ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es);
5265        int ret = 0;
5266
5267        start = range->start >> sb->s_blocksize_bits;
5268        end = start + (range->len >> sb->s_blocksize_bits) - 1;
5269        minlen = EXT4_NUM_B2C(EXT4_SB(sb),
5270                              range->minlen >> sb->s_blocksize_bits);
5271
5272        if (minlen > EXT4_CLUSTERS_PER_GROUP(sb) ||
5273            start >= max_blks ||
5274            range->len < sb->s_blocksize)
5275                return -EINVAL;
5276        if (end >= max_blks)
5277                end = max_blks - 1;
5278        if (end <= first_data_blk)
5279                goto out;
5280        if (start < first_data_blk)
5281                start = first_data_blk;
5282
5283        /* Determine first and last group to examine based on start and end */
5284        ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
5285                                     &first_group, &first_cluster);
5286        ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end,
5287                                     &last_group, &last_cluster);
5288
5289        /* end now represents the last cluster to discard in this group */
5290        end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
5291
5292        for (group = first_group; group <= last_group; group++) {
5293                grp = ext4_get_group_info(sb, group);
5294                /* We only do this if the grp has never been initialized */
5295                if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
5296                        ret = ext4_mb_init_group(sb, group, GFP_NOFS);
5297                        if (ret)
5298                                break;
5299                }
5300
5301                /*
5302                 * For all the groups except the last one, last cluster will
5303                 * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to
5304                 * change it for the last group, note that last_cluster is
5305                 * already computed earlier by ext4_get_group_no_and_offset()
5306                 */
5307                if (group == last_group)
5308                        end = last_cluster;
5309
5310                if (grp->bb_free >= minlen) {
5311                        cnt = ext4_trim_all_free(sb, group, first_cluster,
5312                                                end, minlen);
5313                        if (cnt < 0) {
5314                                ret = cnt;
5315                                break;
5316                        }
5317                        trimmed += cnt;
5318                }
5319
5320                /*
5321                 * For every group except the first one, we are sure
5322                 * that the first cluster to discard will be cluster #0.
5323                 */
5324                first_cluster = 0;
5325        }
5326
5327        if (!ret)
5328                atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen);
5329
5330out:
5331        range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits;
5332        return ret;
5333}
5334
5335/* Iterate all the free extents in the group. */
5336int
5337ext4_mballoc_query_range(
5338        struct super_block              *sb,
5339        ext4_group_t                    group,
5340        ext4_grpblk_t                   start,
5341        ext4_grpblk_t                   end,
5342        ext4_mballoc_query_range_fn     formatter,
5343        void                            *priv)
5344{
5345        void                            *bitmap;
5346        ext4_grpblk_t                   next;
5347        struct ext4_buddy               e4b;
5348        int                             error;
5349
5350        error = ext4_mb_load_buddy(sb, group, &e4b);
5351        if (error)
5352                return error;
5353        bitmap = e4b.bd_bitmap;
5354
5355        ext4_lock_group(sb, group);
5356
5357        start = (e4b.bd_info->bb_first_free > start) ?
5358                e4b.bd_info->bb_first_free : start;
5359        if (end >= EXT4_CLUSTERS_PER_GROUP(sb))
5360                end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
5361
5362        while (start <= end) {
5363                start = mb_find_next_zero_bit(bitmap, end + 1, start);
5364                if (start > end)
5365                        break;
5366                next = mb_find_next_bit(bitmap, end + 1, start);
5367
5368                ext4_unlock_group(sb, group);
5369                error = formatter(sb, group, start, next - start, priv);
5370                if (error)
5371                        goto out_unload;
5372                ext4_lock_group(sb, group);
5373
5374                start = next + 1;
5375        }
5376
5377        ext4_unlock_group(sb, group);
5378out_unload:
5379        ext4_mb_unload_buddy(&e4b);
5380
5381        return error;
5382}
5383