linux/fs/xfs/libxfs/xfs_alloc.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
   4 * All Rights Reserved.
   5 */
   6#include "xfs.h"
   7#include "xfs_fs.h"
   8#include "xfs_format.h"
   9#include "xfs_log_format.h"
  10#include "xfs_shared.h"
  11#include "xfs_trans_resv.h"
  12#include "xfs_bit.h"
  13#include "xfs_sb.h"
  14#include "xfs_mount.h"
  15#include "xfs_defer.h"
  16#include "xfs_btree.h"
  17#include "xfs_rmap.h"
  18#include "xfs_alloc_btree.h"
  19#include "xfs_alloc.h"
  20#include "xfs_extent_busy.h"
  21#include "xfs_errortag.h"
  22#include "xfs_error.h"
  23#include "xfs_trace.h"
  24#include "xfs_trans.h"
  25#include "xfs_buf_item.h"
  26#include "xfs_log.h"
  27#include "xfs_ag_resv.h"
  28#include "xfs_bmap.h"
  29
  30extern kmem_zone_t      *xfs_bmap_free_item_zone;
  31
  32struct workqueue_struct *xfs_alloc_wq;
  33
  34#define XFS_ABSDIFF(a,b)        (((a) <= (b)) ? ((b) - (a)) : ((a) - (b)))
  35
  36#define XFSA_FIXUP_BNO_OK       1
  37#define XFSA_FIXUP_CNT_OK       2
  38
  39STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *);
  40STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *);
  41STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
  42
  43/*
  44 * Size of the AGFL.  For CRC-enabled filesystes we steal a couple of slots in
  45 * the beginning of the block for a proper header with the location information
  46 * and CRC.
  47 */
  48unsigned int
  49xfs_agfl_size(
  50        struct xfs_mount        *mp)
  51{
  52        unsigned int            size = mp->m_sb.sb_sectsize;
  53
  54        if (xfs_sb_version_hascrc(&mp->m_sb))
  55                size -= sizeof(struct xfs_agfl);
  56
  57        return size / sizeof(xfs_agblock_t);
  58}
  59
  60unsigned int
  61xfs_refc_block(
  62        struct xfs_mount        *mp)
  63{
  64        if (xfs_sb_version_hasrmapbt(&mp->m_sb))
  65                return XFS_RMAP_BLOCK(mp) + 1;
  66        if (xfs_sb_version_hasfinobt(&mp->m_sb))
  67                return XFS_FIBT_BLOCK(mp) + 1;
  68        return XFS_IBT_BLOCK(mp) + 1;
  69}
  70
  71xfs_extlen_t
  72xfs_prealloc_blocks(
  73        struct xfs_mount        *mp)
  74{
  75        if (xfs_sb_version_hasreflink(&mp->m_sb))
  76                return xfs_refc_block(mp) + 1;
  77        if (xfs_sb_version_hasrmapbt(&mp->m_sb))
  78                return XFS_RMAP_BLOCK(mp) + 1;
  79        if (xfs_sb_version_hasfinobt(&mp->m_sb))
  80                return XFS_FIBT_BLOCK(mp) + 1;
  81        return XFS_IBT_BLOCK(mp) + 1;
  82}
  83
  84/*
  85 * In order to avoid ENOSPC-related deadlock caused by out-of-order locking of
  86 * AGF buffer (PV 947395), we place constraints on the relationship among
  87 * actual allocations for data blocks, freelist blocks, and potential file data
  88 * bmap btree blocks. However, these restrictions may result in no actual space
  89 * allocated for a delayed extent, for example, a data block in a certain AG is
  90 * allocated but there is no additional block for the additional bmap btree
  91 * block due to a split of the bmap btree of the file. The result of this may
  92 * lead to an infinite loop when the file gets flushed to disk and all delayed
  93 * extents need to be actually allocated. To get around this, we explicitly set
  94 * aside a few blocks which will not be reserved in delayed allocation.
  95 *
  96 * We need to reserve 4 fsbs _per AG_ for the freelist and 4 more to handle a
  97 * potential split of the file's bmap btree.
  98 */
  99unsigned int
 100xfs_alloc_set_aside(
 101        struct xfs_mount        *mp)
 102{
 103        return mp->m_sb.sb_agcount * (XFS_ALLOC_AGFL_RESERVE + 4);
 104}
 105
 106/*
 107 * When deciding how much space to allocate out of an AG, we limit the
 108 * allocation maximum size to the size the AG. However, we cannot use all the
 109 * blocks in the AG - some are permanently used by metadata. These
 110 * blocks are generally:
 111 *      - the AG superblock, AGF, AGI and AGFL
 112 *      - the AGF (bno and cnt) and AGI btree root blocks, and optionally
 113 *        the AGI free inode and rmap btree root blocks.
 114 *      - blocks on the AGFL according to xfs_alloc_set_aside() limits
 115 *      - the rmapbt root block
 116 *
 117 * The AG headers are sector sized, so the amount of space they take up is
 118 * dependent on filesystem geometry. The others are all single blocks.
 119 */
 120unsigned int
 121xfs_alloc_ag_max_usable(
 122        struct xfs_mount        *mp)
 123{
 124        unsigned int            blocks;
 125
 126        blocks = XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)); /* ag headers */
 127        blocks += XFS_ALLOC_AGFL_RESERVE;
 128        blocks += 3;                    /* AGF, AGI btree root blocks */
 129        if (xfs_sb_version_hasfinobt(&mp->m_sb))
 130                blocks++;               /* finobt root block */
 131        if (xfs_sb_version_hasrmapbt(&mp->m_sb))
 132                blocks++;               /* rmap root block */
 133        if (xfs_sb_version_hasreflink(&mp->m_sb))
 134                blocks++;               /* refcount root block */
 135
 136        return mp->m_sb.sb_agblocks - blocks;
 137}
 138
 139/*
 140 * Lookup the record equal to [bno, len] in the btree given by cur.
 141 */
 142STATIC int                              /* error */
 143xfs_alloc_lookup_eq(
 144        struct xfs_btree_cur    *cur,   /* btree cursor */
 145        xfs_agblock_t           bno,    /* starting block of extent */
 146        xfs_extlen_t            len,    /* length of extent */
 147        int                     *stat)  /* success/failure */
 148{
 149        int                     error;
 150
 151        cur->bc_rec.a.ar_startblock = bno;
 152        cur->bc_rec.a.ar_blockcount = len;
 153        error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
 154        cur->bc_ag.abt.active = (*stat == 1);
 155        return error;
 156}
 157
 158/*
 159 * Lookup the first record greater than or equal to [bno, len]
 160 * in the btree given by cur.
 161 */
 162int                             /* error */
 163xfs_alloc_lookup_ge(
 164        struct xfs_btree_cur    *cur,   /* btree cursor */
 165        xfs_agblock_t           bno,    /* starting block of extent */
 166        xfs_extlen_t            len,    /* length of extent */
 167        int                     *stat)  /* success/failure */
 168{
 169        int                     error;
 170
 171        cur->bc_rec.a.ar_startblock = bno;
 172        cur->bc_rec.a.ar_blockcount = len;
 173        error = xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
 174        cur->bc_ag.abt.active = (*stat == 1);
 175        return error;
 176}
 177
 178/*
 179 * Lookup the first record less than or equal to [bno, len]
 180 * in the btree given by cur.
 181 */
 182int                                     /* error */
 183xfs_alloc_lookup_le(
 184        struct xfs_btree_cur    *cur,   /* btree cursor */
 185        xfs_agblock_t           bno,    /* starting block of extent */
 186        xfs_extlen_t            len,    /* length of extent */
 187        int                     *stat)  /* success/failure */
 188{
 189        int                     error;
 190        cur->bc_rec.a.ar_startblock = bno;
 191        cur->bc_rec.a.ar_blockcount = len;
 192        error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
 193        cur->bc_ag.abt.active = (*stat == 1);
 194        return error;
 195}
 196
 197static inline bool
 198xfs_alloc_cur_active(
 199        struct xfs_btree_cur    *cur)
 200{
 201        return cur && cur->bc_ag.abt.active;
 202}
 203
 204/*
 205 * Update the record referred to by cur to the value given
 206 * by [bno, len].
 207 * This either works (return 0) or gets an EFSCORRUPTED error.
 208 */
 209STATIC int                              /* error */
 210xfs_alloc_update(
 211        struct xfs_btree_cur    *cur,   /* btree cursor */
 212        xfs_agblock_t           bno,    /* starting block of extent */
 213        xfs_extlen_t            len)    /* length of extent */
 214{
 215        union xfs_btree_rec     rec;
 216
 217        rec.alloc.ar_startblock = cpu_to_be32(bno);
 218        rec.alloc.ar_blockcount = cpu_to_be32(len);
 219        return xfs_btree_update(cur, &rec);
 220}
 221
 222/*
 223 * Get the data from the pointed-to record.
 224 */
 225int                                     /* error */
 226xfs_alloc_get_rec(
 227        struct xfs_btree_cur    *cur,   /* btree cursor */
 228        xfs_agblock_t           *bno,   /* output: starting block of extent */
 229        xfs_extlen_t            *len,   /* output: length of extent */
 230        int                     *stat)  /* output: success/failure */
 231{
 232        struct xfs_mount        *mp = cur->bc_mp;
 233        xfs_agnumber_t          agno = cur->bc_ag.agno;
 234        union xfs_btree_rec     *rec;
 235        int                     error;
 236
 237        error = xfs_btree_get_rec(cur, &rec, stat);
 238        if (error || !(*stat))
 239                return error;
 240
 241        *bno = be32_to_cpu(rec->alloc.ar_startblock);
 242        *len = be32_to_cpu(rec->alloc.ar_blockcount);
 243
 244        if (*len == 0)
 245                goto out_bad_rec;
 246
 247        /* check for valid extent range, including overflow */
 248        if (!xfs_verify_agbno(mp, agno, *bno))
 249                goto out_bad_rec;
 250        if (*bno > *bno + *len)
 251                goto out_bad_rec;
 252        if (!xfs_verify_agbno(mp, agno, *bno + *len - 1))
 253                goto out_bad_rec;
 254
 255        return 0;
 256
 257out_bad_rec:
 258        xfs_warn(mp,
 259                "%s Freespace BTree record corruption in AG %d detected!",
 260                cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size", agno);
 261        xfs_warn(mp,
 262                "start block 0x%x block count 0x%x", *bno, *len);
 263        return -EFSCORRUPTED;
 264}
 265
 266/*
 267 * Compute aligned version of the found extent.
 268 * Takes alignment and min length into account.
 269 */
 270STATIC bool
 271xfs_alloc_compute_aligned(
 272        xfs_alloc_arg_t *args,          /* allocation argument structure */
 273        xfs_agblock_t   foundbno,       /* starting block in found extent */
 274        xfs_extlen_t    foundlen,       /* length in found extent */
 275        xfs_agblock_t   *resbno,        /* result block number */
 276        xfs_extlen_t    *reslen,        /* result length */
 277        unsigned        *busy_gen)
 278{
 279        xfs_agblock_t   bno = foundbno;
 280        xfs_extlen_t    len = foundlen;
 281        xfs_extlen_t    diff;
 282        bool            busy;
 283
 284        /* Trim busy sections out of found extent */
 285        busy = xfs_extent_busy_trim(args, &bno, &len, busy_gen);
 286
 287        /*
 288         * If we have a largish extent that happens to start before min_agbno,
 289         * see if we can shift it into range...
 290         */
 291        if (bno < args->min_agbno && bno + len > args->min_agbno) {
 292                diff = args->min_agbno - bno;
 293                if (len > diff) {
 294                        bno += diff;
 295                        len -= diff;
 296                }
 297        }
 298
 299        if (args->alignment > 1 && len >= args->minlen) {
 300                xfs_agblock_t   aligned_bno = roundup(bno, args->alignment);
 301
 302                diff = aligned_bno - bno;
 303
 304                *resbno = aligned_bno;
 305                *reslen = diff >= len ? 0 : len - diff;
 306        } else {
 307                *resbno = bno;
 308                *reslen = len;
 309        }
 310
 311        return busy;
 312}
 313
 314/*
 315 * Compute best start block and diff for "near" allocations.
 316 * freelen >= wantlen already checked by caller.
 317 */
 318STATIC xfs_extlen_t                     /* difference value (absolute) */
 319xfs_alloc_compute_diff(
 320        xfs_agblock_t   wantbno,        /* target starting block */
 321        xfs_extlen_t    wantlen,        /* target length */
 322        xfs_extlen_t    alignment,      /* target alignment */
 323        int             datatype,       /* are we allocating data? */
 324        xfs_agblock_t   freebno,        /* freespace's starting block */
 325        xfs_extlen_t    freelen,        /* freespace's length */
 326        xfs_agblock_t   *newbnop)       /* result: best start block from free */
 327{
 328        xfs_agblock_t   freeend;        /* end of freespace extent */
 329        xfs_agblock_t   newbno1;        /* return block number */
 330        xfs_agblock_t   newbno2;        /* other new block number */
 331        xfs_extlen_t    newlen1=0;      /* length with newbno1 */
 332        xfs_extlen_t    newlen2=0;      /* length with newbno2 */
 333        xfs_agblock_t   wantend;        /* end of target extent */
 334        bool            userdata = datatype & XFS_ALLOC_USERDATA;
 335
 336        ASSERT(freelen >= wantlen);
 337        freeend = freebno + freelen;
 338        wantend = wantbno + wantlen;
 339        /*
 340         * We want to allocate from the start of a free extent if it is past
 341         * the desired block or if we are allocating user data and the free
 342         * extent is before desired block. The second case is there to allow
 343         * for contiguous allocation from the remaining free space if the file
 344         * grows in the short term.
 345         */
 346        if (freebno >= wantbno || (userdata && freeend < wantend)) {
 347                if ((newbno1 = roundup(freebno, alignment)) >= freeend)
 348                        newbno1 = NULLAGBLOCK;
 349        } else if (freeend >= wantend && alignment > 1) {
 350                newbno1 = roundup(wantbno, alignment);
 351                newbno2 = newbno1 - alignment;
 352                if (newbno1 >= freeend)
 353                        newbno1 = NULLAGBLOCK;
 354                else
 355                        newlen1 = XFS_EXTLEN_MIN(wantlen, freeend - newbno1);
 356                if (newbno2 < freebno)
 357                        newbno2 = NULLAGBLOCK;
 358                else
 359                        newlen2 = XFS_EXTLEN_MIN(wantlen, freeend - newbno2);
 360                if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) {
 361                        if (newlen1 < newlen2 ||
 362                            (newlen1 == newlen2 &&
 363                             XFS_ABSDIFF(newbno1, wantbno) >
 364                             XFS_ABSDIFF(newbno2, wantbno)))
 365                                newbno1 = newbno2;
 366                } else if (newbno2 != NULLAGBLOCK)
 367                        newbno1 = newbno2;
 368        } else if (freeend >= wantend) {
 369                newbno1 = wantbno;
 370        } else if (alignment > 1) {
 371                newbno1 = roundup(freeend - wantlen, alignment);
 372                if (newbno1 > freeend - wantlen &&
 373                    newbno1 - alignment >= freebno)
 374                        newbno1 -= alignment;
 375                else if (newbno1 >= freeend)
 376                        newbno1 = NULLAGBLOCK;
 377        } else
 378                newbno1 = freeend - wantlen;
 379        *newbnop = newbno1;
 380        return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno);
 381}
 382
 383/*
 384 * Fix up the length, based on mod and prod.
 385 * len should be k * prod + mod for some k.
 386 * If len is too small it is returned unchanged.
 387 * If len hits maxlen it is left alone.
 388 */
 389STATIC void
 390xfs_alloc_fix_len(
 391        xfs_alloc_arg_t *args)          /* allocation argument structure */
 392{
 393        xfs_extlen_t    k;
 394        xfs_extlen_t    rlen;
 395
 396        ASSERT(args->mod < args->prod);
 397        rlen = args->len;
 398        ASSERT(rlen >= args->minlen);
 399        ASSERT(rlen <= args->maxlen);
 400        if (args->prod <= 1 || rlen < args->mod || rlen == args->maxlen ||
 401            (args->mod == 0 && rlen < args->prod))
 402                return;
 403        k = rlen % args->prod;
 404        if (k == args->mod)
 405                return;
 406        if (k > args->mod)
 407                rlen = rlen - (k - args->mod);
 408        else
 409                rlen = rlen - args->prod + (args->mod - k);
 410        /* casts to (int) catch length underflows */
 411        if ((int)rlen < (int)args->minlen)
 412                return;
 413        ASSERT(rlen >= args->minlen && rlen <= args->maxlen);
 414        ASSERT(rlen % args->prod == args->mod);
 415        ASSERT(args->pag->pagf_freeblks + args->pag->pagf_flcount >=
 416                rlen + args->minleft);
 417        args->len = rlen;
 418}
 419
 420/*
 421 * Update the two btrees, logically removing from freespace the extent
 422 * starting at rbno, rlen blocks.  The extent is contained within the
 423 * actual (current) free extent fbno for flen blocks.
 424 * Flags are passed in indicating whether the cursors are set to the
 425 * relevant records.
 426 */
 427STATIC int                              /* error code */
 428xfs_alloc_fixup_trees(
 429        xfs_btree_cur_t *cnt_cur,       /* cursor for by-size btree */
 430        xfs_btree_cur_t *bno_cur,       /* cursor for by-block btree */
 431        xfs_agblock_t   fbno,           /* starting block of free extent */
 432        xfs_extlen_t    flen,           /* length of free extent */
 433        xfs_agblock_t   rbno,           /* starting block of returned extent */
 434        xfs_extlen_t    rlen,           /* length of returned extent */
 435        int             flags)          /* flags, XFSA_FIXUP_... */
 436{
 437        int             error;          /* error code */
 438        int             i;              /* operation results */
 439        xfs_agblock_t   nfbno1;         /* first new free startblock */
 440        xfs_agblock_t   nfbno2;         /* second new free startblock */
 441        xfs_extlen_t    nflen1=0;       /* first new free length */
 442        xfs_extlen_t    nflen2=0;       /* second new free length */
 443        struct xfs_mount *mp;
 444
 445        mp = cnt_cur->bc_mp;
 446
 447        /*
 448         * Look up the record in the by-size tree if necessary.
 449         */
 450        if (flags & XFSA_FIXUP_CNT_OK) {
 451#ifdef DEBUG
 452                if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i)))
 453                        return error;
 454                if (XFS_IS_CORRUPT(mp,
 455                                   i != 1 ||
 456                                   nfbno1 != fbno ||
 457                                   nflen1 != flen))
 458                        return -EFSCORRUPTED;
 459#endif
 460        } else {
 461                if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i)))
 462                        return error;
 463                if (XFS_IS_CORRUPT(mp, i != 1))
 464                        return -EFSCORRUPTED;
 465        }
 466        /*
 467         * Look up the record in the by-block tree if necessary.
 468         */
 469        if (flags & XFSA_FIXUP_BNO_OK) {
 470#ifdef DEBUG
 471                if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i)))
 472                        return error;
 473                if (XFS_IS_CORRUPT(mp,
 474                                   i != 1 ||
 475                                   nfbno1 != fbno ||
 476                                   nflen1 != flen))
 477                        return -EFSCORRUPTED;
 478#endif
 479        } else {
 480                if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i)))
 481                        return error;
 482                if (XFS_IS_CORRUPT(mp, i != 1))
 483                        return -EFSCORRUPTED;
 484        }
 485
 486#ifdef DEBUG
 487        if (bno_cur->bc_nlevels == 1 && cnt_cur->bc_nlevels == 1) {
 488                struct xfs_btree_block  *bnoblock;
 489                struct xfs_btree_block  *cntblock;
 490
 491                bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_bufs[0]);
 492                cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_bufs[0]);
 493
 494                if (XFS_IS_CORRUPT(mp,
 495                                   bnoblock->bb_numrecs !=
 496                                   cntblock->bb_numrecs))
 497                        return -EFSCORRUPTED;
 498        }
 499#endif
 500
 501        /*
 502         * Deal with all four cases: the allocated record is contained
 503         * within the freespace record, so we can have new freespace
 504         * at either (or both) end, or no freespace remaining.
 505         */
 506        if (rbno == fbno && rlen == flen)
 507                nfbno1 = nfbno2 = NULLAGBLOCK;
 508        else if (rbno == fbno) {
 509                nfbno1 = rbno + rlen;
 510                nflen1 = flen - rlen;
 511                nfbno2 = NULLAGBLOCK;
 512        } else if (rbno + rlen == fbno + flen) {
 513                nfbno1 = fbno;
 514                nflen1 = flen - rlen;
 515                nfbno2 = NULLAGBLOCK;
 516        } else {
 517                nfbno1 = fbno;
 518                nflen1 = rbno - fbno;
 519                nfbno2 = rbno + rlen;
 520                nflen2 = (fbno + flen) - nfbno2;
 521        }
 522        /*
 523         * Delete the entry from the by-size btree.
 524         */
 525        if ((error = xfs_btree_delete(cnt_cur, &i)))
 526                return error;
 527        if (XFS_IS_CORRUPT(mp, i != 1))
 528                return -EFSCORRUPTED;
 529        /*
 530         * Add new by-size btree entry(s).
 531         */
 532        if (nfbno1 != NULLAGBLOCK) {
 533                if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
 534                        return error;
 535                if (XFS_IS_CORRUPT(mp, i != 0))
 536                        return -EFSCORRUPTED;
 537                if ((error = xfs_btree_insert(cnt_cur, &i)))
 538                        return error;
 539                if (XFS_IS_CORRUPT(mp, i != 1))
 540                        return -EFSCORRUPTED;
 541        }
 542        if (nfbno2 != NULLAGBLOCK) {
 543                if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
 544                        return error;
 545                if (XFS_IS_CORRUPT(mp, i != 0))
 546                        return -EFSCORRUPTED;
 547                if ((error = xfs_btree_insert(cnt_cur, &i)))
 548                        return error;
 549                if (XFS_IS_CORRUPT(mp, i != 1))
 550                        return -EFSCORRUPTED;
 551        }
 552        /*
 553         * Fix up the by-block btree entry(s).
 554         */
 555        if (nfbno1 == NULLAGBLOCK) {
 556                /*
 557                 * No remaining freespace, just delete the by-block tree entry.
 558                 */
 559                if ((error = xfs_btree_delete(bno_cur, &i)))
 560                        return error;
 561                if (XFS_IS_CORRUPT(mp, i != 1))
 562                        return -EFSCORRUPTED;
 563        } else {
 564                /*
 565                 * Update the by-block entry to start later|be shorter.
 566                 */
 567                if ((error = xfs_alloc_update(bno_cur, nfbno1, nflen1)))
 568                        return error;
 569        }
 570        if (nfbno2 != NULLAGBLOCK) {
 571                /*
 572                 * 2 resulting free entries, need to add one.
 573                 */
 574                if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
 575                        return error;
 576                if (XFS_IS_CORRUPT(mp, i != 0))
 577                        return -EFSCORRUPTED;
 578                if ((error = xfs_btree_insert(bno_cur, &i)))
 579                        return error;
 580                if (XFS_IS_CORRUPT(mp, i != 1))
 581                        return -EFSCORRUPTED;
 582        }
 583        return 0;
 584}
 585
 586static xfs_failaddr_t
 587xfs_agfl_verify(
 588        struct xfs_buf  *bp)
 589{
 590        struct xfs_mount *mp = bp->b_mount;
 591        struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp);
 592        __be32          *agfl_bno = xfs_buf_to_agfl_bno(bp);
 593        int             i;
 594
 595        /*
 596         * There is no verification of non-crc AGFLs because mkfs does not
 597         * initialise the AGFL to zero or NULL. Hence the only valid part of the
 598         * AGFL is what the AGF says is active. We can't get to the AGF, so we
 599         * can't verify just those entries are valid.
 600         */
 601        if (!xfs_sb_version_hascrc(&mp->m_sb))
 602                return NULL;
 603
 604        if (!xfs_verify_magic(bp, agfl->agfl_magicnum))
 605                return __this_address;
 606        if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid))
 607                return __this_address;
 608        /*
 609         * during growfs operations, the perag is not fully initialised,
 610         * so we can't use it for any useful checking. growfs ensures we can't
 611         * use it by using uncached buffers that don't have the perag attached
 612         * so we can detect and avoid this problem.
 613         */
 614        if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno)
 615                return __this_address;
 616
 617        for (i = 0; i < xfs_agfl_size(mp); i++) {
 618                if (be32_to_cpu(agfl_bno[i]) != NULLAGBLOCK &&
 619                    be32_to_cpu(agfl_bno[i]) >= mp->m_sb.sb_agblocks)
 620                        return __this_address;
 621        }
 622
 623        if (!xfs_log_check_lsn(mp, be64_to_cpu(XFS_BUF_TO_AGFL(bp)->agfl_lsn)))
 624                return __this_address;
 625        return NULL;
 626}
 627
 628static void
 629xfs_agfl_read_verify(
 630        struct xfs_buf  *bp)
 631{
 632        struct xfs_mount *mp = bp->b_mount;
 633        xfs_failaddr_t  fa;
 634
 635        /*
 636         * There is no verification of non-crc AGFLs because mkfs does not
 637         * initialise the AGFL to zero or NULL. Hence the only valid part of the
 638         * AGFL is what the AGF says is active. We can't get to the AGF, so we
 639         * can't verify just those entries are valid.
 640         */
 641        if (!xfs_sb_version_hascrc(&mp->m_sb))
 642                return;
 643
 644        if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF))
 645                xfs_verifier_error(bp, -EFSBADCRC, __this_address);
 646        else {
 647                fa = xfs_agfl_verify(bp);
 648                if (fa)
 649                        xfs_verifier_error(bp, -EFSCORRUPTED, fa);
 650        }
 651}
 652
 653static void
 654xfs_agfl_write_verify(
 655        struct xfs_buf  *bp)
 656{
 657        struct xfs_mount        *mp = bp->b_mount;
 658        struct xfs_buf_log_item *bip = bp->b_log_item;
 659        xfs_failaddr_t          fa;
 660
 661        /* no verification of non-crc AGFLs */
 662        if (!xfs_sb_version_hascrc(&mp->m_sb))
 663                return;
 664
 665        fa = xfs_agfl_verify(bp);
 666        if (fa) {
 667                xfs_verifier_error(bp, -EFSCORRUPTED, fa);
 668                return;
 669        }
 670
 671        if (bip)
 672                XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
 673
 674        xfs_buf_update_cksum(bp, XFS_AGFL_CRC_OFF);
 675}
 676
 677const struct xfs_buf_ops xfs_agfl_buf_ops = {
 678        .name = "xfs_agfl",
 679        .magic = { cpu_to_be32(XFS_AGFL_MAGIC), cpu_to_be32(XFS_AGFL_MAGIC) },
 680        .verify_read = xfs_agfl_read_verify,
 681        .verify_write = xfs_agfl_write_verify,
 682        .verify_struct = xfs_agfl_verify,
 683};
 684
 685/*
 686 * Read in the allocation group free block array.
 687 */
 688int                                     /* error */
 689xfs_alloc_read_agfl(
 690        xfs_mount_t     *mp,            /* mount point structure */
 691        xfs_trans_t     *tp,            /* transaction pointer */
 692        xfs_agnumber_t  agno,           /* allocation group number */
 693        xfs_buf_t       **bpp)          /* buffer for the ag free block array */
 694{
 695        xfs_buf_t       *bp;            /* return value */
 696        int             error;
 697
 698        ASSERT(agno != NULLAGNUMBER);
 699        error = xfs_trans_read_buf(
 700                        mp, tp, mp->m_ddev_targp,
 701                        XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
 702                        XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
 703        if (error)
 704                return error;
 705        xfs_buf_set_ref(bp, XFS_AGFL_REF);
 706        *bpp = bp;
 707        return 0;
 708}
 709
 710STATIC int
 711xfs_alloc_update_counters(
 712        struct xfs_trans        *tp,
 713        struct xfs_perag        *pag,
 714        struct xfs_buf          *agbp,
 715        long                    len)
 716{
 717        struct xfs_agf          *agf = agbp->b_addr;
 718
 719        pag->pagf_freeblks += len;
 720        be32_add_cpu(&agf->agf_freeblks, len);
 721
 722        xfs_trans_agblocks_delta(tp, len);
 723        if (unlikely(be32_to_cpu(agf->agf_freeblks) >
 724                     be32_to_cpu(agf->agf_length))) {
 725                xfs_buf_mark_corrupt(agbp);
 726                return -EFSCORRUPTED;
 727        }
 728
 729        xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
 730        return 0;
 731}
 732
 733/*
 734 * Block allocation algorithm and data structures.
 735 */
 736struct xfs_alloc_cur {
 737        struct xfs_btree_cur            *cnt;   /* btree cursors */
 738        struct xfs_btree_cur            *bnolt;
 739        struct xfs_btree_cur            *bnogt;
 740        xfs_extlen_t                    cur_len;/* current search length */
 741        xfs_agblock_t                   rec_bno;/* extent startblock */
 742        xfs_extlen_t                    rec_len;/* extent length */
 743        xfs_agblock_t                   bno;    /* alloc bno */
 744        xfs_extlen_t                    len;    /* alloc len */
 745        xfs_extlen_t                    diff;   /* diff from search bno */
 746        unsigned int                    busy_gen;/* busy state */
 747        bool                            busy;
 748};
 749
 750/*
 751 * Set up cursors, etc. in the extent allocation cursor. This function can be
 752 * called multiple times to reset an initialized structure without having to
 753 * reallocate cursors.
 754 */
 755static int
 756xfs_alloc_cur_setup(
 757        struct xfs_alloc_arg    *args,
 758        struct xfs_alloc_cur    *acur)
 759{
 760        int                     error;
 761        int                     i;
 762
 763        ASSERT(args->alignment == 1 || args->type != XFS_ALLOCTYPE_THIS_BNO);
 764
 765        acur->cur_len = args->maxlen;
 766        acur->rec_bno = 0;
 767        acur->rec_len = 0;
 768        acur->bno = 0;
 769        acur->len = 0;
 770        acur->diff = -1;
 771        acur->busy = false;
 772        acur->busy_gen = 0;
 773
 774        /*
 775         * Perform an initial cntbt lookup to check for availability of maxlen
 776         * extents. If this fails, we'll return -ENOSPC to signal the caller to
 777         * attempt a small allocation.
 778         */
 779        if (!acur->cnt)
 780                acur->cnt = xfs_allocbt_init_cursor(args->mp, args->tp,
 781                                        args->agbp, args->agno, XFS_BTNUM_CNT);
 782        error = xfs_alloc_lookup_ge(acur->cnt, 0, args->maxlen, &i);
 783        if (error)
 784                return error;
 785
 786        /*
 787         * Allocate the bnobt left and right search cursors.
 788         */
 789        if (!acur->bnolt)
 790                acur->bnolt = xfs_allocbt_init_cursor(args->mp, args->tp,
 791                                        args->agbp, args->agno, XFS_BTNUM_BNO);
 792        if (!acur->bnogt)
 793                acur->bnogt = xfs_allocbt_init_cursor(args->mp, args->tp,
 794                                        args->agbp, args->agno, XFS_BTNUM_BNO);
 795        return i == 1 ? 0 : -ENOSPC;
 796}
 797
 798static void
 799xfs_alloc_cur_close(
 800        struct xfs_alloc_cur    *acur,
 801        bool                    error)
 802{
 803        int                     cur_error = XFS_BTREE_NOERROR;
 804
 805        if (error)
 806                cur_error = XFS_BTREE_ERROR;
 807
 808        if (acur->cnt)
 809                xfs_btree_del_cursor(acur->cnt, cur_error);
 810        if (acur->bnolt)
 811                xfs_btree_del_cursor(acur->bnolt, cur_error);
 812        if (acur->bnogt)
 813                xfs_btree_del_cursor(acur->bnogt, cur_error);
 814        acur->cnt = acur->bnolt = acur->bnogt = NULL;
 815}
 816
 817/*
 818 * Check an extent for allocation and track the best available candidate in the
 819 * allocation structure. The cursor is deactivated if it has entered an out of
 820 * range state based on allocation arguments. Optionally return the extent
 821 * extent geometry and allocation status if requested by the caller.
 822 */
 823static int
 824xfs_alloc_cur_check(
 825        struct xfs_alloc_arg    *args,
 826        struct xfs_alloc_cur    *acur,
 827        struct xfs_btree_cur    *cur,
 828        int                     *new)
 829{
 830        int                     error, i;
 831        xfs_agblock_t           bno, bnoa, bnew;
 832        xfs_extlen_t            len, lena, diff = -1;
 833        bool                    busy;
 834        unsigned                busy_gen = 0;
 835        bool                    deactivate = false;
 836        bool                    isbnobt = cur->bc_btnum == XFS_BTNUM_BNO;
 837
 838        *new = 0;
 839
 840        error = xfs_alloc_get_rec(cur, &bno, &len, &i);
 841        if (error)
 842                return error;
 843        if (XFS_IS_CORRUPT(args->mp, i != 1))
 844                return -EFSCORRUPTED;
 845
 846        /*
 847         * Check minlen and deactivate a cntbt cursor if out of acceptable size
 848         * range (i.e., walking backwards looking for a minlen extent).
 849         */
 850        if (len < args->minlen) {
 851                deactivate = !isbnobt;
 852                goto out;
 853        }
 854
 855        busy = xfs_alloc_compute_aligned(args, bno, len, &bnoa, &lena,
 856                                         &busy_gen);
 857        acur->busy |= busy;
 858        if (busy)
 859                acur->busy_gen = busy_gen;
 860        /* deactivate a bnobt cursor outside of locality range */
 861        if (bnoa < args->min_agbno || bnoa > args->max_agbno) {
 862                deactivate = isbnobt;
 863                goto out;
 864        }
 865        if (lena < args->minlen)
 866                goto out;
 867
 868        args->len = XFS_EXTLEN_MIN(lena, args->maxlen);
 869        xfs_alloc_fix_len(args);
 870        ASSERT(args->len >= args->minlen);
 871        if (args->len < acur->len)
 872                goto out;
 873
 874        /*
 875         * We have an aligned record that satisfies minlen and beats or matches
 876         * the candidate extent size. Compare locality for near allocation mode.
 877         */
 878        ASSERT(args->type == XFS_ALLOCTYPE_NEAR_BNO);
 879        diff = xfs_alloc_compute_diff(args->agbno, args->len,
 880                                      args->alignment, args->datatype,
 881                                      bnoa, lena, &bnew);
 882        if (bnew == NULLAGBLOCK)
 883                goto out;
 884
 885        /*
 886         * Deactivate a bnobt cursor with worse locality than the current best.
 887         */
 888        if (diff > acur->diff) {
 889                deactivate = isbnobt;
 890                goto out;
 891        }
 892
 893        ASSERT(args->len > acur->len ||
 894               (args->len == acur->len && diff <= acur->diff));
 895        acur->rec_bno = bno;
 896        acur->rec_len = len;
 897        acur->bno = bnew;
 898        acur->len = args->len;
 899        acur->diff = diff;
 900        *new = 1;
 901
 902        /*
 903         * We're done if we found a perfect allocation. This only deactivates
 904         * the current cursor, but this is just an optimization to terminate a
 905         * cntbt search that otherwise runs to the edge of the tree.
 906         */
 907        if (acur->diff == 0 && acur->len == args->maxlen)
 908                deactivate = true;
 909out:
 910        if (deactivate)
 911                cur->bc_ag.abt.active = false;
 912        trace_xfs_alloc_cur_check(args->mp, cur->bc_btnum, bno, len, diff,
 913                                  *new);
 914        return 0;
 915}
 916
 917/*
 918 * Complete an allocation of a candidate extent. Remove the extent from both
 919 * trees and update the args structure.
 920 */
 921STATIC int
 922xfs_alloc_cur_finish(
 923        struct xfs_alloc_arg    *args,
 924        struct xfs_alloc_cur    *acur)
 925{
 926        struct xfs_agf __maybe_unused *agf = args->agbp->b_addr;
 927        int                     error;
 928
 929        ASSERT(acur->cnt && acur->bnolt);
 930        ASSERT(acur->bno >= acur->rec_bno);
 931        ASSERT(acur->bno + acur->len <= acur->rec_bno + acur->rec_len);
 932        ASSERT(acur->rec_bno + acur->rec_len <= be32_to_cpu(agf->agf_length));
 933
 934        error = xfs_alloc_fixup_trees(acur->cnt, acur->bnolt, acur->rec_bno,
 935                                      acur->rec_len, acur->bno, acur->len, 0);
 936        if (error)
 937                return error;
 938
 939        args->agbno = acur->bno;
 940        args->len = acur->len;
 941        args->wasfromfl = 0;
 942
 943        trace_xfs_alloc_cur(args);
 944        return 0;
 945}
 946
 947/*
 948 * Locality allocation lookup algorithm. This expects a cntbt cursor and uses
 949 * bno optimized lookup to search for extents with ideal size and locality.
 950 */
 951STATIC int
 952xfs_alloc_cntbt_iter(
 953        struct xfs_alloc_arg            *args,
 954        struct xfs_alloc_cur            *acur)
 955{
 956        struct xfs_btree_cur    *cur = acur->cnt;
 957        xfs_agblock_t           bno;
 958        xfs_extlen_t            len, cur_len;
 959        int                     error;
 960        int                     i;
 961
 962        if (!xfs_alloc_cur_active(cur))
 963                return 0;
 964
 965        /* locality optimized lookup */
 966        cur_len = acur->cur_len;
 967        error = xfs_alloc_lookup_ge(cur, args->agbno, cur_len, &i);
 968        if (error)
 969                return error;
 970        if (i == 0)
 971                return 0;
 972        error = xfs_alloc_get_rec(cur, &bno, &len, &i);
 973        if (error)
 974                return error;
 975
 976        /* check the current record and update search length from it */
 977        error = xfs_alloc_cur_check(args, acur, cur, &i);
 978        if (error)
 979                return error;
 980        ASSERT(len >= acur->cur_len);
 981        acur->cur_len = len;
 982
 983        /*
 984         * We looked up the first record >= [agbno, len] above. The agbno is a
 985         * secondary key and so the current record may lie just before or after
 986         * agbno. If it is past agbno, check the previous record too so long as
 987         * the length matches as it may be closer. Don't check a smaller record
 988         * because that could deactivate our cursor.
 989         */
 990        if (bno > args->agbno) {
 991                error = xfs_btree_decrement(cur, 0, &i);
 992                if (!error && i) {
 993                        error = xfs_alloc_get_rec(cur, &bno, &len, &i);
 994                        if (!error && i && len == acur->cur_len)
 995                                error = xfs_alloc_cur_check(args, acur, cur,
 996                                                            &i);
 997                }
 998                if (error)
 999                        return error;
1000        }
1001
1002        /*
1003         * Increment the search key until we find at least one allocation
1004         * candidate or if the extent we found was larger. Otherwise, double the
1005         * search key to optimize the search. Efficiency is more important here
1006         * than absolute best locality.
1007         */
1008        cur_len <<= 1;
1009        if (!acur->len || acur->cur_len >= cur_len)
1010                acur->cur_len++;
1011        else
1012                acur->cur_len = cur_len;
1013
1014        return error;
1015}
1016
1017/*
1018 * Deal with the case where only small freespaces remain. Either return the
1019 * contents of the last freespace record, or allocate space from the freelist if
1020 * there is nothing in the tree.
1021 */
1022STATIC int                      /* error */
1023xfs_alloc_ag_vextent_small(
1024        struct xfs_alloc_arg    *args,  /* allocation argument structure */
1025        struct xfs_btree_cur    *ccur,  /* optional by-size cursor */
1026        xfs_agblock_t           *fbnop, /* result block number */
1027        xfs_extlen_t            *flenp, /* result length */
1028        int                     *stat)  /* status: 0-freelist, 1-normal/none */
1029{
1030        struct xfs_agf          *agf = args->agbp->b_addr;
1031        int                     error = 0;
1032        xfs_agblock_t           fbno = NULLAGBLOCK;
1033        xfs_extlen_t            flen = 0;
1034        int                     i = 0;
1035
1036        /*
1037         * If a cntbt cursor is provided, try to allocate the largest record in
1038         * the tree. Try the AGFL if the cntbt is empty, otherwise fail the
1039         * allocation. Make sure to respect minleft even when pulling from the
1040         * freelist.
1041         */
1042        if (ccur)
1043                error = xfs_btree_decrement(ccur, 0, &i);
1044        if (error)
1045                goto error;
1046        if (i) {
1047                error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i);
1048                if (error)
1049                        goto error;
1050                if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1051                        error = -EFSCORRUPTED;
1052                        goto error;
1053                }
1054                goto out;
1055        }
1056
1057        if (args->minlen != 1 || args->alignment != 1 ||
1058            args->resv == XFS_AG_RESV_AGFL ||
1059            be32_to_cpu(agf->agf_flcount) <= args->minleft)
1060                goto out;
1061
1062        error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0);
1063        if (error)
1064                goto error;
1065        if (fbno == NULLAGBLOCK)
1066                goto out;
1067
1068        xfs_extent_busy_reuse(args->mp, args->agno, fbno, 1,
1069                              (args->datatype & XFS_ALLOC_NOBUSY));
1070
1071        if (args->datatype & XFS_ALLOC_USERDATA) {
1072                struct xfs_buf  *bp;
1073
1074                error = xfs_trans_get_buf(args->tp, args->mp->m_ddev_targp,
1075                                XFS_AGB_TO_DADDR(args->mp, args->agno, fbno),
1076                                args->mp->m_bsize, 0, &bp);
1077                if (error)
1078                        goto error;
1079                xfs_trans_binval(args->tp, bp);
1080        }
1081        *fbnop = args->agbno = fbno;
1082        *flenp = args->len = 1;
1083        if (XFS_IS_CORRUPT(args->mp, fbno >= be32_to_cpu(agf->agf_length))) {
1084                error = -EFSCORRUPTED;
1085                goto error;
1086        }
1087        args->wasfromfl = 1;
1088        trace_xfs_alloc_small_freelist(args);
1089
1090        /*
1091         * If we're feeding an AGFL block to something that doesn't live in the
1092         * free space, we need to clear out the OWN_AG rmap.
1093         */
1094        error = xfs_rmap_free(args->tp, args->agbp, args->agno, fbno, 1,
1095                              &XFS_RMAP_OINFO_AG);
1096        if (error)
1097                goto error;
1098
1099        *stat = 0;
1100        return 0;
1101
1102out:
1103        /*
1104         * Can't do the allocation, give up.
1105         */
1106        if (flen < args->minlen) {
1107                args->agbno = NULLAGBLOCK;
1108                trace_xfs_alloc_small_notenough(args);
1109                flen = 0;
1110        }
1111        *fbnop = fbno;
1112        *flenp = flen;
1113        *stat = 1;
1114        trace_xfs_alloc_small_done(args);
1115        return 0;
1116
1117error:
1118        trace_xfs_alloc_small_error(args);
1119        return error;
1120}
1121
1122/*
1123 * Allocate a variable extent in the allocation group agno.
1124 * Type and bno are used to determine where in the allocation group the
1125 * extent will start.
1126 * Extent's length (returned in *len) will be between minlen and maxlen,
1127 * and of the form k * prod + mod unless there's nothing that large.
1128 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1129 */
1130STATIC int                      /* error */
1131xfs_alloc_ag_vextent(
1132        xfs_alloc_arg_t *args)  /* argument structure for allocation */
1133{
1134        int             error=0;
1135
1136        ASSERT(args->minlen > 0);
1137        ASSERT(args->maxlen > 0);
1138        ASSERT(args->minlen <= args->maxlen);
1139        ASSERT(args->mod < args->prod);
1140        ASSERT(args->alignment > 0);
1141
1142        /*
1143         * Branch to correct routine based on the type.
1144         */
1145        args->wasfromfl = 0;
1146        switch (args->type) {
1147        case XFS_ALLOCTYPE_THIS_AG:
1148                error = xfs_alloc_ag_vextent_size(args);
1149                break;
1150        case XFS_ALLOCTYPE_NEAR_BNO:
1151                error = xfs_alloc_ag_vextent_near(args);
1152                break;
1153        case XFS_ALLOCTYPE_THIS_BNO:
1154                error = xfs_alloc_ag_vextent_exact(args);
1155                break;
1156        default:
1157                ASSERT(0);
1158                /* NOTREACHED */
1159        }
1160
1161        if (error || args->agbno == NULLAGBLOCK)
1162                return error;
1163
1164        ASSERT(args->len >= args->minlen);
1165        ASSERT(args->len <= args->maxlen);
1166        ASSERT(!args->wasfromfl || args->resv != XFS_AG_RESV_AGFL);
1167        ASSERT(args->agbno % args->alignment == 0);
1168
1169        /* if not file data, insert new block into the reverse map btree */
1170        if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) {
1171                error = xfs_rmap_alloc(args->tp, args->agbp, args->agno,
1172                                       args->agbno, args->len, &args->oinfo);
1173                if (error)
1174                        return error;
1175        }
1176
1177        if (!args->wasfromfl) {
1178                error = xfs_alloc_update_counters(args->tp, args->pag,
1179                                                  args->agbp,
1180                                                  -((long)(args->len)));
1181                if (error)
1182                        return error;
1183
1184                ASSERT(!xfs_extent_busy_search(args->mp, args->agno,
1185                                              args->agbno, args->len));
1186        }
1187
1188        xfs_ag_resv_alloc_extent(args->pag, args->resv, args);
1189
1190        XFS_STATS_INC(args->mp, xs_allocx);
1191        XFS_STATS_ADD(args->mp, xs_allocb, args->len);
1192        return error;
1193}
1194
1195/*
1196 * Allocate a variable extent at exactly agno/bno.
1197 * Extent's length (returned in *len) will be between minlen and maxlen,
1198 * and of the form k * prod + mod unless there's nothing that large.
1199 * Return the starting a.g. block (bno), or NULLAGBLOCK if we can't do it.
1200 */
1201STATIC int                      /* error */
1202xfs_alloc_ag_vextent_exact(
1203        xfs_alloc_arg_t *args)  /* allocation argument structure */
1204{
1205        struct xfs_agf __maybe_unused *agf = args->agbp->b_addr;
1206        xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */
1207        xfs_btree_cur_t *cnt_cur;/* by count btree cursor */
1208        int             error;
1209        xfs_agblock_t   fbno;   /* start block of found extent */
1210        xfs_extlen_t    flen;   /* length of found extent */
1211        xfs_agblock_t   tbno;   /* start block of busy extent */
1212        xfs_extlen_t    tlen;   /* length of busy extent */
1213        xfs_agblock_t   tend;   /* end block of busy extent */
1214        int             i;      /* success/failure of operation */
1215        unsigned        busy_gen;
1216
1217        ASSERT(args->alignment == 1);
1218
1219        /*
1220         * Allocate/initialize a cursor for the by-number freespace btree.
1221         */
1222        bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1223                                          args->agno, XFS_BTNUM_BNO);
1224
1225        /*
1226         * Lookup bno and minlen in the btree (minlen is irrelevant, really).
1227         * Look for the closest free block <= bno, it must contain bno
1228         * if any free block does.
1229         */
1230        error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i);
1231        if (error)
1232                goto error0;
1233        if (!i)
1234                goto not_found;
1235
1236        /*
1237         * Grab the freespace record.
1238         */
1239        error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
1240        if (error)
1241                goto error0;
1242        if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1243                error = -EFSCORRUPTED;
1244                goto error0;
1245        }
1246        ASSERT(fbno <= args->agbno);
1247
1248        /*
1249         * Check for overlapping busy extents.
1250         */
1251        tbno = fbno;
1252        tlen = flen;
1253        xfs_extent_busy_trim(args, &tbno, &tlen, &busy_gen);
1254
1255        /*
1256         * Give up if the start of the extent is busy, or the freespace isn't
1257         * long enough for the minimum request.
1258         */
1259        if (tbno > args->agbno)
1260                goto not_found;
1261        if (tlen < args->minlen)
1262                goto not_found;
1263        tend = tbno + tlen;
1264        if (tend < args->agbno + args->minlen)
1265                goto not_found;
1266
1267        /*
1268         * End of extent will be smaller of the freespace end and the
1269         * maximal requested end.
1270         *
1271         * Fix the length according to mod and prod if given.
1272         */
1273        args->len = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen)
1274                                                - args->agbno;
1275        xfs_alloc_fix_len(args);
1276        ASSERT(args->agbno + args->len <= tend);
1277
1278        /*
1279         * We are allocating agbno for args->len
1280         * Allocate/initialize a cursor for the by-size btree.
1281         */
1282        cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1283                args->agno, XFS_BTNUM_CNT);
1284        ASSERT(args->agbno + args->len <= be32_to_cpu(agf->agf_length));
1285        error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
1286                                      args->len, XFSA_FIXUP_BNO_OK);
1287        if (error) {
1288                xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
1289                goto error0;
1290        }
1291
1292        xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1293        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1294
1295        args->wasfromfl = 0;
1296        trace_xfs_alloc_exact_done(args);
1297        return 0;
1298
1299not_found:
1300        /* Didn't find it, return null. */
1301        xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1302        args->agbno = NULLAGBLOCK;
1303        trace_xfs_alloc_exact_notfound(args);
1304        return 0;
1305
1306error0:
1307        xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
1308        trace_xfs_alloc_exact_error(args);
1309        return error;
1310}
1311
1312/*
1313 * Search a given number of btree records in a given direction. Check each
1314 * record against the good extent we've already found.
1315 */
1316STATIC int
1317xfs_alloc_walk_iter(
1318        struct xfs_alloc_arg    *args,
1319        struct xfs_alloc_cur    *acur,
1320        struct xfs_btree_cur    *cur,
1321        bool                    increment,
1322        bool                    find_one, /* quit on first candidate */
1323        int                     count,    /* rec count (-1 for infinite) */
1324        int                     *stat)
1325{
1326        int                     error;
1327        int                     i;
1328
1329        *stat = 0;
1330
1331        /*
1332         * Search so long as the cursor is active or we find a better extent.
1333         * The cursor is deactivated if it extends beyond the range of the
1334         * current allocation candidate.
1335         */
1336        while (xfs_alloc_cur_active(cur) && count) {
1337                error = xfs_alloc_cur_check(args, acur, cur, &i);
1338                if (error)
1339                        return error;
1340                if (i == 1) {
1341                        *stat = 1;
1342                        if (find_one)
1343                                break;
1344                }
1345                if (!xfs_alloc_cur_active(cur))
1346                        break;
1347
1348                if (increment)
1349                        error = xfs_btree_increment(cur, 0, &i);
1350                else
1351                        error = xfs_btree_decrement(cur, 0, &i);
1352                if (error)
1353                        return error;
1354                if (i == 0)
1355                        cur->bc_ag.abt.active = false;
1356
1357                if (count > 0)
1358                        count--;
1359        }
1360
1361        return 0;
1362}
1363
1364/*
1365 * Search the by-bno and by-size btrees in parallel in search of an extent with
1366 * ideal locality based on the NEAR mode ->agbno locality hint.
1367 */
1368STATIC int
1369xfs_alloc_ag_vextent_locality(
1370        struct xfs_alloc_arg    *args,
1371        struct xfs_alloc_cur    *acur,
1372        int                     *stat)
1373{
1374        struct xfs_btree_cur    *fbcur = NULL;
1375        int                     error;
1376        int                     i;
1377        bool                    fbinc;
1378
1379        ASSERT(acur->len == 0);
1380        ASSERT(args->type == XFS_ALLOCTYPE_NEAR_BNO);
1381
1382        *stat = 0;
1383
1384        error = xfs_alloc_lookup_ge(acur->cnt, args->agbno, acur->cur_len, &i);
1385        if (error)
1386                return error;
1387        error = xfs_alloc_lookup_le(acur->bnolt, args->agbno, 0, &i);
1388        if (error)
1389                return error;
1390        error = xfs_alloc_lookup_ge(acur->bnogt, args->agbno, 0, &i);
1391        if (error)
1392                return error;
1393
1394        /*
1395         * Search the bnobt and cntbt in parallel. Search the bnobt left and
1396         * right and lookup the closest extent to the locality hint for each
1397         * extent size key in the cntbt. The entire search terminates
1398         * immediately on a bnobt hit because that means we've found best case
1399         * locality. Otherwise the search continues until the cntbt cursor runs
1400         * off the end of the tree. If no allocation candidate is found at this
1401         * point, give up on locality, walk backwards from the end of the cntbt
1402         * and take the first available extent.
1403         *
1404         * The parallel tree searches balance each other out to provide fairly
1405         * consistent performance for various situations. The bnobt search can
1406         * have pathological behavior in the worst case scenario of larger
1407         * allocation requests and fragmented free space. On the other hand, the
1408         * bnobt is able to satisfy most smaller allocation requests much more
1409         * quickly than the cntbt. The cntbt search can sift through fragmented
1410         * free space and sets of free extents for larger allocation requests
1411         * more quickly than the bnobt. Since the locality hint is just a hint
1412         * and we don't want to scan the entire bnobt for perfect locality, the
1413         * cntbt search essentially bounds the bnobt search such that we can
1414         * find good enough locality at reasonable performance in most cases.
1415         */
1416        while (xfs_alloc_cur_active(acur->bnolt) ||
1417               xfs_alloc_cur_active(acur->bnogt) ||
1418               xfs_alloc_cur_active(acur->cnt)) {
1419
1420                trace_xfs_alloc_cur_lookup(args);
1421
1422                /*
1423                 * Search the bnobt left and right. In the case of a hit, finish
1424                 * the search in the opposite direction and we're done.
1425                 */
1426                error = xfs_alloc_walk_iter(args, acur, acur->bnolt, false,
1427                                            true, 1, &i);
1428                if (error)
1429                        return error;
1430                if (i == 1) {
1431                        trace_xfs_alloc_cur_left(args);
1432                        fbcur = acur->bnogt;
1433                        fbinc = true;
1434                        break;
1435                }
1436                error = xfs_alloc_walk_iter(args, acur, acur->bnogt, true, true,
1437                                            1, &i);
1438                if (error)
1439                        return error;
1440                if (i == 1) {
1441                        trace_xfs_alloc_cur_right(args);
1442                        fbcur = acur->bnolt;
1443                        fbinc = false;
1444                        break;
1445                }
1446
1447                /*
1448                 * Check the extent with best locality based on the current
1449                 * extent size search key and keep track of the best candidate.
1450                 */
1451                error = xfs_alloc_cntbt_iter(args, acur);
1452                if (error)
1453                        return error;
1454                if (!xfs_alloc_cur_active(acur->cnt)) {
1455                        trace_xfs_alloc_cur_lookup_done(args);
1456                        break;
1457                }
1458        }
1459
1460        /*
1461         * If we failed to find anything due to busy extents, return empty
1462         * handed so the caller can flush and retry. If no busy extents were
1463         * found, walk backwards from the end of the cntbt as a last resort.
1464         */
1465        if (!xfs_alloc_cur_active(acur->cnt) && !acur->len && !acur->busy) {
1466                error = xfs_btree_decrement(acur->cnt, 0, &i);
1467                if (error)
1468                        return error;
1469                if (i) {
1470                        acur->cnt->bc_ag.abt.active = true;
1471                        fbcur = acur->cnt;
1472                        fbinc = false;
1473                }
1474        }
1475
1476        /*
1477         * Search in the opposite direction for a better entry in the case of
1478         * a bnobt hit or walk backwards from the end of the cntbt.
1479         */
1480        if (fbcur) {
1481                error = xfs_alloc_walk_iter(args, acur, fbcur, fbinc, true, -1,
1482                                            &i);
1483                if (error)
1484                        return error;
1485        }
1486
1487        if (acur->len)
1488                *stat = 1;
1489
1490        return 0;
1491}
1492
1493/* Check the last block of the cnt btree for allocations. */
1494static int
1495xfs_alloc_ag_vextent_lastblock(
1496        struct xfs_alloc_arg    *args,
1497        struct xfs_alloc_cur    *acur,
1498        xfs_agblock_t           *bno,
1499        xfs_extlen_t            *len,
1500        bool                    *allocated)
1501{
1502        int                     error;
1503        int                     i;
1504
1505#ifdef DEBUG
1506        /* Randomly don't execute the first algorithm. */
1507        if (prandom_u32() & 1)
1508                return 0;
1509#endif
1510
1511        /*
1512         * Start from the entry that lookup found, sequence through all larger
1513         * free blocks.  If we're actually pointing at a record smaller than
1514         * maxlen, go to the start of this block, and skip all those smaller
1515         * than minlen.
1516         */
1517        if (*len || args->alignment > 1) {
1518                acur->cnt->bc_ptrs[0] = 1;
1519                do {
1520                        error = xfs_alloc_get_rec(acur->cnt, bno, len, &i);
1521                        if (error)
1522                                return error;
1523                        if (XFS_IS_CORRUPT(args->mp, i != 1))
1524                                return -EFSCORRUPTED;
1525                        if (*len >= args->minlen)
1526                                break;
1527                        error = xfs_btree_increment(acur->cnt, 0, &i);
1528                        if (error)
1529                                return error;
1530                } while (i);
1531                ASSERT(*len >= args->minlen);
1532                if (!i)
1533                        return 0;
1534        }
1535
1536        error = xfs_alloc_walk_iter(args, acur, acur->cnt, true, false, -1, &i);
1537        if (error)
1538                return error;
1539
1540        /*
1541         * It didn't work.  We COULD be in a case where there's a good record
1542         * somewhere, so try again.
1543         */
1544        if (acur->len == 0)
1545                return 0;
1546
1547        trace_xfs_alloc_near_first(args);
1548        *allocated = true;
1549        return 0;
1550}
1551
1552/*
1553 * Allocate a variable extent near bno in the allocation group agno.
1554 * Extent's length (returned in len) will be between minlen and maxlen,
1555 * and of the form k * prod + mod unless there's nothing that large.
1556 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1557 */
1558STATIC int
1559xfs_alloc_ag_vextent_near(
1560        struct xfs_alloc_arg    *args)
1561{
1562        struct xfs_alloc_cur    acur = {};
1563        int                     error;          /* error code */
1564        int                     i;              /* result code, temporary */
1565        xfs_agblock_t           bno;
1566        xfs_extlen_t            len;
1567
1568        /* handle uninitialized agbno range so caller doesn't have to */
1569        if (!args->min_agbno && !args->max_agbno)
1570                args->max_agbno = args->mp->m_sb.sb_agblocks - 1;
1571        ASSERT(args->min_agbno <= args->max_agbno);
1572
1573        /* clamp agbno to the range if it's outside */
1574        if (args->agbno < args->min_agbno)
1575                args->agbno = args->min_agbno;
1576        if (args->agbno > args->max_agbno)
1577                args->agbno = args->max_agbno;
1578
1579restart:
1580        len = 0;
1581
1582        /*
1583         * Set up cursors and see if there are any free extents as big as
1584         * maxlen. If not, pick the last entry in the tree unless the tree is
1585         * empty.
1586         */
1587        error = xfs_alloc_cur_setup(args, &acur);
1588        if (error == -ENOSPC) {
1589                error = xfs_alloc_ag_vextent_small(args, acur.cnt, &bno,
1590                                &len, &i);
1591                if (error)
1592                        goto out;
1593                if (i == 0 || len == 0) {
1594                        trace_xfs_alloc_near_noentry(args);
1595                        goto out;
1596                }
1597                ASSERT(i == 1);
1598        } else if (error) {
1599                goto out;
1600        }
1601
1602        /*
1603         * First algorithm.
1604         * If the requested extent is large wrt the freespaces available
1605         * in this a.g., then the cursor will be pointing to a btree entry
1606         * near the right edge of the tree.  If it's in the last btree leaf
1607         * block, then we just examine all the entries in that block
1608         * that are big enough, and pick the best one.
1609         */
1610        if (xfs_btree_islastblock(acur.cnt, 0)) {
1611                bool            allocated = false;
1612
1613                error = xfs_alloc_ag_vextent_lastblock(args, &acur, &bno, &len,
1614                                &allocated);
1615                if (error)
1616                        goto out;
1617                if (allocated)
1618                        goto alloc_finish;
1619        }
1620
1621        /*
1622         * Second algorithm. Combined cntbt and bnobt search to find ideal
1623         * locality.
1624         */
1625        error = xfs_alloc_ag_vextent_locality(args, &acur, &i);
1626        if (error)
1627                goto out;
1628
1629        /*
1630         * If we couldn't get anything, give up.
1631         */
1632        if (!acur.len) {
1633                if (acur.busy) {
1634                        trace_xfs_alloc_near_busy(args);
1635                        xfs_extent_busy_flush(args->mp, args->pag,
1636                                              acur.busy_gen);
1637                        goto restart;
1638                }
1639                trace_xfs_alloc_size_neither(args);
1640                args->agbno = NULLAGBLOCK;
1641                goto out;
1642        }
1643
1644alloc_finish:
1645        /* fix up btrees on a successful allocation */
1646        error = xfs_alloc_cur_finish(args, &acur);
1647
1648out:
1649        xfs_alloc_cur_close(&acur, error);
1650        return error;
1651}
1652
1653/*
1654 * Allocate a variable extent anywhere in the allocation group agno.
1655 * Extent's length (returned in len) will be between minlen and maxlen,
1656 * and of the form k * prod + mod unless there's nothing that large.
1657 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1658 */
1659STATIC int                              /* error */
1660xfs_alloc_ag_vextent_size(
1661        xfs_alloc_arg_t *args)          /* allocation argument structure */
1662{
1663        struct xfs_agf  *agf = args->agbp->b_addr;
1664        xfs_btree_cur_t *bno_cur;       /* cursor for bno btree */
1665        xfs_btree_cur_t *cnt_cur;       /* cursor for cnt btree */
1666        int             error;          /* error result */
1667        xfs_agblock_t   fbno;           /* start of found freespace */
1668        xfs_extlen_t    flen;           /* length of found freespace */
1669        int             i;              /* temp status variable */
1670        xfs_agblock_t   rbno;           /* returned block number */
1671        xfs_extlen_t    rlen;           /* length of returned extent */
1672        bool            busy;
1673        unsigned        busy_gen;
1674
1675restart:
1676        /*
1677         * Allocate and initialize a cursor for the by-size btree.
1678         */
1679        cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1680                args->agno, XFS_BTNUM_CNT);
1681        bno_cur = NULL;
1682        busy = false;
1683
1684        /*
1685         * Look for an entry >= maxlen+alignment-1 blocks.
1686         */
1687        if ((error = xfs_alloc_lookup_ge(cnt_cur, 0,
1688                        args->maxlen + args->alignment - 1, &i)))
1689                goto error0;
1690
1691        /*
1692         * If none then we have to settle for a smaller extent. In the case that
1693         * there are no large extents, this will return the last entry in the
1694         * tree unless the tree is empty. In the case that there are only busy
1695         * large extents, this will return the largest small extent unless there
1696         * are no smaller extents available.
1697         */
1698        if (!i) {
1699                error = xfs_alloc_ag_vextent_small(args, cnt_cur,
1700                                                   &fbno, &flen, &i);
1701                if (error)
1702                        goto error0;
1703                if (i == 0 || flen == 0) {
1704                        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1705                        trace_xfs_alloc_size_noentry(args);
1706                        return 0;
1707                }
1708                ASSERT(i == 1);
1709                busy = xfs_alloc_compute_aligned(args, fbno, flen, &rbno,
1710                                &rlen, &busy_gen);
1711        } else {
1712                /*
1713                 * Search for a non-busy extent that is large enough.
1714                 */
1715                for (;;) {
1716                        error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
1717                        if (error)
1718                                goto error0;
1719                        if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1720                                error = -EFSCORRUPTED;
1721                                goto error0;
1722                        }
1723
1724                        busy = xfs_alloc_compute_aligned(args, fbno, flen,
1725                                        &rbno, &rlen, &busy_gen);
1726
1727                        if (rlen >= args->maxlen)
1728                                break;
1729
1730                        error = xfs_btree_increment(cnt_cur, 0, &i);
1731                        if (error)
1732                                goto error0;
1733                        if (i == 0) {
1734                                /*
1735                                 * Our only valid extents must have been busy.
1736                                 * Make it unbusy by forcing the log out and
1737                                 * retrying.
1738                                 */
1739                                xfs_btree_del_cursor(cnt_cur,
1740                                                     XFS_BTREE_NOERROR);
1741                                trace_xfs_alloc_size_busy(args);
1742                                xfs_extent_busy_flush(args->mp,
1743                                                        args->pag, busy_gen);
1744                                goto restart;
1745                        }
1746                }
1747        }
1748
1749        /*
1750         * In the first case above, we got the last entry in the
1751         * by-size btree.  Now we check to see if the space hits maxlen
1752         * once aligned; if not, we search left for something better.
1753         * This can't happen in the second case above.
1754         */
1755        rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
1756        if (XFS_IS_CORRUPT(args->mp,
1757                           rlen != 0 &&
1758                           (rlen > flen ||
1759                            rbno + rlen > fbno + flen))) {
1760                error = -EFSCORRUPTED;
1761                goto error0;
1762        }
1763        if (rlen < args->maxlen) {
1764                xfs_agblock_t   bestfbno;
1765                xfs_extlen_t    bestflen;
1766                xfs_agblock_t   bestrbno;
1767                xfs_extlen_t    bestrlen;
1768
1769                bestrlen = rlen;
1770                bestrbno = rbno;
1771                bestflen = flen;
1772                bestfbno = fbno;
1773                for (;;) {
1774                        if ((error = xfs_btree_decrement(cnt_cur, 0, &i)))
1775                                goto error0;
1776                        if (i == 0)
1777                                break;
1778                        if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen,
1779                                        &i)))
1780                                goto error0;
1781                        if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1782                                error = -EFSCORRUPTED;
1783                                goto error0;
1784                        }
1785                        if (flen < bestrlen)
1786                                break;
1787                        busy = xfs_alloc_compute_aligned(args, fbno, flen,
1788                                        &rbno, &rlen, &busy_gen);
1789                        rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
1790                        if (XFS_IS_CORRUPT(args->mp,
1791                                           rlen != 0 &&
1792                                           (rlen > flen ||
1793                                            rbno + rlen > fbno + flen))) {
1794                                error = -EFSCORRUPTED;
1795                                goto error0;
1796                        }
1797                        if (rlen > bestrlen) {
1798                                bestrlen = rlen;
1799                                bestrbno = rbno;
1800                                bestflen = flen;
1801                                bestfbno = fbno;
1802                                if (rlen == args->maxlen)
1803                                        break;
1804                        }
1805                }
1806                if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen,
1807                                &i)))
1808                        goto error0;
1809                if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1810                        error = -EFSCORRUPTED;
1811                        goto error0;
1812                }
1813                rlen = bestrlen;
1814                rbno = bestrbno;
1815                flen = bestflen;
1816                fbno = bestfbno;
1817        }
1818        args->wasfromfl = 0;
1819        /*
1820         * Fix up the length.
1821         */
1822        args->len = rlen;
1823        if (rlen < args->minlen) {
1824                if (busy) {
1825                        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1826                        trace_xfs_alloc_size_busy(args);
1827                        xfs_extent_busy_flush(args->mp, args->pag, busy_gen);
1828                        goto restart;
1829                }
1830                goto out_nominleft;
1831        }
1832        xfs_alloc_fix_len(args);
1833
1834        rlen = args->len;
1835        if (XFS_IS_CORRUPT(args->mp, rlen > flen)) {
1836                error = -EFSCORRUPTED;
1837                goto error0;
1838        }
1839        /*
1840         * Allocate and initialize a cursor for the by-block tree.
1841         */
1842        bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1843                args->agno, XFS_BTNUM_BNO);
1844        if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
1845                        rbno, rlen, XFSA_FIXUP_CNT_OK)))
1846                goto error0;
1847        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1848        xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1849        cnt_cur = bno_cur = NULL;
1850        args->len = rlen;
1851        args->agbno = rbno;
1852        if (XFS_IS_CORRUPT(args->mp,
1853                           args->agbno + args->len >
1854                           be32_to_cpu(agf->agf_length))) {
1855                error = -EFSCORRUPTED;
1856                goto error0;
1857        }
1858        trace_xfs_alloc_size_done(args);
1859        return 0;
1860
1861error0:
1862        trace_xfs_alloc_size_error(args);
1863        if (cnt_cur)
1864                xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
1865        if (bno_cur)
1866                xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
1867        return error;
1868
1869out_nominleft:
1870        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1871        trace_xfs_alloc_size_nominleft(args);
1872        args->agbno = NULLAGBLOCK;
1873        return 0;
1874}
1875
1876/*
1877 * Free the extent starting at agno/bno for length.
1878 */
1879STATIC int
1880xfs_free_ag_extent(
1881        struct xfs_trans                *tp,
1882        struct xfs_buf                  *agbp,
1883        xfs_agnumber_t                  agno,
1884        xfs_agblock_t                   bno,
1885        xfs_extlen_t                    len,
1886        const struct xfs_owner_info     *oinfo,
1887        enum xfs_ag_resv_type           type)
1888{
1889        struct xfs_mount                *mp;
1890        struct xfs_perag                *pag;
1891        struct xfs_btree_cur            *bno_cur;
1892        struct xfs_btree_cur            *cnt_cur;
1893        xfs_agblock_t                   gtbno; /* start of right neighbor */
1894        xfs_extlen_t                    gtlen; /* length of right neighbor */
1895        xfs_agblock_t                   ltbno; /* start of left neighbor */
1896        xfs_extlen_t                    ltlen; /* length of left neighbor */
1897        xfs_agblock_t                   nbno; /* new starting block of freesp */
1898        xfs_extlen_t                    nlen; /* new length of freespace */
1899        int                             haveleft; /* have a left neighbor */
1900        int                             haveright; /* have a right neighbor */
1901        int                             i;
1902        int                             error;
1903
1904        bno_cur = cnt_cur = NULL;
1905        mp = tp->t_mountp;
1906
1907        if (!xfs_rmap_should_skip_owner_update(oinfo)) {
1908                error = xfs_rmap_free(tp, agbp, agno, bno, len, oinfo);
1909                if (error)
1910                        goto error0;
1911        }
1912
1913        /*
1914         * Allocate and initialize a cursor for the by-block btree.
1915         */
1916        bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO);
1917        /*
1918         * Look for a neighboring block on the left (lower block numbers)
1919         * that is contiguous with this space.
1920         */
1921        if ((error = xfs_alloc_lookup_le(bno_cur, bno, len, &haveleft)))
1922                goto error0;
1923        if (haveleft) {
1924                /*
1925                 * There is a block to our left.
1926                 */
1927                if ((error = xfs_alloc_get_rec(bno_cur, &ltbno, &ltlen, &i)))
1928                        goto error0;
1929                if (XFS_IS_CORRUPT(mp, i != 1)) {
1930                        error = -EFSCORRUPTED;
1931                        goto error0;
1932                }
1933                /*
1934                 * It's not contiguous, though.
1935                 */
1936                if (ltbno + ltlen < bno)
1937                        haveleft = 0;
1938                else {
1939                        /*
1940                         * If this failure happens the request to free this
1941                         * space was invalid, it's (partly) already free.
1942                         * Very bad.
1943                         */
1944                        if (XFS_IS_CORRUPT(mp, ltbno + ltlen > bno)) {
1945                                error = -EFSCORRUPTED;
1946                                goto error0;
1947                        }
1948                }
1949        }
1950        /*
1951         * Look for a neighboring block on the right (higher block numbers)
1952         * that is contiguous with this space.
1953         */
1954        if ((error = xfs_btree_increment(bno_cur, 0, &haveright)))
1955                goto error0;
1956        if (haveright) {
1957                /*
1958                 * There is a block to our right.
1959                 */
1960                if ((error = xfs_alloc_get_rec(bno_cur, &gtbno, &gtlen, &i)))
1961                        goto error0;
1962                if (XFS_IS_CORRUPT(mp, i != 1)) {
1963                        error = -EFSCORRUPTED;
1964                        goto error0;
1965                }
1966                /*
1967                 * It's not contiguous, though.
1968                 */
1969                if (bno + len < gtbno)
1970                        haveright = 0;
1971                else {
1972                        /*
1973                         * If this failure happens the request to free this
1974                         * space was invalid, it's (partly) already free.
1975                         * Very bad.
1976                         */
1977                        if (XFS_IS_CORRUPT(mp, bno + len > gtbno)) {
1978                                error = -EFSCORRUPTED;
1979                                goto error0;
1980                        }
1981                }
1982        }
1983        /*
1984         * Now allocate and initialize a cursor for the by-size tree.
1985         */
1986        cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT);
1987        /*
1988         * Have both left and right contiguous neighbors.
1989         * Merge all three into a single free block.
1990         */
1991        if (haveleft && haveright) {
1992                /*
1993                 * Delete the old by-size entry on the left.
1994                 */
1995                if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
1996                        goto error0;
1997                if (XFS_IS_CORRUPT(mp, i != 1)) {
1998                        error = -EFSCORRUPTED;
1999                        goto error0;
2000                }
2001                if ((error = xfs_btree_delete(cnt_cur, &i)))
2002                        goto error0;
2003                if (XFS_IS_CORRUPT(mp, i != 1)) {
2004                        error = -EFSCORRUPTED;
2005                        goto error0;
2006                }
2007                /*
2008                 * Delete the old by-size entry on the right.
2009                 */
2010                if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
2011                        goto error0;
2012                if (XFS_IS_CORRUPT(mp, i != 1)) {
2013                        error = -EFSCORRUPTED;
2014                        goto error0;
2015                }
2016                if ((error = xfs_btree_delete(cnt_cur, &i)))
2017                        goto error0;
2018                if (XFS_IS_CORRUPT(mp, i != 1)) {
2019                        error = -EFSCORRUPTED;
2020                        goto error0;
2021                }
2022                /*
2023                 * Delete the old by-block entry for the right block.
2024                 */
2025                if ((error = xfs_btree_delete(bno_cur, &i)))
2026                        goto error0;
2027                if (XFS_IS_CORRUPT(mp, i != 1)) {
2028                        error = -EFSCORRUPTED;
2029                        goto error0;
2030                }
2031                /*
2032                 * Move the by-block cursor back to the left neighbor.
2033                 */
2034                if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
2035                        goto error0;
2036                if (XFS_IS_CORRUPT(mp, i != 1)) {
2037                        error = -EFSCORRUPTED;
2038                        goto error0;
2039                }
2040#ifdef DEBUG
2041                /*
2042                 * Check that this is the right record: delete didn't
2043                 * mangle the cursor.
2044                 */
2045                {
2046                        xfs_agblock_t   xxbno;
2047                        xfs_extlen_t    xxlen;
2048
2049                        if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen,
2050                                        &i)))
2051                                goto error0;
2052                        if (XFS_IS_CORRUPT(mp,
2053                                           i != 1 ||
2054                                           xxbno != ltbno ||
2055                                           xxlen != ltlen)) {
2056                                error = -EFSCORRUPTED;
2057                                goto error0;
2058                        }
2059                }
2060#endif
2061                /*
2062                 * Update remaining by-block entry to the new, joined block.
2063                 */
2064                nbno = ltbno;
2065                nlen = len + ltlen + gtlen;
2066                if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2067                        goto error0;
2068        }
2069        /*
2070         * Have only a left contiguous neighbor.
2071         * Merge it together with the new freespace.
2072         */
2073        else if (haveleft) {
2074                /*
2075                 * Delete the old by-size entry on the left.
2076                 */
2077                if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
2078                        goto error0;
2079                if (XFS_IS_CORRUPT(mp, i != 1)) {
2080                        error = -EFSCORRUPTED;
2081                        goto error0;
2082                }
2083                if ((error = xfs_btree_delete(cnt_cur, &i)))
2084                        goto error0;
2085                if (XFS_IS_CORRUPT(mp, i != 1)) {
2086                        error = -EFSCORRUPTED;
2087                        goto error0;
2088                }
2089                /*
2090                 * Back up the by-block cursor to the left neighbor, and
2091                 * update its length.
2092                 */
2093                if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
2094                        goto error0;
2095                if (XFS_IS_CORRUPT(mp, i != 1)) {
2096                        error = -EFSCORRUPTED;
2097                        goto error0;
2098                }
2099                nbno = ltbno;
2100                nlen = len + ltlen;
2101                if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2102                        goto error0;
2103        }
2104        /*
2105         * Have only a right contiguous neighbor.
2106         * Merge it together with the new freespace.
2107         */
2108        else if (haveright) {
2109                /*
2110                 * Delete the old by-size entry on the right.
2111                 */
2112                if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
2113                        goto error0;
2114                if (XFS_IS_CORRUPT(mp, i != 1)) {
2115                        error = -EFSCORRUPTED;
2116                        goto error0;
2117                }
2118                if ((error = xfs_btree_delete(cnt_cur, &i)))
2119                        goto error0;
2120                if (XFS_IS_CORRUPT(mp, i != 1)) {
2121                        error = -EFSCORRUPTED;
2122                        goto error0;
2123                }
2124                /*
2125                 * Update the starting block and length of the right
2126                 * neighbor in the by-block tree.
2127                 */
2128                nbno = bno;
2129                nlen = len + gtlen;
2130                if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2131                        goto error0;
2132        }
2133        /*
2134         * No contiguous neighbors.
2135         * Insert the new freespace into the by-block tree.
2136         */
2137        else {
2138                nbno = bno;
2139                nlen = len;
2140                if ((error = xfs_btree_insert(bno_cur, &i)))
2141                        goto error0;
2142                if (XFS_IS_CORRUPT(mp, i != 1)) {
2143                        error = -EFSCORRUPTED;
2144                        goto error0;
2145                }
2146        }
2147        xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
2148        bno_cur = NULL;
2149        /*
2150         * In all cases we need to insert the new freespace in the by-size tree.
2151         */
2152        if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
2153                goto error0;
2154        if (XFS_IS_CORRUPT(mp, i != 0)) {
2155                error = -EFSCORRUPTED;
2156                goto error0;
2157        }
2158        if ((error = xfs_btree_insert(cnt_cur, &i)))
2159                goto error0;
2160        if (XFS_IS_CORRUPT(mp, i != 1)) {
2161                error = -EFSCORRUPTED;
2162                goto error0;
2163        }
2164        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
2165        cnt_cur = NULL;
2166
2167        /*
2168         * Update the freespace totals in the ag and superblock.
2169         */
2170        pag = xfs_perag_get(mp, agno);
2171        error = xfs_alloc_update_counters(tp, pag, agbp, len);
2172        xfs_ag_resv_free_extent(pag, type, tp, len);
2173        xfs_perag_put(pag);
2174        if (error)
2175                goto error0;
2176
2177        XFS_STATS_INC(mp, xs_freex);
2178        XFS_STATS_ADD(mp, xs_freeb, len);
2179
2180        trace_xfs_free_extent(mp, agno, bno, len, type, haveleft, haveright);
2181
2182        return 0;
2183
2184 error0:
2185        trace_xfs_free_extent(mp, agno, bno, len, type, -1, -1);
2186        if (bno_cur)
2187                xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
2188        if (cnt_cur)
2189                xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
2190        return error;
2191}
2192
2193/*
2194 * Visible (exported) allocation/free functions.
2195 * Some of these are used just by xfs_alloc_btree.c and this file.
2196 */
2197
2198/*
2199 * Compute and fill in value of m_ag_maxlevels.
2200 */
2201void
2202xfs_alloc_compute_maxlevels(
2203        xfs_mount_t     *mp)    /* file system mount structure */
2204{
2205        mp->m_ag_maxlevels = xfs_btree_compute_maxlevels(mp->m_alloc_mnr,
2206                        (mp->m_sb.sb_agblocks + 1) / 2);
2207}
2208
2209/*
2210 * Find the length of the longest extent in an AG.  The 'need' parameter
2211 * specifies how much space we're going to need for the AGFL and the
2212 * 'reserved' parameter tells us how many blocks in this AG are reserved for
2213 * other callers.
2214 */
2215xfs_extlen_t
2216xfs_alloc_longest_free_extent(
2217        struct xfs_perag        *pag,
2218        xfs_extlen_t            need,
2219        xfs_extlen_t            reserved)
2220{
2221        xfs_extlen_t            delta = 0;
2222
2223        /*
2224         * If the AGFL needs a recharge, we'll have to subtract that from the
2225         * longest extent.
2226         */
2227        if (need > pag->pagf_flcount)
2228                delta = need - pag->pagf_flcount;
2229
2230        /*
2231         * If we cannot maintain others' reservations with space from the
2232         * not-longest freesp extents, we'll have to subtract /that/ from
2233         * the longest extent too.
2234         */
2235        if (pag->pagf_freeblks - pag->pagf_longest < reserved)
2236                delta += reserved - (pag->pagf_freeblks - pag->pagf_longest);
2237
2238        /*
2239         * If the longest extent is long enough to satisfy all the
2240         * reservations and AGFL rules in place, we can return this extent.
2241         */
2242        if (pag->pagf_longest > delta)
2243                return min_t(xfs_extlen_t, pag->pag_mount->m_ag_max_usable,
2244                                pag->pagf_longest - delta);
2245
2246        /* Otherwise, let the caller try for 1 block if there's space. */
2247        return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
2248}
2249
2250/*
2251 * Compute the minimum length of the AGFL in the given AG.  If @pag is NULL,
2252 * return the largest possible minimum length.
2253 */
2254unsigned int
2255xfs_alloc_min_freelist(
2256        struct xfs_mount        *mp,
2257        struct xfs_perag        *pag)
2258{
2259        /* AG btrees have at least 1 level. */
2260        static const uint8_t    fake_levels[XFS_BTNUM_AGF] = {1, 1, 1};
2261        const uint8_t           *levels = pag ? pag->pagf_levels : fake_levels;
2262        unsigned int            min_free;
2263
2264        ASSERT(mp->m_ag_maxlevels > 0);
2265
2266        /* space needed by-bno freespace btree */
2267        min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1,
2268                                       mp->m_ag_maxlevels);
2269        /* space needed by-size freespace btree */
2270        min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1,
2271                                       mp->m_ag_maxlevels);
2272        /* space needed reverse mapping used space btree */
2273        if (xfs_sb_version_hasrmapbt(&mp->m_sb))
2274                min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1,
2275                                                mp->m_rmap_maxlevels);
2276
2277        return min_free;
2278}
2279
2280/*
2281 * Check if the operation we are fixing up the freelist for should go ahead or
2282 * not. If we are freeing blocks, we always allow it, otherwise the allocation
2283 * is dependent on whether the size and shape of free space available will
2284 * permit the requested allocation to take place.
2285 */
2286static bool
2287xfs_alloc_space_available(
2288        struct xfs_alloc_arg    *args,
2289        xfs_extlen_t            min_free,
2290        int                     flags)
2291{
2292        struct xfs_perag        *pag = args->pag;
2293        xfs_extlen_t            alloc_len, longest;
2294        xfs_extlen_t            reservation; /* blocks that are still reserved */
2295        int                     available;
2296        xfs_extlen_t            agflcount;
2297
2298        if (flags & XFS_ALLOC_FLAG_FREEING)
2299                return true;
2300
2301        reservation = xfs_ag_resv_needed(pag, args->resv);
2302
2303        /* do we have enough contiguous free space for the allocation? */
2304        alloc_len = args->minlen + (args->alignment - 1) + args->minalignslop;
2305        longest = xfs_alloc_longest_free_extent(pag, min_free, reservation);
2306        if (longest < alloc_len)
2307                return false;
2308
2309        /*
2310         * Do we have enough free space remaining for the allocation? Don't
2311         * account extra agfl blocks because we are about to defer free them,
2312         * making them unavailable until the current transaction commits.
2313         */
2314        agflcount = min_t(xfs_extlen_t, pag->pagf_flcount, min_free);
2315        available = (int)(pag->pagf_freeblks + agflcount -
2316                          reservation - min_free - args->minleft);
2317        if (available < (int)max(args->total, alloc_len))
2318                return false;
2319
2320        /*
2321         * Clamp maxlen to the amount of free space available for the actual
2322         * extent allocation.
2323         */
2324        if (available < (int)args->maxlen && !(flags & XFS_ALLOC_FLAG_CHECK)) {
2325                args->maxlen = available;
2326                ASSERT(args->maxlen > 0);
2327                ASSERT(args->maxlen >= args->minlen);
2328        }
2329
2330        return true;
2331}
2332
2333int
2334xfs_free_agfl_block(
2335        struct xfs_trans        *tp,
2336        xfs_agnumber_t          agno,
2337        xfs_agblock_t           agbno,
2338        struct xfs_buf          *agbp,
2339        struct xfs_owner_info   *oinfo)
2340{
2341        int                     error;
2342        struct xfs_buf          *bp;
2343
2344        error = xfs_free_ag_extent(tp, agbp, agno, agbno, 1, oinfo,
2345                                   XFS_AG_RESV_AGFL);
2346        if (error)
2347                return error;
2348
2349        error = xfs_trans_get_buf(tp, tp->t_mountp->m_ddev_targp,
2350                        XFS_AGB_TO_DADDR(tp->t_mountp, agno, agbno),
2351                        tp->t_mountp->m_bsize, 0, &bp);
2352        if (error)
2353                return error;
2354        xfs_trans_binval(tp, bp);
2355
2356        return 0;
2357}
2358
2359/*
2360 * Check the agfl fields of the agf for inconsistency or corruption. The purpose
2361 * is to detect an agfl header padding mismatch between current and early v5
2362 * kernels. This problem manifests as a 1-slot size difference between the
2363 * on-disk flcount and the active [first, last] range of a wrapped agfl. This
2364 * may also catch variants of agfl count corruption unrelated to padding. Either
2365 * way, we'll reset the agfl and warn the user.
2366 *
2367 * Return true if a reset is required before the agfl can be used, false
2368 * otherwise.
2369 */
2370static bool
2371xfs_agfl_needs_reset(
2372        struct xfs_mount        *mp,
2373        struct xfs_agf          *agf)
2374{
2375        uint32_t                f = be32_to_cpu(agf->agf_flfirst);
2376        uint32_t                l = be32_to_cpu(agf->agf_fllast);
2377        uint32_t                c = be32_to_cpu(agf->agf_flcount);
2378        int                     agfl_size = xfs_agfl_size(mp);
2379        int                     active;
2380
2381        /* no agfl header on v4 supers */
2382        if (!xfs_sb_version_hascrc(&mp->m_sb))
2383                return false;
2384
2385        /*
2386         * The agf read verifier catches severe corruption of these fields.
2387         * Repeat some sanity checks to cover a packed -> unpacked mismatch if
2388         * the verifier allows it.
2389         */
2390        if (f >= agfl_size || l >= agfl_size)
2391                return true;
2392        if (c > agfl_size)
2393                return true;
2394
2395        /*
2396         * Check consistency between the on-disk count and the active range. An
2397         * agfl padding mismatch manifests as an inconsistent flcount.
2398         */
2399        if (c && l >= f)
2400                active = l - f + 1;
2401        else if (c)
2402                active = agfl_size - f + l + 1;
2403        else
2404                active = 0;
2405
2406        return active != c;
2407}
2408
2409/*
2410 * Reset the agfl to an empty state. Ignore/drop any existing blocks since the
2411 * agfl content cannot be trusted. Warn the user that a repair is required to
2412 * recover leaked blocks.
2413 *
2414 * The purpose of this mechanism is to handle filesystems affected by the agfl
2415 * header padding mismatch problem. A reset keeps the filesystem online with a
2416 * relatively minor free space accounting inconsistency rather than suffer the
2417 * inevitable crash from use of an invalid agfl block.
2418 */
2419static void
2420xfs_agfl_reset(
2421        struct xfs_trans        *tp,
2422        struct xfs_buf          *agbp,
2423        struct xfs_perag        *pag)
2424{
2425        struct xfs_mount        *mp = tp->t_mountp;
2426        struct xfs_agf          *agf = agbp->b_addr;
2427
2428        ASSERT(pag->pagf_agflreset);
2429        trace_xfs_agfl_reset(mp, agf, 0, _RET_IP_);
2430
2431        xfs_warn(mp,
2432               "WARNING: Reset corrupted AGFL on AG %u. %d blocks leaked. "
2433               "Please unmount and run xfs_repair.",
2434                 pag->pag_agno, pag->pagf_flcount);
2435
2436        agf->agf_flfirst = 0;
2437        agf->agf_fllast = cpu_to_be32(xfs_agfl_size(mp) - 1);
2438        agf->agf_flcount = 0;
2439        xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLFIRST | XFS_AGF_FLLAST |
2440                                    XFS_AGF_FLCOUNT);
2441
2442        pag->pagf_flcount = 0;
2443        pag->pagf_agflreset = false;
2444}
2445
2446/*
2447 * Defer an AGFL block free. This is effectively equivalent to
2448 * xfs_bmap_add_free() with some special handling particular to AGFL blocks.
2449 *
2450 * Deferring AGFL frees helps prevent log reservation overruns due to too many
2451 * allocation operations in a transaction. AGFL frees are prone to this problem
2452 * because for one they are always freed one at a time. Further, an immediate
2453 * AGFL block free can cause a btree join and require another block free before
2454 * the real allocation can proceed. Deferring the free disconnects freeing up
2455 * the AGFL slot from freeing the block.
2456 */
2457STATIC void
2458xfs_defer_agfl_block(
2459        struct xfs_trans                *tp,
2460        xfs_agnumber_t                  agno,
2461        xfs_fsblock_t                   agbno,
2462        struct xfs_owner_info           *oinfo)
2463{
2464        struct xfs_mount                *mp = tp->t_mountp;
2465        struct xfs_extent_free_item     *new;           /* new element */
2466
2467        ASSERT(xfs_bmap_free_item_zone != NULL);
2468        ASSERT(oinfo != NULL);
2469
2470        new = kmem_zone_alloc(xfs_bmap_free_item_zone, 0);
2471        new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno);
2472        new->xefi_blockcount = 1;
2473        new->xefi_oinfo = *oinfo;
2474
2475        trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1);
2476
2477        xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &new->xefi_list);
2478}
2479
2480/*
2481 * Decide whether to use this allocation group for this allocation.
2482 * If so, fix up the btree freelist's size.
2483 */
2484int                     /* error */
2485xfs_alloc_fix_freelist(
2486        struct xfs_alloc_arg    *args,  /* allocation argument structure */
2487        int                     flags)  /* XFS_ALLOC_FLAG_... */
2488{
2489        struct xfs_mount        *mp = args->mp;
2490        struct xfs_perag        *pag = args->pag;
2491        struct xfs_trans        *tp = args->tp;
2492        struct xfs_buf          *agbp = NULL;
2493        struct xfs_buf          *agflbp = NULL;
2494        struct xfs_alloc_arg    targs;  /* local allocation arguments */
2495        xfs_agblock_t           bno;    /* freelist block */
2496        xfs_extlen_t            need;   /* total blocks needed in freelist */
2497        int                     error = 0;
2498
2499        /* deferred ops (AGFL block frees) require permanent transactions */
2500        ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
2501
2502        if (!pag->pagf_init) {
2503                error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp);
2504                if (error) {
2505                        /* Couldn't lock the AGF so skip this AG. */
2506                        if (error == -EAGAIN)
2507                                error = 0;
2508                        goto out_no_agbp;
2509                }
2510        }
2511
2512        /*
2513         * If this is a metadata preferred pag and we are user data then try
2514         * somewhere else if we are not being asked to try harder at this
2515         * point
2516         */
2517        if (pag->pagf_metadata && (args->datatype & XFS_ALLOC_USERDATA) &&
2518            (flags & XFS_ALLOC_FLAG_TRYLOCK)) {
2519                ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
2520                goto out_agbp_relse;
2521        }
2522
2523        need = xfs_alloc_min_freelist(mp, pag);
2524        if (!xfs_alloc_space_available(args, need, flags |
2525                        XFS_ALLOC_FLAG_CHECK))
2526                goto out_agbp_relse;
2527
2528        /*
2529         * Get the a.g. freespace buffer.
2530         * Can fail if we're not blocking on locks, and it's held.
2531         */
2532        if (!agbp) {
2533                error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp);
2534                if (error) {
2535                        /* Couldn't lock the AGF so skip this AG. */
2536                        if (error == -EAGAIN)
2537                                error = 0;
2538                        goto out_no_agbp;
2539                }
2540        }
2541
2542        /* reset a padding mismatched agfl before final free space check */
2543        if (pag->pagf_agflreset)
2544                xfs_agfl_reset(tp, agbp, pag);
2545
2546        /* If there isn't enough total space or single-extent, reject it. */
2547        need = xfs_alloc_min_freelist(mp, pag);
2548        if (!xfs_alloc_space_available(args, need, flags))
2549                goto out_agbp_relse;
2550
2551        /*
2552         * Make the freelist shorter if it's too long.
2553         *
2554         * Note that from this point onwards, we will always release the agf and
2555         * agfl buffers on error. This handles the case where we error out and
2556         * the buffers are clean or may not have been joined to the transaction
2557         * and hence need to be released manually. If they have been joined to
2558         * the transaction, then xfs_trans_brelse() will handle them
2559         * appropriately based on the recursion count and dirty state of the
2560         * buffer.
2561         *
2562         * XXX (dgc): When we have lots of free space, does this buy us
2563         * anything other than extra overhead when we need to put more blocks
2564         * back on the free list? Maybe we should only do this when space is
2565         * getting low or the AGFL is more than half full?
2566         *
2567         * The NOSHRINK flag prevents the AGFL from being shrunk if it's too
2568         * big; the NORMAP flag prevents AGFL expand/shrink operations from
2569         * updating the rmapbt.  Both flags are used in xfs_repair while we're
2570         * rebuilding the rmapbt, and neither are used by the kernel.  They're
2571         * both required to ensure that rmaps are correctly recorded for the
2572         * regenerated AGFL, bnobt, and cntbt.  See repair/phase5.c and
2573         * repair/rmap.c in xfsprogs for details.
2574         */
2575        memset(&targs, 0, sizeof(targs));
2576        /* struct copy below */
2577        if (flags & XFS_ALLOC_FLAG_NORMAP)
2578                targs.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE;
2579        else
2580                targs.oinfo = XFS_RMAP_OINFO_AG;
2581        while (!(flags & XFS_ALLOC_FLAG_NOSHRINK) && pag->pagf_flcount > need) {
2582                error = xfs_alloc_get_freelist(tp, agbp, &bno, 0);
2583                if (error)
2584                        goto out_agbp_relse;
2585
2586                /* defer agfl frees */
2587                xfs_defer_agfl_block(tp, args->agno, bno, &targs.oinfo);
2588        }
2589
2590        targs.tp = tp;
2591        targs.mp = mp;
2592        targs.agbp = agbp;
2593        targs.agno = args->agno;
2594        targs.alignment = targs.minlen = targs.prod = 1;
2595        targs.type = XFS_ALLOCTYPE_THIS_AG;
2596        targs.pag = pag;
2597        error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp);
2598        if (error)
2599                goto out_agbp_relse;
2600
2601        /* Make the freelist longer if it's too short. */
2602        while (pag->pagf_flcount < need) {
2603                targs.agbno = 0;
2604                targs.maxlen = need - pag->pagf_flcount;
2605                targs.resv = XFS_AG_RESV_AGFL;
2606
2607                /* Allocate as many blocks as possible at once. */
2608                error = xfs_alloc_ag_vextent(&targs);
2609                if (error)
2610                        goto out_agflbp_relse;
2611
2612                /*
2613                 * Stop if we run out.  Won't happen if callers are obeying
2614                 * the restrictions correctly.  Can happen for free calls
2615                 * on a completely full ag.
2616                 */
2617                if (targs.agbno == NULLAGBLOCK) {
2618                        if (flags & XFS_ALLOC_FLAG_FREEING)
2619                                break;
2620                        goto out_agflbp_relse;
2621                }
2622                /*
2623                 * Put each allocated block on the list.
2624                 */
2625                for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) {
2626                        error = xfs_alloc_put_freelist(tp, agbp,
2627                                                        agflbp, bno, 0);
2628                        if (error)
2629                                goto out_agflbp_relse;
2630                }
2631        }
2632        xfs_trans_brelse(tp, agflbp);
2633        args->agbp = agbp;
2634        return 0;
2635
2636out_agflbp_relse:
2637        xfs_trans_brelse(tp, agflbp);
2638out_agbp_relse:
2639        if (agbp)
2640                xfs_trans_brelse(tp, agbp);
2641out_no_agbp:
2642        args->agbp = NULL;
2643        return error;
2644}
2645
2646/*
2647 * Get a block from the freelist.
2648 * Returns with the buffer for the block gotten.
2649 */
2650int                             /* error */
2651xfs_alloc_get_freelist(
2652        xfs_trans_t     *tp,    /* transaction pointer */
2653        xfs_buf_t       *agbp,  /* buffer containing the agf structure */
2654        xfs_agblock_t   *bnop,  /* block address retrieved from freelist */
2655        int             btreeblk) /* destination is a AGF btree */
2656{
2657        struct xfs_agf  *agf = agbp->b_addr;
2658        xfs_buf_t       *agflbp;/* buffer for a.g. freelist structure */
2659        xfs_agblock_t   bno;    /* block number returned */
2660        __be32          *agfl_bno;
2661        int             error;
2662        int             logflags;
2663        xfs_mount_t     *mp = tp->t_mountp;
2664        xfs_perag_t     *pag;   /* per allocation group data */
2665
2666        /*
2667         * Freelist is empty, give up.
2668         */
2669        if (!agf->agf_flcount) {
2670                *bnop = NULLAGBLOCK;
2671                return 0;
2672        }
2673        /*
2674         * Read the array of free blocks.
2675         */
2676        error = xfs_alloc_read_agfl(mp, tp, be32_to_cpu(agf->agf_seqno),
2677                                    &agflbp);
2678        if (error)
2679                return error;
2680
2681
2682        /*
2683         * Get the block number and update the data structures.
2684         */
2685        agfl_bno = xfs_buf_to_agfl_bno(agflbp);
2686        bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]);
2687        be32_add_cpu(&agf->agf_flfirst, 1);
2688        xfs_trans_brelse(tp, agflbp);
2689        if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp))
2690                agf->agf_flfirst = 0;
2691
2692        pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
2693        ASSERT(!pag->pagf_agflreset);
2694        be32_add_cpu(&agf->agf_flcount, -1);
2695        xfs_trans_agflist_delta(tp, -1);
2696        pag->pagf_flcount--;
2697
2698        logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT;
2699        if (btreeblk) {
2700                be32_add_cpu(&agf->agf_btreeblks, 1);
2701                pag->pagf_btreeblks++;
2702                logflags |= XFS_AGF_BTREEBLKS;
2703        }
2704        xfs_perag_put(pag);
2705
2706        xfs_alloc_log_agf(tp, agbp, logflags);
2707        *bnop = bno;
2708
2709        return 0;
2710}
2711
2712/*
2713 * Log the given fields from the agf structure.
2714 */
2715void
2716xfs_alloc_log_agf(
2717        xfs_trans_t     *tp,    /* transaction pointer */
2718        xfs_buf_t       *bp,    /* buffer for a.g. freelist header */
2719        int             fields) /* mask of fields to be logged (XFS_AGF_...) */
2720{
2721        int     first;          /* first byte offset */
2722        int     last;           /* last byte offset */
2723        static const short      offsets[] = {
2724                offsetof(xfs_agf_t, agf_magicnum),
2725                offsetof(xfs_agf_t, agf_versionnum),
2726                offsetof(xfs_agf_t, agf_seqno),
2727                offsetof(xfs_agf_t, agf_length),
2728                offsetof(xfs_agf_t, agf_roots[0]),
2729                offsetof(xfs_agf_t, agf_levels[0]),
2730                offsetof(xfs_agf_t, agf_flfirst),
2731                offsetof(xfs_agf_t, agf_fllast),
2732                offsetof(xfs_agf_t, agf_flcount),
2733                offsetof(xfs_agf_t, agf_freeblks),
2734                offsetof(xfs_agf_t, agf_longest),
2735                offsetof(xfs_agf_t, agf_btreeblks),
2736                offsetof(xfs_agf_t, agf_uuid),
2737                offsetof(xfs_agf_t, agf_rmap_blocks),
2738                offsetof(xfs_agf_t, agf_refcount_blocks),
2739                offsetof(xfs_agf_t, agf_refcount_root),
2740                offsetof(xfs_agf_t, agf_refcount_level),
2741                /* needed so that we don't log the whole rest of the structure: */
2742                offsetof(xfs_agf_t, agf_spare64),
2743                sizeof(xfs_agf_t)
2744        };
2745
2746        trace_xfs_agf(tp->t_mountp, bp->b_addr, fields, _RET_IP_);
2747
2748        xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF);
2749
2750        xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last);
2751        xfs_trans_log_buf(tp, bp, (uint)first, (uint)last);
2752}
2753
2754/*
2755 * Interface for inode allocation to force the pag data to be initialized.
2756 */
2757int                                     /* error */
2758xfs_alloc_pagf_init(
2759        xfs_mount_t             *mp,    /* file system mount structure */
2760        xfs_trans_t             *tp,    /* transaction pointer */
2761        xfs_agnumber_t          agno,   /* allocation group number */
2762        int                     flags)  /* XFS_ALLOC_FLAGS_... */
2763{
2764        xfs_buf_t               *bp;
2765        int                     error;
2766
2767        error = xfs_alloc_read_agf(mp, tp, agno, flags, &bp);
2768        if (!error)
2769                xfs_trans_brelse(tp, bp);
2770        return error;
2771}
2772
2773/*
2774 * Put the block on the freelist for the allocation group.
2775 */
2776int                                     /* error */
2777xfs_alloc_put_freelist(
2778        xfs_trans_t             *tp,    /* transaction pointer */
2779        xfs_buf_t               *agbp,  /* buffer for a.g. freelist header */
2780        xfs_buf_t               *agflbp,/* buffer for a.g. free block array */
2781        xfs_agblock_t           bno,    /* block being freed */
2782        int                     btreeblk) /* block came from a AGF btree */
2783{
2784        struct xfs_mount        *mp = tp->t_mountp;
2785        struct xfs_agf          *agf = agbp->b_addr;
2786        __be32                  *blockp;/* pointer to array entry */
2787        int                     error;
2788        int                     logflags;
2789        xfs_perag_t             *pag;   /* per allocation group data */
2790        __be32                  *agfl_bno;
2791        int                     startoff;
2792
2793        if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp,
2794                        be32_to_cpu(agf->agf_seqno), &agflbp)))
2795                return error;
2796        be32_add_cpu(&agf->agf_fllast, 1);
2797        if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp))
2798                agf->agf_fllast = 0;
2799
2800        pag = xfs_perag_get(mp, be32_to_cpu(agf->agf_seqno));
2801        ASSERT(!pag->pagf_agflreset);
2802        be32_add_cpu(&agf->agf_flcount, 1);
2803        xfs_trans_agflist_delta(tp, 1);
2804        pag->pagf_flcount++;
2805
2806        logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT;
2807        if (btreeblk) {
2808                be32_add_cpu(&agf->agf_btreeblks, -1);
2809                pag->pagf_btreeblks--;
2810                logflags |= XFS_AGF_BTREEBLKS;
2811        }
2812        xfs_perag_put(pag);
2813
2814        xfs_alloc_log_agf(tp, agbp, logflags);
2815
2816        ASSERT(be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp));
2817
2818        agfl_bno = xfs_buf_to_agfl_bno(agflbp);
2819        blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)];
2820        *blockp = cpu_to_be32(bno);
2821        startoff = (char *)blockp - (char *)agflbp->b_addr;
2822
2823        xfs_alloc_log_agf(tp, agbp, logflags);
2824
2825        xfs_trans_buf_set_type(tp, agflbp, XFS_BLFT_AGFL_BUF);
2826        xfs_trans_log_buf(tp, agflbp, startoff,
2827                          startoff + sizeof(xfs_agblock_t) - 1);
2828        return 0;
2829}
2830
2831static xfs_failaddr_t
2832xfs_agf_verify(
2833        struct xfs_buf          *bp)
2834{
2835        struct xfs_mount        *mp = bp->b_mount;
2836        struct xfs_agf          *agf = bp->b_addr;
2837
2838        if (xfs_sb_version_hascrc(&mp->m_sb)) {
2839                if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid))
2840                        return __this_address;
2841                if (!xfs_log_check_lsn(mp, be64_to_cpu(agf->agf_lsn)))
2842                        return __this_address;
2843        }
2844
2845        if (!xfs_verify_magic(bp, agf->agf_magicnum))
2846                return __this_address;
2847
2848        if (!(XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
2849              be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
2850              be32_to_cpu(agf->agf_flfirst) < xfs_agfl_size(mp) &&
2851              be32_to_cpu(agf->agf_fllast) < xfs_agfl_size(mp) &&
2852              be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp)))
2853                return __this_address;
2854
2855        if (be32_to_cpu(agf->agf_length) > mp->m_sb.sb_dblocks)
2856                return __this_address;
2857
2858        if (be32_to_cpu(agf->agf_freeblks) < be32_to_cpu(agf->agf_longest) ||
2859            be32_to_cpu(agf->agf_freeblks) > be32_to_cpu(agf->agf_length))
2860                return __this_address;
2861
2862        if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 ||
2863            be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 ||
2864            be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > XFS_BTREE_MAXLEVELS ||
2865            be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > XFS_BTREE_MAXLEVELS)
2866                return __this_address;
2867
2868        if (xfs_sb_version_hasrmapbt(&mp->m_sb) &&
2869            (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 ||
2870             be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > XFS_BTREE_MAXLEVELS))
2871                return __this_address;
2872
2873        if (xfs_sb_version_hasrmapbt(&mp->m_sb) &&
2874            be32_to_cpu(agf->agf_rmap_blocks) > be32_to_cpu(agf->agf_length))
2875                return __this_address;
2876
2877        /*
2878         * during growfs operations, the perag is not fully initialised,
2879         * so we can't use it for any useful checking. growfs ensures we can't
2880         * use it by using uncached buffers that don't have the perag attached
2881         * so we can detect and avoid this problem.
2882         */
2883        if (bp->b_pag && be32_to_cpu(agf->agf_seqno) != bp->b_pag->pag_agno)
2884                return __this_address;
2885
2886        if (xfs_sb_version_haslazysbcount(&mp->m_sb) &&
2887            be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length))
2888                return __this_address;
2889
2890        if (xfs_sb_version_hasreflink(&mp->m_sb) &&
2891            be32_to_cpu(agf->agf_refcount_blocks) >
2892            be32_to_cpu(agf->agf_length))
2893                return __this_address;
2894
2895        if (xfs_sb_version_hasreflink(&mp->m_sb) &&
2896            (be32_to_cpu(agf->agf_refcount_level) < 1 ||
2897             be32_to_cpu(agf->agf_refcount_level) > XFS_BTREE_MAXLEVELS))
2898                return __this_address;
2899
2900        return NULL;
2901
2902}
2903
2904static void
2905xfs_agf_read_verify(
2906        struct xfs_buf  *bp)
2907{
2908        struct xfs_mount *mp = bp->b_mount;
2909        xfs_failaddr_t  fa;
2910
2911        if (xfs_sb_version_hascrc(&mp->m_sb) &&
2912            !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF))
2913                xfs_verifier_error(bp, -EFSBADCRC, __this_address);
2914        else {
2915                fa = xfs_agf_verify(bp);
2916                if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_ALLOC_READ_AGF))
2917                        xfs_verifier_error(bp, -EFSCORRUPTED, fa);
2918        }
2919}
2920
2921static void
2922xfs_agf_write_verify(
2923        struct xfs_buf  *bp)
2924{
2925        struct xfs_mount        *mp = bp->b_mount;
2926        struct xfs_buf_log_item *bip = bp->b_log_item;
2927        struct xfs_agf          *agf = bp->b_addr;
2928        xfs_failaddr_t          fa;
2929
2930        fa = xfs_agf_verify(bp);
2931        if (fa) {
2932                xfs_verifier_error(bp, -EFSCORRUPTED, fa);
2933                return;
2934        }
2935
2936        if (!xfs_sb_version_hascrc(&mp->m_sb))
2937                return;
2938
2939        if (bip)
2940                agf->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn);
2941
2942        xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF);
2943}
2944
2945const struct xfs_buf_ops xfs_agf_buf_ops = {
2946        .name = "xfs_agf",
2947        .magic = { cpu_to_be32(XFS_AGF_MAGIC), cpu_to_be32(XFS_AGF_MAGIC) },
2948        .verify_read = xfs_agf_read_verify,
2949        .verify_write = xfs_agf_write_verify,
2950        .verify_struct = xfs_agf_verify,
2951};
2952
2953/*
2954 * Read in the allocation group header (free/alloc section).
2955 */
2956int                                     /* error */
2957xfs_read_agf(
2958        struct xfs_mount        *mp,    /* mount point structure */
2959        struct xfs_trans        *tp,    /* transaction pointer */
2960        xfs_agnumber_t          agno,   /* allocation group number */
2961        int                     flags,  /* XFS_BUF_ */
2962        struct xfs_buf          **bpp)  /* buffer for the ag freelist header */
2963{
2964        int             error;
2965
2966        trace_xfs_read_agf(mp, agno);
2967
2968        ASSERT(agno != NULLAGNUMBER);
2969        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
2970                        XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
2971                        XFS_FSS_TO_BB(mp, 1), flags, bpp, &xfs_agf_buf_ops);
2972        if (error)
2973                return error;
2974
2975        ASSERT(!(*bpp)->b_error);
2976        xfs_buf_set_ref(*bpp, XFS_AGF_REF);
2977        return 0;
2978}
2979
2980/*
2981 * Read in the allocation group header (free/alloc section).
2982 */
2983int                                     /* error */
2984xfs_alloc_read_agf(
2985        struct xfs_mount        *mp,    /* mount point structure */
2986        struct xfs_trans        *tp,    /* transaction pointer */
2987        xfs_agnumber_t          agno,   /* allocation group number */
2988        int                     flags,  /* XFS_ALLOC_FLAG_... */
2989        struct xfs_buf          **bpp)  /* buffer for the ag freelist header */
2990{
2991        struct xfs_agf          *agf;           /* ag freelist header */
2992        struct xfs_perag        *pag;           /* per allocation group data */
2993        int                     error;
2994
2995        trace_xfs_alloc_read_agf(mp, agno);
2996
2997        /* We don't support trylock when freeing. */
2998        ASSERT((flags & (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)) !=
2999                        (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK));
3000        ASSERT(agno != NULLAGNUMBER);
3001        error = xfs_read_agf(mp, tp, agno,
3002                        (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
3003                        bpp);
3004        if (error)
3005                return error;
3006        ASSERT(!(*bpp)->b_error);
3007
3008        agf = (*bpp)->b_addr;
3009        pag = xfs_perag_get(mp, agno);
3010        if (!pag->pagf_init) {
3011                pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
3012                pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
3013                pag->pagf_flcount = be32_to_cpu(agf->agf_flcount);
3014                pag->pagf_longest = be32_to_cpu(agf->agf_longest);
3015                pag->pagf_levels[XFS_BTNUM_BNOi] =
3016                        be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]);
3017                pag->pagf_levels[XFS_BTNUM_CNTi] =
3018                        be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
3019                pag->pagf_levels[XFS_BTNUM_RMAPi] =
3020                        be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]);
3021                pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
3022                pag->pagf_init = 1;
3023                pag->pagf_agflreset = xfs_agfl_needs_reset(mp, agf);
3024        }
3025#ifdef DEBUG
3026        else if (!XFS_FORCED_SHUTDOWN(mp)) {
3027                ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks));
3028                ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
3029                ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
3030                ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest));
3031                ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] ==
3032                       be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]));
3033                ASSERT(pag->pagf_levels[XFS_BTNUM_CNTi] ==
3034                       be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
3035        }
3036#endif
3037        xfs_perag_put(pag);
3038        return 0;
3039}
3040
3041/*
3042 * Allocate an extent (variable-size).
3043 * Depending on the allocation type, we either look in a single allocation
3044 * group or loop over the allocation groups to find the result.
3045 */
3046int                             /* error */
3047xfs_alloc_vextent(
3048        struct xfs_alloc_arg    *args)  /* allocation argument structure */
3049{
3050        xfs_agblock_t           agsize; /* allocation group size */
3051        int                     error;
3052        int                     flags;  /* XFS_ALLOC_FLAG_... locking flags */
3053        struct xfs_mount        *mp;    /* mount structure pointer */
3054        xfs_agnumber_t          sagno;  /* starting allocation group number */
3055        xfs_alloctype_t         type;   /* input allocation type */
3056        int                     bump_rotor = 0;
3057        xfs_agnumber_t          rotorstep = xfs_rotorstep; /* inode32 agf stepper */
3058
3059        mp = args->mp;
3060        type = args->otype = args->type;
3061        args->agbno = NULLAGBLOCK;
3062        /*
3063         * Just fix this up, for the case where the last a.g. is shorter
3064         * (or there's only one a.g.) and the caller couldn't easily figure
3065         * that out (xfs_bmap_alloc).
3066         */
3067        agsize = mp->m_sb.sb_agblocks;
3068        if (args->maxlen > agsize)
3069                args->maxlen = agsize;
3070        if (args->alignment == 0)
3071                args->alignment = 1;
3072        ASSERT(XFS_FSB_TO_AGNO(mp, args->fsbno) < mp->m_sb.sb_agcount);
3073        ASSERT(XFS_FSB_TO_AGBNO(mp, args->fsbno) < agsize);
3074        ASSERT(args->minlen <= args->maxlen);
3075        ASSERT(args->minlen <= agsize);
3076        ASSERT(args->mod < args->prod);
3077        if (XFS_FSB_TO_AGNO(mp, args->fsbno) >= mp->m_sb.sb_agcount ||
3078            XFS_FSB_TO_AGBNO(mp, args->fsbno) >= agsize ||
3079            args->minlen > args->maxlen || args->minlen > agsize ||
3080            args->mod >= args->prod) {
3081                args->fsbno = NULLFSBLOCK;
3082                trace_xfs_alloc_vextent_badargs(args);
3083                return 0;
3084        }
3085
3086        switch (type) {
3087        case XFS_ALLOCTYPE_THIS_AG:
3088        case XFS_ALLOCTYPE_NEAR_BNO:
3089        case XFS_ALLOCTYPE_THIS_BNO:
3090                /*
3091                 * These three force us into a single a.g.
3092                 */
3093                args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
3094                args->pag = xfs_perag_get(mp, args->agno);
3095                error = xfs_alloc_fix_freelist(args, 0);
3096                if (error) {
3097                        trace_xfs_alloc_vextent_nofix(args);
3098                        goto error0;
3099                }
3100                if (!args->agbp) {
3101                        trace_xfs_alloc_vextent_noagbp(args);
3102                        break;
3103                }
3104                args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
3105                if ((error = xfs_alloc_ag_vextent(args)))
3106                        goto error0;
3107                break;
3108        case XFS_ALLOCTYPE_START_BNO:
3109                /*
3110                 * Try near allocation first, then anywhere-in-ag after
3111                 * the first a.g. fails.
3112                 */
3113                if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) &&
3114                    (mp->m_flags & XFS_MOUNT_32BITINODES)) {
3115                        args->fsbno = XFS_AGB_TO_FSB(mp,
3116                                        ((mp->m_agfrotor / rotorstep) %
3117                                        mp->m_sb.sb_agcount), 0);
3118                        bump_rotor = 1;
3119                }
3120                args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
3121                args->type = XFS_ALLOCTYPE_NEAR_BNO;
3122                /* FALLTHROUGH */
3123        case XFS_ALLOCTYPE_FIRST_AG:
3124                /*
3125                 * Rotate through the allocation groups looking for a winner.
3126                 */
3127                if (type == XFS_ALLOCTYPE_FIRST_AG) {
3128                        /*
3129                         * Start with allocation group given by bno.
3130                         */
3131                        args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
3132                        args->type = XFS_ALLOCTYPE_THIS_AG;
3133                        sagno = 0;
3134                        flags = 0;
3135                } else {
3136                        /*
3137                         * Start with the given allocation group.
3138                         */
3139                        args->agno = sagno = XFS_FSB_TO_AGNO(mp, args->fsbno);
3140                        flags = XFS_ALLOC_FLAG_TRYLOCK;
3141                }
3142                /*
3143                 * Loop over allocation groups twice; first time with
3144                 * trylock set, second time without.
3145                 */
3146                for (;;) {
3147                        args->pag = xfs_perag_get(mp, args->agno);
3148                        error = xfs_alloc_fix_freelist(args, flags);
3149                        if (error) {
3150                                trace_xfs_alloc_vextent_nofix(args);
3151                                goto error0;
3152                        }
3153                        /*
3154                         * If we get a buffer back then the allocation will fly.
3155                         */
3156                        if (args->agbp) {
3157                                if ((error = xfs_alloc_ag_vextent(args)))
3158                                        goto error0;
3159                                break;
3160                        }
3161
3162                        trace_xfs_alloc_vextent_loopfailed(args);
3163
3164                        /*
3165                         * Didn't work, figure out the next iteration.
3166                         */
3167                        if (args->agno == sagno &&
3168                            type == XFS_ALLOCTYPE_START_BNO)
3169                                args->type = XFS_ALLOCTYPE_THIS_AG;
3170                        /*
3171                        * For the first allocation, we can try any AG to get
3172                        * space.  However, if we already have allocated a
3173                        * block, we don't want to try AGs whose number is below
3174                        * sagno. Otherwise, we may end up with out-of-order
3175                        * locking of AGF, which might cause deadlock.
3176                        */
3177                        if (++(args->agno) == mp->m_sb.sb_agcount) {
3178                                if (args->tp->t_firstblock != NULLFSBLOCK)
3179                                        args->agno = sagno;
3180                                else
3181                                        args->agno = 0;
3182                        }
3183                        /*
3184                         * Reached the starting a.g., must either be done
3185                         * or switch to non-trylock mode.
3186                         */
3187                        if (args->agno == sagno) {
3188                                if (flags == 0) {
3189                                        args->agbno = NULLAGBLOCK;
3190                                        trace_xfs_alloc_vextent_allfailed(args);
3191                                        break;
3192                                }
3193
3194                                flags = 0;
3195                                if (type == XFS_ALLOCTYPE_START_BNO) {
3196                                        args->agbno = XFS_FSB_TO_AGBNO(mp,
3197                                                args->fsbno);
3198                                        args->type = XFS_ALLOCTYPE_NEAR_BNO;
3199                                }
3200                        }
3201                        xfs_perag_put(args->pag);
3202                }
3203                if (bump_rotor) {
3204                        if (args->agno == sagno)
3205                                mp->m_agfrotor = (mp->m_agfrotor + 1) %
3206                                        (mp->m_sb.sb_agcount * rotorstep);
3207                        else
3208                                mp->m_agfrotor = (args->agno * rotorstep + 1) %
3209                                        (mp->m_sb.sb_agcount * rotorstep);
3210                }
3211                break;
3212        default:
3213                ASSERT(0);
3214                /* NOTREACHED */
3215        }
3216        if (args->agbno == NULLAGBLOCK)
3217                args->fsbno = NULLFSBLOCK;
3218        else {
3219                args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno);
3220#ifdef DEBUG
3221                ASSERT(args->len >= args->minlen);
3222                ASSERT(args->len <= args->maxlen);
3223                ASSERT(args->agbno % args->alignment == 0);
3224                XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno),
3225                        args->len);
3226#endif
3227
3228        }
3229        xfs_perag_put(args->pag);
3230        return 0;
3231error0:
3232        xfs_perag_put(args->pag);
3233        return error;
3234}
3235
3236/* Ensure that the freelist is at full capacity. */
3237int
3238xfs_free_extent_fix_freelist(
3239        struct xfs_trans        *tp,
3240        xfs_agnumber_t          agno,
3241        struct xfs_buf          **agbp)
3242{
3243        struct xfs_alloc_arg    args;
3244        int                     error;
3245
3246        memset(&args, 0, sizeof(struct xfs_alloc_arg));
3247        args.tp = tp;
3248        args.mp = tp->t_mountp;
3249        args.agno = agno;
3250
3251        /*
3252         * validate that the block number is legal - the enables us to detect
3253         * and handle a silent filesystem corruption rather than crashing.
3254         */
3255        if (args.agno >= args.mp->m_sb.sb_agcount)
3256                return -EFSCORRUPTED;
3257
3258        args.pag = xfs_perag_get(args.mp, args.agno);
3259        ASSERT(args.pag);
3260
3261        error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
3262        if (error)
3263                goto out;
3264
3265        *agbp = args.agbp;
3266out:
3267        xfs_perag_put(args.pag);
3268        return error;
3269}
3270
3271/*
3272 * Free an extent.
3273 * Just break up the extent address and hand off to xfs_free_ag_extent
3274 * after fixing up the freelist.
3275 */
3276int
3277__xfs_free_extent(
3278        struct xfs_trans                *tp,
3279        xfs_fsblock_t                   bno,
3280        xfs_extlen_t                    len,
3281        const struct xfs_owner_info     *oinfo,
3282        enum xfs_ag_resv_type           type,
3283        bool                            skip_discard)
3284{
3285        struct xfs_mount                *mp = tp->t_mountp;
3286        struct xfs_buf                  *agbp;
3287        xfs_agnumber_t                  agno = XFS_FSB_TO_AGNO(mp, bno);
3288        xfs_agblock_t                   agbno = XFS_FSB_TO_AGBNO(mp, bno);
3289        struct xfs_agf                  *agf;
3290        int                             error;
3291        unsigned int                    busy_flags = 0;
3292
3293        ASSERT(len != 0);
3294        ASSERT(type != XFS_AG_RESV_AGFL);
3295
3296        if (XFS_TEST_ERROR(false, mp,
3297                        XFS_ERRTAG_FREE_EXTENT))
3298                return -EIO;
3299
3300        error = xfs_free_extent_fix_freelist(tp, agno, &agbp);
3301        if (error)
3302                return error;
3303        agf = agbp->b_addr;
3304
3305        if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) {
3306                error = -EFSCORRUPTED;
3307                goto err;
3308        }
3309
3310        /* validate the extent size is legal now we have the agf locked */
3311        if (XFS_IS_CORRUPT(mp, agbno + len > be32_to_cpu(agf->agf_length))) {
3312                error = -EFSCORRUPTED;
3313                goto err;
3314        }
3315
3316        error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, type);
3317        if (error)
3318                goto err;
3319
3320        if (skip_discard)
3321                busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD;
3322        xfs_extent_busy_insert(tp, agno, agbno, len, busy_flags);
3323        return 0;
3324
3325err:
3326        xfs_trans_brelse(tp, agbp);
3327        return error;
3328}
3329
3330struct xfs_alloc_query_range_info {
3331        xfs_alloc_query_range_fn        fn;
3332        void                            *priv;
3333};
3334
3335/* Format btree record and pass to our callback. */
3336STATIC int
3337xfs_alloc_query_range_helper(
3338        struct xfs_btree_cur            *cur,
3339        union xfs_btree_rec             *rec,
3340        void                            *priv)
3341{
3342        struct xfs_alloc_query_range_info       *query = priv;
3343        struct xfs_alloc_rec_incore             irec;
3344
3345        irec.ar_startblock = be32_to_cpu(rec->alloc.ar_startblock);
3346        irec.ar_blockcount = be32_to_cpu(rec->alloc.ar_blockcount);
3347        return query->fn(cur, &irec, query->priv);
3348}
3349
3350/* Find all free space within a given range of blocks. */
3351int
3352xfs_alloc_query_range(
3353        struct xfs_btree_cur                    *cur,
3354        struct xfs_alloc_rec_incore             *low_rec,
3355        struct xfs_alloc_rec_incore             *high_rec,
3356        xfs_alloc_query_range_fn                fn,
3357        void                                    *priv)
3358{
3359        union xfs_btree_irec                    low_brec;
3360        union xfs_btree_irec                    high_brec;
3361        struct xfs_alloc_query_range_info       query;
3362
3363        ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
3364        low_brec.a = *low_rec;
3365        high_brec.a = *high_rec;
3366        query.priv = priv;
3367        query.fn = fn;
3368        return xfs_btree_query_range(cur, &low_brec, &high_brec,
3369                        xfs_alloc_query_range_helper, &query);
3370}
3371
3372/* Find all free space records. */
3373int
3374xfs_alloc_query_all(
3375        struct xfs_btree_cur                    *cur,
3376        xfs_alloc_query_range_fn                fn,
3377        void                                    *priv)
3378{
3379        struct xfs_alloc_query_range_info       query;
3380
3381        ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
3382        query.priv = priv;
3383        query.fn = fn;
3384        return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query);
3385}
3386
3387/* Is there a record covering a given extent? */
3388int
3389xfs_alloc_has_record(
3390        struct xfs_btree_cur    *cur,
3391        xfs_agblock_t           bno,
3392        xfs_extlen_t            len,
3393        bool                    *exists)
3394{
3395        union xfs_btree_irec    low;
3396        union xfs_btree_irec    high;
3397
3398        memset(&low, 0, sizeof(low));
3399        low.a.ar_startblock = bno;
3400        memset(&high, 0xFF, sizeof(high));
3401        high.a.ar_startblock = bno + len - 1;
3402
3403        return xfs_btree_has_record(cur, &low, &high, exists);
3404}
3405
3406/*
3407 * Walk all the blocks in the AGFL.  The @walk_fn can return any negative
3408 * error code or XFS_ITER_*.
3409 */
3410int
3411xfs_agfl_walk(
3412        struct xfs_mount        *mp,
3413        struct xfs_agf          *agf,
3414        struct xfs_buf          *agflbp,
3415        xfs_agfl_walk_fn        walk_fn,
3416        void                    *priv)
3417{
3418        __be32                  *agfl_bno;
3419        unsigned int            i;
3420        int                     error;
3421
3422        agfl_bno = xfs_buf_to_agfl_bno(agflbp);
3423        i = be32_to_cpu(agf->agf_flfirst);
3424
3425        /* Nothing to walk in an empty AGFL. */
3426        if (agf->agf_flcount == cpu_to_be32(0))
3427                return 0;
3428
3429        /* Otherwise, walk from first to last, wrapping as needed. */
3430        for (;;) {
3431                error = walk_fn(mp, be32_to_cpu(agfl_bno[i]), priv);
3432                if (error)
3433                        return error;
3434                if (i == be32_to_cpu(agf->agf_fllast))
3435                        break;
3436                if (++i == xfs_agfl_size(mp))
3437                        i = 0;
3438        }
3439
3440        return 0;
3441}
3442