linux/fs/xfs/libxfs/xfs_alloc.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
   4 * All Rights Reserved.
   5 */
   6#include "xfs.h"
   7#include "xfs_fs.h"
   8#include "xfs_format.h"
   9#include "xfs_log_format.h"
  10#include "xfs_shared.h"
  11#include "xfs_trans_resv.h"
  12#include "xfs_bit.h"
  13#include "xfs_mount.h"
  14#include "xfs_defer.h"
  15#include "xfs_btree.h"
  16#include "xfs_rmap.h"
  17#include "xfs_alloc_btree.h"
  18#include "xfs_alloc.h"
  19#include "xfs_extent_busy.h"
  20#include "xfs_errortag.h"
  21#include "xfs_error.h"
  22#include "xfs_trace.h"
  23#include "xfs_trans.h"
  24#include "xfs_buf_item.h"
  25#include "xfs_log.h"
  26#include "xfs_ag.h"
  27#include "xfs_ag_resv.h"
  28#include "xfs_bmap.h"
  29
  30extern kmem_zone_t      *xfs_bmap_free_item_zone;
  31
  32struct workqueue_struct *xfs_alloc_wq;
  33
  34#define XFS_ABSDIFF(a,b)        (((a) <= (b)) ? ((b) - (a)) : ((a) - (b)))
  35
  36#define XFSA_FIXUP_BNO_OK       1
  37#define XFSA_FIXUP_CNT_OK       2
  38
  39STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *);
  40STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *);
  41STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
  42
  43/*
  44 * Size of the AGFL.  For CRC-enabled filesystes we steal a couple of slots in
  45 * the beginning of the block for a proper header with the location information
  46 * and CRC.
  47 */
  48unsigned int
  49xfs_agfl_size(
  50        struct xfs_mount        *mp)
  51{
  52        unsigned int            size = mp->m_sb.sb_sectsize;
  53
  54        if (xfs_has_crc(mp))
  55                size -= sizeof(struct xfs_agfl);
  56
  57        return size / sizeof(xfs_agblock_t);
  58}
  59
  60unsigned int
  61xfs_refc_block(
  62        struct xfs_mount        *mp)
  63{
  64        if (xfs_has_rmapbt(mp))
  65                return XFS_RMAP_BLOCK(mp) + 1;
  66        if (xfs_has_finobt(mp))
  67                return XFS_FIBT_BLOCK(mp) + 1;
  68        return XFS_IBT_BLOCK(mp) + 1;
  69}
  70
  71xfs_extlen_t
  72xfs_prealloc_blocks(
  73        struct xfs_mount        *mp)
  74{
  75        if (xfs_has_reflink(mp))
  76                return xfs_refc_block(mp) + 1;
  77        if (xfs_has_rmapbt(mp))
  78                return XFS_RMAP_BLOCK(mp) + 1;
  79        if (xfs_has_finobt(mp))
  80                return XFS_FIBT_BLOCK(mp) + 1;
  81        return XFS_IBT_BLOCK(mp) + 1;
  82}
  83
  84/*
  85 * In order to avoid ENOSPC-related deadlock caused by out-of-order locking of
  86 * AGF buffer (PV 947395), we place constraints on the relationship among
  87 * actual allocations for data blocks, freelist blocks, and potential file data
  88 * bmap btree blocks. However, these restrictions may result in no actual space
  89 * allocated for a delayed extent, for example, a data block in a certain AG is
  90 * allocated but there is no additional block for the additional bmap btree
  91 * block due to a split of the bmap btree of the file. The result of this may
  92 * lead to an infinite loop when the file gets flushed to disk and all delayed
  93 * extents need to be actually allocated. To get around this, we explicitly set
  94 * aside a few blocks which will not be reserved in delayed allocation.
  95 *
  96 * We need to reserve 4 fsbs _per AG_ for the freelist and 4 more to handle a
  97 * potential split of the file's bmap btree.
  98 */
  99unsigned int
 100xfs_alloc_set_aside(
 101        struct xfs_mount        *mp)
 102{
 103        return mp->m_sb.sb_agcount * (XFS_ALLOC_AGFL_RESERVE + 4);
 104}
 105
 106/*
 107 * When deciding how much space to allocate out of an AG, we limit the
 108 * allocation maximum size to the size the AG. However, we cannot use all the
 109 * blocks in the AG - some are permanently used by metadata. These
 110 * blocks are generally:
 111 *      - the AG superblock, AGF, AGI and AGFL
 112 *      - the AGF (bno and cnt) and AGI btree root blocks, and optionally
 113 *        the AGI free inode and rmap btree root blocks.
 114 *      - blocks on the AGFL according to xfs_alloc_set_aside() limits
 115 *      - the rmapbt root block
 116 *
 117 * The AG headers are sector sized, so the amount of space they take up is
 118 * dependent on filesystem geometry. The others are all single blocks.
 119 */
 120unsigned int
 121xfs_alloc_ag_max_usable(
 122        struct xfs_mount        *mp)
 123{
 124        unsigned int            blocks;
 125
 126        blocks = XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)); /* ag headers */
 127        blocks += XFS_ALLOC_AGFL_RESERVE;
 128        blocks += 3;                    /* AGF, AGI btree root blocks */
 129        if (xfs_has_finobt(mp))
 130                blocks++;               /* finobt root block */
 131        if (xfs_has_rmapbt(mp))
 132                blocks++;               /* rmap root block */
 133        if (xfs_has_reflink(mp))
 134                blocks++;               /* refcount root block */
 135
 136        return mp->m_sb.sb_agblocks - blocks;
 137}
 138
 139/*
 140 * Lookup the record equal to [bno, len] in the btree given by cur.
 141 */
 142STATIC int                              /* error */
 143xfs_alloc_lookup_eq(
 144        struct xfs_btree_cur    *cur,   /* btree cursor */
 145        xfs_agblock_t           bno,    /* starting block of extent */
 146        xfs_extlen_t            len,    /* length of extent */
 147        int                     *stat)  /* success/failure */
 148{
 149        int                     error;
 150
 151        cur->bc_rec.a.ar_startblock = bno;
 152        cur->bc_rec.a.ar_blockcount = len;
 153        error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
 154        cur->bc_ag.abt.active = (*stat == 1);
 155        return error;
 156}
 157
 158/*
 159 * Lookup the first record greater than or equal to [bno, len]
 160 * in the btree given by cur.
 161 */
 162int                             /* error */
 163xfs_alloc_lookup_ge(
 164        struct xfs_btree_cur    *cur,   /* btree cursor */
 165        xfs_agblock_t           bno,    /* starting block of extent */
 166        xfs_extlen_t            len,    /* length of extent */
 167        int                     *stat)  /* success/failure */
 168{
 169        int                     error;
 170
 171        cur->bc_rec.a.ar_startblock = bno;
 172        cur->bc_rec.a.ar_blockcount = len;
 173        error = xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
 174        cur->bc_ag.abt.active = (*stat == 1);
 175        return error;
 176}
 177
 178/*
 179 * Lookup the first record less than or equal to [bno, len]
 180 * in the btree given by cur.
 181 */
 182int                                     /* error */
 183xfs_alloc_lookup_le(
 184        struct xfs_btree_cur    *cur,   /* btree cursor */
 185        xfs_agblock_t           bno,    /* starting block of extent */
 186        xfs_extlen_t            len,    /* length of extent */
 187        int                     *stat)  /* success/failure */
 188{
 189        int                     error;
 190        cur->bc_rec.a.ar_startblock = bno;
 191        cur->bc_rec.a.ar_blockcount = len;
 192        error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
 193        cur->bc_ag.abt.active = (*stat == 1);
 194        return error;
 195}
 196
 197static inline bool
 198xfs_alloc_cur_active(
 199        struct xfs_btree_cur    *cur)
 200{
 201        return cur && cur->bc_ag.abt.active;
 202}
 203
 204/*
 205 * Update the record referred to by cur to the value given
 206 * by [bno, len].
 207 * This either works (return 0) or gets an EFSCORRUPTED error.
 208 */
 209STATIC int                              /* error */
 210xfs_alloc_update(
 211        struct xfs_btree_cur    *cur,   /* btree cursor */
 212        xfs_agblock_t           bno,    /* starting block of extent */
 213        xfs_extlen_t            len)    /* length of extent */
 214{
 215        union xfs_btree_rec     rec;
 216
 217        rec.alloc.ar_startblock = cpu_to_be32(bno);
 218        rec.alloc.ar_blockcount = cpu_to_be32(len);
 219        return xfs_btree_update(cur, &rec);
 220}
 221
 222/*
 223 * Get the data from the pointed-to record.
 224 */
 225int                                     /* error */
 226xfs_alloc_get_rec(
 227        struct xfs_btree_cur    *cur,   /* btree cursor */
 228        xfs_agblock_t           *bno,   /* output: starting block of extent */
 229        xfs_extlen_t            *len,   /* output: length of extent */
 230        int                     *stat)  /* output: success/failure */
 231{
 232        struct xfs_mount        *mp = cur->bc_mp;
 233        xfs_agnumber_t          agno = cur->bc_ag.pag->pag_agno;
 234        union xfs_btree_rec     *rec;
 235        int                     error;
 236
 237        error = xfs_btree_get_rec(cur, &rec, stat);
 238        if (error || !(*stat))
 239                return error;
 240
 241        *bno = be32_to_cpu(rec->alloc.ar_startblock);
 242        *len = be32_to_cpu(rec->alloc.ar_blockcount);
 243
 244        if (*len == 0)
 245                goto out_bad_rec;
 246
 247        /* check for valid extent range, including overflow */
 248        if (!xfs_verify_agbno(mp, agno, *bno))
 249                goto out_bad_rec;
 250        if (*bno > *bno + *len)
 251                goto out_bad_rec;
 252        if (!xfs_verify_agbno(mp, agno, *bno + *len - 1))
 253                goto out_bad_rec;
 254
 255        return 0;
 256
 257out_bad_rec:
 258        xfs_warn(mp,
 259                "%s Freespace BTree record corruption in AG %d detected!",
 260                cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size", agno);
 261        xfs_warn(mp,
 262                "start block 0x%x block count 0x%x", *bno, *len);
 263        return -EFSCORRUPTED;
 264}
 265
 266/*
 267 * Compute aligned version of the found extent.
 268 * Takes alignment and min length into account.
 269 */
 270STATIC bool
 271xfs_alloc_compute_aligned(
 272        xfs_alloc_arg_t *args,          /* allocation argument structure */
 273        xfs_agblock_t   foundbno,       /* starting block in found extent */
 274        xfs_extlen_t    foundlen,       /* length in found extent */
 275        xfs_agblock_t   *resbno,        /* result block number */
 276        xfs_extlen_t    *reslen,        /* result length */
 277        unsigned        *busy_gen)
 278{
 279        xfs_agblock_t   bno = foundbno;
 280        xfs_extlen_t    len = foundlen;
 281        xfs_extlen_t    diff;
 282        bool            busy;
 283
 284        /* Trim busy sections out of found extent */
 285        busy = xfs_extent_busy_trim(args, &bno, &len, busy_gen);
 286
 287        /*
 288         * If we have a largish extent that happens to start before min_agbno,
 289         * see if we can shift it into range...
 290         */
 291        if (bno < args->min_agbno && bno + len > args->min_agbno) {
 292                diff = args->min_agbno - bno;
 293                if (len > diff) {
 294                        bno += diff;
 295                        len -= diff;
 296                }
 297        }
 298
 299        if (args->alignment > 1 && len >= args->minlen) {
 300                xfs_agblock_t   aligned_bno = roundup(bno, args->alignment);
 301
 302                diff = aligned_bno - bno;
 303
 304                *resbno = aligned_bno;
 305                *reslen = diff >= len ? 0 : len - diff;
 306        } else {
 307                *resbno = bno;
 308                *reslen = len;
 309        }
 310
 311        return busy;
 312}
 313
 314/*
 315 * Compute best start block and diff for "near" allocations.
 316 * freelen >= wantlen already checked by caller.
 317 */
 318STATIC xfs_extlen_t                     /* difference value (absolute) */
 319xfs_alloc_compute_diff(
 320        xfs_agblock_t   wantbno,        /* target starting block */
 321        xfs_extlen_t    wantlen,        /* target length */
 322        xfs_extlen_t    alignment,      /* target alignment */
 323        int             datatype,       /* are we allocating data? */
 324        xfs_agblock_t   freebno,        /* freespace's starting block */
 325        xfs_extlen_t    freelen,        /* freespace's length */
 326        xfs_agblock_t   *newbnop)       /* result: best start block from free */
 327{
 328        xfs_agblock_t   freeend;        /* end of freespace extent */
 329        xfs_agblock_t   newbno1;        /* return block number */
 330        xfs_agblock_t   newbno2;        /* other new block number */
 331        xfs_extlen_t    newlen1=0;      /* length with newbno1 */
 332        xfs_extlen_t    newlen2=0;      /* length with newbno2 */
 333        xfs_agblock_t   wantend;        /* end of target extent */
 334        bool            userdata = datatype & XFS_ALLOC_USERDATA;
 335
 336        ASSERT(freelen >= wantlen);
 337        freeend = freebno + freelen;
 338        wantend = wantbno + wantlen;
 339        /*
 340         * We want to allocate from the start of a free extent if it is past
 341         * the desired block or if we are allocating user data and the free
 342         * extent is before desired block. The second case is there to allow
 343         * for contiguous allocation from the remaining free space if the file
 344         * grows in the short term.
 345         */
 346        if (freebno >= wantbno || (userdata && freeend < wantend)) {
 347                if ((newbno1 = roundup(freebno, alignment)) >= freeend)
 348                        newbno1 = NULLAGBLOCK;
 349        } else if (freeend >= wantend && alignment > 1) {
 350                newbno1 = roundup(wantbno, alignment);
 351                newbno2 = newbno1 - alignment;
 352                if (newbno1 >= freeend)
 353                        newbno1 = NULLAGBLOCK;
 354                else
 355                        newlen1 = XFS_EXTLEN_MIN(wantlen, freeend - newbno1);
 356                if (newbno2 < freebno)
 357                        newbno2 = NULLAGBLOCK;
 358                else
 359                        newlen2 = XFS_EXTLEN_MIN(wantlen, freeend - newbno2);
 360                if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) {
 361                        if (newlen1 < newlen2 ||
 362                            (newlen1 == newlen2 &&
 363                             XFS_ABSDIFF(newbno1, wantbno) >
 364                             XFS_ABSDIFF(newbno2, wantbno)))
 365                                newbno1 = newbno2;
 366                } else if (newbno2 != NULLAGBLOCK)
 367                        newbno1 = newbno2;
 368        } else if (freeend >= wantend) {
 369                newbno1 = wantbno;
 370        } else if (alignment > 1) {
 371                newbno1 = roundup(freeend - wantlen, alignment);
 372                if (newbno1 > freeend - wantlen &&
 373                    newbno1 - alignment >= freebno)
 374                        newbno1 -= alignment;
 375                else if (newbno1 >= freeend)
 376                        newbno1 = NULLAGBLOCK;
 377        } else
 378                newbno1 = freeend - wantlen;
 379        *newbnop = newbno1;
 380        return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno);
 381}
 382
 383/*
 384 * Fix up the length, based on mod and prod.
 385 * len should be k * prod + mod for some k.
 386 * If len is too small it is returned unchanged.
 387 * If len hits maxlen it is left alone.
 388 */
 389STATIC void
 390xfs_alloc_fix_len(
 391        xfs_alloc_arg_t *args)          /* allocation argument structure */
 392{
 393        xfs_extlen_t    k;
 394        xfs_extlen_t    rlen;
 395
 396        ASSERT(args->mod < args->prod);
 397        rlen = args->len;
 398        ASSERT(rlen >= args->minlen);
 399        ASSERT(rlen <= args->maxlen);
 400        if (args->prod <= 1 || rlen < args->mod || rlen == args->maxlen ||
 401            (args->mod == 0 && rlen < args->prod))
 402                return;
 403        k = rlen % args->prod;
 404        if (k == args->mod)
 405                return;
 406        if (k > args->mod)
 407                rlen = rlen - (k - args->mod);
 408        else
 409                rlen = rlen - args->prod + (args->mod - k);
 410        /* casts to (int) catch length underflows */
 411        if ((int)rlen < (int)args->minlen)
 412                return;
 413        ASSERT(rlen >= args->minlen && rlen <= args->maxlen);
 414        ASSERT(rlen % args->prod == args->mod);
 415        ASSERT(args->pag->pagf_freeblks + args->pag->pagf_flcount >=
 416                rlen + args->minleft);
 417        args->len = rlen;
 418}
 419
 420/*
 421 * Update the two btrees, logically removing from freespace the extent
 422 * starting at rbno, rlen blocks.  The extent is contained within the
 423 * actual (current) free extent fbno for flen blocks.
 424 * Flags are passed in indicating whether the cursors are set to the
 425 * relevant records.
 426 */
 427STATIC int                              /* error code */
 428xfs_alloc_fixup_trees(
 429        xfs_btree_cur_t *cnt_cur,       /* cursor for by-size btree */
 430        xfs_btree_cur_t *bno_cur,       /* cursor for by-block btree */
 431        xfs_agblock_t   fbno,           /* starting block of free extent */
 432        xfs_extlen_t    flen,           /* length of free extent */
 433        xfs_agblock_t   rbno,           /* starting block of returned extent */
 434        xfs_extlen_t    rlen,           /* length of returned extent */
 435        int             flags)          /* flags, XFSA_FIXUP_... */
 436{
 437        int             error;          /* error code */
 438        int             i;              /* operation results */
 439        xfs_agblock_t   nfbno1;         /* first new free startblock */
 440        xfs_agblock_t   nfbno2;         /* second new free startblock */
 441        xfs_extlen_t    nflen1=0;       /* first new free length */
 442        xfs_extlen_t    nflen2=0;       /* second new free length */
 443        struct xfs_mount *mp;
 444
 445        mp = cnt_cur->bc_mp;
 446
 447        /*
 448         * Look up the record in the by-size tree if necessary.
 449         */
 450        if (flags & XFSA_FIXUP_CNT_OK) {
 451#ifdef DEBUG
 452                if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i)))
 453                        return error;
 454                if (XFS_IS_CORRUPT(mp,
 455                                   i != 1 ||
 456                                   nfbno1 != fbno ||
 457                                   nflen1 != flen))
 458                        return -EFSCORRUPTED;
 459#endif
 460        } else {
 461                if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i)))
 462                        return error;
 463                if (XFS_IS_CORRUPT(mp, i != 1))
 464                        return -EFSCORRUPTED;
 465        }
 466        /*
 467         * Look up the record in the by-block tree if necessary.
 468         */
 469        if (flags & XFSA_FIXUP_BNO_OK) {
 470#ifdef DEBUG
 471                if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i)))
 472                        return error;
 473                if (XFS_IS_CORRUPT(mp,
 474                                   i != 1 ||
 475                                   nfbno1 != fbno ||
 476                                   nflen1 != flen))
 477                        return -EFSCORRUPTED;
 478#endif
 479        } else {
 480                if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i)))
 481                        return error;
 482                if (XFS_IS_CORRUPT(mp, i != 1))
 483                        return -EFSCORRUPTED;
 484        }
 485
 486#ifdef DEBUG
 487        if (bno_cur->bc_nlevels == 1 && cnt_cur->bc_nlevels == 1) {
 488                struct xfs_btree_block  *bnoblock;
 489                struct xfs_btree_block  *cntblock;
 490
 491                bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_bufs[0]);
 492                cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_bufs[0]);
 493
 494                if (XFS_IS_CORRUPT(mp,
 495                                   bnoblock->bb_numrecs !=
 496                                   cntblock->bb_numrecs))
 497                        return -EFSCORRUPTED;
 498        }
 499#endif
 500
 501        /*
 502         * Deal with all four cases: the allocated record is contained
 503         * within the freespace record, so we can have new freespace
 504         * at either (or both) end, or no freespace remaining.
 505         */
 506        if (rbno == fbno && rlen == flen)
 507                nfbno1 = nfbno2 = NULLAGBLOCK;
 508        else if (rbno == fbno) {
 509                nfbno1 = rbno + rlen;
 510                nflen1 = flen - rlen;
 511                nfbno2 = NULLAGBLOCK;
 512        } else if (rbno + rlen == fbno + flen) {
 513                nfbno1 = fbno;
 514                nflen1 = flen - rlen;
 515                nfbno2 = NULLAGBLOCK;
 516        } else {
 517                nfbno1 = fbno;
 518                nflen1 = rbno - fbno;
 519                nfbno2 = rbno + rlen;
 520                nflen2 = (fbno + flen) - nfbno2;
 521        }
 522        /*
 523         * Delete the entry from the by-size btree.
 524         */
 525        if ((error = xfs_btree_delete(cnt_cur, &i)))
 526                return error;
 527        if (XFS_IS_CORRUPT(mp, i != 1))
 528                return -EFSCORRUPTED;
 529        /*
 530         * Add new by-size btree entry(s).
 531         */
 532        if (nfbno1 != NULLAGBLOCK) {
 533                if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
 534                        return error;
 535                if (XFS_IS_CORRUPT(mp, i != 0))
 536                        return -EFSCORRUPTED;
 537                if ((error = xfs_btree_insert(cnt_cur, &i)))
 538                        return error;
 539                if (XFS_IS_CORRUPT(mp, i != 1))
 540                        return -EFSCORRUPTED;
 541        }
 542        if (nfbno2 != NULLAGBLOCK) {
 543                if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
 544                        return error;
 545                if (XFS_IS_CORRUPT(mp, i != 0))
 546                        return -EFSCORRUPTED;
 547                if ((error = xfs_btree_insert(cnt_cur, &i)))
 548                        return error;
 549                if (XFS_IS_CORRUPT(mp, i != 1))
 550                        return -EFSCORRUPTED;
 551        }
 552        /*
 553         * Fix up the by-block btree entry(s).
 554         */
 555        if (nfbno1 == NULLAGBLOCK) {
 556                /*
 557                 * No remaining freespace, just delete the by-block tree entry.
 558                 */
 559                if ((error = xfs_btree_delete(bno_cur, &i)))
 560                        return error;
 561                if (XFS_IS_CORRUPT(mp, i != 1))
 562                        return -EFSCORRUPTED;
 563        } else {
 564                /*
 565                 * Update the by-block entry to start later|be shorter.
 566                 */
 567                if ((error = xfs_alloc_update(bno_cur, nfbno1, nflen1)))
 568                        return error;
 569        }
 570        if (nfbno2 != NULLAGBLOCK) {
 571                /*
 572                 * 2 resulting free entries, need to add one.
 573                 */
 574                if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
 575                        return error;
 576                if (XFS_IS_CORRUPT(mp, i != 0))
 577                        return -EFSCORRUPTED;
 578                if ((error = xfs_btree_insert(bno_cur, &i)))
 579                        return error;
 580                if (XFS_IS_CORRUPT(mp, i != 1))
 581                        return -EFSCORRUPTED;
 582        }
 583        return 0;
 584}
 585
 586static xfs_failaddr_t
 587xfs_agfl_verify(
 588        struct xfs_buf  *bp)
 589{
 590        struct xfs_mount *mp = bp->b_mount;
 591        struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp);
 592        __be32          *agfl_bno = xfs_buf_to_agfl_bno(bp);
 593        int             i;
 594
 595        /*
 596         * There is no verification of non-crc AGFLs because mkfs does not
 597         * initialise the AGFL to zero or NULL. Hence the only valid part of the
 598         * AGFL is what the AGF says is active. We can't get to the AGF, so we
 599         * can't verify just those entries are valid.
 600         */
 601        if (!xfs_has_crc(mp))
 602                return NULL;
 603
 604        if (!xfs_verify_magic(bp, agfl->agfl_magicnum))
 605                return __this_address;
 606        if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid))
 607                return __this_address;
 608        /*
 609         * during growfs operations, the perag is not fully initialised,
 610         * so we can't use it for any useful checking. growfs ensures we can't
 611         * use it by using uncached buffers that don't have the perag attached
 612         * so we can detect and avoid this problem.
 613         */
 614        if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno)
 615                return __this_address;
 616
 617        for (i = 0; i < xfs_agfl_size(mp); i++) {
 618                if (be32_to_cpu(agfl_bno[i]) != NULLAGBLOCK &&
 619                    be32_to_cpu(agfl_bno[i]) >= mp->m_sb.sb_agblocks)
 620                        return __this_address;
 621        }
 622
 623        if (!xfs_log_check_lsn(mp, be64_to_cpu(XFS_BUF_TO_AGFL(bp)->agfl_lsn)))
 624                return __this_address;
 625        return NULL;
 626}
 627
 628static void
 629xfs_agfl_read_verify(
 630        struct xfs_buf  *bp)
 631{
 632        struct xfs_mount *mp = bp->b_mount;
 633        xfs_failaddr_t  fa;
 634
 635        /*
 636         * There is no verification of non-crc AGFLs because mkfs does not
 637         * initialise the AGFL to zero or NULL. Hence the only valid part of the
 638         * AGFL is what the AGF says is active. We can't get to the AGF, so we
 639         * can't verify just those entries are valid.
 640         */
 641        if (!xfs_has_crc(mp))
 642                return;
 643
 644        if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF))
 645                xfs_verifier_error(bp, -EFSBADCRC, __this_address);
 646        else {
 647                fa = xfs_agfl_verify(bp);
 648                if (fa)
 649                        xfs_verifier_error(bp, -EFSCORRUPTED, fa);
 650        }
 651}
 652
 653static void
 654xfs_agfl_write_verify(
 655        struct xfs_buf  *bp)
 656{
 657        struct xfs_mount        *mp = bp->b_mount;
 658        struct xfs_buf_log_item *bip = bp->b_log_item;
 659        xfs_failaddr_t          fa;
 660
 661        /* no verification of non-crc AGFLs */
 662        if (!xfs_has_crc(mp))
 663                return;
 664
 665        fa = xfs_agfl_verify(bp);
 666        if (fa) {
 667                xfs_verifier_error(bp, -EFSCORRUPTED, fa);
 668                return;
 669        }
 670
 671        if (bip)
 672                XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
 673
 674        xfs_buf_update_cksum(bp, XFS_AGFL_CRC_OFF);
 675}
 676
 677const struct xfs_buf_ops xfs_agfl_buf_ops = {
 678        .name = "xfs_agfl",
 679        .magic = { cpu_to_be32(XFS_AGFL_MAGIC), cpu_to_be32(XFS_AGFL_MAGIC) },
 680        .verify_read = xfs_agfl_read_verify,
 681        .verify_write = xfs_agfl_write_verify,
 682        .verify_struct = xfs_agfl_verify,
 683};
 684
 685/*
 686 * Read in the allocation group free block array.
 687 */
 688int                                     /* error */
 689xfs_alloc_read_agfl(
 690        xfs_mount_t     *mp,            /* mount point structure */
 691        xfs_trans_t     *tp,            /* transaction pointer */
 692        xfs_agnumber_t  agno,           /* allocation group number */
 693        struct xfs_buf  **bpp)          /* buffer for the ag free block array */
 694{
 695        struct xfs_buf  *bp;            /* return value */
 696        int             error;
 697
 698        ASSERT(agno != NULLAGNUMBER);
 699        error = xfs_trans_read_buf(
 700                        mp, tp, mp->m_ddev_targp,
 701                        XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
 702                        XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
 703        if (error)
 704                return error;
 705        xfs_buf_set_ref(bp, XFS_AGFL_REF);
 706        *bpp = bp;
 707        return 0;
 708}
 709
 710STATIC int
 711xfs_alloc_update_counters(
 712        struct xfs_trans        *tp,
 713        struct xfs_buf          *agbp,
 714        long                    len)
 715{
 716        struct xfs_agf          *agf = agbp->b_addr;
 717
 718        agbp->b_pag->pagf_freeblks += len;
 719        be32_add_cpu(&agf->agf_freeblks, len);
 720
 721        if (unlikely(be32_to_cpu(agf->agf_freeblks) >
 722                     be32_to_cpu(agf->agf_length))) {
 723                xfs_buf_mark_corrupt(agbp);
 724                return -EFSCORRUPTED;
 725        }
 726
 727        xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
 728        return 0;
 729}
 730
 731/*
 732 * Block allocation algorithm and data structures.
 733 */
 734struct xfs_alloc_cur {
 735        struct xfs_btree_cur            *cnt;   /* btree cursors */
 736        struct xfs_btree_cur            *bnolt;
 737        struct xfs_btree_cur            *bnogt;
 738        xfs_extlen_t                    cur_len;/* current search length */
 739        xfs_agblock_t                   rec_bno;/* extent startblock */
 740        xfs_extlen_t                    rec_len;/* extent length */
 741        xfs_agblock_t                   bno;    /* alloc bno */
 742        xfs_extlen_t                    len;    /* alloc len */
 743        xfs_extlen_t                    diff;   /* diff from search bno */
 744        unsigned int                    busy_gen;/* busy state */
 745        bool                            busy;
 746};
 747
 748/*
 749 * Set up cursors, etc. in the extent allocation cursor. This function can be
 750 * called multiple times to reset an initialized structure without having to
 751 * reallocate cursors.
 752 */
 753static int
 754xfs_alloc_cur_setup(
 755        struct xfs_alloc_arg    *args,
 756        struct xfs_alloc_cur    *acur)
 757{
 758        int                     error;
 759        int                     i;
 760
 761        ASSERT(args->alignment == 1 || args->type != XFS_ALLOCTYPE_THIS_BNO);
 762
 763        acur->cur_len = args->maxlen;
 764        acur->rec_bno = 0;
 765        acur->rec_len = 0;
 766        acur->bno = 0;
 767        acur->len = 0;
 768        acur->diff = -1;
 769        acur->busy = false;
 770        acur->busy_gen = 0;
 771
 772        /*
 773         * Perform an initial cntbt lookup to check for availability of maxlen
 774         * extents. If this fails, we'll return -ENOSPC to signal the caller to
 775         * attempt a small allocation.
 776         */
 777        if (!acur->cnt)
 778                acur->cnt = xfs_allocbt_init_cursor(args->mp, args->tp,
 779                                        args->agbp, args->pag, XFS_BTNUM_CNT);
 780        error = xfs_alloc_lookup_ge(acur->cnt, 0, args->maxlen, &i);
 781        if (error)
 782                return error;
 783
 784        /*
 785         * Allocate the bnobt left and right search cursors.
 786         */
 787        if (!acur->bnolt)
 788                acur->bnolt = xfs_allocbt_init_cursor(args->mp, args->tp,
 789                                        args->agbp, args->pag, XFS_BTNUM_BNO);
 790        if (!acur->bnogt)
 791                acur->bnogt = xfs_allocbt_init_cursor(args->mp, args->tp,
 792                                        args->agbp, args->pag, XFS_BTNUM_BNO);
 793        return i == 1 ? 0 : -ENOSPC;
 794}
 795
 796static void
 797xfs_alloc_cur_close(
 798        struct xfs_alloc_cur    *acur,
 799        bool                    error)
 800{
 801        int                     cur_error = XFS_BTREE_NOERROR;
 802
 803        if (error)
 804                cur_error = XFS_BTREE_ERROR;
 805
 806        if (acur->cnt)
 807                xfs_btree_del_cursor(acur->cnt, cur_error);
 808        if (acur->bnolt)
 809                xfs_btree_del_cursor(acur->bnolt, cur_error);
 810        if (acur->bnogt)
 811                xfs_btree_del_cursor(acur->bnogt, cur_error);
 812        acur->cnt = acur->bnolt = acur->bnogt = NULL;
 813}
 814
 815/*
 816 * Check an extent for allocation and track the best available candidate in the
 817 * allocation structure. The cursor is deactivated if it has entered an out of
 818 * range state based on allocation arguments. Optionally return the extent
 819 * extent geometry and allocation status if requested by the caller.
 820 */
 821static int
 822xfs_alloc_cur_check(
 823        struct xfs_alloc_arg    *args,
 824        struct xfs_alloc_cur    *acur,
 825        struct xfs_btree_cur    *cur,
 826        int                     *new)
 827{
 828        int                     error, i;
 829        xfs_agblock_t           bno, bnoa, bnew;
 830        xfs_extlen_t            len, lena, diff = -1;
 831        bool                    busy;
 832        unsigned                busy_gen = 0;
 833        bool                    deactivate = false;
 834        bool                    isbnobt = cur->bc_btnum == XFS_BTNUM_BNO;
 835
 836        *new = 0;
 837
 838        error = xfs_alloc_get_rec(cur, &bno, &len, &i);
 839        if (error)
 840                return error;
 841        if (XFS_IS_CORRUPT(args->mp, i != 1))
 842                return -EFSCORRUPTED;
 843
 844        /*
 845         * Check minlen and deactivate a cntbt cursor if out of acceptable size
 846         * range (i.e., walking backwards looking for a minlen extent).
 847         */
 848        if (len < args->minlen) {
 849                deactivate = !isbnobt;
 850                goto out;
 851        }
 852
 853        busy = xfs_alloc_compute_aligned(args, bno, len, &bnoa, &lena,
 854                                         &busy_gen);
 855        acur->busy |= busy;
 856        if (busy)
 857                acur->busy_gen = busy_gen;
 858        /* deactivate a bnobt cursor outside of locality range */
 859        if (bnoa < args->min_agbno || bnoa > args->max_agbno) {
 860                deactivate = isbnobt;
 861                goto out;
 862        }
 863        if (lena < args->minlen)
 864                goto out;
 865
 866        args->len = XFS_EXTLEN_MIN(lena, args->maxlen);
 867        xfs_alloc_fix_len(args);
 868        ASSERT(args->len >= args->minlen);
 869        if (args->len < acur->len)
 870                goto out;
 871
 872        /*
 873         * We have an aligned record that satisfies minlen and beats or matches
 874         * the candidate extent size. Compare locality for near allocation mode.
 875         */
 876        ASSERT(args->type == XFS_ALLOCTYPE_NEAR_BNO);
 877        diff = xfs_alloc_compute_diff(args->agbno, args->len,
 878                                      args->alignment, args->datatype,
 879                                      bnoa, lena, &bnew);
 880        if (bnew == NULLAGBLOCK)
 881                goto out;
 882
 883        /*
 884         * Deactivate a bnobt cursor with worse locality than the current best.
 885         */
 886        if (diff > acur->diff) {
 887                deactivate = isbnobt;
 888                goto out;
 889        }
 890
 891        ASSERT(args->len > acur->len ||
 892               (args->len == acur->len && diff <= acur->diff));
 893        acur->rec_bno = bno;
 894        acur->rec_len = len;
 895        acur->bno = bnew;
 896        acur->len = args->len;
 897        acur->diff = diff;
 898        *new = 1;
 899
 900        /*
 901         * We're done if we found a perfect allocation. This only deactivates
 902         * the current cursor, but this is just an optimization to terminate a
 903         * cntbt search that otherwise runs to the edge of the tree.
 904         */
 905        if (acur->diff == 0 && acur->len == args->maxlen)
 906                deactivate = true;
 907out:
 908        if (deactivate)
 909                cur->bc_ag.abt.active = false;
 910        trace_xfs_alloc_cur_check(args->mp, cur->bc_btnum, bno, len, diff,
 911                                  *new);
 912        return 0;
 913}
 914
 915/*
 916 * Complete an allocation of a candidate extent. Remove the extent from both
 917 * trees and update the args structure.
 918 */
 919STATIC int
 920xfs_alloc_cur_finish(
 921        struct xfs_alloc_arg    *args,
 922        struct xfs_alloc_cur    *acur)
 923{
 924        struct xfs_agf __maybe_unused *agf = args->agbp->b_addr;
 925        int                     error;
 926
 927        ASSERT(acur->cnt && acur->bnolt);
 928        ASSERT(acur->bno >= acur->rec_bno);
 929        ASSERT(acur->bno + acur->len <= acur->rec_bno + acur->rec_len);
 930        ASSERT(acur->rec_bno + acur->rec_len <= be32_to_cpu(agf->agf_length));
 931
 932        error = xfs_alloc_fixup_trees(acur->cnt, acur->bnolt, acur->rec_bno,
 933                                      acur->rec_len, acur->bno, acur->len, 0);
 934        if (error)
 935                return error;
 936
 937        args->agbno = acur->bno;
 938        args->len = acur->len;
 939        args->wasfromfl = 0;
 940
 941        trace_xfs_alloc_cur(args);
 942        return 0;
 943}
 944
 945/*
 946 * Locality allocation lookup algorithm. This expects a cntbt cursor and uses
 947 * bno optimized lookup to search for extents with ideal size and locality.
 948 */
 949STATIC int
 950xfs_alloc_cntbt_iter(
 951        struct xfs_alloc_arg            *args,
 952        struct xfs_alloc_cur            *acur)
 953{
 954        struct xfs_btree_cur    *cur = acur->cnt;
 955        xfs_agblock_t           bno;
 956        xfs_extlen_t            len, cur_len;
 957        int                     error;
 958        int                     i;
 959
 960        if (!xfs_alloc_cur_active(cur))
 961                return 0;
 962
 963        /* locality optimized lookup */
 964        cur_len = acur->cur_len;
 965        error = xfs_alloc_lookup_ge(cur, args->agbno, cur_len, &i);
 966        if (error)
 967                return error;
 968        if (i == 0)
 969                return 0;
 970        error = xfs_alloc_get_rec(cur, &bno, &len, &i);
 971        if (error)
 972                return error;
 973
 974        /* check the current record and update search length from it */
 975        error = xfs_alloc_cur_check(args, acur, cur, &i);
 976        if (error)
 977                return error;
 978        ASSERT(len >= acur->cur_len);
 979        acur->cur_len = len;
 980
 981        /*
 982         * We looked up the first record >= [agbno, len] above. The agbno is a
 983         * secondary key and so the current record may lie just before or after
 984         * agbno. If it is past agbno, check the previous record too so long as
 985         * the length matches as it may be closer. Don't check a smaller record
 986         * because that could deactivate our cursor.
 987         */
 988        if (bno > args->agbno) {
 989                error = xfs_btree_decrement(cur, 0, &i);
 990                if (!error && i) {
 991                        error = xfs_alloc_get_rec(cur, &bno, &len, &i);
 992                        if (!error && i && len == acur->cur_len)
 993                                error = xfs_alloc_cur_check(args, acur, cur,
 994                                                            &i);
 995                }
 996                if (error)
 997                        return error;
 998        }
 999
1000        /*
1001         * Increment the search key until we find at least one allocation
1002         * candidate or if the extent we found was larger. Otherwise, double the
1003         * search key to optimize the search. Efficiency is more important here
1004         * than absolute best locality.
1005         */
1006        cur_len <<= 1;
1007        if (!acur->len || acur->cur_len >= cur_len)
1008                acur->cur_len++;
1009        else
1010                acur->cur_len = cur_len;
1011
1012        return error;
1013}
1014
1015/*
1016 * Deal with the case where only small freespaces remain. Either return the
1017 * contents of the last freespace record, or allocate space from the freelist if
1018 * there is nothing in the tree.
1019 */
1020STATIC int                      /* error */
1021xfs_alloc_ag_vextent_small(
1022        struct xfs_alloc_arg    *args,  /* allocation argument structure */
1023        struct xfs_btree_cur    *ccur,  /* optional by-size cursor */
1024        xfs_agblock_t           *fbnop, /* result block number */
1025        xfs_extlen_t            *flenp, /* result length */
1026        int                     *stat)  /* status: 0-freelist, 1-normal/none */
1027{
1028        struct xfs_agf          *agf = args->agbp->b_addr;
1029        int                     error = 0;
1030        xfs_agblock_t           fbno = NULLAGBLOCK;
1031        xfs_extlen_t            flen = 0;
1032        int                     i = 0;
1033
1034        /*
1035         * If a cntbt cursor is provided, try to allocate the largest record in
1036         * the tree. Try the AGFL if the cntbt is empty, otherwise fail the
1037         * allocation. Make sure to respect minleft even when pulling from the
1038         * freelist.
1039         */
1040        if (ccur)
1041                error = xfs_btree_decrement(ccur, 0, &i);
1042        if (error)
1043                goto error;
1044        if (i) {
1045                error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i);
1046                if (error)
1047                        goto error;
1048                if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1049                        error = -EFSCORRUPTED;
1050                        goto error;
1051                }
1052                goto out;
1053        }
1054
1055        if (args->minlen != 1 || args->alignment != 1 ||
1056            args->resv == XFS_AG_RESV_AGFL ||
1057            be32_to_cpu(agf->agf_flcount) <= args->minleft)
1058                goto out;
1059
1060        error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0);
1061        if (error)
1062                goto error;
1063        if (fbno == NULLAGBLOCK)
1064                goto out;
1065
1066        xfs_extent_busy_reuse(args->mp, args->pag, fbno, 1,
1067                              (args->datatype & XFS_ALLOC_NOBUSY));
1068
1069        if (args->datatype & XFS_ALLOC_USERDATA) {
1070                struct xfs_buf  *bp;
1071
1072                error = xfs_trans_get_buf(args->tp, args->mp->m_ddev_targp,
1073                                XFS_AGB_TO_DADDR(args->mp, args->agno, fbno),
1074                                args->mp->m_bsize, 0, &bp);
1075                if (error)
1076                        goto error;
1077                xfs_trans_binval(args->tp, bp);
1078        }
1079        *fbnop = args->agbno = fbno;
1080        *flenp = args->len = 1;
1081        if (XFS_IS_CORRUPT(args->mp, fbno >= be32_to_cpu(agf->agf_length))) {
1082                error = -EFSCORRUPTED;
1083                goto error;
1084        }
1085        args->wasfromfl = 1;
1086        trace_xfs_alloc_small_freelist(args);
1087
1088        /*
1089         * If we're feeding an AGFL block to something that doesn't live in the
1090         * free space, we need to clear out the OWN_AG rmap.
1091         */
1092        error = xfs_rmap_free(args->tp, args->agbp, args->pag, fbno, 1,
1093                              &XFS_RMAP_OINFO_AG);
1094        if (error)
1095                goto error;
1096
1097        *stat = 0;
1098        return 0;
1099
1100out:
1101        /*
1102         * Can't do the allocation, give up.
1103         */
1104        if (flen < args->minlen) {
1105                args->agbno = NULLAGBLOCK;
1106                trace_xfs_alloc_small_notenough(args);
1107                flen = 0;
1108        }
1109        *fbnop = fbno;
1110        *flenp = flen;
1111        *stat = 1;
1112        trace_xfs_alloc_small_done(args);
1113        return 0;
1114
1115error:
1116        trace_xfs_alloc_small_error(args);
1117        return error;
1118}
1119
1120/*
1121 * Allocate a variable extent in the allocation group agno.
1122 * Type and bno are used to determine where in the allocation group the
1123 * extent will start.
1124 * Extent's length (returned in *len) will be between minlen and maxlen,
1125 * and of the form k * prod + mod unless there's nothing that large.
1126 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1127 */
1128STATIC int                      /* error */
1129xfs_alloc_ag_vextent(
1130        xfs_alloc_arg_t *args)  /* argument structure for allocation */
1131{
1132        int             error=0;
1133
1134        ASSERT(args->minlen > 0);
1135        ASSERT(args->maxlen > 0);
1136        ASSERT(args->minlen <= args->maxlen);
1137        ASSERT(args->mod < args->prod);
1138        ASSERT(args->alignment > 0);
1139
1140        /*
1141         * Branch to correct routine based on the type.
1142         */
1143        args->wasfromfl = 0;
1144        switch (args->type) {
1145        case XFS_ALLOCTYPE_THIS_AG:
1146                error = xfs_alloc_ag_vextent_size(args);
1147                break;
1148        case XFS_ALLOCTYPE_NEAR_BNO:
1149                error = xfs_alloc_ag_vextent_near(args);
1150                break;
1151        case XFS_ALLOCTYPE_THIS_BNO:
1152                error = xfs_alloc_ag_vextent_exact(args);
1153                break;
1154        default:
1155                ASSERT(0);
1156                /* NOTREACHED */
1157        }
1158
1159        if (error || args->agbno == NULLAGBLOCK)
1160                return error;
1161
1162        ASSERT(args->len >= args->minlen);
1163        ASSERT(args->len <= args->maxlen);
1164        ASSERT(!args->wasfromfl || args->resv != XFS_AG_RESV_AGFL);
1165        ASSERT(args->agbno % args->alignment == 0);
1166
1167        /* if not file data, insert new block into the reverse map btree */
1168        if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) {
1169                error = xfs_rmap_alloc(args->tp, args->agbp, args->pag,
1170                                       args->agbno, args->len, &args->oinfo);
1171                if (error)
1172                        return error;
1173        }
1174
1175        if (!args->wasfromfl) {
1176                error = xfs_alloc_update_counters(args->tp, args->agbp,
1177                                                  -((long)(args->len)));
1178                if (error)
1179                        return error;
1180
1181                ASSERT(!xfs_extent_busy_search(args->mp, args->pag,
1182                                              args->agbno, args->len));
1183        }
1184
1185        xfs_ag_resv_alloc_extent(args->pag, args->resv, args);
1186
1187        XFS_STATS_INC(args->mp, xs_allocx);
1188        XFS_STATS_ADD(args->mp, xs_allocb, args->len);
1189        return error;
1190}
1191
1192/*
1193 * Allocate a variable extent at exactly agno/bno.
1194 * Extent's length (returned in *len) will be between minlen and maxlen,
1195 * and of the form k * prod + mod unless there's nothing that large.
1196 * Return the starting a.g. block (bno), or NULLAGBLOCK if we can't do it.
1197 */
1198STATIC int                      /* error */
1199xfs_alloc_ag_vextent_exact(
1200        xfs_alloc_arg_t *args)  /* allocation argument structure */
1201{
1202        struct xfs_agf __maybe_unused *agf = args->agbp->b_addr;
1203        xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */
1204        xfs_btree_cur_t *cnt_cur;/* by count btree cursor */
1205        int             error;
1206        xfs_agblock_t   fbno;   /* start block of found extent */
1207        xfs_extlen_t    flen;   /* length of found extent */
1208        xfs_agblock_t   tbno;   /* start block of busy extent */
1209        xfs_extlen_t    tlen;   /* length of busy extent */
1210        xfs_agblock_t   tend;   /* end block of busy extent */
1211        int             i;      /* success/failure of operation */
1212        unsigned        busy_gen;
1213
1214        ASSERT(args->alignment == 1);
1215
1216        /*
1217         * Allocate/initialize a cursor for the by-number freespace btree.
1218         */
1219        bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1220                                          args->pag, XFS_BTNUM_BNO);
1221
1222        /*
1223         * Lookup bno and minlen in the btree (minlen is irrelevant, really).
1224         * Look for the closest free block <= bno, it must contain bno
1225         * if any free block does.
1226         */
1227        error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i);
1228        if (error)
1229                goto error0;
1230        if (!i)
1231                goto not_found;
1232
1233        /*
1234         * Grab the freespace record.
1235         */
1236        error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
1237        if (error)
1238                goto error0;
1239        if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1240                error = -EFSCORRUPTED;
1241                goto error0;
1242        }
1243        ASSERT(fbno <= args->agbno);
1244
1245        /*
1246         * Check for overlapping busy extents.
1247         */
1248        tbno = fbno;
1249        tlen = flen;
1250        xfs_extent_busy_trim(args, &tbno, &tlen, &busy_gen);
1251
1252        /*
1253         * Give up if the start of the extent is busy, or the freespace isn't
1254         * long enough for the minimum request.
1255         */
1256        if (tbno > args->agbno)
1257                goto not_found;
1258        if (tlen < args->minlen)
1259                goto not_found;
1260        tend = tbno + tlen;
1261        if (tend < args->agbno + args->minlen)
1262                goto not_found;
1263
1264        /*
1265         * End of extent will be smaller of the freespace end and the
1266         * maximal requested end.
1267         *
1268         * Fix the length according to mod and prod if given.
1269         */
1270        args->len = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen)
1271                                                - args->agbno;
1272        xfs_alloc_fix_len(args);
1273        ASSERT(args->agbno + args->len <= tend);
1274
1275        /*
1276         * We are allocating agbno for args->len
1277         * Allocate/initialize a cursor for the by-size btree.
1278         */
1279        cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1280                                        args->pag, XFS_BTNUM_CNT);
1281        ASSERT(args->agbno + args->len <= be32_to_cpu(agf->agf_length));
1282        error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
1283                                      args->len, XFSA_FIXUP_BNO_OK);
1284        if (error) {
1285                xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
1286                goto error0;
1287        }
1288
1289        xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1290        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1291
1292        args->wasfromfl = 0;
1293        trace_xfs_alloc_exact_done(args);
1294        return 0;
1295
1296not_found:
1297        /* Didn't find it, return null. */
1298        xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1299        args->agbno = NULLAGBLOCK;
1300        trace_xfs_alloc_exact_notfound(args);
1301        return 0;
1302
1303error0:
1304        xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
1305        trace_xfs_alloc_exact_error(args);
1306        return error;
1307}
1308
1309/*
1310 * Search a given number of btree records in a given direction. Check each
1311 * record against the good extent we've already found.
1312 */
1313STATIC int
1314xfs_alloc_walk_iter(
1315        struct xfs_alloc_arg    *args,
1316        struct xfs_alloc_cur    *acur,
1317        struct xfs_btree_cur    *cur,
1318        bool                    increment,
1319        bool                    find_one, /* quit on first candidate */
1320        int                     count,    /* rec count (-1 for infinite) */
1321        int                     *stat)
1322{
1323        int                     error;
1324        int                     i;
1325
1326        *stat = 0;
1327
1328        /*
1329         * Search so long as the cursor is active or we find a better extent.
1330         * The cursor is deactivated if it extends beyond the range of the
1331         * current allocation candidate.
1332         */
1333        while (xfs_alloc_cur_active(cur) && count) {
1334                error = xfs_alloc_cur_check(args, acur, cur, &i);
1335                if (error)
1336                        return error;
1337                if (i == 1) {
1338                        *stat = 1;
1339                        if (find_one)
1340                                break;
1341                }
1342                if (!xfs_alloc_cur_active(cur))
1343                        break;
1344
1345                if (increment)
1346                        error = xfs_btree_increment(cur, 0, &i);
1347                else
1348                        error = xfs_btree_decrement(cur, 0, &i);
1349                if (error)
1350                        return error;
1351                if (i == 0)
1352                        cur->bc_ag.abt.active = false;
1353
1354                if (count > 0)
1355                        count--;
1356        }
1357
1358        return 0;
1359}
1360
1361/*
1362 * Search the by-bno and by-size btrees in parallel in search of an extent with
1363 * ideal locality based on the NEAR mode ->agbno locality hint.
1364 */
1365STATIC int
1366xfs_alloc_ag_vextent_locality(
1367        struct xfs_alloc_arg    *args,
1368        struct xfs_alloc_cur    *acur,
1369        int                     *stat)
1370{
1371        struct xfs_btree_cur    *fbcur = NULL;
1372        int                     error;
1373        int                     i;
1374        bool                    fbinc;
1375
1376        ASSERT(acur->len == 0);
1377        ASSERT(args->type == XFS_ALLOCTYPE_NEAR_BNO);
1378
1379        *stat = 0;
1380
1381        error = xfs_alloc_lookup_ge(acur->cnt, args->agbno, acur->cur_len, &i);
1382        if (error)
1383                return error;
1384        error = xfs_alloc_lookup_le(acur->bnolt, args->agbno, 0, &i);
1385        if (error)
1386                return error;
1387        error = xfs_alloc_lookup_ge(acur->bnogt, args->agbno, 0, &i);
1388        if (error)
1389                return error;
1390
1391        /*
1392         * Search the bnobt and cntbt in parallel. Search the bnobt left and
1393         * right and lookup the closest extent to the locality hint for each
1394         * extent size key in the cntbt. The entire search terminates
1395         * immediately on a bnobt hit because that means we've found best case
1396         * locality. Otherwise the search continues until the cntbt cursor runs
1397         * off the end of the tree. If no allocation candidate is found at this
1398         * point, give up on locality, walk backwards from the end of the cntbt
1399         * and take the first available extent.
1400         *
1401         * The parallel tree searches balance each other out to provide fairly
1402         * consistent performance for various situations. The bnobt search can
1403         * have pathological behavior in the worst case scenario of larger
1404         * allocation requests and fragmented free space. On the other hand, the
1405         * bnobt is able to satisfy most smaller allocation requests much more
1406         * quickly than the cntbt. The cntbt search can sift through fragmented
1407         * free space and sets of free extents for larger allocation requests
1408         * more quickly than the bnobt. Since the locality hint is just a hint
1409         * and we don't want to scan the entire bnobt for perfect locality, the
1410         * cntbt search essentially bounds the bnobt search such that we can
1411         * find good enough locality at reasonable performance in most cases.
1412         */
1413        while (xfs_alloc_cur_active(acur->bnolt) ||
1414               xfs_alloc_cur_active(acur->bnogt) ||
1415               xfs_alloc_cur_active(acur->cnt)) {
1416
1417                trace_xfs_alloc_cur_lookup(args);
1418
1419                /*
1420                 * Search the bnobt left and right. In the case of a hit, finish
1421                 * the search in the opposite direction and we're done.
1422                 */
1423                error = xfs_alloc_walk_iter(args, acur, acur->bnolt, false,
1424                                            true, 1, &i);
1425                if (error)
1426                        return error;
1427                if (i == 1) {
1428                        trace_xfs_alloc_cur_left(args);
1429                        fbcur = acur->bnogt;
1430                        fbinc = true;
1431                        break;
1432                }
1433                error = xfs_alloc_walk_iter(args, acur, acur->bnogt, true, true,
1434                                            1, &i);
1435                if (error)
1436                        return error;
1437                if (i == 1) {
1438                        trace_xfs_alloc_cur_right(args);
1439                        fbcur = acur->bnolt;
1440                        fbinc = false;
1441                        break;
1442                }
1443
1444                /*
1445                 * Check the extent with best locality based on the current
1446                 * extent size search key and keep track of the best candidate.
1447                 */
1448                error = xfs_alloc_cntbt_iter(args, acur);
1449                if (error)
1450                        return error;
1451                if (!xfs_alloc_cur_active(acur->cnt)) {
1452                        trace_xfs_alloc_cur_lookup_done(args);
1453                        break;
1454                }
1455        }
1456
1457        /*
1458         * If we failed to find anything due to busy extents, return empty
1459         * handed so the caller can flush and retry. If no busy extents were
1460         * found, walk backwards from the end of the cntbt as a last resort.
1461         */
1462        if (!xfs_alloc_cur_active(acur->cnt) && !acur->len && !acur->busy) {
1463                error = xfs_btree_decrement(acur->cnt, 0, &i);
1464                if (error)
1465                        return error;
1466                if (i) {
1467                        acur->cnt->bc_ag.abt.active = true;
1468                        fbcur = acur->cnt;
1469                        fbinc = false;
1470                }
1471        }
1472
1473        /*
1474         * Search in the opposite direction for a better entry in the case of
1475         * a bnobt hit or walk backwards from the end of the cntbt.
1476         */
1477        if (fbcur) {
1478                error = xfs_alloc_walk_iter(args, acur, fbcur, fbinc, true, -1,
1479                                            &i);
1480                if (error)
1481                        return error;
1482        }
1483
1484        if (acur->len)
1485                *stat = 1;
1486
1487        return 0;
1488}
1489
1490/* Check the last block of the cnt btree for allocations. */
1491static int
1492xfs_alloc_ag_vextent_lastblock(
1493        struct xfs_alloc_arg    *args,
1494        struct xfs_alloc_cur    *acur,
1495        xfs_agblock_t           *bno,
1496        xfs_extlen_t            *len,
1497        bool                    *allocated)
1498{
1499        int                     error;
1500        int                     i;
1501
1502#ifdef DEBUG
1503        /* Randomly don't execute the first algorithm. */
1504        if (prandom_u32() & 1)
1505                return 0;
1506#endif
1507
1508        /*
1509         * Start from the entry that lookup found, sequence through all larger
1510         * free blocks.  If we're actually pointing at a record smaller than
1511         * maxlen, go to the start of this block, and skip all those smaller
1512         * than minlen.
1513         */
1514        if (*len || args->alignment > 1) {
1515                acur->cnt->bc_ptrs[0] = 1;
1516                do {
1517                        error = xfs_alloc_get_rec(acur->cnt, bno, len, &i);
1518                        if (error)
1519                                return error;
1520                        if (XFS_IS_CORRUPT(args->mp, i != 1))
1521                                return -EFSCORRUPTED;
1522                        if (*len >= args->minlen)
1523                                break;
1524                        error = xfs_btree_increment(acur->cnt, 0, &i);
1525                        if (error)
1526                                return error;
1527                } while (i);
1528                ASSERT(*len >= args->minlen);
1529                if (!i)
1530                        return 0;
1531        }
1532
1533        error = xfs_alloc_walk_iter(args, acur, acur->cnt, true, false, -1, &i);
1534        if (error)
1535                return error;
1536
1537        /*
1538         * It didn't work.  We COULD be in a case where there's a good record
1539         * somewhere, so try again.
1540         */
1541        if (acur->len == 0)
1542                return 0;
1543
1544        trace_xfs_alloc_near_first(args);
1545        *allocated = true;
1546        return 0;
1547}
1548
1549/*
1550 * Allocate a variable extent near bno in the allocation group agno.
1551 * Extent's length (returned in len) will be between minlen and maxlen,
1552 * and of the form k * prod + mod unless there's nothing that large.
1553 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1554 */
1555STATIC int
1556xfs_alloc_ag_vextent_near(
1557        struct xfs_alloc_arg    *args)
1558{
1559        struct xfs_alloc_cur    acur = {};
1560        int                     error;          /* error code */
1561        int                     i;              /* result code, temporary */
1562        xfs_agblock_t           bno;
1563        xfs_extlen_t            len;
1564
1565        /* handle uninitialized agbno range so caller doesn't have to */
1566        if (!args->min_agbno && !args->max_agbno)
1567                args->max_agbno = args->mp->m_sb.sb_agblocks - 1;
1568        ASSERT(args->min_agbno <= args->max_agbno);
1569
1570        /* clamp agbno to the range if it's outside */
1571        if (args->agbno < args->min_agbno)
1572                args->agbno = args->min_agbno;
1573        if (args->agbno > args->max_agbno)
1574                args->agbno = args->max_agbno;
1575
1576restart:
1577        len = 0;
1578
1579        /*
1580         * Set up cursors and see if there are any free extents as big as
1581         * maxlen. If not, pick the last entry in the tree unless the tree is
1582         * empty.
1583         */
1584        error = xfs_alloc_cur_setup(args, &acur);
1585        if (error == -ENOSPC) {
1586                error = xfs_alloc_ag_vextent_small(args, acur.cnt, &bno,
1587                                &len, &i);
1588                if (error)
1589                        goto out;
1590                if (i == 0 || len == 0) {
1591                        trace_xfs_alloc_near_noentry(args);
1592                        goto out;
1593                }
1594                ASSERT(i == 1);
1595        } else if (error) {
1596                goto out;
1597        }
1598
1599        /*
1600         * First algorithm.
1601         * If the requested extent is large wrt the freespaces available
1602         * in this a.g., then the cursor will be pointing to a btree entry
1603         * near the right edge of the tree.  If it's in the last btree leaf
1604         * block, then we just examine all the entries in that block
1605         * that are big enough, and pick the best one.
1606         */
1607        if (xfs_btree_islastblock(acur.cnt, 0)) {
1608                bool            allocated = false;
1609
1610                error = xfs_alloc_ag_vextent_lastblock(args, &acur, &bno, &len,
1611                                &allocated);
1612                if (error)
1613                        goto out;
1614                if (allocated)
1615                        goto alloc_finish;
1616        }
1617
1618        /*
1619         * Second algorithm. Combined cntbt and bnobt search to find ideal
1620         * locality.
1621         */
1622        error = xfs_alloc_ag_vextent_locality(args, &acur, &i);
1623        if (error)
1624                goto out;
1625
1626        /*
1627         * If we couldn't get anything, give up.
1628         */
1629        if (!acur.len) {
1630                if (acur.busy) {
1631                        trace_xfs_alloc_near_busy(args);
1632                        xfs_extent_busy_flush(args->mp, args->pag,
1633                                              acur.busy_gen);
1634                        goto restart;
1635                }
1636                trace_xfs_alloc_size_neither(args);
1637                args->agbno = NULLAGBLOCK;
1638                goto out;
1639        }
1640
1641alloc_finish:
1642        /* fix up btrees on a successful allocation */
1643        error = xfs_alloc_cur_finish(args, &acur);
1644
1645out:
1646        xfs_alloc_cur_close(&acur, error);
1647        return error;
1648}
1649
1650/*
1651 * Allocate a variable extent anywhere in the allocation group agno.
1652 * Extent's length (returned in len) will be between minlen and maxlen,
1653 * and of the form k * prod + mod unless there's nothing that large.
1654 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1655 */
1656STATIC int                              /* error */
1657xfs_alloc_ag_vextent_size(
1658        xfs_alloc_arg_t *args)          /* allocation argument structure */
1659{
1660        struct xfs_agf  *agf = args->agbp->b_addr;
1661        xfs_btree_cur_t *bno_cur;       /* cursor for bno btree */
1662        xfs_btree_cur_t *cnt_cur;       /* cursor for cnt btree */
1663        int             error;          /* error result */
1664        xfs_agblock_t   fbno;           /* start of found freespace */
1665        xfs_extlen_t    flen;           /* length of found freespace */
1666        int             i;              /* temp status variable */
1667        xfs_agblock_t   rbno;           /* returned block number */
1668        xfs_extlen_t    rlen;           /* length of returned extent */
1669        bool            busy;
1670        unsigned        busy_gen;
1671
1672restart:
1673        /*
1674         * Allocate and initialize a cursor for the by-size btree.
1675         */
1676        cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1677                                        args->pag, XFS_BTNUM_CNT);
1678        bno_cur = NULL;
1679
1680        /*
1681         * Look for an entry >= maxlen+alignment-1 blocks.
1682         */
1683        if ((error = xfs_alloc_lookup_ge(cnt_cur, 0,
1684                        args->maxlen + args->alignment - 1, &i)))
1685                goto error0;
1686
1687        /*
1688         * If none then we have to settle for a smaller extent. In the case that
1689         * there are no large extents, this will return the last entry in the
1690         * tree unless the tree is empty. In the case that there are only busy
1691         * large extents, this will return the largest small extent unless there
1692         * are no smaller extents available.
1693         */
1694        if (!i) {
1695                error = xfs_alloc_ag_vextent_small(args, cnt_cur,
1696                                                   &fbno, &flen, &i);
1697                if (error)
1698                        goto error0;
1699                if (i == 0 || flen == 0) {
1700                        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1701                        trace_xfs_alloc_size_noentry(args);
1702                        return 0;
1703                }
1704                ASSERT(i == 1);
1705                busy = xfs_alloc_compute_aligned(args, fbno, flen, &rbno,
1706                                &rlen, &busy_gen);
1707        } else {
1708                /*
1709                 * Search for a non-busy extent that is large enough.
1710                 */
1711                for (;;) {
1712                        error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
1713                        if (error)
1714                                goto error0;
1715                        if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1716                                error = -EFSCORRUPTED;
1717                                goto error0;
1718                        }
1719
1720                        busy = xfs_alloc_compute_aligned(args, fbno, flen,
1721                                        &rbno, &rlen, &busy_gen);
1722
1723                        if (rlen >= args->maxlen)
1724                                break;
1725
1726                        error = xfs_btree_increment(cnt_cur, 0, &i);
1727                        if (error)
1728                                goto error0;
1729                        if (i == 0) {
1730                                /*
1731                                 * Our only valid extents must have been busy.
1732                                 * Make it unbusy by forcing the log out and
1733                                 * retrying.
1734                                 */
1735                                xfs_btree_del_cursor(cnt_cur,
1736                                                     XFS_BTREE_NOERROR);
1737                                trace_xfs_alloc_size_busy(args);
1738                                xfs_extent_busy_flush(args->mp,
1739                                                        args->pag, busy_gen);
1740                                goto restart;
1741                        }
1742                }
1743        }
1744
1745        /*
1746         * In the first case above, we got the last entry in the
1747         * by-size btree.  Now we check to see if the space hits maxlen
1748         * once aligned; if not, we search left for something better.
1749         * This can't happen in the second case above.
1750         */
1751        rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
1752        if (XFS_IS_CORRUPT(args->mp,
1753                           rlen != 0 &&
1754                           (rlen > flen ||
1755                            rbno + rlen > fbno + flen))) {
1756                error = -EFSCORRUPTED;
1757                goto error0;
1758        }
1759        if (rlen < args->maxlen) {
1760                xfs_agblock_t   bestfbno;
1761                xfs_extlen_t    bestflen;
1762                xfs_agblock_t   bestrbno;
1763                xfs_extlen_t    bestrlen;
1764
1765                bestrlen = rlen;
1766                bestrbno = rbno;
1767                bestflen = flen;
1768                bestfbno = fbno;
1769                for (;;) {
1770                        if ((error = xfs_btree_decrement(cnt_cur, 0, &i)))
1771                                goto error0;
1772                        if (i == 0)
1773                                break;
1774                        if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen,
1775                                        &i)))
1776                                goto error0;
1777                        if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1778                                error = -EFSCORRUPTED;
1779                                goto error0;
1780                        }
1781                        if (flen < bestrlen)
1782                                break;
1783                        busy = xfs_alloc_compute_aligned(args, fbno, flen,
1784                                        &rbno, &rlen, &busy_gen);
1785                        rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
1786                        if (XFS_IS_CORRUPT(args->mp,
1787                                           rlen != 0 &&
1788                                           (rlen > flen ||
1789                                            rbno + rlen > fbno + flen))) {
1790                                error = -EFSCORRUPTED;
1791                                goto error0;
1792                        }
1793                        if (rlen > bestrlen) {
1794                                bestrlen = rlen;
1795                                bestrbno = rbno;
1796                                bestflen = flen;
1797                                bestfbno = fbno;
1798                                if (rlen == args->maxlen)
1799                                        break;
1800                        }
1801                }
1802                if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen,
1803                                &i)))
1804                        goto error0;
1805                if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1806                        error = -EFSCORRUPTED;
1807                        goto error0;
1808                }
1809                rlen = bestrlen;
1810                rbno = bestrbno;
1811                flen = bestflen;
1812                fbno = bestfbno;
1813        }
1814        args->wasfromfl = 0;
1815        /*
1816         * Fix up the length.
1817         */
1818        args->len = rlen;
1819        if (rlen < args->minlen) {
1820                if (busy) {
1821                        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1822                        trace_xfs_alloc_size_busy(args);
1823                        xfs_extent_busy_flush(args->mp, args->pag, busy_gen);
1824                        goto restart;
1825                }
1826                goto out_nominleft;
1827        }
1828        xfs_alloc_fix_len(args);
1829
1830        rlen = args->len;
1831        if (XFS_IS_CORRUPT(args->mp, rlen > flen)) {
1832                error = -EFSCORRUPTED;
1833                goto error0;
1834        }
1835        /*
1836         * Allocate and initialize a cursor for the by-block tree.
1837         */
1838        bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1839                                        args->pag, XFS_BTNUM_BNO);
1840        if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
1841                        rbno, rlen, XFSA_FIXUP_CNT_OK)))
1842                goto error0;
1843        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1844        xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1845        cnt_cur = bno_cur = NULL;
1846        args->len = rlen;
1847        args->agbno = rbno;
1848        if (XFS_IS_CORRUPT(args->mp,
1849                           args->agbno + args->len >
1850                           be32_to_cpu(agf->agf_length))) {
1851                error = -EFSCORRUPTED;
1852                goto error0;
1853        }
1854        trace_xfs_alloc_size_done(args);
1855        return 0;
1856
1857error0:
1858        trace_xfs_alloc_size_error(args);
1859        if (cnt_cur)
1860                xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
1861        if (bno_cur)
1862                xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
1863        return error;
1864
1865out_nominleft:
1866        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1867        trace_xfs_alloc_size_nominleft(args);
1868        args->agbno = NULLAGBLOCK;
1869        return 0;
1870}
1871
1872/*
1873 * Free the extent starting at agno/bno for length.
1874 */
1875STATIC int
1876xfs_free_ag_extent(
1877        struct xfs_trans                *tp,
1878        struct xfs_buf                  *agbp,
1879        xfs_agnumber_t                  agno,
1880        xfs_agblock_t                   bno,
1881        xfs_extlen_t                    len,
1882        const struct xfs_owner_info     *oinfo,
1883        enum xfs_ag_resv_type           type)
1884{
1885        struct xfs_mount                *mp;
1886        struct xfs_btree_cur            *bno_cur;
1887        struct xfs_btree_cur            *cnt_cur;
1888        xfs_agblock_t                   gtbno; /* start of right neighbor */
1889        xfs_extlen_t                    gtlen; /* length of right neighbor */
1890        xfs_agblock_t                   ltbno; /* start of left neighbor */
1891        xfs_extlen_t                    ltlen; /* length of left neighbor */
1892        xfs_agblock_t                   nbno; /* new starting block of freesp */
1893        xfs_extlen_t                    nlen; /* new length of freespace */
1894        int                             haveleft; /* have a left neighbor */
1895        int                             haveright; /* have a right neighbor */
1896        int                             i;
1897        int                             error;
1898        struct xfs_perag                *pag = agbp->b_pag;
1899
1900        bno_cur = cnt_cur = NULL;
1901        mp = tp->t_mountp;
1902
1903        if (!xfs_rmap_should_skip_owner_update(oinfo)) {
1904                error = xfs_rmap_free(tp, agbp, pag, bno, len, oinfo);
1905                if (error)
1906                        goto error0;
1907        }
1908
1909        /*
1910         * Allocate and initialize a cursor for the by-block btree.
1911         */
1912        bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_BNO);
1913        /*
1914         * Look for a neighboring block on the left (lower block numbers)
1915         * that is contiguous with this space.
1916         */
1917        if ((error = xfs_alloc_lookup_le(bno_cur, bno, len, &haveleft)))
1918                goto error0;
1919        if (haveleft) {
1920                /*
1921                 * There is a block to our left.
1922                 */
1923                if ((error = xfs_alloc_get_rec(bno_cur, &ltbno, &ltlen, &i)))
1924                        goto error0;
1925                if (XFS_IS_CORRUPT(mp, i != 1)) {
1926                        error = -EFSCORRUPTED;
1927                        goto error0;
1928                }
1929                /*
1930                 * It's not contiguous, though.
1931                 */
1932                if (ltbno + ltlen < bno)
1933                        haveleft = 0;
1934                else {
1935                        /*
1936                         * If this failure happens the request to free this
1937                         * space was invalid, it's (partly) already free.
1938                         * Very bad.
1939                         */
1940                        if (XFS_IS_CORRUPT(mp, ltbno + ltlen > bno)) {
1941                                error = -EFSCORRUPTED;
1942                                goto error0;
1943                        }
1944                }
1945        }
1946        /*
1947         * Look for a neighboring block on the right (higher block numbers)
1948         * that is contiguous with this space.
1949         */
1950        if ((error = xfs_btree_increment(bno_cur, 0, &haveright)))
1951                goto error0;
1952        if (haveright) {
1953                /*
1954                 * There is a block to our right.
1955                 */
1956                if ((error = xfs_alloc_get_rec(bno_cur, &gtbno, &gtlen, &i)))
1957                        goto error0;
1958                if (XFS_IS_CORRUPT(mp, i != 1)) {
1959                        error = -EFSCORRUPTED;
1960                        goto error0;
1961                }
1962                /*
1963                 * It's not contiguous, though.
1964                 */
1965                if (bno + len < gtbno)
1966                        haveright = 0;
1967                else {
1968                        /*
1969                         * If this failure happens the request to free this
1970                         * space was invalid, it's (partly) already free.
1971                         * Very bad.
1972                         */
1973                        if (XFS_IS_CORRUPT(mp, bno + len > gtbno)) {
1974                                error = -EFSCORRUPTED;
1975                                goto error0;
1976                        }
1977                }
1978        }
1979        /*
1980         * Now allocate and initialize a cursor for the by-size tree.
1981         */
1982        cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_CNT);
1983        /*
1984         * Have both left and right contiguous neighbors.
1985         * Merge all three into a single free block.
1986         */
1987        if (haveleft && haveright) {
1988                /*
1989                 * Delete the old by-size entry on the left.
1990                 */
1991                if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
1992                        goto error0;
1993                if (XFS_IS_CORRUPT(mp, i != 1)) {
1994                        error = -EFSCORRUPTED;
1995                        goto error0;
1996                }
1997                if ((error = xfs_btree_delete(cnt_cur, &i)))
1998                        goto error0;
1999                if (XFS_IS_CORRUPT(mp, i != 1)) {
2000                        error = -EFSCORRUPTED;
2001                        goto error0;
2002                }
2003                /*
2004                 * Delete the old by-size entry on the right.
2005                 */
2006                if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
2007                        goto error0;
2008                if (XFS_IS_CORRUPT(mp, i != 1)) {
2009                        error = -EFSCORRUPTED;
2010                        goto error0;
2011                }
2012                if ((error = xfs_btree_delete(cnt_cur, &i)))
2013                        goto error0;
2014                if (XFS_IS_CORRUPT(mp, i != 1)) {
2015                        error = -EFSCORRUPTED;
2016                        goto error0;
2017                }
2018                /*
2019                 * Delete the old by-block entry for the right block.
2020                 */
2021                if ((error = xfs_btree_delete(bno_cur, &i)))
2022                        goto error0;
2023                if (XFS_IS_CORRUPT(mp, i != 1)) {
2024                        error = -EFSCORRUPTED;
2025                        goto error0;
2026                }
2027                /*
2028                 * Move the by-block cursor back to the left neighbor.
2029                 */
2030                if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
2031                        goto error0;
2032                if (XFS_IS_CORRUPT(mp, i != 1)) {
2033                        error = -EFSCORRUPTED;
2034                        goto error0;
2035                }
2036#ifdef DEBUG
2037                /*
2038                 * Check that this is the right record: delete didn't
2039                 * mangle the cursor.
2040                 */
2041                {
2042                        xfs_agblock_t   xxbno;
2043                        xfs_extlen_t    xxlen;
2044
2045                        if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen,
2046                                        &i)))
2047                                goto error0;
2048                        if (XFS_IS_CORRUPT(mp,
2049                                           i != 1 ||
2050                                           xxbno != ltbno ||
2051                                           xxlen != ltlen)) {
2052                                error = -EFSCORRUPTED;
2053                                goto error0;
2054                        }
2055                }
2056#endif
2057                /*
2058                 * Update remaining by-block entry to the new, joined block.
2059                 */
2060                nbno = ltbno;
2061                nlen = len + ltlen + gtlen;
2062                if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2063                        goto error0;
2064        }
2065        /*
2066         * Have only a left contiguous neighbor.
2067         * Merge it together with the new freespace.
2068         */
2069        else if (haveleft) {
2070                /*
2071                 * Delete the old by-size entry on the left.
2072                 */
2073                if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
2074                        goto error0;
2075                if (XFS_IS_CORRUPT(mp, i != 1)) {
2076                        error = -EFSCORRUPTED;
2077                        goto error0;
2078                }
2079                if ((error = xfs_btree_delete(cnt_cur, &i)))
2080                        goto error0;
2081                if (XFS_IS_CORRUPT(mp, i != 1)) {
2082                        error = -EFSCORRUPTED;
2083                        goto error0;
2084                }
2085                /*
2086                 * Back up the by-block cursor to the left neighbor, and
2087                 * update its length.
2088                 */
2089                if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
2090                        goto error0;
2091                if (XFS_IS_CORRUPT(mp, i != 1)) {
2092                        error = -EFSCORRUPTED;
2093                        goto error0;
2094                }
2095                nbno = ltbno;
2096                nlen = len + ltlen;
2097                if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2098                        goto error0;
2099        }
2100        /*
2101         * Have only a right contiguous neighbor.
2102         * Merge it together with the new freespace.
2103         */
2104        else if (haveright) {
2105                /*
2106                 * Delete the old by-size entry on the right.
2107                 */
2108                if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
2109                        goto error0;
2110                if (XFS_IS_CORRUPT(mp, i != 1)) {
2111                        error = -EFSCORRUPTED;
2112                        goto error0;
2113                }
2114                if ((error = xfs_btree_delete(cnt_cur, &i)))
2115                        goto error0;
2116                if (XFS_IS_CORRUPT(mp, i != 1)) {
2117                        error = -EFSCORRUPTED;
2118                        goto error0;
2119                }
2120                /*
2121                 * Update the starting block and length of the right
2122                 * neighbor in the by-block tree.
2123                 */
2124                nbno = bno;
2125                nlen = len + gtlen;
2126                if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2127                        goto error0;
2128        }
2129        /*
2130         * No contiguous neighbors.
2131         * Insert the new freespace into the by-block tree.
2132         */
2133        else {
2134                nbno = bno;
2135                nlen = len;
2136                if ((error = xfs_btree_insert(bno_cur, &i)))
2137                        goto error0;
2138                if (XFS_IS_CORRUPT(mp, i != 1)) {
2139                        error = -EFSCORRUPTED;
2140                        goto error0;
2141                }
2142        }
2143        xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
2144        bno_cur = NULL;
2145        /*
2146         * In all cases we need to insert the new freespace in the by-size tree.
2147         */
2148        if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
2149                goto error0;
2150        if (XFS_IS_CORRUPT(mp, i != 0)) {
2151                error = -EFSCORRUPTED;
2152                goto error0;
2153        }
2154        if ((error = xfs_btree_insert(cnt_cur, &i)))
2155                goto error0;
2156        if (XFS_IS_CORRUPT(mp, i != 1)) {
2157                error = -EFSCORRUPTED;
2158                goto error0;
2159        }
2160        xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
2161        cnt_cur = NULL;
2162
2163        /*
2164         * Update the freespace totals in the ag and superblock.
2165         */
2166        error = xfs_alloc_update_counters(tp, agbp, len);
2167        xfs_ag_resv_free_extent(agbp->b_pag, type, tp, len);
2168        if (error)
2169                goto error0;
2170
2171        XFS_STATS_INC(mp, xs_freex);
2172        XFS_STATS_ADD(mp, xs_freeb, len);
2173
2174        trace_xfs_free_extent(mp, agno, bno, len, type, haveleft, haveright);
2175
2176        return 0;
2177
2178 error0:
2179        trace_xfs_free_extent(mp, agno, bno, len, type, -1, -1);
2180        if (bno_cur)
2181                xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
2182        if (cnt_cur)
2183                xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
2184        return error;
2185}
2186
2187/*
2188 * Visible (exported) allocation/free functions.
2189 * Some of these are used just by xfs_alloc_btree.c and this file.
2190 */
2191
2192/*
2193 * Compute and fill in value of m_ag_maxlevels.
2194 */
2195void
2196xfs_alloc_compute_maxlevels(
2197        xfs_mount_t     *mp)    /* file system mount structure */
2198{
2199        mp->m_ag_maxlevels = xfs_btree_compute_maxlevels(mp->m_alloc_mnr,
2200                        (mp->m_sb.sb_agblocks + 1) / 2);
2201}
2202
2203/*
2204 * Find the length of the longest extent in an AG.  The 'need' parameter
2205 * specifies how much space we're going to need for the AGFL and the
2206 * 'reserved' parameter tells us how many blocks in this AG are reserved for
2207 * other callers.
2208 */
2209xfs_extlen_t
2210xfs_alloc_longest_free_extent(
2211        struct xfs_perag        *pag,
2212        xfs_extlen_t            need,
2213        xfs_extlen_t            reserved)
2214{
2215        xfs_extlen_t            delta = 0;
2216
2217        /*
2218         * If the AGFL needs a recharge, we'll have to subtract that from the
2219         * longest extent.
2220         */
2221        if (need > pag->pagf_flcount)
2222                delta = need - pag->pagf_flcount;
2223
2224        /*
2225         * If we cannot maintain others' reservations with space from the
2226         * not-longest freesp extents, we'll have to subtract /that/ from
2227         * the longest extent too.
2228         */
2229        if (pag->pagf_freeblks - pag->pagf_longest < reserved)
2230                delta += reserved - (pag->pagf_freeblks - pag->pagf_longest);
2231
2232        /*
2233         * If the longest extent is long enough to satisfy all the
2234         * reservations and AGFL rules in place, we can return this extent.
2235         */
2236        if (pag->pagf_longest > delta)
2237                return min_t(xfs_extlen_t, pag->pag_mount->m_ag_max_usable,
2238                                pag->pagf_longest - delta);
2239
2240        /* Otherwise, let the caller try for 1 block if there's space. */
2241        return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
2242}
2243
2244/*
2245 * Compute the minimum length of the AGFL in the given AG.  If @pag is NULL,
2246 * return the largest possible minimum length.
2247 */
2248unsigned int
2249xfs_alloc_min_freelist(
2250        struct xfs_mount        *mp,
2251        struct xfs_perag        *pag)
2252{
2253        /* AG btrees have at least 1 level. */
2254        static const uint8_t    fake_levels[XFS_BTNUM_AGF] = {1, 1, 1};
2255        const uint8_t           *levels = pag ? pag->pagf_levels : fake_levels;
2256        unsigned int            min_free;
2257
2258        ASSERT(mp->m_ag_maxlevels > 0);
2259
2260        /* space needed by-bno freespace btree */
2261        min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1,
2262                                       mp->m_ag_maxlevels);
2263        /* space needed by-size freespace btree */
2264        min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1,
2265                                       mp->m_ag_maxlevels);
2266        /* space needed reverse mapping used space btree */
2267        if (xfs_has_rmapbt(mp))
2268                min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1,
2269                                                mp->m_rmap_maxlevels);
2270
2271        return min_free;
2272}
2273
2274/*
2275 * Check if the operation we are fixing up the freelist for should go ahead or
2276 * not. If we are freeing blocks, we always allow it, otherwise the allocation
2277 * is dependent on whether the size and shape of free space available will
2278 * permit the requested allocation to take place.
2279 */
2280static bool
2281xfs_alloc_space_available(
2282        struct xfs_alloc_arg    *args,
2283        xfs_extlen_t            min_free,
2284        int                     flags)
2285{
2286        struct xfs_perag        *pag = args->pag;
2287        xfs_extlen_t            alloc_len, longest;
2288        xfs_extlen_t            reservation; /* blocks that are still reserved */
2289        int                     available;
2290        xfs_extlen_t            agflcount;
2291
2292        if (flags & XFS_ALLOC_FLAG_FREEING)
2293                return true;
2294
2295        reservation = xfs_ag_resv_needed(pag, args->resv);
2296
2297        /* do we have enough contiguous free space for the allocation? */
2298        alloc_len = args->minlen + (args->alignment - 1) + args->minalignslop;
2299        longest = xfs_alloc_longest_free_extent(pag, min_free, reservation);
2300        if (longest < alloc_len)
2301                return false;
2302
2303        /*
2304         * Do we have enough free space remaining for the allocation? Don't
2305         * account extra agfl blocks because we are about to defer free them,
2306         * making them unavailable until the current transaction commits.
2307         */
2308        agflcount = min_t(xfs_extlen_t, pag->pagf_flcount, min_free);
2309        available = (int)(pag->pagf_freeblks + agflcount -
2310                          reservation - min_free - args->minleft);
2311        if (available < (int)max(args->total, alloc_len))
2312                return false;
2313
2314        /*
2315         * Clamp maxlen to the amount of free space available for the actual
2316         * extent allocation.
2317         */
2318        if (available < (int)args->maxlen && !(flags & XFS_ALLOC_FLAG_CHECK)) {
2319                args->maxlen = available;
2320                ASSERT(args->maxlen > 0);
2321                ASSERT(args->maxlen >= args->minlen);
2322        }
2323
2324        return true;
2325}
2326
2327int
2328xfs_free_agfl_block(
2329        struct xfs_trans        *tp,
2330        xfs_agnumber_t          agno,
2331        xfs_agblock_t           agbno,
2332        struct xfs_buf          *agbp,
2333        struct xfs_owner_info   *oinfo)
2334{
2335        int                     error;
2336        struct xfs_buf          *bp;
2337
2338        error = xfs_free_ag_extent(tp, agbp, agno, agbno, 1, oinfo,
2339                                   XFS_AG_RESV_AGFL);
2340        if (error)
2341                return error;
2342
2343        error = xfs_trans_get_buf(tp, tp->t_mountp->m_ddev_targp,
2344                        XFS_AGB_TO_DADDR(tp->t_mountp, agno, agbno),
2345                        tp->t_mountp->m_bsize, 0, &bp);
2346        if (error)
2347                return error;
2348        xfs_trans_binval(tp, bp);
2349
2350        return 0;
2351}
2352
2353/*
2354 * Check the agfl fields of the agf for inconsistency or corruption. The purpose
2355 * is to detect an agfl header padding mismatch between current and early v5
2356 * kernels. This problem manifests as a 1-slot size difference between the
2357 * on-disk flcount and the active [first, last] range of a wrapped agfl. This
2358 * may also catch variants of agfl count corruption unrelated to padding. Either
2359 * way, we'll reset the agfl and warn the user.
2360 *
2361 * Return true if a reset is required before the agfl can be used, false
2362 * otherwise.
2363 */
2364static bool
2365xfs_agfl_needs_reset(
2366        struct xfs_mount        *mp,
2367        struct xfs_agf          *agf)
2368{
2369        uint32_t                f = be32_to_cpu(agf->agf_flfirst);
2370        uint32_t                l = be32_to_cpu(agf->agf_fllast);
2371        uint32_t                c = be32_to_cpu(agf->agf_flcount);
2372        int                     agfl_size = xfs_agfl_size(mp);
2373        int                     active;
2374
2375        /* no agfl header on v4 supers */
2376        if (!xfs_has_crc(mp))
2377                return false;
2378
2379        /*
2380         * The agf read verifier catches severe corruption of these fields.
2381         * Repeat some sanity checks to cover a packed -> unpacked mismatch if
2382         * the verifier allows it.
2383         */
2384        if (f >= agfl_size || l >= agfl_size)
2385                return true;
2386        if (c > agfl_size)
2387                return true;
2388
2389        /*
2390         * Check consistency between the on-disk count and the active range. An
2391         * agfl padding mismatch manifests as an inconsistent flcount.
2392         */
2393        if (c && l >= f)
2394                active = l - f + 1;
2395        else if (c)
2396                active = agfl_size - f + l + 1;
2397        else
2398                active = 0;
2399
2400        return active != c;
2401}
2402
2403/*
2404 * Reset the agfl to an empty state. Ignore/drop any existing blocks since the
2405 * agfl content cannot be trusted. Warn the user that a repair is required to
2406 * recover leaked blocks.
2407 *
2408 * The purpose of this mechanism is to handle filesystems affected by the agfl
2409 * header padding mismatch problem. A reset keeps the filesystem online with a
2410 * relatively minor free space accounting inconsistency rather than suffer the
2411 * inevitable crash from use of an invalid agfl block.
2412 */
2413static void
2414xfs_agfl_reset(
2415        struct xfs_trans        *tp,
2416        struct xfs_buf          *agbp,
2417        struct xfs_perag        *pag)
2418{
2419        struct xfs_mount        *mp = tp->t_mountp;
2420        struct xfs_agf          *agf = agbp->b_addr;
2421
2422        ASSERT(pag->pagf_agflreset);
2423        trace_xfs_agfl_reset(mp, agf, 0, _RET_IP_);
2424
2425        xfs_warn(mp,
2426               "WARNING: Reset corrupted AGFL on AG %u. %d blocks leaked. "
2427               "Please unmount and run xfs_repair.",
2428                 pag->pag_agno, pag->pagf_flcount);
2429
2430        agf->agf_flfirst = 0;
2431        agf->agf_fllast = cpu_to_be32(xfs_agfl_size(mp) - 1);
2432        agf->agf_flcount = 0;
2433        xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLFIRST | XFS_AGF_FLLAST |
2434                                    XFS_AGF_FLCOUNT);
2435
2436        pag->pagf_flcount = 0;
2437        pag->pagf_agflreset = false;
2438}
2439
2440/*
2441 * Defer an AGFL block free. This is effectively equivalent to
2442 * xfs_bmap_add_free() with some special handling particular to AGFL blocks.
2443 *
2444 * Deferring AGFL frees helps prevent log reservation overruns due to too many
2445 * allocation operations in a transaction. AGFL frees are prone to this problem
2446 * because for one they are always freed one at a time. Further, an immediate
2447 * AGFL block free can cause a btree join and require another block free before
2448 * the real allocation can proceed. Deferring the free disconnects freeing up
2449 * the AGFL slot from freeing the block.
2450 */
2451STATIC void
2452xfs_defer_agfl_block(
2453        struct xfs_trans                *tp,
2454        xfs_agnumber_t                  agno,
2455        xfs_fsblock_t                   agbno,
2456        struct xfs_owner_info           *oinfo)
2457{
2458        struct xfs_mount                *mp = tp->t_mountp;
2459        struct xfs_extent_free_item     *new;           /* new element */
2460
2461        ASSERT(xfs_bmap_free_item_zone != NULL);
2462        ASSERT(oinfo != NULL);
2463
2464        new = kmem_cache_alloc(xfs_bmap_free_item_zone,
2465                               GFP_KERNEL | __GFP_NOFAIL);
2466        new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno);
2467        new->xefi_blockcount = 1;
2468        new->xefi_oinfo = *oinfo;
2469        new->xefi_skip_discard = false;
2470
2471        trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1);
2472
2473        xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &new->xefi_list);
2474}
2475
2476#ifdef DEBUG
2477/*
2478 * Check if an AGF has a free extent record whose length is equal to
2479 * args->minlen.
2480 */
2481STATIC int
2482xfs_exact_minlen_extent_available(
2483        struct xfs_alloc_arg    *args,
2484        struct xfs_buf          *agbp,
2485        int                     *stat)
2486{
2487        struct xfs_btree_cur    *cnt_cur;
2488        xfs_agblock_t           fbno;
2489        xfs_extlen_t            flen;
2490        int                     error = 0;
2491
2492        cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, agbp,
2493                                        args->pag, XFS_BTNUM_CNT);
2494        error = xfs_alloc_lookup_ge(cnt_cur, 0, args->minlen, stat);
2495        if (error)
2496                goto out;
2497
2498        if (*stat == 0) {
2499                error = -EFSCORRUPTED;
2500                goto out;
2501        }
2502
2503        error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, stat);
2504        if (error)
2505                goto out;
2506
2507        if (*stat == 1 && flen != args->minlen)
2508                *stat = 0;
2509
2510out:
2511        xfs_btree_del_cursor(cnt_cur, error);
2512
2513        return error;
2514}
2515#endif
2516
2517/*
2518 * Decide whether to use this allocation group for this allocation.
2519 * If so, fix up the btree freelist's size.
2520 */
2521int                     /* error */
2522xfs_alloc_fix_freelist(
2523        struct xfs_alloc_arg    *args,  /* allocation argument structure */
2524        int                     flags)  /* XFS_ALLOC_FLAG_... */
2525{
2526        struct xfs_mount        *mp = args->mp;
2527        struct xfs_perag        *pag = args->pag;
2528        struct xfs_trans        *tp = args->tp;
2529        struct xfs_buf          *agbp = NULL;
2530        struct xfs_buf          *agflbp = NULL;
2531        struct xfs_alloc_arg    targs;  /* local allocation arguments */
2532        xfs_agblock_t           bno;    /* freelist block */
2533        xfs_extlen_t            need;   /* total blocks needed in freelist */
2534        int                     error = 0;
2535
2536        /* deferred ops (AGFL block frees) require permanent transactions */
2537        ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
2538
2539        if (!pag->pagf_init) {
2540                error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp);
2541                if (error) {
2542                        /* Couldn't lock the AGF so skip this AG. */
2543                        if (error == -EAGAIN)
2544                                error = 0;
2545                        goto out_no_agbp;
2546                }
2547        }
2548
2549        /*
2550         * If this is a metadata preferred pag and we are user data then try
2551         * somewhere else if we are not being asked to try harder at this
2552         * point
2553         */
2554        if (pag->pagf_metadata && (args->datatype & XFS_ALLOC_USERDATA) &&
2555            (flags & XFS_ALLOC_FLAG_TRYLOCK)) {
2556                ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
2557                goto out_agbp_relse;
2558        }
2559
2560        need = xfs_alloc_min_freelist(mp, pag);
2561        if (!xfs_alloc_space_available(args, need, flags |
2562                        XFS_ALLOC_FLAG_CHECK))
2563                goto out_agbp_relse;
2564
2565        /*
2566         * Get the a.g. freespace buffer.
2567         * Can fail if we're not blocking on locks, and it's held.
2568         */
2569        if (!agbp) {
2570                error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp);
2571                if (error) {
2572                        /* Couldn't lock the AGF so skip this AG. */
2573                        if (error == -EAGAIN)
2574                                error = 0;
2575                        goto out_no_agbp;
2576                }
2577        }
2578
2579        /* reset a padding mismatched agfl before final free space check */
2580        if (pag->pagf_agflreset)
2581                xfs_agfl_reset(tp, agbp, pag);
2582
2583        /* If there isn't enough total space or single-extent, reject it. */
2584        need = xfs_alloc_min_freelist(mp, pag);
2585        if (!xfs_alloc_space_available(args, need, flags))
2586                goto out_agbp_relse;
2587
2588#ifdef DEBUG
2589        if (args->alloc_minlen_only) {
2590                int stat;
2591
2592                error = xfs_exact_minlen_extent_available(args, agbp, &stat);
2593                if (error || !stat)
2594                        goto out_agbp_relse;
2595        }
2596#endif
2597        /*
2598         * Make the freelist shorter if it's too long.
2599         *
2600         * Note that from this point onwards, we will always release the agf and
2601         * agfl buffers on error. This handles the case where we error out and
2602         * the buffers are clean or may not have been joined to the transaction
2603         * and hence need to be released manually. If they have been joined to
2604         * the transaction, then xfs_trans_brelse() will handle them
2605         * appropriately based on the recursion count and dirty state of the
2606         * buffer.
2607         *
2608         * XXX (dgc): When we have lots of free space, does this buy us
2609         * anything other than extra overhead when we need to put more blocks
2610         * back on the free list? Maybe we should only do this when space is
2611         * getting low or the AGFL is more than half full?
2612         *
2613         * The NOSHRINK flag prevents the AGFL from being shrunk if it's too
2614         * big; the NORMAP flag prevents AGFL expand/shrink operations from
2615         * updating the rmapbt.  Both flags are used in xfs_repair while we're
2616         * rebuilding the rmapbt, and neither are used by the kernel.  They're
2617         * both required to ensure that rmaps are correctly recorded for the
2618         * regenerated AGFL, bnobt, and cntbt.  See repair/phase5.c and
2619         * repair/rmap.c in xfsprogs for details.
2620         */
2621        memset(&targs, 0, sizeof(targs));
2622        /* struct copy below */
2623        if (flags & XFS_ALLOC_FLAG_NORMAP)
2624                targs.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE;
2625        else
2626                targs.oinfo = XFS_RMAP_OINFO_AG;
2627        while (!(flags & XFS_ALLOC_FLAG_NOSHRINK) && pag->pagf_flcount > need) {
2628                error = xfs_alloc_get_freelist(tp, agbp, &bno, 0);
2629                if (error)
2630                        goto out_agbp_relse;
2631
2632                /* defer agfl frees */
2633                xfs_defer_agfl_block(tp, args->agno, bno, &targs.oinfo);
2634        }
2635
2636        targs.tp = tp;
2637        targs.mp = mp;
2638        targs.agbp = agbp;
2639        targs.agno = args->agno;
2640        targs.alignment = targs.minlen = targs.prod = 1;
2641        targs.type = XFS_ALLOCTYPE_THIS_AG;
2642        targs.pag = pag;
2643        error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp);
2644        if (error)
2645                goto out_agbp_relse;
2646
2647        /* Make the freelist longer if it's too short. */
2648        while (pag->pagf_flcount < need) {
2649                targs.agbno = 0;
2650                targs.maxlen = need - pag->pagf_flcount;
2651                targs.resv = XFS_AG_RESV_AGFL;
2652
2653                /* Allocate as many blocks as possible at once. */
2654                error = xfs_alloc_ag_vextent(&targs);
2655                if (error)
2656                        goto out_agflbp_relse;
2657
2658                /*
2659                 * Stop if we run out.  Won't happen if callers are obeying
2660                 * the restrictions correctly.  Can happen for free calls
2661                 * on a completely full ag.
2662                 */
2663                if (targs.agbno == NULLAGBLOCK) {
2664                        if (flags & XFS_ALLOC_FLAG_FREEING)
2665                                break;
2666                        goto out_agflbp_relse;
2667                }
2668                /*
2669                 * Put each allocated block on the list.
2670                 */
2671                for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) {
2672                        error = xfs_alloc_put_freelist(tp, agbp,
2673                                                        agflbp, bno, 0);
2674                        if (error)
2675                                goto out_agflbp_relse;
2676                }
2677        }
2678        xfs_trans_brelse(tp, agflbp);
2679        args->agbp = agbp;
2680        return 0;
2681
2682out_agflbp_relse:
2683        xfs_trans_brelse(tp, agflbp);
2684out_agbp_relse:
2685        if (agbp)
2686                xfs_trans_brelse(tp, agbp);
2687out_no_agbp:
2688        args->agbp = NULL;
2689        return error;
2690}
2691
2692/*
2693 * Get a block from the freelist.
2694 * Returns with the buffer for the block gotten.
2695 */
2696int
2697xfs_alloc_get_freelist(
2698        struct xfs_trans        *tp,
2699        struct xfs_buf          *agbp,
2700        xfs_agblock_t           *bnop,
2701        int                     btreeblk)
2702{
2703        struct xfs_agf          *agf = agbp->b_addr;
2704        struct xfs_buf          *agflbp;
2705        xfs_agblock_t           bno;
2706        __be32                  *agfl_bno;
2707        int                     error;
2708        int                     logflags;
2709        struct xfs_mount        *mp = tp->t_mountp;
2710        struct xfs_perag        *pag;
2711
2712        /*
2713         * Freelist is empty, give up.
2714         */
2715        if (!agf->agf_flcount) {
2716                *bnop = NULLAGBLOCK;
2717                return 0;
2718        }
2719        /*
2720         * Read the array of free blocks.
2721         */
2722        error = xfs_alloc_read_agfl(mp, tp, be32_to_cpu(agf->agf_seqno),
2723                                    &agflbp);
2724        if (error)
2725                return error;
2726
2727
2728        /*
2729         * Get the block number and update the data structures.
2730         */
2731        agfl_bno = xfs_buf_to_agfl_bno(agflbp);
2732        bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]);
2733        be32_add_cpu(&agf->agf_flfirst, 1);
2734        xfs_trans_brelse(tp, agflbp);
2735        if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp))
2736                agf->agf_flfirst = 0;
2737
2738        pag = agbp->b_pag;
2739        ASSERT(!pag->pagf_agflreset);
2740        be32_add_cpu(&agf->agf_flcount, -1);
2741        pag->pagf_flcount--;
2742
2743        logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT;
2744        if (btreeblk) {
2745                be32_add_cpu(&agf->agf_btreeblks, 1);
2746                pag->pagf_btreeblks++;
2747                logflags |= XFS_AGF_BTREEBLKS;
2748        }
2749
2750        xfs_alloc_log_agf(tp, agbp, logflags);
2751        *bnop = bno;
2752
2753        return 0;
2754}
2755
2756/*
2757 * Log the given fields from the agf structure.
2758 */
2759void
2760xfs_alloc_log_agf(
2761        xfs_trans_t     *tp,    /* transaction pointer */
2762        struct xfs_buf  *bp,    /* buffer for a.g. freelist header */
2763        int             fields) /* mask of fields to be logged (XFS_AGF_...) */
2764{
2765        int     first;          /* first byte offset */
2766        int     last;           /* last byte offset */
2767        static const short      offsets[] = {
2768                offsetof(xfs_agf_t, agf_magicnum),
2769                offsetof(xfs_agf_t, agf_versionnum),
2770                offsetof(xfs_agf_t, agf_seqno),
2771                offsetof(xfs_agf_t, agf_length),
2772                offsetof(xfs_agf_t, agf_roots[0]),
2773                offsetof(xfs_agf_t, agf_levels[0]),
2774                offsetof(xfs_agf_t, agf_flfirst),
2775                offsetof(xfs_agf_t, agf_fllast),
2776                offsetof(xfs_agf_t, agf_flcount),
2777                offsetof(xfs_agf_t, agf_freeblks),
2778                offsetof(xfs_agf_t, agf_longest),
2779                offsetof(xfs_agf_t, agf_btreeblks),
2780                offsetof(xfs_agf_t, agf_uuid),
2781                offsetof(xfs_agf_t, agf_rmap_blocks),
2782                offsetof(xfs_agf_t, agf_refcount_blocks),
2783                offsetof(xfs_agf_t, agf_refcount_root),
2784                offsetof(xfs_agf_t, agf_refcount_level),
2785                /* needed so that we don't log the whole rest of the structure: */
2786                offsetof(xfs_agf_t, agf_spare64),
2787                sizeof(xfs_agf_t)
2788        };
2789
2790        trace_xfs_agf(tp->t_mountp, bp->b_addr, fields, _RET_IP_);
2791
2792        xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF);
2793
2794        xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last);
2795        xfs_trans_log_buf(tp, bp, (uint)first, (uint)last);
2796}
2797
2798/*
2799 * Interface for inode allocation to force the pag data to be initialized.
2800 */
2801int                                     /* error */
2802xfs_alloc_pagf_init(
2803        xfs_mount_t             *mp,    /* file system mount structure */
2804        xfs_trans_t             *tp,    /* transaction pointer */
2805        xfs_agnumber_t          agno,   /* allocation group number */
2806        int                     flags)  /* XFS_ALLOC_FLAGS_... */
2807{
2808        struct xfs_buf          *bp;
2809        int                     error;
2810
2811        error = xfs_alloc_read_agf(mp, tp, agno, flags, &bp);
2812        if (!error)
2813                xfs_trans_brelse(tp, bp);
2814        return error;
2815}
2816
2817/*
2818 * Put the block on the freelist for the allocation group.
2819 */
2820int
2821xfs_alloc_put_freelist(
2822        struct xfs_trans        *tp,
2823        struct xfs_buf          *agbp,
2824        struct xfs_buf          *agflbp,
2825        xfs_agblock_t           bno,
2826        int                     btreeblk)
2827{
2828        struct xfs_mount        *mp = tp->t_mountp;
2829        struct xfs_agf          *agf = agbp->b_addr;
2830        struct xfs_perag        *pag;
2831        __be32                  *blockp;
2832        int                     error;
2833        int                     logflags;
2834        __be32                  *agfl_bno;
2835        int                     startoff;
2836
2837        if (!agflbp && (error = xfs_alloc_read_agfl(mp, tp,
2838                        be32_to_cpu(agf->agf_seqno), &agflbp)))
2839                return error;
2840        be32_add_cpu(&agf->agf_fllast, 1);
2841        if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp))
2842                agf->agf_fllast = 0;
2843
2844        pag = agbp->b_pag;
2845        ASSERT(!pag->pagf_agflreset);
2846        be32_add_cpu(&agf->agf_flcount, 1);
2847        pag->pagf_flcount++;
2848
2849        logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT;
2850        if (btreeblk) {
2851                be32_add_cpu(&agf->agf_btreeblks, -1);
2852                pag->pagf_btreeblks--;
2853                logflags |= XFS_AGF_BTREEBLKS;
2854        }
2855
2856        xfs_alloc_log_agf(tp, agbp, logflags);
2857
2858        ASSERT(be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp));
2859
2860        agfl_bno = xfs_buf_to_agfl_bno(agflbp);
2861        blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)];
2862        *blockp = cpu_to_be32(bno);
2863        startoff = (char *)blockp - (char *)agflbp->b_addr;
2864
2865        xfs_alloc_log_agf(tp, agbp, logflags);
2866
2867        xfs_trans_buf_set_type(tp, agflbp, XFS_BLFT_AGFL_BUF);
2868        xfs_trans_log_buf(tp, agflbp, startoff,
2869                          startoff + sizeof(xfs_agblock_t) - 1);
2870        return 0;
2871}
2872
2873static xfs_failaddr_t
2874xfs_agf_verify(
2875        struct xfs_buf          *bp)
2876{
2877        struct xfs_mount        *mp = bp->b_mount;
2878        struct xfs_agf          *agf = bp->b_addr;
2879
2880        if (xfs_has_crc(mp)) {
2881                if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid))
2882                        return __this_address;
2883                if (!xfs_log_check_lsn(mp, be64_to_cpu(agf->agf_lsn)))
2884                        return __this_address;
2885        }
2886
2887        if (!xfs_verify_magic(bp, agf->agf_magicnum))
2888                return __this_address;
2889
2890        if (!(XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
2891              be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
2892              be32_to_cpu(agf->agf_flfirst) < xfs_agfl_size(mp) &&
2893              be32_to_cpu(agf->agf_fllast) < xfs_agfl_size(mp) &&
2894              be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp)))
2895                return __this_address;
2896
2897        if (be32_to_cpu(agf->agf_length) > mp->m_sb.sb_dblocks)
2898                return __this_address;
2899
2900        if (be32_to_cpu(agf->agf_freeblks) < be32_to_cpu(agf->agf_longest) ||
2901            be32_to_cpu(agf->agf_freeblks) > be32_to_cpu(agf->agf_length))
2902                return __this_address;
2903
2904        if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 ||
2905            be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 ||
2906            be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > mp->m_ag_maxlevels ||
2907            be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > mp->m_ag_maxlevels)
2908                return __this_address;
2909
2910        if (xfs_has_rmapbt(mp) &&
2911            (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 ||
2912             be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > mp->m_rmap_maxlevels))
2913                return __this_address;
2914
2915        if (xfs_has_rmapbt(mp) &&
2916            be32_to_cpu(agf->agf_rmap_blocks) > be32_to_cpu(agf->agf_length))
2917                return __this_address;
2918
2919        /*
2920         * during growfs operations, the perag is not fully initialised,
2921         * so we can't use it for any useful checking. growfs ensures we can't
2922         * use it by using uncached buffers that don't have the perag attached
2923         * so we can detect and avoid this problem.
2924         */
2925        if (bp->b_pag && be32_to_cpu(agf->agf_seqno) != bp->b_pag->pag_agno)
2926                return __this_address;
2927
2928        if (xfs_has_lazysbcount(mp) &&
2929            be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length))
2930                return __this_address;
2931
2932        if (xfs_has_reflink(mp) &&
2933            be32_to_cpu(agf->agf_refcount_blocks) >
2934            be32_to_cpu(agf->agf_length))
2935                return __this_address;
2936
2937        if (xfs_has_reflink(mp) &&
2938            (be32_to_cpu(agf->agf_refcount_level) < 1 ||
2939             be32_to_cpu(agf->agf_refcount_level) > mp->m_refc_maxlevels))
2940                return __this_address;
2941
2942        return NULL;
2943
2944}
2945
2946static void
2947xfs_agf_read_verify(
2948        struct xfs_buf  *bp)
2949{
2950        struct xfs_mount *mp = bp->b_mount;
2951        xfs_failaddr_t  fa;
2952
2953        if (xfs_has_crc(mp) &&
2954            !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF))
2955                xfs_verifier_error(bp, -EFSBADCRC, __this_address);
2956        else {
2957                fa = xfs_agf_verify(bp);
2958                if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_ALLOC_READ_AGF))
2959                        xfs_verifier_error(bp, -EFSCORRUPTED, fa);
2960        }
2961}
2962
2963static void
2964xfs_agf_write_verify(
2965        struct xfs_buf  *bp)
2966{
2967        struct xfs_mount        *mp = bp->b_mount;
2968        struct xfs_buf_log_item *bip = bp->b_log_item;
2969        struct xfs_agf          *agf = bp->b_addr;
2970        xfs_failaddr_t          fa;
2971
2972        fa = xfs_agf_verify(bp);
2973        if (fa) {
2974                xfs_verifier_error(bp, -EFSCORRUPTED, fa);
2975                return;
2976        }
2977
2978        if (!xfs_has_crc(mp))
2979                return;
2980
2981        if (bip)
2982                agf->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn);
2983
2984        xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF);
2985}
2986
2987const struct xfs_buf_ops xfs_agf_buf_ops = {
2988        .name = "xfs_agf",
2989        .magic = { cpu_to_be32(XFS_AGF_MAGIC), cpu_to_be32(XFS_AGF_MAGIC) },
2990        .verify_read = xfs_agf_read_verify,
2991        .verify_write = xfs_agf_write_verify,
2992        .verify_struct = xfs_agf_verify,
2993};
2994
2995/*
2996 * Read in the allocation group header (free/alloc section).
2997 */
2998int                                     /* error */
2999xfs_read_agf(
3000        struct xfs_mount        *mp,    /* mount point structure */
3001        struct xfs_trans        *tp,    /* transaction pointer */
3002        xfs_agnumber_t          agno,   /* allocation group number */
3003        int                     flags,  /* XFS_BUF_ */
3004        struct xfs_buf          **bpp)  /* buffer for the ag freelist header */
3005{
3006        int             error;
3007
3008        trace_xfs_read_agf(mp, agno);
3009
3010        ASSERT(agno != NULLAGNUMBER);
3011        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
3012                        XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
3013                        XFS_FSS_TO_BB(mp, 1), flags, bpp, &xfs_agf_buf_ops);
3014        if (error)
3015                return error;
3016
3017        ASSERT(!(*bpp)->b_error);
3018        xfs_buf_set_ref(*bpp, XFS_AGF_REF);
3019        return 0;
3020}
3021
3022/*
3023 * Read in the allocation group header (free/alloc section).
3024 */
3025int                                     /* error */
3026xfs_alloc_read_agf(
3027        struct xfs_mount        *mp,    /* mount point structure */
3028        struct xfs_trans        *tp,    /* transaction pointer */
3029        xfs_agnumber_t          agno,   /* allocation group number */
3030        int                     flags,  /* XFS_ALLOC_FLAG_... */
3031        struct xfs_buf          **bpp)  /* buffer for the ag freelist header */
3032{
3033        struct xfs_agf          *agf;           /* ag freelist header */
3034        struct xfs_perag        *pag;           /* per allocation group data */
3035        int                     error;
3036        int                     allocbt_blks;
3037
3038        trace_xfs_alloc_read_agf(mp, agno);
3039
3040        /* We don't support trylock when freeing. */
3041        ASSERT((flags & (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)) !=
3042                        (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK));
3043        ASSERT(agno != NULLAGNUMBER);
3044        error = xfs_read_agf(mp, tp, agno,
3045                        (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
3046                        bpp);
3047        if (error)
3048                return error;
3049        ASSERT(!(*bpp)->b_error);
3050
3051        agf = (*bpp)->b_addr;
3052        pag = (*bpp)->b_pag;
3053        if (!pag->pagf_init) {
3054                pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
3055                pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
3056                pag->pagf_flcount = be32_to_cpu(agf->agf_flcount);
3057                pag->pagf_longest = be32_to_cpu(agf->agf_longest);
3058                pag->pagf_levels[XFS_BTNUM_BNOi] =
3059                        be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]);
3060                pag->pagf_levels[XFS_BTNUM_CNTi] =
3061                        be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
3062                pag->pagf_levels[XFS_BTNUM_RMAPi] =
3063                        be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]);
3064                pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
3065                pag->pagf_init = 1;
3066                pag->pagf_agflreset = xfs_agfl_needs_reset(mp, agf);
3067
3068                /*
3069                 * Update the in-core allocbt counter. Filter out the rmapbt
3070                 * subset of the btreeblks counter because the rmapbt is managed
3071                 * by perag reservation. Subtract one for the rmapbt root block
3072                 * because the rmap counter includes it while the btreeblks
3073                 * counter only tracks non-root blocks.
3074                 */
3075                allocbt_blks = pag->pagf_btreeblks;
3076                if (xfs_has_rmapbt(mp))
3077                        allocbt_blks -= be32_to_cpu(agf->agf_rmap_blocks) - 1;
3078                if (allocbt_blks > 0)
3079                        atomic64_add(allocbt_blks, &mp->m_allocbt_blks);
3080        }
3081#ifdef DEBUG
3082        else if (!xfs_is_shutdown(mp)) {
3083                ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks));
3084                ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
3085                ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
3086                ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest));
3087                ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] ==
3088                       be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]));
3089                ASSERT(pag->pagf_levels[XFS_BTNUM_CNTi] ==
3090                       be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
3091        }
3092#endif
3093        return 0;
3094}
3095
3096/*
3097 * Allocate an extent (variable-size).
3098 * Depending on the allocation type, we either look in a single allocation
3099 * group or loop over the allocation groups to find the result.
3100 */
3101int                             /* error */
3102xfs_alloc_vextent(
3103        struct xfs_alloc_arg    *args)  /* allocation argument structure */
3104{
3105        xfs_agblock_t           agsize; /* allocation group size */
3106        int                     error;
3107        int                     flags;  /* XFS_ALLOC_FLAG_... locking flags */
3108        struct xfs_mount        *mp;    /* mount structure pointer */
3109        xfs_agnumber_t          sagno;  /* starting allocation group number */
3110        xfs_alloctype_t         type;   /* input allocation type */
3111        int                     bump_rotor = 0;
3112        xfs_agnumber_t          rotorstep = xfs_rotorstep; /* inode32 agf stepper */
3113
3114        mp = args->mp;
3115        type = args->otype = args->type;
3116        args->agbno = NULLAGBLOCK;
3117        /*
3118         * Just fix this up, for the case where the last a.g. is shorter
3119         * (or there's only one a.g.) and the caller couldn't easily figure
3120         * that out (xfs_bmap_alloc).
3121         */
3122        agsize = mp->m_sb.sb_agblocks;
3123        if (args->maxlen > agsize)
3124                args->maxlen = agsize;
3125        if (args->alignment == 0)
3126                args->alignment = 1;
3127        ASSERT(XFS_FSB_TO_AGNO(mp, args->fsbno) < mp->m_sb.sb_agcount);
3128        ASSERT(XFS_FSB_TO_AGBNO(mp, args->fsbno) < agsize);
3129        ASSERT(args->minlen <= args->maxlen);
3130        ASSERT(args->minlen <= agsize);
3131        ASSERT(args->mod < args->prod);
3132        if (XFS_FSB_TO_AGNO(mp, args->fsbno) >= mp->m_sb.sb_agcount ||
3133            XFS_FSB_TO_AGBNO(mp, args->fsbno) >= agsize ||
3134            args->minlen > args->maxlen || args->minlen > agsize ||
3135            args->mod >= args->prod) {
3136                args->fsbno = NULLFSBLOCK;
3137                trace_xfs_alloc_vextent_badargs(args);
3138                return 0;
3139        }
3140
3141        switch (type) {
3142        case XFS_ALLOCTYPE_THIS_AG:
3143        case XFS_ALLOCTYPE_NEAR_BNO:
3144        case XFS_ALLOCTYPE_THIS_BNO:
3145                /*
3146                 * These three force us into a single a.g.
3147                 */
3148                args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
3149                args->pag = xfs_perag_get(mp, args->agno);
3150                error = xfs_alloc_fix_freelist(args, 0);
3151                if (error) {
3152                        trace_xfs_alloc_vextent_nofix(args);
3153                        goto error0;
3154                }
3155                if (!args->agbp) {
3156                        trace_xfs_alloc_vextent_noagbp(args);
3157                        break;
3158                }
3159                args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
3160                if ((error = xfs_alloc_ag_vextent(args)))
3161                        goto error0;
3162                break;
3163        case XFS_ALLOCTYPE_START_BNO:
3164                /*
3165                 * Try near allocation first, then anywhere-in-ag after
3166                 * the first a.g. fails.
3167                 */
3168                if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) &&
3169                    xfs_is_inode32(mp)) {
3170                        args->fsbno = XFS_AGB_TO_FSB(mp,
3171                                        ((mp->m_agfrotor / rotorstep) %
3172                                        mp->m_sb.sb_agcount), 0);
3173                        bump_rotor = 1;
3174                }
3175                args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
3176                args->type = XFS_ALLOCTYPE_NEAR_BNO;
3177                fallthrough;
3178        case XFS_ALLOCTYPE_FIRST_AG:
3179                /*
3180                 * Rotate through the allocation groups looking for a winner.
3181                 */
3182                if (type == XFS_ALLOCTYPE_FIRST_AG) {
3183                        /*
3184                         * Start with allocation group given by bno.
3185                         */
3186                        args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
3187                        args->type = XFS_ALLOCTYPE_THIS_AG;
3188                        sagno = 0;
3189                        flags = 0;
3190                } else {
3191                        /*
3192                         * Start with the given allocation group.
3193                         */
3194                        args->agno = sagno = XFS_FSB_TO_AGNO(mp, args->fsbno);
3195                        flags = XFS_ALLOC_FLAG_TRYLOCK;
3196                }
3197                /*
3198                 * Loop over allocation groups twice; first time with
3199                 * trylock set, second time without.
3200                 */
3201                for (;;) {
3202                        args->pag = xfs_perag_get(mp, args->agno);
3203                        error = xfs_alloc_fix_freelist(args, flags);
3204                        if (error) {
3205                                trace_xfs_alloc_vextent_nofix(args);
3206                                goto error0;
3207                        }
3208                        /*
3209                         * If we get a buffer back then the allocation will fly.
3210                         */
3211                        if (args->agbp) {
3212                                if ((error = xfs_alloc_ag_vextent(args)))
3213                                        goto error0;
3214                                break;
3215                        }
3216
3217                        trace_xfs_alloc_vextent_loopfailed(args);
3218
3219                        /*
3220                         * Didn't work, figure out the next iteration.
3221                         */
3222                        if (args->agno == sagno &&
3223                            type == XFS_ALLOCTYPE_START_BNO)
3224                                args->type = XFS_ALLOCTYPE_THIS_AG;
3225                        /*
3226                        * For the first allocation, we can try any AG to get
3227                        * space.  However, if we already have allocated a
3228                        * block, we don't want to try AGs whose number is below
3229                        * sagno. Otherwise, we may end up with out-of-order
3230                        * locking of AGF, which might cause deadlock.
3231                        */
3232                        if (++(args->agno) == mp->m_sb.sb_agcount) {
3233                                if (args->tp->t_firstblock != NULLFSBLOCK)
3234                                        args->agno = sagno;
3235                                else
3236                                        args->agno = 0;
3237                        }
3238                        /*
3239                         * Reached the starting a.g., must either be done
3240                         * or switch to non-trylock mode.
3241                         */
3242                        if (args->agno == sagno) {
3243                                if (flags == 0) {
3244                                        args->agbno = NULLAGBLOCK;
3245                                        trace_xfs_alloc_vextent_allfailed(args);
3246                                        break;
3247                                }
3248
3249                                flags = 0;
3250                                if (type == XFS_ALLOCTYPE_START_BNO) {
3251                                        args->agbno = XFS_FSB_TO_AGBNO(mp,
3252                                                args->fsbno);
3253                                        args->type = XFS_ALLOCTYPE_NEAR_BNO;
3254                                }
3255                        }
3256                        xfs_perag_put(args->pag);
3257                }
3258                if (bump_rotor) {
3259                        if (args->agno == sagno)
3260                                mp->m_agfrotor = (mp->m_agfrotor + 1) %
3261                                        (mp->m_sb.sb_agcount * rotorstep);
3262                        else
3263                                mp->m_agfrotor = (args->agno * rotorstep + 1) %
3264                                        (mp->m_sb.sb_agcount * rotorstep);
3265                }
3266                break;
3267        default:
3268                ASSERT(0);
3269                /* NOTREACHED */
3270        }
3271        if (args->agbno == NULLAGBLOCK)
3272                args->fsbno = NULLFSBLOCK;
3273        else {
3274                args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno);
3275#ifdef DEBUG
3276                ASSERT(args->len >= args->minlen);
3277                ASSERT(args->len <= args->maxlen);
3278                ASSERT(args->agbno % args->alignment == 0);
3279                XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno),
3280                        args->len);
3281#endif
3282
3283        }
3284        xfs_perag_put(args->pag);
3285        return 0;
3286error0:
3287        xfs_perag_put(args->pag);
3288        return error;
3289}
3290
3291/* Ensure that the freelist is at full capacity. */
3292int
3293xfs_free_extent_fix_freelist(
3294        struct xfs_trans        *tp,
3295        struct xfs_perag        *pag,
3296        struct xfs_buf          **agbp)
3297{
3298        struct xfs_alloc_arg    args;
3299        int                     error;
3300
3301        memset(&args, 0, sizeof(struct xfs_alloc_arg));
3302        args.tp = tp;
3303        args.mp = tp->t_mountp;
3304        args.agno = pag->pag_agno;
3305        args.pag = pag;
3306
3307        /*
3308         * validate that the block number is legal - the enables us to detect
3309         * and handle a silent filesystem corruption rather than crashing.
3310         */
3311        if (args.agno >= args.mp->m_sb.sb_agcount)
3312                return -EFSCORRUPTED;
3313
3314        error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
3315        if (error)
3316                return error;
3317
3318        *agbp = args.agbp;
3319        return 0;
3320}
3321
3322/*
3323 * Free an extent.
3324 * Just break up the extent address and hand off to xfs_free_ag_extent
3325 * after fixing up the freelist.
3326 */
3327int
3328__xfs_free_extent(
3329        struct xfs_trans                *tp,
3330        xfs_fsblock_t                   bno,
3331        xfs_extlen_t                    len,
3332        const struct xfs_owner_info     *oinfo,
3333        enum xfs_ag_resv_type           type,
3334        bool                            skip_discard)
3335{
3336        struct xfs_mount                *mp = tp->t_mountp;
3337        struct xfs_buf                  *agbp;
3338        xfs_agnumber_t                  agno = XFS_FSB_TO_AGNO(mp, bno);
3339        xfs_agblock_t                   agbno = XFS_FSB_TO_AGBNO(mp, bno);
3340        struct xfs_agf                  *agf;
3341        int                             error;
3342        unsigned int                    busy_flags = 0;
3343        struct xfs_perag                *pag;
3344
3345        ASSERT(len != 0);
3346        ASSERT(type != XFS_AG_RESV_AGFL);
3347
3348        if (XFS_TEST_ERROR(false, mp,
3349                        XFS_ERRTAG_FREE_EXTENT))
3350                return -EIO;
3351
3352        pag = xfs_perag_get(mp, agno);
3353        error = xfs_free_extent_fix_freelist(tp, pag, &agbp);
3354        if (error)
3355                goto err;
3356        agf = agbp->b_addr;
3357
3358        if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) {
3359                error = -EFSCORRUPTED;
3360                goto err_release;
3361        }
3362
3363        /* validate the extent size is legal now we have the agf locked */
3364        if (XFS_IS_CORRUPT(mp, agbno + len > be32_to_cpu(agf->agf_length))) {
3365                error = -EFSCORRUPTED;
3366                goto err_release;
3367        }
3368
3369        error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, type);
3370        if (error)
3371                goto err_release;
3372
3373        if (skip_discard)
3374                busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD;
3375        xfs_extent_busy_insert(tp, pag, agbno, len, busy_flags);
3376        xfs_perag_put(pag);
3377        return 0;
3378
3379err_release:
3380        xfs_trans_brelse(tp, agbp);
3381err:
3382        xfs_perag_put(pag);
3383        return error;
3384}
3385
3386struct xfs_alloc_query_range_info {
3387        xfs_alloc_query_range_fn        fn;
3388        void                            *priv;
3389};
3390
3391/* Format btree record and pass to our callback. */
3392STATIC int
3393xfs_alloc_query_range_helper(
3394        struct xfs_btree_cur            *cur,
3395        const union xfs_btree_rec       *rec,
3396        void                            *priv)
3397{
3398        struct xfs_alloc_query_range_info       *query = priv;
3399        struct xfs_alloc_rec_incore             irec;
3400
3401        irec.ar_startblock = be32_to_cpu(rec->alloc.ar_startblock);
3402        irec.ar_blockcount = be32_to_cpu(rec->alloc.ar_blockcount);
3403        return query->fn(cur, &irec, query->priv);
3404}
3405
3406/* Find all free space within a given range of blocks. */
3407int
3408xfs_alloc_query_range(
3409        struct xfs_btree_cur                    *cur,
3410        const struct xfs_alloc_rec_incore       *low_rec,
3411        const struct xfs_alloc_rec_incore       *high_rec,
3412        xfs_alloc_query_range_fn                fn,
3413        void                                    *priv)
3414{
3415        union xfs_btree_irec                    low_brec;
3416        union xfs_btree_irec                    high_brec;
3417        struct xfs_alloc_query_range_info       query;
3418
3419        ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
3420        low_brec.a = *low_rec;
3421        high_brec.a = *high_rec;
3422        query.priv = priv;
3423        query.fn = fn;
3424        return xfs_btree_query_range(cur, &low_brec, &high_brec,
3425                        xfs_alloc_query_range_helper, &query);
3426}
3427
3428/* Find all free space records. */
3429int
3430xfs_alloc_query_all(
3431        struct xfs_btree_cur                    *cur,
3432        xfs_alloc_query_range_fn                fn,
3433        void                                    *priv)
3434{
3435        struct xfs_alloc_query_range_info       query;
3436
3437        ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
3438        query.priv = priv;
3439        query.fn = fn;
3440        return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query);
3441}
3442
3443/* Is there a record covering a given extent? */
3444int
3445xfs_alloc_has_record(
3446        struct xfs_btree_cur    *cur,
3447        xfs_agblock_t           bno,
3448        xfs_extlen_t            len,
3449        bool                    *exists)
3450{
3451        union xfs_btree_irec    low;
3452        union xfs_btree_irec    high;
3453
3454        memset(&low, 0, sizeof(low));
3455        low.a.ar_startblock = bno;
3456        memset(&high, 0xFF, sizeof(high));
3457        high.a.ar_startblock = bno + len - 1;
3458
3459        return xfs_btree_has_record(cur, &low, &high, exists);
3460}
3461
3462/*
3463 * Walk all the blocks in the AGFL.  The @walk_fn can return any negative
3464 * error code or XFS_ITER_*.
3465 */
3466int
3467xfs_agfl_walk(
3468        struct xfs_mount        *mp,
3469        struct xfs_agf          *agf,
3470        struct xfs_buf          *agflbp,
3471        xfs_agfl_walk_fn        walk_fn,
3472        void                    *priv)
3473{
3474        __be32                  *agfl_bno;
3475        unsigned int            i;
3476        int                     error;
3477
3478        agfl_bno = xfs_buf_to_agfl_bno(agflbp);
3479        i = be32_to_cpu(agf->agf_flfirst);
3480
3481        /* Nothing to walk in an empty AGFL. */
3482        if (agf->agf_flcount == cpu_to_be32(0))
3483                return 0;
3484
3485        /* Otherwise, walk from first to last, wrapping as needed. */
3486        for (;;) {
3487                error = walk_fn(mp, be32_to_cpu(agfl_bno[i]), priv);
3488                if (error)
3489                        return error;
3490                if (i == be32_to_cpu(agf->agf_fllast))
3491                        break;
3492                if (++i == xfs_agfl_size(mp))
3493                        i = 0;
3494        }
3495
3496        return 0;
3497}
3498