linux/fs/xfs/libxfs/xfs_ialloc.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
   4 * All Rights Reserved.
   5 */
   6#include "xfs.h"
   7#include "xfs_fs.h"
   8#include "xfs_shared.h"
   9#include "xfs_format.h"
  10#include "xfs_log_format.h"
  11#include "xfs_trans_resv.h"
  12#include "xfs_bit.h"
  13#include "xfs_sb.h"
  14#include "xfs_mount.h"
  15#include "xfs_defer.h"
  16#include "xfs_inode.h"
  17#include "xfs_btree.h"
  18#include "xfs_ialloc.h"
  19#include "xfs_ialloc_btree.h"
  20#include "xfs_alloc.h"
  21#include "xfs_rtalloc.h"
  22#include "xfs_errortag.h"
  23#include "xfs_error.h"
  24#include "xfs_bmap.h"
  25#include "xfs_cksum.h"
  26#include "xfs_trans.h"
  27#include "xfs_buf_item.h"
  28#include "xfs_icreate_item.h"
  29#include "xfs_icache.h"
  30#include "xfs_trace.h"
  31#include "xfs_log.h"
  32#include "xfs_rmap.h"
  33
  34
  35/*
  36 * Allocation group level functions.
  37 */
  38int
  39xfs_ialloc_cluster_alignment(
  40        struct xfs_mount        *mp)
  41{
  42        if (xfs_sb_version_hasalign(&mp->m_sb) &&
  43            mp->m_sb.sb_inoalignmt >= xfs_icluster_size_fsb(mp))
  44                return mp->m_sb.sb_inoalignmt;
  45        return 1;
  46}
  47
  48/*
  49 * Lookup a record by ino in the btree given by cur.
  50 */
  51int                                     /* error */
  52xfs_inobt_lookup(
  53        struct xfs_btree_cur    *cur,   /* btree cursor */
  54        xfs_agino_t             ino,    /* starting inode of chunk */
  55        xfs_lookup_t            dir,    /* <=, >=, == */
  56        int                     *stat)  /* success/failure */
  57{
  58        cur->bc_rec.i.ir_startino = ino;
  59        cur->bc_rec.i.ir_holemask = 0;
  60        cur->bc_rec.i.ir_count = 0;
  61        cur->bc_rec.i.ir_freecount = 0;
  62        cur->bc_rec.i.ir_free = 0;
  63        return xfs_btree_lookup(cur, dir, stat);
  64}
  65
  66/*
  67 * Update the record referred to by cur to the value given.
  68 * This either works (return 0) or gets an EFSCORRUPTED error.
  69 */
  70STATIC int                              /* error */
  71xfs_inobt_update(
  72        struct xfs_btree_cur    *cur,   /* btree cursor */
  73        xfs_inobt_rec_incore_t  *irec)  /* btree record */
  74{
  75        union xfs_btree_rec     rec;
  76
  77        rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino);
  78        if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) {
  79                rec.inobt.ir_u.sp.ir_holemask = cpu_to_be16(irec->ir_holemask);
  80                rec.inobt.ir_u.sp.ir_count = irec->ir_count;
  81                rec.inobt.ir_u.sp.ir_freecount = irec->ir_freecount;
  82        } else {
  83                /* ir_holemask/ir_count not supported on-disk */
  84                rec.inobt.ir_u.f.ir_freecount = cpu_to_be32(irec->ir_freecount);
  85        }
  86        rec.inobt.ir_free = cpu_to_be64(irec->ir_free);
  87        return xfs_btree_update(cur, &rec);
  88}
  89
  90/* Convert on-disk btree record to incore inobt record. */
  91void
  92xfs_inobt_btrec_to_irec(
  93        struct xfs_mount                *mp,
  94        union xfs_btree_rec             *rec,
  95        struct xfs_inobt_rec_incore     *irec)
  96{
  97        irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino);
  98        if (xfs_sb_version_hassparseinodes(&mp->m_sb)) {
  99                irec->ir_holemask = be16_to_cpu(rec->inobt.ir_u.sp.ir_holemask);
 100                irec->ir_count = rec->inobt.ir_u.sp.ir_count;
 101                irec->ir_freecount = rec->inobt.ir_u.sp.ir_freecount;
 102        } else {
 103                /*
 104                 * ir_holemask/ir_count not supported on-disk. Fill in hardcoded
 105                 * values for full inode chunks.
 106                 */
 107                irec->ir_holemask = XFS_INOBT_HOLEMASK_FULL;
 108                irec->ir_count = XFS_INODES_PER_CHUNK;
 109                irec->ir_freecount =
 110                                be32_to_cpu(rec->inobt.ir_u.f.ir_freecount);
 111        }
 112        irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
 113}
 114
 115/*
 116 * Get the data from the pointed-to record.
 117 */
 118int
 119xfs_inobt_get_rec(
 120        struct xfs_btree_cur            *cur,
 121        struct xfs_inobt_rec_incore     *irec,
 122        int                             *stat)
 123{
 124        struct xfs_mount                *mp = cur->bc_mp;
 125        xfs_agnumber_t                  agno = cur->bc_private.a.agno;
 126        union xfs_btree_rec             *rec;
 127        int                             error;
 128        uint64_t                        realfree;
 129
 130        error = xfs_btree_get_rec(cur, &rec, stat);
 131        if (error || *stat == 0)
 132                return error;
 133
 134        xfs_inobt_btrec_to_irec(mp, rec, irec);
 135
 136        if (!xfs_verify_agino(mp, agno, irec->ir_startino))
 137                goto out_bad_rec;
 138        if (irec->ir_count < XFS_INODES_PER_HOLEMASK_BIT ||
 139            irec->ir_count > XFS_INODES_PER_CHUNK)
 140                goto out_bad_rec;
 141        if (irec->ir_freecount > XFS_INODES_PER_CHUNK)
 142                goto out_bad_rec;
 143
 144        /* if there are no holes, return the first available offset */
 145        if (!xfs_inobt_issparse(irec->ir_holemask))
 146                realfree = irec->ir_free;
 147        else
 148                realfree = irec->ir_free & xfs_inobt_irec_to_allocmask(irec);
 149        if (hweight64(realfree) != irec->ir_freecount)
 150                goto out_bad_rec;
 151
 152        return 0;
 153
 154out_bad_rec:
 155        xfs_warn(mp,
 156                "%s Inode BTree record corruption in AG %d detected!",
 157                cur->bc_btnum == XFS_BTNUM_INO ? "Used" : "Free", agno);
 158        xfs_warn(mp,
 159"start inode 0x%x, count 0x%x, free 0x%x freemask 0x%llx, holemask 0x%x",
 160                irec->ir_startino, irec->ir_count, irec->ir_freecount,
 161                irec->ir_free, irec->ir_holemask);
 162        return -EFSCORRUPTED;
 163}
 164
 165/*
 166 * Insert a single inobt record. Cursor must already point to desired location.
 167 */
 168int
 169xfs_inobt_insert_rec(
 170        struct xfs_btree_cur    *cur,
 171        uint16_t                holemask,
 172        uint8_t                 count,
 173        int32_t                 freecount,
 174        xfs_inofree_t           free,
 175        int                     *stat)
 176{
 177        cur->bc_rec.i.ir_holemask = holemask;
 178        cur->bc_rec.i.ir_count = count;
 179        cur->bc_rec.i.ir_freecount = freecount;
 180        cur->bc_rec.i.ir_free = free;
 181        return xfs_btree_insert(cur, stat);
 182}
 183
 184/*
 185 * Insert records describing a newly allocated inode chunk into the inobt.
 186 */
 187STATIC int
 188xfs_inobt_insert(
 189        struct xfs_mount        *mp,
 190        struct xfs_trans        *tp,
 191        struct xfs_buf          *agbp,
 192        xfs_agino_t             newino,
 193        xfs_agino_t             newlen,
 194        xfs_btnum_t             btnum)
 195{
 196        struct xfs_btree_cur    *cur;
 197        struct xfs_agi          *agi = XFS_BUF_TO_AGI(agbp);
 198        xfs_agnumber_t          agno = be32_to_cpu(agi->agi_seqno);
 199        xfs_agino_t             thisino;
 200        int                     i;
 201        int                     error;
 202
 203        cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum);
 204
 205        for (thisino = newino;
 206             thisino < newino + newlen;
 207             thisino += XFS_INODES_PER_CHUNK) {
 208                error = xfs_inobt_lookup(cur, thisino, XFS_LOOKUP_EQ, &i);
 209                if (error) {
 210                        xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
 211                        return error;
 212                }
 213                ASSERT(i == 0);
 214
 215                error = xfs_inobt_insert_rec(cur, XFS_INOBT_HOLEMASK_FULL,
 216                                             XFS_INODES_PER_CHUNK,
 217                                             XFS_INODES_PER_CHUNK,
 218                                             XFS_INOBT_ALL_FREE, &i);
 219                if (error) {
 220                        xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
 221                        return error;
 222                }
 223                ASSERT(i == 1);
 224        }
 225
 226        xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
 227
 228        return 0;
 229}
 230
 231/*
 232 * Verify that the number of free inodes in the AGI is correct.
 233 */
 234#ifdef DEBUG
 235STATIC int
 236xfs_check_agi_freecount(
 237        struct xfs_btree_cur    *cur,
 238        struct xfs_agi          *agi)
 239{
 240        if (cur->bc_nlevels == 1) {
 241                xfs_inobt_rec_incore_t rec;
 242                int             freecount = 0;
 243                int             error;
 244                int             i;
 245
 246                error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
 247                if (error)
 248                        return error;
 249
 250                do {
 251                        error = xfs_inobt_get_rec(cur, &rec, &i);
 252                        if (error)
 253                                return error;
 254
 255                        if (i) {
 256                                freecount += rec.ir_freecount;
 257                                error = xfs_btree_increment(cur, 0, &i);
 258                                if (error)
 259                                        return error;
 260                        }
 261                } while (i == 1);
 262
 263                if (!XFS_FORCED_SHUTDOWN(cur->bc_mp))
 264                        ASSERT(freecount == be32_to_cpu(agi->agi_freecount));
 265        }
 266        return 0;
 267}
 268#else
 269#define xfs_check_agi_freecount(cur, agi)       0
 270#endif
 271
 272/*
 273 * Initialise a new set of inodes. When called without a transaction context
 274 * (e.g. from recovery) we initiate a delayed write of the inode buffers rather
 275 * than logging them (which in a transaction context puts them into the AIL
 276 * for writeback rather than the xfsbufd queue).
 277 */
 278int
 279xfs_ialloc_inode_init(
 280        struct xfs_mount        *mp,
 281        struct xfs_trans        *tp,
 282        struct list_head        *buffer_list,
 283        int                     icount,
 284        xfs_agnumber_t          agno,
 285        xfs_agblock_t           agbno,
 286        xfs_agblock_t           length,
 287        unsigned int            gen)
 288{
 289        struct xfs_buf          *fbuf;
 290        struct xfs_dinode       *free;
 291        int                     nbufs, blks_per_cluster, inodes_per_cluster;
 292        int                     version;
 293        int                     i, j;
 294        xfs_daddr_t             d;
 295        xfs_ino_t               ino = 0;
 296
 297        /*
 298         * Loop over the new block(s), filling in the inodes.  For small block
 299         * sizes, manipulate the inodes in buffers  which are multiples of the
 300         * blocks size.
 301         */
 302        blks_per_cluster = xfs_icluster_size_fsb(mp);
 303        inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
 304        nbufs = length / blks_per_cluster;
 305
 306        /*
 307         * Figure out what version number to use in the inodes we create.  If
 308         * the superblock version has caught up to the one that supports the new
 309         * inode format, then use the new inode version.  Otherwise use the old
 310         * version so that old kernels will continue to be able to use the file
 311         * system.
 312         *
 313         * For v3 inodes, we also need to write the inode number into the inode,
 314         * so calculate the first inode number of the chunk here as
 315         * XFS_OFFBNO_TO_AGINO() only works within a filesystem block, not
 316         * across multiple filesystem blocks (such as a cluster) and so cannot
 317         * be used in the cluster buffer loop below.
 318         *
 319         * Further, because we are writing the inode directly into the buffer
 320         * and calculating a CRC on the entire inode, we have ot log the entire
 321         * inode so that the entire range the CRC covers is present in the log.
 322         * That means for v3 inode we log the entire buffer rather than just the
 323         * inode cores.
 324         */
 325        if (xfs_sb_version_hascrc(&mp->m_sb)) {
 326                version = 3;
 327                ino = XFS_AGINO_TO_INO(mp, agno,
 328                                       XFS_OFFBNO_TO_AGINO(mp, agbno, 0));
 329
 330                /*
 331                 * log the initialisation that is about to take place as an
 332                 * logical operation. This means the transaction does not
 333                 * need to log the physical changes to the inode buffers as log
 334                 * recovery will know what initialisation is actually needed.
 335                 * Hence we only need to log the buffers as "ordered" buffers so
 336                 * they track in the AIL as if they were physically logged.
 337                 */
 338                if (tp)
 339                        xfs_icreate_log(tp, agno, agbno, icount,
 340                                        mp->m_sb.sb_inodesize, length, gen);
 341        } else
 342                version = 2;
 343
 344        for (j = 0; j < nbufs; j++) {
 345                /*
 346                 * Get the block.
 347                 */
 348                d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster));
 349                fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
 350                                         mp->m_bsize * blks_per_cluster,
 351                                         XBF_UNMAPPED);
 352                if (!fbuf)
 353                        return -ENOMEM;
 354
 355                /* Initialize the inode buffers and log them appropriately. */
 356                fbuf->b_ops = &xfs_inode_buf_ops;
 357                xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length));
 358                for (i = 0; i < inodes_per_cluster; i++) {
 359                        int     ioffset = i << mp->m_sb.sb_inodelog;
 360                        uint    isize = xfs_dinode_size(version);
 361
 362                        free = xfs_make_iptr(mp, fbuf, i);
 363                        free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
 364                        free->di_version = version;
 365                        free->di_gen = cpu_to_be32(gen);
 366                        free->di_next_unlinked = cpu_to_be32(NULLAGINO);
 367
 368                        if (version == 3) {
 369                                free->di_ino = cpu_to_be64(ino);
 370                                ino++;
 371                                uuid_copy(&free->di_uuid,
 372                                          &mp->m_sb.sb_meta_uuid);
 373                                xfs_dinode_calc_crc(mp, free);
 374                        } else if (tp) {
 375                                /* just log the inode core */
 376                                xfs_trans_log_buf(tp, fbuf, ioffset,
 377                                                  ioffset + isize - 1);
 378                        }
 379                }
 380
 381                if (tp) {
 382                        /*
 383                         * Mark the buffer as an inode allocation buffer so it
 384                         * sticks in AIL at the point of this allocation
 385                         * transaction. This ensures the they are on disk before
 386                         * the tail of the log can be moved past this
 387                         * transaction (i.e. by preventing relogging from moving
 388                         * it forward in the log).
 389                         */
 390                        xfs_trans_inode_alloc_buf(tp, fbuf);
 391                        if (version == 3) {
 392                                /*
 393                                 * Mark the buffer as ordered so that they are
 394                                 * not physically logged in the transaction but
 395                                 * still tracked in the AIL as part of the
 396                                 * transaction and pin the log appropriately.
 397                                 */
 398                                xfs_trans_ordered_buf(tp, fbuf);
 399                        }
 400                } else {
 401                        fbuf->b_flags |= XBF_DONE;
 402                        xfs_buf_delwri_queue(fbuf, buffer_list);
 403                        xfs_buf_relse(fbuf);
 404                }
 405        }
 406        return 0;
 407}
 408
 409/*
 410 * Align startino and allocmask for a recently allocated sparse chunk such that
 411 * they are fit for insertion (or merge) into the on-disk inode btrees.
 412 *
 413 * Background:
 414 *
 415 * When enabled, sparse inode support increases the inode alignment from cluster
 416 * size to inode chunk size. This means that the minimum range between two
 417 * non-adjacent inode records in the inobt is large enough for a full inode
 418 * record. This allows for cluster sized, cluster aligned block allocation
 419 * without need to worry about whether the resulting inode record overlaps with
 420 * another record in the tree. Without this basic rule, we would have to deal
 421 * with the consequences of overlap by potentially undoing recent allocations in
 422 * the inode allocation codepath.
 423 *
 424 * Because of this alignment rule (which is enforced on mount), there are two
 425 * inobt possibilities for newly allocated sparse chunks. One is that the
 426 * aligned inode record for the chunk covers a range of inodes not already
 427 * covered in the inobt (i.e., it is safe to insert a new sparse record). The
 428 * other is that a record already exists at the aligned startino that considers
 429 * the newly allocated range as sparse. In the latter case, record content is
 430 * merged in hope that sparse inode chunks fill to full chunks over time.
 431 */
 432STATIC void
 433xfs_align_sparse_ino(
 434        struct xfs_mount                *mp,
 435        xfs_agino_t                     *startino,
 436        uint16_t                        *allocmask)
 437{
 438        xfs_agblock_t                   agbno;
 439        xfs_agblock_t                   mod;
 440        int                             offset;
 441
 442        agbno = XFS_AGINO_TO_AGBNO(mp, *startino);
 443        mod = agbno % mp->m_sb.sb_inoalignmt;
 444        if (!mod)
 445                return;
 446
 447        /* calculate the inode offset and align startino */
 448        offset = mod << mp->m_sb.sb_inopblog;
 449        *startino -= offset;
 450
 451        /*
 452         * Since startino has been aligned down, left shift allocmask such that
 453         * it continues to represent the same physical inodes relative to the
 454         * new startino.
 455         */
 456        *allocmask <<= offset / XFS_INODES_PER_HOLEMASK_BIT;
 457}
 458
 459/*
 460 * Determine whether the source inode record can merge into the target. Both
 461 * records must be sparse, the inode ranges must match and there must be no
 462 * allocation overlap between the records.
 463 */
 464STATIC bool
 465__xfs_inobt_can_merge(
 466        struct xfs_inobt_rec_incore     *trec,  /* tgt record */
 467        struct xfs_inobt_rec_incore     *srec)  /* src record */
 468{
 469        uint64_t                        talloc;
 470        uint64_t                        salloc;
 471
 472        /* records must cover the same inode range */
 473        if (trec->ir_startino != srec->ir_startino)
 474                return false;
 475
 476        /* both records must be sparse */
 477        if (!xfs_inobt_issparse(trec->ir_holemask) ||
 478            !xfs_inobt_issparse(srec->ir_holemask))
 479                return false;
 480
 481        /* both records must track some inodes */
 482        if (!trec->ir_count || !srec->ir_count)
 483                return false;
 484
 485        /* can't exceed capacity of a full record */
 486        if (trec->ir_count + srec->ir_count > XFS_INODES_PER_CHUNK)
 487                return false;
 488
 489        /* verify there is no allocation overlap */
 490        talloc = xfs_inobt_irec_to_allocmask(trec);
 491        salloc = xfs_inobt_irec_to_allocmask(srec);
 492        if (talloc & salloc)
 493                return false;
 494
 495        return true;
 496}
 497
 498/*
 499 * Merge the source inode record into the target. The caller must call
 500 * __xfs_inobt_can_merge() to ensure the merge is valid.
 501 */
 502STATIC void
 503__xfs_inobt_rec_merge(
 504        struct xfs_inobt_rec_incore     *trec,  /* target */
 505        struct xfs_inobt_rec_incore     *srec)  /* src */
 506{
 507        ASSERT(trec->ir_startino == srec->ir_startino);
 508
 509        /* combine the counts */
 510        trec->ir_count += srec->ir_count;
 511        trec->ir_freecount += srec->ir_freecount;
 512
 513        /*
 514         * Merge the holemask and free mask. For both fields, 0 bits refer to
 515         * allocated inodes. We combine the allocated ranges with bitwise AND.
 516         */
 517        trec->ir_holemask &= srec->ir_holemask;
 518        trec->ir_free &= srec->ir_free;
 519}
 520
 521/*
 522 * Insert a new sparse inode chunk into the associated inode btree. The inode
 523 * record for the sparse chunk is pre-aligned to a startino that should match
 524 * any pre-existing sparse inode record in the tree. This allows sparse chunks
 525 * to fill over time.
 526 *
 527 * This function supports two modes of handling preexisting records depending on
 528 * the merge flag. If merge is true, the provided record is merged with the
 529 * existing record and updated in place. The merged record is returned in nrec.
 530 * If merge is false, an existing record is replaced with the provided record.
 531 * If no preexisting record exists, the provided record is always inserted.
 532 *
 533 * It is considered corruption if a merge is requested and not possible. Given
 534 * the sparse inode alignment constraints, this should never happen.
 535 */
 536STATIC int
 537xfs_inobt_insert_sprec(
 538        struct xfs_mount                *mp,
 539        struct xfs_trans                *tp,
 540        struct xfs_buf                  *agbp,
 541        int                             btnum,
 542        struct xfs_inobt_rec_incore     *nrec,  /* in/out: new/merged rec. */
 543        bool                            merge)  /* merge or replace */
 544{
 545        struct xfs_btree_cur            *cur;
 546        struct xfs_agi                  *agi = XFS_BUF_TO_AGI(agbp);
 547        xfs_agnumber_t                  agno = be32_to_cpu(agi->agi_seqno);
 548        int                             error;
 549        int                             i;
 550        struct xfs_inobt_rec_incore     rec;
 551
 552        cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum);
 553
 554        /* the new record is pre-aligned so we know where to look */
 555        error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i);
 556        if (error)
 557                goto error;
 558        /* if nothing there, insert a new record and return */
 559        if (i == 0) {
 560                error = xfs_inobt_insert_rec(cur, nrec->ir_holemask,
 561                                             nrec->ir_count, nrec->ir_freecount,
 562                                             nrec->ir_free, &i);
 563                if (error)
 564                        goto error;
 565                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error);
 566
 567                goto out;
 568        }
 569
 570        /*
 571         * A record exists at this startino. Merge or replace the record
 572         * depending on what we've been asked to do.
 573         */
 574        if (merge) {
 575                error = xfs_inobt_get_rec(cur, &rec, &i);
 576                if (error)
 577                        goto error;
 578                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error);
 579                XFS_WANT_CORRUPTED_GOTO(mp,
 580                                        rec.ir_startino == nrec->ir_startino,
 581                                        error);
 582
 583                /*
 584                 * This should never fail. If we have coexisting records that
 585                 * cannot merge, something is seriously wrong.
 586                 */
 587                XFS_WANT_CORRUPTED_GOTO(mp, __xfs_inobt_can_merge(nrec, &rec),
 588                                        error);
 589
 590                trace_xfs_irec_merge_pre(mp, agno, rec.ir_startino,
 591                                         rec.ir_holemask, nrec->ir_startino,
 592                                         nrec->ir_holemask);
 593
 594                /* merge to nrec to output the updated record */
 595                __xfs_inobt_rec_merge(nrec, &rec);
 596
 597                trace_xfs_irec_merge_post(mp, agno, nrec->ir_startino,
 598                                          nrec->ir_holemask);
 599
 600                error = xfs_inobt_rec_check_count(mp, nrec);
 601                if (error)
 602                        goto error;
 603        }
 604
 605        error = xfs_inobt_update(cur, nrec);
 606        if (error)
 607                goto error;
 608
 609out:
 610        xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
 611        return 0;
 612error:
 613        xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
 614        return error;
 615}
 616
 617/*
 618 * Allocate new inodes in the allocation group specified by agbp.
 619 * Return 0 for success, else error code.
 620 */
 621STATIC int                              /* error code or 0 */
 622xfs_ialloc_ag_alloc(
 623        xfs_trans_t     *tp,            /* transaction pointer */
 624        xfs_buf_t       *agbp,          /* alloc group buffer */
 625        int             *alloc)
 626{
 627        xfs_agi_t       *agi;           /* allocation group header */
 628        xfs_alloc_arg_t args;           /* allocation argument structure */
 629        xfs_agnumber_t  agno;
 630        int             error;
 631        xfs_agino_t     newino;         /* new first inode's number */
 632        xfs_agino_t     newlen;         /* new number of inodes */
 633        int             isaligned = 0;  /* inode allocation at stripe unit */
 634                                        /* boundary */
 635        uint16_t        allocmask = (uint16_t) -1; /* init. to full chunk */
 636        struct xfs_inobt_rec_incore rec;
 637        struct xfs_perag *pag;
 638        int             do_sparse = 0;
 639
 640        memset(&args, 0, sizeof(args));
 641        args.tp = tp;
 642        args.mp = tp->t_mountp;
 643        args.fsbno = NULLFSBLOCK;
 644        xfs_rmap_ag_owner(&args.oinfo, XFS_RMAP_OWN_INODES);
 645
 646#ifdef DEBUG
 647        /* randomly do sparse inode allocations */
 648        if (xfs_sb_version_hassparseinodes(&tp->t_mountp->m_sb) &&
 649            args.mp->m_ialloc_min_blks < args.mp->m_ialloc_blks)
 650                do_sparse = prandom_u32() & 1;
 651#endif
 652
 653        /*
 654         * Locking will ensure that we don't have two callers in here
 655         * at one time.
 656         */
 657        newlen = args.mp->m_ialloc_inos;
 658        if (args.mp->m_maxicount &&
 659            percpu_counter_read_positive(&args.mp->m_icount) + newlen >
 660                                                        args.mp->m_maxicount)
 661                return -ENOSPC;
 662        args.minlen = args.maxlen = args.mp->m_ialloc_blks;
 663        /*
 664         * First try to allocate inodes contiguous with the last-allocated
 665         * chunk of inodes.  If the filesystem is striped, this will fill
 666         * an entire stripe unit with inodes.
 667         */
 668        agi = XFS_BUF_TO_AGI(agbp);
 669        newino = be32_to_cpu(agi->agi_newino);
 670        agno = be32_to_cpu(agi->agi_seqno);
 671        args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
 672                     args.mp->m_ialloc_blks;
 673        if (do_sparse)
 674                goto sparse_alloc;
 675        if (likely(newino != NULLAGINO &&
 676                  (args.agbno < be32_to_cpu(agi->agi_length)))) {
 677                args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
 678                args.type = XFS_ALLOCTYPE_THIS_BNO;
 679                args.prod = 1;
 680
 681                /*
 682                 * We need to take into account alignment here to ensure that
 683                 * we don't modify the free list if we fail to have an exact
 684                 * block. If we don't have an exact match, and every oher
 685                 * attempt allocation attempt fails, we'll end up cancelling
 686                 * a dirty transaction and shutting down.
 687                 *
 688                 * For an exact allocation, alignment must be 1,
 689                 * however we need to take cluster alignment into account when
 690                 * fixing up the freelist. Use the minalignslop field to
 691                 * indicate that extra blocks might be required for alignment,
 692                 * but not to use them in the actual exact allocation.
 693                 */
 694                args.alignment = 1;
 695                args.minalignslop = xfs_ialloc_cluster_alignment(args.mp) - 1;
 696
 697                /* Allow space for the inode btree to split. */
 698                args.minleft = args.mp->m_in_maxlevels - 1;
 699                if ((error = xfs_alloc_vextent(&args)))
 700                        return error;
 701
 702                /*
 703                 * This request might have dirtied the transaction if the AG can
 704                 * satisfy the request, but the exact block was not available.
 705                 * If the allocation did fail, subsequent requests will relax
 706                 * the exact agbno requirement and increase the alignment
 707                 * instead. It is critical that the total size of the request
 708                 * (len + alignment + slop) does not increase from this point
 709                 * on, so reset minalignslop to ensure it is not included in
 710                 * subsequent requests.
 711                 */
 712                args.minalignslop = 0;
 713        }
 714
 715        if (unlikely(args.fsbno == NULLFSBLOCK)) {
 716                /*
 717                 * Set the alignment for the allocation.
 718                 * If stripe alignment is turned on then align at stripe unit
 719                 * boundary.
 720                 * If the cluster size is smaller than a filesystem block
 721                 * then we're doing I/O for inodes in filesystem block size
 722                 * pieces, so don't need alignment anyway.
 723                 */
 724                isaligned = 0;
 725                if (args.mp->m_sinoalign) {
 726                        ASSERT(!(args.mp->m_flags & XFS_MOUNT_NOALIGN));
 727                        args.alignment = args.mp->m_dalign;
 728                        isaligned = 1;
 729                } else
 730                        args.alignment = xfs_ialloc_cluster_alignment(args.mp);
 731                /*
 732                 * Need to figure out where to allocate the inode blocks.
 733                 * Ideally they should be spaced out through the a.g.
 734                 * For now, just allocate blocks up front.
 735                 */
 736                args.agbno = be32_to_cpu(agi->agi_root);
 737                args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
 738                /*
 739                 * Allocate a fixed-size extent of inodes.
 740                 */
 741                args.type = XFS_ALLOCTYPE_NEAR_BNO;
 742                args.prod = 1;
 743                /*
 744                 * Allow space for the inode btree to split.
 745                 */
 746                args.minleft = args.mp->m_in_maxlevels - 1;
 747                if ((error = xfs_alloc_vextent(&args)))
 748                        return error;
 749        }
 750
 751        /*
 752         * If stripe alignment is turned on, then try again with cluster
 753         * alignment.
 754         */
 755        if (isaligned && args.fsbno == NULLFSBLOCK) {
 756                args.type = XFS_ALLOCTYPE_NEAR_BNO;
 757                args.agbno = be32_to_cpu(agi->agi_root);
 758                args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
 759                args.alignment = xfs_ialloc_cluster_alignment(args.mp);
 760                if ((error = xfs_alloc_vextent(&args)))
 761                        return error;
 762        }
 763
 764        /*
 765         * Finally, try a sparse allocation if the filesystem supports it and
 766         * the sparse allocation length is smaller than a full chunk.
 767         */
 768        if (xfs_sb_version_hassparseinodes(&args.mp->m_sb) &&
 769            args.mp->m_ialloc_min_blks < args.mp->m_ialloc_blks &&
 770            args.fsbno == NULLFSBLOCK) {
 771sparse_alloc:
 772                args.type = XFS_ALLOCTYPE_NEAR_BNO;
 773                args.agbno = be32_to_cpu(agi->agi_root);
 774                args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
 775                args.alignment = args.mp->m_sb.sb_spino_align;
 776                args.prod = 1;
 777
 778                args.minlen = args.mp->m_ialloc_min_blks;
 779                args.maxlen = args.minlen;
 780
 781                /*
 782                 * The inode record will be aligned to full chunk size. We must
 783                 * prevent sparse allocation from AG boundaries that result in
 784                 * invalid inode records, such as records that start at agbno 0
 785                 * or extend beyond the AG.
 786                 *
 787                 * Set min agbno to the first aligned, non-zero agbno and max to
 788                 * the last aligned agbno that is at least one full chunk from
 789                 * the end of the AG.
 790                 */
 791                args.min_agbno = args.mp->m_sb.sb_inoalignmt;
 792                args.max_agbno = round_down(args.mp->m_sb.sb_agblocks,
 793                                            args.mp->m_sb.sb_inoalignmt) -
 794                                 args.mp->m_ialloc_blks;
 795
 796                error = xfs_alloc_vextent(&args);
 797                if (error)
 798                        return error;
 799
 800                newlen = args.len << args.mp->m_sb.sb_inopblog;
 801                ASSERT(newlen <= XFS_INODES_PER_CHUNK);
 802                allocmask = (1 << (newlen / XFS_INODES_PER_HOLEMASK_BIT)) - 1;
 803        }
 804
 805        if (args.fsbno == NULLFSBLOCK) {
 806                *alloc = 0;
 807                return 0;
 808        }
 809        ASSERT(args.len == args.minlen);
 810
 811        /*
 812         * Stamp and write the inode buffers.
 813         *
 814         * Seed the new inode cluster with a random generation number. This
 815         * prevents short-term reuse of generation numbers if a chunk is
 816         * freed and then immediately reallocated. We use random numbers
 817         * rather than a linear progression to prevent the next generation
 818         * number from being easily guessable.
 819         */
 820        error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, agno,
 821                        args.agbno, args.len, prandom_u32());
 822
 823        if (error)
 824                return error;
 825        /*
 826         * Convert the results.
 827         */
 828        newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
 829
 830        if (xfs_inobt_issparse(~allocmask)) {
 831                /*
 832                 * We've allocated a sparse chunk. Align the startino and mask.
 833                 */
 834                xfs_align_sparse_ino(args.mp, &newino, &allocmask);
 835
 836                rec.ir_startino = newino;
 837                rec.ir_holemask = ~allocmask;
 838                rec.ir_count = newlen;
 839                rec.ir_freecount = newlen;
 840                rec.ir_free = XFS_INOBT_ALL_FREE;
 841
 842                /*
 843                 * Insert the sparse record into the inobt and allow for a merge
 844                 * if necessary. If a merge does occur, rec is updated to the
 845                 * merged record.
 846                 */
 847                error = xfs_inobt_insert_sprec(args.mp, tp, agbp, XFS_BTNUM_INO,
 848                                               &rec, true);
 849                if (error == -EFSCORRUPTED) {
 850                        xfs_alert(args.mp,
 851        "invalid sparse inode record: ino 0x%llx holemask 0x%x count %u",
 852                                  XFS_AGINO_TO_INO(args.mp, agno,
 853                                                   rec.ir_startino),
 854                                  rec.ir_holemask, rec.ir_count);
 855                        xfs_force_shutdown(args.mp, SHUTDOWN_CORRUPT_INCORE);
 856                }
 857                if (error)
 858                        return error;
 859
 860                /*
 861                 * We can't merge the part we've just allocated as for the inobt
 862                 * due to finobt semantics. The original record may or may not
 863                 * exist independent of whether physical inodes exist in this
 864                 * sparse chunk.
 865                 *
 866                 * We must update the finobt record based on the inobt record.
 867                 * rec contains the fully merged and up to date inobt record
 868                 * from the previous call. Set merge false to replace any
 869                 * existing record with this one.
 870                 */
 871                if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
 872                        error = xfs_inobt_insert_sprec(args.mp, tp, agbp,
 873                                                       XFS_BTNUM_FINO, &rec,
 874                                                       false);
 875                        if (error)
 876                                return error;
 877                }
 878        } else {
 879                /* full chunk - insert new records to both btrees */
 880                error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
 881                                         XFS_BTNUM_INO);
 882                if (error)
 883                        return error;
 884
 885                if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
 886                        error = xfs_inobt_insert(args.mp, tp, agbp, newino,
 887                                                 newlen, XFS_BTNUM_FINO);
 888                        if (error)
 889                                return error;
 890                }
 891        }
 892
 893        /*
 894         * Update AGI counts and newino.
 895         */
 896        be32_add_cpu(&agi->agi_count, newlen);
 897        be32_add_cpu(&agi->agi_freecount, newlen);
 898        pag = xfs_perag_get(args.mp, agno);
 899        pag->pagi_freecount += newlen;
 900        pag->pagi_count += newlen;
 901        xfs_perag_put(pag);
 902        agi->agi_newino = cpu_to_be32(newino);
 903
 904        /*
 905         * Log allocation group header fields
 906         */
 907        xfs_ialloc_log_agi(tp, agbp,
 908                XFS_AGI_COUNT | XFS_AGI_FREECOUNT | XFS_AGI_NEWINO);
 909        /*
 910         * Modify/log superblock values for inode count and inode free count.
 911         */
 912        xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, (long)newlen);
 913        xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, (long)newlen);
 914        *alloc = 1;
 915        return 0;
 916}
 917
 918STATIC xfs_agnumber_t
 919xfs_ialloc_next_ag(
 920        xfs_mount_t     *mp)
 921{
 922        xfs_agnumber_t  agno;
 923
 924        spin_lock(&mp->m_agirotor_lock);
 925        agno = mp->m_agirotor;
 926        if (++mp->m_agirotor >= mp->m_maxagi)
 927                mp->m_agirotor = 0;
 928        spin_unlock(&mp->m_agirotor_lock);
 929
 930        return agno;
 931}
 932
 933/*
 934 * Select an allocation group to look for a free inode in, based on the parent
 935 * inode and the mode.  Return the allocation group buffer.
 936 */
 937STATIC xfs_agnumber_t
 938xfs_ialloc_ag_select(
 939        xfs_trans_t     *tp,            /* transaction pointer */
 940        xfs_ino_t       parent,         /* parent directory inode number */
 941        umode_t         mode)           /* bits set to indicate file type */
 942{
 943        xfs_agnumber_t  agcount;        /* number of ag's in the filesystem */
 944        xfs_agnumber_t  agno;           /* current ag number */
 945        int             flags;          /* alloc buffer locking flags */
 946        xfs_extlen_t    ineed;          /* blocks needed for inode allocation */
 947        xfs_extlen_t    longest = 0;    /* longest extent available */
 948        xfs_mount_t     *mp;            /* mount point structure */
 949        int             needspace;      /* file mode implies space allocated */
 950        xfs_perag_t     *pag;           /* per allocation group data */
 951        xfs_agnumber_t  pagno;          /* parent (starting) ag number */
 952        int             error;
 953
 954        /*
 955         * Files of these types need at least one block if length > 0
 956         * (and they won't fit in the inode, but that's hard to figure out).
 957         */
 958        needspace = S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode);
 959        mp = tp->t_mountp;
 960        agcount = mp->m_maxagi;
 961        if (S_ISDIR(mode))
 962                pagno = xfs_ialloc_next_ag(mp);
 963        else {
 964                pagno = XFS_INO_TO_AGNO(mp, parent);
 965                if (pagno >= agcount)
 966                        pagno = 0;
 967        }
 968
 969        ASSERT(pagno < agcount);
 970
 971        /*
 972         * Loop through allocation groups, looking for one with a little
 973         * free space in it.  Note we don't look for free inodes, exactly.
 974         * Instead, we include whether there is a need to allocate inodes
 975         * to mean that blocks must be allocated for them,
 976         * if none are currently free.
 977         */
 978        agno = pagno;
 979        flags = XFS_ALLOC_FLAG_TRYLOCK;
 980        for (;;) {
 981                pag = xfs_perag_get(mp, agno);
 982                if (!pag->pagi_inodeok) {
 983                        xfs_ialloc_next_ag(mp);
 984                        goto nextag;
 985                }
 986
 987                if (!pag->pagi_init) {
 988                        error = xfs_ialloc_pagi_init(mp, tp, agno);
 989                        if (error)
 990                                goto nextag;
 991                }
 992
 993                if (pag->pagi_freecount) {
 994                        xfs_perag_put(pag);
 995                        return agno;
 996                }
 997
 998                if (!pag->pagf_init) {
 999                        error = xfs_alloc_pagf_init(mp, tp, agno, flags);
1000                        if (error)
1001                                goto nextag;
1002                }
1003
1004                /*
1005                 * Check that there is enough free space for the file plus a
1006                 * chunk of inodes if we need to allocate some. If this is the
1007                 * first pass across the AGs, take into account the potential
1008                 * space needed for alignment of inode chunks when checking the
1009                 * longest contiguous free space in the AG - this prevents us
1010                 * from getting ENOSPC because we have free space larger than
1011                 * m_ialloc_blks but alignment constraints prevent us from using
1012                 * it.
1013                 *
1014                 * If we can't find an AG with space for full alignment slack to
1015                 * be taken into account, we must be near ENOSPC in all AGs.
1016                 * Hence we don't include alignment for the second pass and so
1017                 * if we fail allocation due to alignment issues then it is most
1018                 * likely a real ENOSPC condition.
1019                 */
1020                ineed = mp->m_ialloc_min_blks;
1021                if (flags && ineed > 1)
1022                        ineed += xfs_ialloc_cluster_alignment(mp);
1023                longest = pag->pagf_longest;
1024                if (!longest)
1025                        longest = pag->pagf_flcount > 0;
1026
1027                if (pag->pagf_freeblks >= needspace + ineed &&
1028                    longest >= ineed) {
1029                        xfs_perag_put(pag);
1030                        return agno;
1031                }
1032nextag:
1033                xfs_perag_put(pag);
1034                /*
1035                 * No point in iterating over the rest, if we're shutting
1036                 * down.
1037                 */
1038                if (XFS_FORCED_SHUTDOWN(mp))
1039                        return NULLAGNUMBER;
1040                agno++;
1041                if (agno >= agcount)
1042                        agno = 0;
1043                if (agno == pagno) {
1044                        if (flags == 0)
1045                                return NULLAGNUMBER;
1046                        flags = 0;
1047                }
1048        }
1049}
1050
1051/*
1052 * Try to retrieve the next record to the left/right from the current one.
1053 */
1054STATIC int
1055xfs_ialloc_next_rec(
1056        struct xfs_btree_cur    *cur,
1057        xfs_inobt_rec_incore_t  *rec,
1058        int                     *done,
1059        int                     left)
1060{
1061        int                     error;
1062        int                     i;
1063
1064        if (left)
1065                error = xfs_btree_decrement(cur, 0, &i);
1066        else
1067                error = xfs_btree_increment(cur, 0, &i);
1068
1069        if (error)
1070                return error;
1071        *done = !i;
1072        if (i) {
1073                error = xfs_inobt_get_rec(cur, rec, &i);
1074                if (error)
1075                        return error;
1076                XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
1077        }
1078
1079        return 0;
1080}
1081
1082STATIC int
1083xfs_ialloc_get_rec(
1084        struct xfs_btree_cur    *cur,
1085        xfs_agino_t             agino,
1086        xfs_inobt_rec_incore_t  *rec,
1087        int                     *done)
1088{
1089        int                     error;
1090        int                     i;
1091
1092        error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_EQ, &i);
1093        if (error)
1094                return error;
1095        *done = !i;
1096        if (i) {
1097                error = xfs_inobt_get_rec(cur, rec, &i);
1098                if (error)
1099                        return error;
1100                XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
1101        }
1102
1103        return 0;
1104}
1105
1106/*
1107 * Return the offset of the first free inode in the record. If the inode chunk
1108 * is sparsely allocated, we convert the record holemask to inode granularity
1109 * and mask off the unallocated regions from the inode free mask.
1110 */
1111STATIC int
1112xfs_inobt_first_free_inode(
1113        struct xfs_inobt_rec_incore     *rec)
1114{
1115        xfs_inofree_t                   realfree;
1116
1117        /* if there are no holes, return the first available offset */
1118        if (!xfs_inobt_issparse(rec->ir_holemask))
1119                return xfs_lowbit64(rec->ir_free);
1120
1121        realfree = xfs_inobt_irec_to_allocmask(rec);
1122        realfree &= rec->ir_free;
1123
1124        return xfs_lowbit64(realfree);
1125}
1126
1127/*
1128 * Allocate an inode using the inobt-only algorithm.
1129 */
1130STATIC int
1131xfs_dialloc_ag_inobt(
1132        struct xfs_trans        *tp,
1133        struct xfs_buf          *agbp,
1134        xfs_ino_t               parent,
1135        xfs_ino_t               *inop)
1136{
1137        struct xfs_mount        *mp = tp->t_mountp;
1138        struct xfs_agi          *agi = XFS_BUF_TO_AGI(agbp);
1139        xfs_agnumber_t          agno = be32_to_cpu(agi->agi_seqno);
1140        xfs_agnumber_t          pagno = XFS_INO_TO_AGNO(mp, parent);
1141        xfs_agino_t             pagino = XFS_INO_TO_AGINO(mp, parent);
1142        struct xfs_perag        *pag;
1143        struct xfs_btree_cur    *cur, *tcur;
1144        struct xfs_inobt_rec_incore rec, trec;
1145        xfs_ino_t               ino;
1146        int                     error;
1147        int                     offset;
1148        int                     i, j;
1149        int                     searchdistance = 10;
1150
1151        pag = xfs_perag_get(mp, agno);
1152
1153        ASSERT(pag->pagi_init);
1154        ASSERT(pag->pagi_inodeok);
1155        ASSERT(pag->pagi_freecount > 0);
1156
1157 restart_pagno:
1158        cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
1159        /*
1160         * If pagino is 0 (this is the root inode allocation) use newino.
1161         * This must work because we've just allocated some.
1162         */
1163        if (!pagino)
1164                pagino = be32_to_cpu(agi->agi_newino);
1165
1166        error = xfs_check_agi_freecount(cur, agi);
1167        if (error)
1168                goto error0;
1169
1170        /*
1171         * If in the same AG as the parent, try to get near the parent.
1172         */
1173        if (pagno == agno) {
1174                int             doneleft;       /* done, to the left */
1175                int             doneright;      /* done, to the right */
1176
1177                error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
1178                if (error)
1179                        goto error0;
1180                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
1181
1182                error = xfs_inobt_get_rec(cur, &rec, &j);
1183                if (error)
1184                        goto error0;
1185                XFS_WANT_CORRUPTED_GOTO(mp, j == 1, error0);
1186
1187                if (rec.ir_freecount > 0) {
1188                        /*
1189                         * Found a free inode in the same chunk
1190                         * as the parent, done.
1191                         */
1192                        goto alloc_inode;
1193                }
1194
1195
1196                /*
1197                 * In the same AG as parent, but parent's chunk is full.
1198                 */
1199
1200                /* duplicate the cursor, search left & right simultaneously */
1201                error = xfs_btree_dup_cursor(cur, &tcur);
1202                if (error)
1203                        goto error0;
1204
1205                /*
1206                 * Skip to last blocks looked up if same parent inode.
1207                 */
1208                if (pagino != NULLAGINO &&
1209                    pag->pagl_pagino == pagino &&
1210                    pag->pagl_leftrec != NULLAGINO &&
1211                    pag->pagl_rightrec != NULLAGINO) {
1212                        error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec,
1213                                                   &trec, &doneleft);
1214                        if (error)
1215                                goto error1;
1216
1217                        error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec,
1218                                                   &rec, &doneright);
1219                        if (error)
1220                                goto error1;
1221                } else {
1222                        /* search left with tcur, back up 1 record */
1223                        error = xfs_ialloc_next_rec(tcur, &trec, &doneleft, 1);
1224                        if (error)
1225                                goto error1;
1226
1227                        /* search right with cur, go forward 1 record. */
1228                        error = xfs_ialloc_next_rec(cur, &rec, &doneright, 0);
1229                        if (error)
1230                                goto error1;
1231                }
1232
1233                /*
1234                 * Loop until we find an inode chunk with a free inode.
1235                 */
1236                while (--searchdistance > 0 && (!doneleft || !doneright)) {
1237                        int     useleft;  /* using left inode chunk this time */
1238
1239                        /* figure out the closer block if both are valid. */
1240                        if (!doneleft && !doneright) {
1241                                useleft = pagino -
1242                                 (trec.ir_startino + XFS_INODES_PER_CHUNK - 1) <
1243                                  rec.ir_startino - pagino;
1244                        } else {
1245                                useleft = !doneleft;
1246                        }
1247
1248                        /* free inodes to the left? */
1249                        if (useleft && trec.ir_freecount) {
1250                                xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1251                                cur = tcur;
1252
1253                                pag->pagl_leftrec = trec.ir_startino;
1254                                pag->pagl_rightrec = rec.ir_startino;
1255                                pag->pagl_pagino = pagino;
1256                                rec = trec;
1257                                goto alloc_inode;
1258                        }
1259
1260                        /* free inodes to the right? */
1261                        if (!useleft && rec.ir_freecount) {
1262                                xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
1263
1264                                pag->pagl_leftrec = trec.ir_startino;
1265                                pag->pagl_rightrec = rec.ir_startino;
1266                                pag->pagl_pagino = pagino;
1267                                goto alloc_inode;
1268                        }
1269
1270                        /* get next record to check */
1271                        if (useleft) {
1272                                error = xfs_ialloc_next_rec(tcur, &trec,
1273                                                                 &doneleft, 1);
1274                        } else {
1275                                error = xfs_ialloc_next_rec(cur, &rec,
1276                                                                 &doneright, 0);
1277                        }
1278                        if (error)
1279                                goto error1;
1280                }
1281
1282                if (searchdistance <= 0) {
1283                        /*
1284                         * Not in range - save last search
1285                         * location and allocate a new inode
1286                         */
1287                        xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
1288                        pag->pagl_leftrec = trec.ir_startino;
1289                        pag->pagl_rightrec = rec.ir_startino;
1290                        pag->pagl_pagino = pagino;
1291
1292                } else {
1293                        /*
1294                         * We've reached the end of the btree. because
1295                         * we are only searching a small chunk of the
1296                         * btree each search, there is obviously free
1297                         * inodes closer to the parent inode than we
1298                         * are now. restart the search again.
1299                         */
1300                        pag->pagl_pagino = NULLAGINO;
1301                        pag->pagl_leftrec = NULLAGINO;
1302                        pag->pagl_rightrec = NULLAGINO;
1303                        xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
1304                        xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1305                        goto restart_pagno;
1306                }
1307        }
1308
1309        /*
1310         * In a different AG from the parent.
1311         * See if the most recently allocated block has any free.
1312         */
1313        if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
1314                error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
1315                                         XFS_LOOKUP_EQ, &i);
1316                if (error)
1317                        goto error0;
1318
1319                if (i == 1) {
1320                        error = xfs_inobt_get_rec(cur, &rec, &j);
1321                        if (error)
1322                                goto error0;
1323
1324                        if (j == 1 && rec.ir_freecount > 0) {
1325                                /*
1326                                 * The last chunk allocated in the group
1327                                 * still has a free inode.
1328                                 */
1329                                goto alloc_inode;
1330                        }
1331                }
1332        }
1333
1334        /*
1335         * None left in the last group, search the whole AG
1336         */
1337        error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
1338        if (error)
1339                goto error0;
1340        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
1341
1342        for (;;) {
1343                error = xfs_inobt_get_rec(cur, &rec, &i);
1344                if (error)
1345                        goto error0;
1346                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
1347                if (rec.ir_freecount > 0)
1348                        break;
1349                error = xfs_btree_increment(cur, 0, &i);
1350                if (error)
1351                        goto error0;
1352                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
1353        }
1354
1355alloc_inode:
1356        offset = xfs_inobt_first_free_inode(&rec);
1357        ASSERT(offset >= 0);
1358        ASSERT(offset < XFS_INODES_PER_CHUNK);
1359        ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
1360                                   XFS_INODES_PER_CHUNK) == 0);
1361        ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
1362        rec.ir_free &= ~XFS_INOBT_MASK(offset);
1363        rec.ir_freecount--;
1364        error = xfs_inobt_update(cur, &rec);
1365        if (error)
1366                goto error0;
1367        be32_add_cpu(&agi->agi_freecount, -1);
1368        xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
1369        pag->pagi_freecount--;
1370
1371        error = xfs_check_agi_freecount(cur, agi);
1372        if (error)
1373                goto error0;
1374
1375        xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1376        xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
1377        xfs_perag_put(pag);
1378        *inop = ino;
1379        return 0;
1380error1:
1381        xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
1382error0:
1383        xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
1384        xfs_perag_put(pag);
1385        return error;
1386}
1387
1388/*
1389 * Use the free inode btree to allocate an inode based on distance from the
1390 * parent. Note that the provided cursor may be deleted and replaced.
1391 */
1392STATIC int
1393xfs_dialloc_ag_finobt_near(
1394        xfs_agino_t                     pagino,
1395        struct xfs_btree_cur            **ocur,
1396        struct xfs_inobt_rec_incore     *rec)
1397{
1398        struct xfs_btree_cur            *lcur = *ocur;  /* left search cursor */
1399        struct xfs_btree_cur            *rcur;  /* right search cursor */
1400        struct xfs_inobt_rec_incore     rrec;
1401        int                             error;
1402        int                             i, j;
1403
1404        error = xfs_inobt_lookup(lcur, pagino, XFS_LOOKUP_LE, &i);
1405        if (error)
1406                return error;
1407
1408        if (i == 1) {
1409                error = xfs_inobt_get_rec(lcur, rec, &i);
1410                if (error)
1411                        return error;
1412                XFS_WANT_CORRUPTED_RETURN(lcur->bc_mp, i == 1);
1413
1414                /*
1415                 * See if we've landed in the parent inode record. The finobt
1416                 * only tracks chunks with at least one free inode, so record
1417                 * existence is enough.
1418                 */
1419                if (pagino >= rec->ir_startino &&
1420                    pagino < (rec->ir_startino + XFS_INODES_PER_CHUNK))
1421                        return 0;
1422        }
1423
1424        error = xfs_btree_dup_cursor(lcur, &rcur);
1425        if (error)
1426                return error;
1427
1428        error = xfs_inobt_lookup(rcur, pagino, XFS_LOOKUP_GE, &j);
1429        if (error)
1430                goto error_rcur;
1431        if (j == 1) {
1432                error = xfs_inobt_get_rec(rcur, &rrec, &j);
1433                if (error)
1434                        goto error_rcur;
1435                XFS_WANT_CORRUPTED_GOTO(lcur->bc_mp, j == 1, error_rcur);
1436        }
1437
1438        XFS_WANT_CORRUPTED_GOTO(lcur->bc_mp, i == 1 || j == 1, error_rcur);
1439        if (i == 1 && j == 1) {
1440                /*
1441                 * Both the left and right records are valid. Choose the closer
1442                 * inode chunk to the target.
1443                 */
1444                if ((pagino - rec->ir_startino + XFS_INODES_PER_CHUNK - 1) >
1445                    (rrec.ir_startino - pagino)) {
1446                        *rec = rrec;
1447                        xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR);
1448                        *ocur = rcur;
1449                } else {
1450                        xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR);
1451                }
1452        } else if (j == 1) {
1453                /* only the right record is valid */
1454                *rec = rrec;
1455                xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR);
1456                *ocur = rcur;
1457        } else if (i == 1) {
1458                /* only the left record is valid */
1459                xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR);
1460        }
1461
1462        return 0;
1463
1464error_rcur:
1465        xfs_btree_del_cursor(rcur, XFS_BTREE_ERROR);
1466        return error;
1467}
1468
1469/*
1470 * Use the free inode btree to find a free inode based on a newino hint. If
1471 * the hint is NULL, find the first free inode in the AG.
1472 */
1473STATIC int
1474xfs_dialloc_ag_finobt_newino(
1475        struct xfs_agi                  *agi,
1476        struct xfs_btree_cur            *cur,
1477        struct xfs_inobt_rec_incore     *rec)
1478{
1479        int error;
1480        int i;
1481
1482        if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
1483                error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
1484                                         XFS_LOOKUP_EQ, &i);
1485                if (error)
1486                        return error;
1487                if (i == 1) {
1488                        error = xfs_inobt_get_rec(cur, rec, &i);
1489                        if (error)
1490                                return error;
1491                        XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
1492                        return 0;
1493                }
1494        }
1495
1496        /*
1497         * Find the first inode available in the AG.
1498         */
1499        error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
1500        if (error)
1501                return error;
1502        XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
1503
1504        error = xfs_inobt_get_rec(cur, rec, &i);
1505        if (error)
1506                return error;
1507        XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
1508
1509        return 0;
1510}
1511
1512/*
1513 * Update the inobt based on a modification made to the finobt. Also ensure that
1514 * the records from both trees are equivalent post-modification.
1515 */
1516STATIC int
1517xfs_dialloc_ag_update_inobt(
1518        struct xfs_btree_cur            *cur,   /* inobt cursor */
1519        struct xfs_inobt_rec_incore     *frec,  /* finobt record */
1520        int                             offset) /* inode offset */
1521{
1522        struct xfs_inobt_rec_incore     rec;
1523        int                             error;
1524        int                             i;
1525
1526        error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i);
1527        if (error)
1528                return error;
1529        XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
1530
1531        error = xfs_inobt_get_rec(cur, &rec, &i);
1532        if (error)
1533                return error;
1534        XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
1535        ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) %
1536                                   XFS_INODES_PER_CHUNK) == 0);
1537
1538        rec.ir_free &= ~XFS_INOBT_MASK(offset);
1539        rec.ir_freecount--;
1540
1541        XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, (rec.ir_free == frec->ir_free) &&
1542                                  (rec.ir_freecount == frec->ir_freecount));
1543
1544        return xfs_inobt_update(cur, &rec);
1545}
1546
1547/*
1548 * Allocate an inode using the free inode btree, if available. Otherwise, fall
1549 * back to the inobt search algorithm.
1550 *
1551 * The caller selected an AG for us, and made sure that free inodes are
1552 * available.
1553 */
1554STATIC int
1555xfs_dialloc_ag(
1556        struct xfs_trans        *tp,
1557        struct xfs_buf          *agbp,
1558        xfs_ino_t               parent,
1559        xfs_ino_t               *inop)
1560{
1561        struct xfs_mount                *mp = tp->t_mountp;
1562        struct xfs_agi                  *agi = XFS_BUF_TO_AGI(agbp);
1563        xfs_agnumber_t                  agno = be32_to_cpu(agi->agi_seqno);
1564        xfs_agnumber_t                  pagno = XFS_INO_TO_AGNO(mp, parent);
1565        xfs_agino_t                     pagino = XFS_INO_TO_AGINO(mp, parent);
1566        struct xfs_perag                *pag;
1567        struct xfs_btree_cur            *cur;   /* finobt cursor */
1568        struct xfs_btree_cur            *icur;  /* inobt cursor */
1569        struct xfs_inobt_rec_incore     rec;
1570        xfs_ino_t                       ino;
1571        int                             error;
1572        int                             offset;
1573        int                             i;
1574
1575        if (!xfs_sb_version_hasfinobt(&mp->m_sb))
1576                return xfs_dialloc_ag_inobt(tp, agbp, parent, inop);
1577
1578        pag = xfs_perag_get(mp, agno);
1579
1580        /*
1581         * If pagino is 0 (this is the root inode allocation) use newino.
1582         * This must work because we've just allocated some.
1583         */
1584        if (!pagino)
1585                pagino = be32_to_cpu(agi->agi_newino);
1586
1587        cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO);
1588
1589        error = xfs_check_agi_freecount(cur, agi);
1590        if (error)
1591                goto error_cur;
1592
1593        /*
1594         * The search algorithm depends on whether we're in the same AG as the
1595         * parent. If so, find the closest available inode to the parent. If
1596         * not, consider the agi hint or find the first free inode in the AG.
1597         */
1598        if (agno == pagno)
1599                error = xfs_dialloc_ag_finobt_near(pagino, &cur, &rec);
1600        else
1601                error = xfs_dialloc_ag_finobt_newino(agi, cur, &rec);
1602        if (error)
1603                goto error_cur;
1604
1605        offset = xfs_inobt_first_free_inode(&rec);
1606        ASSERT(offset >= 0);
1607        ASSERT(offset < XFS_INODES_PER_CHUNK);
1608        ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
1609                                   XFS_INODES_PER_CHUNK) == 0);
1610        ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
1611
1612        /*
1613         * Modify or remove the finobt record.
1614         */
1615        rec.ir_free &= ~XFS_INOBT_MASK(offset);
1616        rec.ir_freecount--;
1617        if (rec.ir_freecount)
1618                error = xfs_inobt_update(cur, &rec);
1619        else
1620                error = xfs_btree_delete(cur, &i);
1621        if (error)
1622                goto error_cur;
1623
1624        /*
1625         * The finobt has now been updated appropriately. We haven't updated the
1626         * agi and superblock yet, so we can create an inobt cursor and validate
1627         * the original freecount. If all is well, make the equivalent update to
1628         * the inobt using the finobt record and offset information.
1629         */
1630        icur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
1631
1632        error = xfs_check_agi_freecount(icur, agi);
1633        if (error)
1634                goto error_icur;
1635
1636        error = xfs_dialloc_ag_update_inobt(icur, &rec, offset);
1637        if (error)
1638                goto error_icur;
1639
1640        /*
1641         * Both trees have now been updated. We must update the perag and
1642         * superblock before we can check the freecount for each btree.
1643         */
1644        be32_add_cpu(&agi->agi_freecount, -1);
1645        xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
1646        pag->pagi_freecount--;
1647
1648        xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
1649
1650        error = xfs_check_agi_freecount(icur, agi);
1651        if (error)
1652                goto error_icur;
1653        error = xfs_check_agi_freecount(cur, agi);
1654        if (error)
1655                goto error_icur;
1656
1657        xfs_btree_del_cursor(icur, XFS_BTREE_NOERROR);
1658        xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1659        xfs_perag_put(pag);
1660        *inop = ino;
1661        return 0;
1662
1663error_icur:
1664        xfs_btree_del_cursor(icur, XFS_BTREE_ERROR);
1665error_cur:
1666        xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
1667        xfs_perag_put(pag);
1668        return error;
1669}
1670
1671/*
1672 * Allocate an inode on disk.
1673 *
1674 * Mode is used to tell whether the new inode will need space, and whether it
1675 * is a directory.
1676 *
1677 * This function is designed to be called twice if it has to do an allocation
1678 * to make more free inodes.  On the first call, *IO_agbp should be set to NULL.
1679 * If an inode is available without having to performn an allocation, an inode
1680 * number is returned.  In this case, *IO_agbp is set to NULL.  If an allocation
1681 * needs to be done, xfs_dialloc returns the current AGI buffer in *IO_agbp.
1682 * The caller should then commit the current transaction, allocate a
1683 * new transaction, and call xfs_dialloc() again, passing in the previous value
1684 * of *IO_agbp.  IO_agbp should be held across the transactions. Since the AGI
1685 * buffer is locked across the two calls, the second call is guaranteed to have
1686 * a free inode available.
1687 *
1688 * Once we successfully pick an inode its number is returned and the on-disk
1689 * data structures are updated.  The inode itself is not read in, since doing so
1690 * would break ordering constraints with xfs_reclaim.
1691 */
1692int
1693xfs_dialloc(
1694        struct xfs_trans        *tp,
1695        xfs_ino_t               parent,
1696        umode_t                 mode,
1697        struct xfs_buf          **IO_agbp,
1698        xfs_ino_t               *inop)
1699{
1700        struct xfs_mount        *mp = tp->t_mountp;
1701        struct xfs_buf          *agbp;
1702        xfs_agnumber_t          agno;
1703        int                     error;
1704        int                     ialloced;
1705        int                     noroom = 0;
1706        xfs_agnumber_t          start_agno;
1707        struct xfs_perag        *pag;
1708        int                     okalloc = 1;
1709
1710        if (*IO_agbp) {
1711                /*
1712                 * If the caller passes in a pointer to the AGI buffer,
1713                 * continue where we left off before.  In this case, we
1714                 * know that the allocation group has free inodes.
1715                 */
1716                agbp = *IO_agbp;
1717                goto out_alloc;
1718        }
1719
1720        /*
1721         * We do not have an agbp, so select an initial allocation
1722         * group for inode allocation.
1723         */
1724        start_agno = xfs_ialloc_ag_select(tp, parent, mode);
1725        if (start_agno == NULLAGNUMBER) {
1726                *inop = NULLFSINO;
1727                return 0;
1728        }
1729
1730        /*
1731         * If we have already hit the ceiling of inode blocks then clear
1732         * okalloc so we scan all available agi structures for a free
1733         * inode.
1734         *
1735         * Read rough value of mp->m_icount by percpu_counter_read_positive,
1736         * which will sacrifice the preciseness but improve the performance.
1737         */
1738        if (mp->m_maxicount &&
1739            percpu_counter_read_positive(&mp->m_icount) + mp->m_ialloc_inos
1740                                                        > mp->m_maxicount) {
1741                noroom = 1;
1742                okalloc = 0;
1743        }
1744
1745        /*
1746         * Loop until we find an allocation group that either has free inodes
1747         * or in which we can allocate some inodes.  Iterate through the
1748         * allocation groups upward, wrapping at the end.
1749         */
1750        agno = start_agno;
1751        for (;;) {
1752                pag = xfs_perag_get(mp, agno);
1753                if (!pag->pagi_inodeok) {
1754                        xfs_ialloc_next_ag(mp);
1755                        goto nextag;
1756                }
1757
1758                if (!pag->pagi_init) {
1759                        error = xfs_ialloc_pagi_init(mp, tp, agno);
1760                        if (error)
1761                                goto out_error;
1762                }
1763
1764                /*
1765                 * Do a first racy fast path check if this AG is usable.
1766                 */
1767                if (!pag->pagi_freecount && !okalloc)
1768                        goto nextag;
1769
1770                /*
1771                 * Then read in the AGI buffer and recheck with the AGI buffer
1772                 * lock held.
1773                 */
1774                error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
1775                if (error)
1776                        goto out_error;
1777
1778                if (pag->pagi_freecount) {
1779                        xfs_perag_put(pag);
1780                        goto out_alloc;
1781                }
1782
1783                if (!okalloc)
1784                        goto nextag_relse_buffer;
1785
1786
1787                error = xfs_ialloc_ag_alloc(tp, agbp, &ialloced);
1788                if (error) {
1789                        xfs_trans_brelse(tp, agbp);
1790
1791                        if (error != -ENOSPC)
1792                                goto out_error;
1793
1794                        xfs_perag_put(pag);
1795                        *inop = NULLFSINO;
1796                        return 0;
1797                }
1798
1799                if (ialloced) {
1800                        /*
1801                         * We successfully allocated some inodes, return
1802                         * the current context to the caller so that it
1803                         * can commit the current transaction and call
1804                         * us again where we left off.
1805                         */
1806                        ASSERT(pag->pagi_freecount > 0);
1807                        xfs_perag_put(pag);
1808
1809                        *IO_agbp = agbp;
1810                        *inop = NULLFSINO;
1811                        return 0;
1812                }
1813
1814nextag_relse_buffer:
1815                xfs_trans_brelse(tp, agbp);
1816nextag:
1817                xfs_perag_put(pag);
1818                if (++agno == mp->m_sb.sb_agcount)
1819                        agno = 0;
1820                if (agno == start_agno) {
1821                        *inop = NULLFSINO;
1822                        return noroom ? -ENOSPC : 0;
1823                }
1824        }
1825
1826out_alloc:
1827        *IO_agbp = NULL;
1828        return xfs_dialloc_ag(tp, agbp, parent, inop);
1829out_error:
1830        xfs_perag_put(pag);
1831        return error;
1832}
1833
1834/*
1835 * Free the blocks of an inode chunk. We must consider that the inode chunk
1836 * might be sparse and only free the regions that are allocated as part of the
1837 * chunk.
1838 */
1839STATIC void
1840xfs_difree_inode_chunk(
1841        struct xfs_mount                *mp,
1842        xfs_agnumber_t                  agno,
1843        struct xfs_inobt_rec_incore     *rec,
1844        struct xfs_defer_ops            *dfops)
1845{
1846        xfs_agblock_t   sagbno = XFS_AGINO_TO_AGBNO(mp, rec->ir_startino);
1847        int             startidx, endidx;
1848        int             nextbit;
1849        xfs_agblock_t   agbno;
1850        int             contigblk;
1851        struct xfs_owner_info   oinfo;
1852        DECLARE_BITMAP(holemask, XFS_INOBT_HOLEMASK_BITS);
1853        xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INODES);
1854
1855        if (!xfs_inobt_issparse(rec->ir_holemask)) {
1856                /* not sparse, calculate extent info directly */
1857                xfs_bmap_add_free(mp, dfops, XFS_AGB_TO_FSB(mp, agno, sagbno),
1858                                  mp->m_ialloc_blks, &oinfo);
1859                return;
1860        }
1861
1862        /* holemask is only 16-bits (fits in an unsigned long) */
1863        ASSERT(sizeof(rec->ir_holemask) <= sizeof(holemask[0]));
1864        holemask[0] = rec->ir_holemask;
1865
1866        /*
1867         * Find contiguous ranges of zeroes (i.e., allocated regions) in the
1868         * holemask and convert the start/end index of each range to an extent.
1869         * We start with the start and end index both pointing at the first 0 in
1870         * the mask.
1871         */
1872        startidx = endidx = find_first_zero_bit(holemask,
1873                                                XFS_INOBT_HOLEMASK_BITS);
1874        nextbit = startidx + 1;
1875        while (startidx < XFS_INOBT_HOLEMASK_BITS) {
1876                nextbit = find_next_zero_bit(holemask, XFS_INOBT_HOLEMASK_BITS,
1877                                             nextbit);
1878                /*
1879                 * If the next zero bit is contiguous, update the end index of
1880                 * the current range and continue.
1881                 */
1882                if (nextbit != XFS_INOBT_HOLEMASK_BITS &&
1883                    nextbit == endidx + 1) {
1884                        endidx = nextbit;
1885                        goto next;
1886                }
1887
1888                /*
1889                 * nextbit is not contiguous with the current end index. Convert
1890                 * the current start/end to an extent and add it to the free
1891                 * list.
1892                 */
1893                agbno = sagbno + (startidx * XFS_INODES_PER_HOLEMASK_BIT) /
1894                                  mp->m_sb.sb_inopblock;
1895                contigblk = ((endidx - startidx + 1) *
1896                             XFS_INODES_PER_HOLEMASK_BIT) /
1897                            mp->m_sb.sb_inopblock;
1898
1899                ASSERT(agbno % mp->m_sb.sb_spino_align == 0);
1900                ASSERT(contigblk % mp->m_sb.sb_spino_align == 0);
1901                xfs_bmap_add_free(mp, dfops, XFS_AGB_TO_FSB(mp, agno, agbno),
1902                                  contigblk, &oinfo);
1903
1904                /* reset range to current bit and carry on... */
1905                startidx = endidx = nextbit;
1906
1907next:
1908                nextbit++;
1909        }
1910}
1911
1912STATIC int
1913xfs_difree_inobt(
1914        struct xfs_mount                *mp,
1915        struct xfs_trans                *tp,
1916        struct xfs_buf                  *agbp,
1917        xfs_agino_t                     agino,
1918        struct xfs_defer_ops            *dfops,
1919        struct xfs_icluster             *xic,
1920        struct xfs_inobt_rec_incore     *orec)
1921{
1922        struct xfs_agi                  *agi = XFS_BUF_TO_AGI(agbp);
1923        xfs_agnumber_t                  agno = be32_to_cpu(agi->agi_seqno);
1924        struct xfs_perag                *pag;
1925        struct xfs_btree_cur            *cur;
1926        struct xfs_inobt_rec_incore     rec;
1927        int                             ilen;
1928        int                             error;
1929        int                             i;
1930        int                             off;
1931
1932        ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
1933        ASSERT(XFS_AGINO_TO_AGBNO(mp, agino) < be32_to_cpu(agi->agi_length));
1934
1935        /*
1936         * Initialize the cursor.
1937         */
1938        cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
1939
1940        error = xfs_check_agi_freecount(cur, agi);
1941        if (error)
1942                goto error0;
1943
1944        /*
1945         * Look for the entry describing this inode.
1946         */
1947        if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) {
1948                xfs_warn(mp, "%s: xfs_inobt_lookup() returned error %d.",
1949                        __func__, error);
1950                goto error0;
1951        }
1952        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
1953        error = xfs_inobt_get_rec(cur, &rec, &i);
1954        if (error) {
1955                xfs_warn(mp, "%s: xfs_inobt_get_rec() returned error %d.",
1956                        __func__, error);
1957                goto error0;
1958        }
1959        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
1960        /*
1961         * Get the offset in the inode chunk.
1962         */
1963        off = agino - rec.ir_startino;
1964        ASSERT(off >= 0 && off < XFS_INODES_PER_CHUNK);
1965        ASSERT(!(rec.ir_free & XFS_INOBT_MASK(off)));
1966        /*
1967         * Mark the inode free & increment the count.
1968         */
1969        rec.ir_free |= XFS_INOBT_MASK(off);
1970        rec.ir_freecount++;
1971
1972        /*
1973         * When an inode chunk is free, it becomes eligible for removal. Don't
1974         * remove the chunk if the block size is large enough for multiple inode
1975         * chunks (that might not be free).
1976         */
1977        if (!(mp->m_flags & XFS_MOUNT_IKEEP) &&
1978            rec.ir_free == XFS_INOBT_ALL_FREE &&
1979            mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) {
1980                xic->deleted = true;
1981                xic->first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino);
1982                xic->alloc = xfs_inobt_irec_to_allocmask(&rec);
1983
1984                /*
1985                 * Remove the inode cluster from the AGI B+Tree, adjust the
1986                 * AGI and Superblock inode counts, and mark the disk space
1987                 * to be freed when the transaction is committed.
1988                 */
1989                ilen = rec.ir_freecount;
1990                be32_add_cpu(&agi->agi_count, -ilen);
1991                be32_add_cpu(&agi->agi_freecount, -(ilen - 1));
1992                xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
1993                pag = xfs_perag_get(mp, agno);
1994                pag->pagi_freecount -= ilen - 1;
1995                pag->pagi_count -= ilen;
1996                xfs_perag_put(pag);
1997                xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen);
1998                xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));
1999
2000                if ((error = xfs_btree_delete(cur, &i))) {
2001                        xfs_warn(mp, "%s: xfs_btree_delete returned error %d.",
2002                                __func__, error);
2003                        goto error0;
2004                }
2005
2006                xfs_difree_inode_chunk(mp, agno, &rec, dfops);
2007        } else {
2008                xic->deleted = false;
2009
2010                error = xfs_inobt_update(cur, &rec);
2011                if (error) {
2012                        xfs_warn(mp, "%s: xfs_inobt_update returned error %d.",
2013                                __func__, error);
2014                        goto error0;
2015                }
2016
2017                /* 
2018                 * Change the inode free counts and log the ag/sb changes.
2019                 */
2020                be32_add_cpu(&agi->agi_freecount, 1);
2021                xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
2022                pag = xfs_perag_get(mp, agno);
2023                pag->pagi_freecount++;
2024                xfs_perag_put(pag);
2025                xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1);
2026        }
2027
2028        error = xfs_check_agi_freecount(cur, agi);
2029        if (error)
2030                goto error0;
2031
2032        *orec = rec;
2033        xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
2034        return 0;
2035
2036error0:
2037        xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
2038        return error;
2039}
2040
2041/*
2042 * Free an inode in the free inode btree.
2043 */
2044STATIC int
2045xfs_difree_finobt(
2046        struct xfs_mount                *mp,
2047        struct xfs_trans                *tp,
2048        struct xfs_buf                  *agbp,
2049        xfs_agino_t                     agino,
2050        struct xfs_inobt_rec_incore     *ibtrec) /* inobt record */
2051{
2052        struct xfs_agi                  *agi = XFS_BUF_TO_AGI(agbp);
2053        xfs_agnumber_t                  agno = be32_to_cpu(agi->agi_seqno);
2054        struct xfs_btree_cur            *cur;
2055        struct xfs_inobt_rec_incore     rec;
2056        int                             offset = agino - ibtrec->ir_startino;
2057        int                             error;
2058        int                             i;
2059
2060        cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO);
2061
2062        error = xfs_inobt_lookup(cur, ibtrec->ir_startino, XFS_LOOKUP_EQ, &i);
2063        if (error)
2064                goto error;
2065        if (i == 0) {
2066                /*
2067                 * If the record does not exist in the finobt, we must have just
2068                 * freed an inode in a previously fully allocated chunk. If not,
2069                 * something is out of sync.
2070                 */
2071                XFS_WANT_CORRUPTED_GOTO(mp, ibtrec->ir_freecount == 1, error);
2072
2073                error = xfs_inobt_insert_rec(cur, ibtrec->ir_holemask,
2074                                             ibtrec->ir_count,
2075                                             ibtrec->ir_freecount,
2076                                             ibtrec->ir_free, &i);
2077                if (error)
2078                        goto error;
2079                ASSERT(i == 1);
2080
2081                goto out;
2082        }
2083
2084        /*
2085         * Read and update the existing record. We could just copy the ibtrec
2086         * across here, but that would defeat the purpose of having redundant
2087         * metadata. By making the modifications independently, we can catch
2088         * corruptions that we wouldn't see if we just copied from one record
2089         * to another.
2090         */
2091        error = xfs_inobt_get_rec(cur, &rec, &i);
2092        if (error)
2093                goto error;
2094        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error);
2095
2096        rec.ir_free |= XFS_INOBT_MASK(offset);
2097        rec.ir_freecount++;
2098
2099        XFS_WANT_CORRUPTED_GOTO(mp, (rec.ir_free == ibtrec->ir_free) &&
2100                                (rec.ir_freecount == ibtrec->ir_freecount),
2101                                error);
2102
2103        /*
2104         * The content of inobt records should always match between the inobt
2105         * and finobt. The lifecycle of records in the finobt is different from
2106         * the inobt in that the finobt only tracks records with at least one
2107         * free inode. Hence, if all of the inodes are free and we aren't
2108         * keeping inode chunks permanently on disk, remove the record.
2109         * Otherwise, update the record with the new information.
2110         *
2111         * Note that we currently can't free chunks when the block size is large
2112         * enough for multiple chunks. Leave the finobt record to remain in sync
2113         * with the inobt.
2114         */
2115        if (rec.ir_free == XFS_INOBT_ALL_FREE &&
2116            mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK &&
2117            !(mp->m_flags & XFS_MOUNT_IKEEP)) {
2118                error = xfs_btree_delete(cur, &i);
2119                if (error)
2120                        goto error;
2121                ASSERT(i == 1);
2122        } else {
2123                error = xfs_inobt_update(cur, &rec);
2124                if (error)
2125                        goto error;
2126        }
2127
2128out:
2129        error = xfs_check_agi_freecount(cur, agi);
2130        if (error)
2131                goto error;
2132
2133        xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
2134        return 0;
2135
2136error:
2137        xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
2138        return error;
2139}
2140
2141/*
2142 * Free disk inode.  Carefully avoids touching the incore inode, all
2143 * manipulations incore are the caller's responsibility.
2144 * The on-disk inode is not changed by this operation, only the
2145 * btree (free inode mask) is changed.
2146 */
2147int
2148xfs_difree(
2149        struct xfs_trans        *tp,            /* transaction pointer */
2150        xfs_ino_t               inode,          /* inode to be freed */
2151        struct xfs_defer_ops    *dfops,         /* extents to free */
2152        struct xfs_icluster     *xic)   /* cluster info if deleted */
2153{
2154        /* REFERENCED */
2155        xfs_agblock_t           agbno;  /* block number containing inode */
2156        struct xfs_buf          *agbp;  /* buffer for allocation group header */
2157        xfs_agino_t             agino;  /* allocation group inode number */
2158        xfs_agnumber_t          agno;   /* allocation group number */
2159        int                     error;  /* error return value */
2160        struct xfs_mount        *mp;    /* mount structure for filesystem */
2161        struct xfs_inobt_rec_incore rec;/* btree record */
2162
2163        mp = tp->t_mountp;
2164
2165        /*
2166         * Break up inode number into its components.
2167         */
2168        agno = XFS_INO_TO_AGNO(mp, inode);
2169        if (agno >= mp->m_sb.sb_agcount)  {
2170                xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).",
2171                        __func__, agno, mp->m_sb.sb_agcount);
2172                ASSERT(0);
2173                return -EINVAL;
2174        }
2175        agino = XFS_INO_TO_AGINO(mp, inode);
2176        if (inode != XFS_AGINO_TO_INO(mp, agno, agino))  {
2177                xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).",
2178                        __func__, (unsigned long long)inode,
2179                        (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino));
2180                ASSERT(0);
2181                return -EINVAL;
2182        }
2183        agbno = XFS_AGINO_TO_AGBNO(mp, agino);
2184        if (agbno >= mp->m_sb.sb_agblocks)  {
2185                xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).",
2186                        __func__, agbno, mp->m_sb.sb_agblocks);
2187                ASSERT(0);
2188                return -EINVAL;
2189        }
2190        /*
2191         * Get the allocation group header.
2192         */
2193        error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
2194        if (error) {
2195                xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.",
2196                        __func__, error);
2197                return error;
2198        }
2199
2200        /*
2201         * Fix up the inode allocation btree.
2202         */
2203        error = xfs_difree_inobt(mp, tp, agbp, agino, dfops, xic, &rec);
2204        if (error)
2205                goto error0;
2206
2207        /*
2208         * Fix up the free inode btree.
2209         */
2210        if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
2211                error = xfs_difree_finobt(mp, tp, agbp, agino, &rec);
2212                if (error)
2213                        goto error0;
2214        }
2215
2216        return 0;
2217
2218error0:
2219        return error;
2220}
2221
2222STATIC int
2223xfs_imap_lookup(
2224        struct xfs_mount        *mp,
2225        struct xfs_trans        *tp,
2226        xfs_agnumber_t          agno,
2227        xfs_agino_t             agino,
2228        xfs_agblock_t           agbno,
2229        xfs_agblock_t           *chunk_agbno,
2230        xfs_agblock_t           *offset_agbno,
2231        int                     flags)
2232{
2233        struct xfs_inobt_rec_incore rec;
2234        struct xfs_btree_cur    *cur;
2235        struct xfs_buf          *agbp;
2236        int                     error;
2237        int                     i;
2238
2239        error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
2240        if (error) {
2241                xfs_alert(mp,
2242                        "%s: xfs_ialloc_read_agi() returned error %d, agno %d",
2243                        __func__, error, agno);
2244                return error;
2245        }
2246
2247        /*
2248         * Lookup the inode record for the given agino. If the record cannot be
2249         * found, then it's an invalid inode number and we should abort. Once
2250         * we have a record, we need to ensure it contains the inode number
2251         * we are looking up.
2252         */
2253        cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
2254        error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
2255        if (!error) {
2256                if (i)
2257                        error = xfs_inobt_get_rec(cur, &rec, &i);
2258                if (!error && i == 0)
2259                        error = -EINVAL;
2260        }
2261
2262        xfs_trans_brelse(tp, agbp);
2263        xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
2264        if (error)
2265                return error;
2266
2267        /* check that the returned record contains the required inode */
2268        if (rec.ir_startino > agino ||
2269            rec.ir_startino + mp->m_ialloc_inos <= agino)
2270                return -EINVAL;
2271
2272        /* for untrusted inodes check it is allocated first */
2273        if ((flags & XFS_IGET_UNTRUSTED) &&
2274            (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino)))
2275                return -EINVAL;
2276
2277        *chunk_agbno = XFS_AGINO_TO_AGBNO(mp, rec.ir_startino);
2278        *offset_agbno = agbno - *chunk_agbno;
2279        return 0;
2280}
2281
2282/*
2283 * Return the location of the inode in imap, for mapping it into a buffer.
2284 */
2285int
2286xfs_imap(
2287        xfs_mount_t      *mp,   /* file system mount structure */
2288        xfs_trans_t      *tp,   /* transaction pointer */
2289        xfs_ino_t       ino,    /* inode to locate */
2290        struct xfs_imap *imap,  /* location map structure */
2291        uint            flags)  /* flags for inode btree lookup */
2292{
2293        xfs_agblock_t   agbno;  /* block number of inode in the alloc group */
2294        xfs_agino_t     agino;  /* inode number within alloc group */
2295        xfs_agnumber_t  agno;   /* allocation group number */
2296        int             blks_per_cluster; /* num blocks per inode cluster */
2297        xfs_agblock_t   chunk_agbno;    /* first block in inode chunk */
2298        xfs_agblock_t   cluster_agbno;  /* first block in inode cluster */
2299        int             error;  /* error code */
2300        int             offset; /* index of inode in its buffer */
2301        xfs_agblock_t   offset_agbno;   /* blks from chunk start to inode */
2302
2303        ASSERT(ino != NULLFSINO);
2304
2305        /*
2306         * Split up the inode number into its parts.
2307         */
2308        agno = XFS_INO_TO_AGNO(mp, ino);
2309        agino = XFS_INO_TO_AGINO(mp, ino);
2310        agbno = XFS_AGINO_TO_AGBNO(mp, agino);
2311        if (agno >= mp->m_sb.sb_agcount || agbno >= mp->m_sb.sb_agblocks ||
2312            ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
2313#ifdef DEBUG
2314                /*
2315                 * Don't output diagnostic information for untrusted inodes
2316                 * as they can be invalid without implying corruption.
2317                 */
2318                if (flags & XFS_IGET_UNTRUSTED)
2319                        return -EINVAL;
2320                if (agno >= mp->m_sb.sb_agcount) {
2321                        xfs_alert(mp,
2322                                "%s: agno (%d) >= mp->m_sb.sb_agcount (%d)",
2323                                __func__, agno, mp->m_sb.sb_agcount);
2324                }
2325                if (agbno >= mp->m_sb.sb_agblocks) {
2326                        xfs_alert(mp,
2327                "%s: agbno (0x%llx) >= mp->m_sb.sb_agblocks (0x%lx)",
2328                                __func__, (unsigned long long)agbno,
2329                                (unsigned long)mp->m_sb.sb_agblocks);
2330                }
2331                if (ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
2332                        xfs_alert(mp,
2333                "%s: ino (0x%llx) != XFS_AGINO_TO_INO() (0x%llx)",
2334                                __func__, ino,
2335                                XFS_AGINO_TO_INO(mp, agno, agino));
2336                }
2337                xfs_stack_trace();
2338#endif /* DEBUG */
2339                return -EINVAL;
2340        }
2341
2342        blks_per_cluster = xfs_icluster_size_fsb(mp);
2343
2344        /*
2345         * For bulkstat and handle lookups, we have an untrusted inode number
2346         * that we have to verify is valid. We cannot do this just by reading
2347         * the inode buffer as it may have been unlinked and removed leaving
2348         * inodes in stale state on disk. Hence we have to do a btree lookup
2349         * in all cases where an untrusted inode number is passed.
2350         */
2351        if (flags & XFS_IGET_UNTRUSTED) {
2352                error = xfs_imap_lookup(mp, tp, agno, agino, agbno,
2353                                        &chunk_agbno, &offset_agbno, flags);
2354                if (error)
2355                        return error;
2356                goto out_map;
2357        }
2358
2359        /*
2360         * If the inode cluster size is the same as the blocksize or
2361         * smaller we get to the buffer by simple arithmetics.
2362         */
2363        if (blks_per_cluster == 1) {
2364                offset = XFS_INO_TO_OFFSET(mp, ino);
2365                ASSERT(offset < mp->m_sb.sb_inopblock);
2366
2367                imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno);
2368                imap->im_len = XFS_FSB_TO_BB(mp, 1);
2369                imap->im_boffset = (unsigned short)(offset <<
2370                                                        mp->m_sb.sb_inodelog);
2371                return 0;
2372        }
2373
2374        /*
2375         * If the inode chunks are aligned then use simple maths to
2376         * find the location. Otherwise we have to do a btree
2377         * lookup to find the location.
2378         */
2379        if (mp->m_inoalign_mask) {
2380                offset_agbno = agbno & mp->m_inoalign_mask;
2381                chunk_agbno = agbno - offset_agbno;
2382        } else {
2383                error = xfs_imap_lookup(mp, tp, agno, agino, agbno,
2384                                        &chunk_agbno, &offset_agbno, flags);
2385                if (error)
2386                        return error;
2387        }
2388
2389out_map:
2390        ASSERT(agbno >= chunk_agbno);
2391        cluster_agbno = chunk_agbno +
2392                ((offset_agbno / blks_per_cluster) * blks_per_cluster);
2393        offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) +
2394                XFS_INO_TO_OFFSET(mp, ino);
2395
2396        imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, cluster_agbno);
2397        imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
2398        imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog);
2399
2400        /*
2401         * If the inode number maps to a block outside the bounds
2402         * of the file system then return NULL rather than calling
2403         * read_buf and panicing when we get an error from the
2404         * driver.
2405         */
2406        if ((imap->im_blkno + imap->im_len) >
2407            XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
2408                xfs_alert(mp,
2409        "%s: (im_blkno (0x%llx) + im_len (0x%llx)) > sb_dblocks (0x%llx)",
2410                        __func__, (unsigned long long) imap->im_blkno,
2411                        (unsigned long long) imap->im_len,
2412                        XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
2413                return -EINVAL;
2414        }
2415        return 0;
2416}
2417
2418/*
2419 * Compute and fill in value of m_in_maxlevels.
2420 */
2421void
2422xfs_ialloc_compute_maxlevels(
2423        xfs_mount_t     *mp)            /* file system mount structure */
2424{
2425        uint            inodes;
2426
2427        inodes = (1LL << XFS_INO_AGINO_BITS(mp)) >> XFS_INODES_PER_CHUNK_LOG;
2428        mp->m_in_maxlevels = xfs_btree_compute_maxlevels(mp->m_inobt_mnr,
2429                                                         inodes);
2430}
2431
2432/*
2433 * Log specified fields for the ag hdr (inode section). The growth of the agi
2434 * structure over time requires that we interpret the buffer as two logical
2435 * regions delineated by the end of the unlinked list. This is due to the size
2436 * of the hash table and its location in the middle of the agi.
2437 *
2438 * For example, a request to log a field before agi_unlinked and a field after
2439 * agi_unlinked could cause us to log the entire hash table and use an excessive
2440 * amount of log space. To avoid this behavior, log the region up through
2441 * agi_unlinked in one call and the region after agi_unlinked through the end of
2442 * the structure in another.
2443 */
2444void
2445xfs_ialloc_log_agi(
2446        xfs_trans_t     *tp,            /* transaction pointer */
2447        xfs_buf_t       *bp,            /* allocation group header buffer */
2448        int             fields)         /* bitmask of fields to log */
2449{
2450        int                     first;          /* first byte number */
2451        int                     last;           /* last byte number */
2452        static const short      offsets[] = {   /* field starting offsets */
2453                                        /* keep in sync with bit definitions */
2454                offsetof(xfs_agi_t, agi_magicnum),
2455                offsetof(xfs_agi_t, agi_versionnum),
2456                offsetof(xfs_agi_t, agi_seqno),
2457                offsetof(xfs_agi_t, agi_length),
2458                offsetof(xfs_agi_t, agi_count),
2459                offsetof(xfs_agi_t, agi_root),
2460                offsetof(xfs_agi_t, agi_level),
2461                offsetof(xfs_agi_t, agi_freecount),
2462                offsetof(xfs_agi_t, agi_newino),
2463                offsetof(xfs_agi_t, agi_dirino),
2464                offsetof(xfs_agi_t, agi_unlinked),
2465                offsetof(xfs_agi_t, agi_free_root),
2466                offsetof(xfs_agi_t, agi_free_level),
2467                sizeof(xfs_agi_t)
2468        };
2469#ifdef DEBUG
2470        xfs_agi_t               *agi;   /* allocation group header */
2471
2472        agi = XFS_BUF_TO_AGI(bp);
2473        ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
2474#endif
2475
2476        /*
2477         * Compute byte offsets for the first and last fields in the first
2478         * region and log the agi buffer. This only logs up through
2479         * agi_unlinked.
2480         */
2481        if (fields & XFS_AGI_ALL_BITS_R1) {
2482                xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R1,
2483                                  &first, &last);
2484                xfs_trans_log_buf(tp, bp, first, last);
2485        }
2486
2487        /*
2488         * Mask off the bits in the first region and calculate the first and
2489         * last field offsets for any bits in the second region.
2490         */
2491        fields &= ~XFS_AGI_ALL_BITS_R1;
2492        if (fields) {
2493                xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R2,
2494                                  &first, &last);
2495                xfs_trans_log_buf(tp, bp, first, last);
2496        }
2497}
2498
2499static xfs_failaddr_t
2500xfs_agi_verify(
2501        struct xfs_buf  *bp)
2502{
2503        struct xfs_mount *mp = bp->b_target->bt_mount;
2504        struct xfs_agi  *agi = XFS_BUF_TO_AGI(bp);
2505        int             i;
2506
2507        if (xfs_sb_version_hascrc(&mp->m_sb)) {
2508                if (!uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid))
2509                        return __this_address;
2510                if (!xfs_log_check_lsn(mp,
2511                                be64_to_cpu(XFS_BUF_TO_AGI(bp)->agi_lsn)))
2512                        return __this_address;
2513        }
2514
2515        /*
2516         * Validate the magic number of the agi block.
2517         */
2518        if (agi->agi_magicnum != cpu_to_be32(XFS_AGI_MAGIC))
2519                return __this_address;
2520        if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)))
2521                return __this_address;
2522
2523        if (be32_to_cpu(agi->agi_level) < 1 ||
2524            be32_to_cpu(agi->agi_level) > XFS_BTREE_MAXLEVELS)
2525                return __this_address;
2526
2527        if (xfs_sb_version_hasfinobt(&mp->m_sb) &&
2528            (be32_to_cpu(agi->agi_free_level) < 1 ||
2529             be32_to_cpu(agi->agi_free_level) > XFS_BTREE_MAXLEVELS))
2530                return __this_address;
2531
2532        /*
2533         * during growfs operations, the perag is not fully initialised,
2534         * so we can't use it for any useful checking. growfs ensures we can't
2535         * use it by using uncached buffers that don't have the perag attached
2536         * so we can detect and avoid this problem.
2537         */
2538        if (bp->b_pag && be32_to_cpu(agi->agi_seqno) != bp->b_pag->pag_agno)
2539                return __this_address;
2540
2541        for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++) {
2542                if (agi->agi_unlinked[i] == NULLAGINO)
2543                        continue;
2544                if (!xfs_verify_ino(mp, be32_to_cpu(agi->agi_unlinked[i])))
2545                        return __this_address;
2546        }
2547
2548        return NULL;
2549}
2550
2551static void
2552xfs_agi_read_verify(
2553        struct xfs_buf  *bp)
2554{
2555        struct xfs_mount *mp = bp->b_target->bt_mount;
2556        xfs_failaddr_t  fa;
2557
2558        if (xfs_sb_version_hascrc(&mp->m_sb) &&
2559            !xfs_buf_verify_cksum(bp, XFS_AGI_CRC_OFF))
2560                xfs_verifier_error(bp, -EFSBADCRC, __this_address);
2561        else {
2562                fa = xfs_agi_verify(bp);
2563                if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_IALLOC_READ_AGI))
2564                        xfs_verifier_error(bp, -EFSCORRUPTED, fa);
2565        }
2566}
2567
2568static void
2569xfs_agi_write_verify(
2570        struct xfs_buf  *bp)
2571{
2572        struct xfs_mount        *mp = bp->b_target->bt_mount;
2573        struct xfs_buf_log_item *bip = bp->b_log_item;
2574        xfs_failaddr_t          fa;
2575
2576        fa = xfs_agi_verify(bp);
2577        if (fa) {
2578                xfs_verifier_error(bp, -EFSCORRUPTED, fa);
2579                return;
2580        }
2581
2582        if (!xfs_sb_version_hascrc(&mp->m_sb))
2583                return;
2584
2585        if (bip)
2586                XFS_BUF_TO_AGI(bp)->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn);
2587        xfs_buf_update_cksum(bp, XFS_AGI_CRC_OFF);
2588}
2589
2590const struct xfs_buf_ops xfs_agi_buf_ops = {
2591        .name = "xfs_agi",
2592        .verify_read = xfs_agi_read_verify,
2593        .verify_write = xfs_agi_write_verify,
2594        .verify_struct = xfs_agi_verify,
2595};
2596
2597/*
2598 * Read in the allocation group header (inode allocation section)
2599 */
2600int
2601xfs_read_agi(
2602        struct xfs_mount        *mp,    /* file system mount structure */
2603        struct xfs_trans        *tp,    /* transaction pointer */
2604        xfs_agnumber_t          agno,   /* allocation group number */
2605        struct xfs_buf          **bpp)  /* allocation group hdr buf */
2606{
2607        int                     error;
2608
2609        trace_xfs_read_agi(mp, agno);
2610
2611        ASSERT(agno != NULLAGNUMBER);
2612        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
2613                        XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
2614                        XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops);
2615        if (error)
2616                return error;
2617        if (tp)
2618                xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_AGI_BUF);
2619
2620        xfs_buf_set_ref(*bpp, XFS_AGI_REF);
2621        return 0;
2622}
2623
2624int
2625xfs_ialloc_read_agi(
2626        struct xfs_mount        *mp,    /* file system mount structure */
2627        struct xfs_trans        *tp,    /* transaction pointer */
2628        xfs_agnumber_t          agno,   /* allocation group number */
2629        struct xfs_buf          **bpp)  /* allocation group hdr buf */
2630{
2631        struct xfs_agi          *agi;   /* allocation group header */
2632        struct xfs_perag        *pag;   /* per allocation group data */
2633        int                     error;
2634
2635        trace_xfs_ialloc_read_agi(mp, agno);
2636
2637        error = xfs_read_agi(mp, tp, agno, bpp);
2638        if (error)
2639                return error;
2640
2641        agi = XFS_BUF_TO_AGI(*bpp);
2642        pag = xfs_perag_get(mp, agno);
2643        if (!pag->pagi_init) {
2644                pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
2645                pag->pagi_count = be32_to_cpu(agi->agi_count);
2646                pag->pagi_init = 1;
2647        }
2648
2649        /*
2650         * It's possible for these to be out of sync if
2651         * we are in the middle of a forced shutdown.
2652         */
2653        ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
2654                XFS_FORCED_SHUTDOWN(mp));
2655        xfs_perag_put(pag);
2656        return 0;
2657}
2658
2659/*
2660 * Read in the agi to initialise the per-ag data in the mount structure
2661 */
2662int
2663xfs_ialloc_pagi_init(
2664        xfs_mount_t     *mp,            /* file system mount structure */
2665        xfs_trans_t     *tp,            /* transaction pointer */
2666        xfs_agnumber_t  agno)           /* allocation group number */
2667{
2668        xfs_buf_t       *bp = NULL;
2669        int             error;
2670
2671        error = xfs_ialloc_read_agi(mp, tp, agno, &bp);
2672        if (error)
2673                return error;
2674        if (bp)
2675                xfs_trans_brelse(tp, bp);
2676        return 0;
2677}
2678
2679/* Is there an inode record covering a given range of inode numbers? */
2680int
2681xfs_ialloc_has_inode_record(
2682        struct xfs_btree_cur    *cur,
2683        xfs_agino_t             low,
2684        xfs_agino_t             high,
2685        bool                    *exists)
2686{
2687        struct xfs_inobt_rec_incore     irec;
2688        xfs_agino_t             agino;
2689        uint16_t                holemask;
2690        int                     has_record;
2691        int                     i;
2692        int                     error;
2693
2694        *exists = false;
2695        error = xfs_inobt_lookup(cur, low, XFS_LOOKUP_LE, &has_record);
2696        while (error == 0 && has_record) {
2697                error = xfs_inobt_get_rec(cur, &irec, &has_record);
2698                if (error || irec.ir_startino > high)
2699                        break;
2700
2701                agino = irec.ir_startino;
2702                holemask = irec.ir_holemask;
2703                for (i = 0; i < XFS_INOBT_HOLEMASK_BITS; holemask >>= 1,
2704                                i++, agino += XFS_INODES_PER_HOLEMASK_BIT) {
2705                        if (holemask & 1)
2706                                continue;
2707                        if (agino + XFS_INODES_PER_HOLEMASK_BIT > low &&
2708                                        agino <= high) {
2709                                *exists = true;
2710                                return 0;
2711                        }
2712                }
2713
2714                error = xfs_btree_increment(cur, 0, &has_record);
2715        }
2716        return error;
2717}
2718
2719/* Is there an inode record covering a given extent? */
2720int
2721xfs_ialloc_has_inodes_at_extent(
2722        struct xfs_btree_cur    *cur,
2723        xfs_agblock_t           bno,
2724        xfs_extlen_t            len,
2725        bool                    *exists)
2726{
2727        xfs_agino_t             low;
2728        xfs_agino_t             high;
2729
2730        low = XFS_OFFBNO_TO_AGINO(cur->bc_mp, bno, 0);
2731        high = XFS_OFFBNO_TO_AGINO(cur->bc_mp, bno + len, 0) - 1;
2732
2733        return xfs_ialloc_has_inode_record(cur, low, high, exists);
2734}
2735
2736struct xfs_ialloc_count_inodes {
2737        xfs_agino_t                     count;
2738        xfs_agino_t                     freecount;
2739};
2740
2741/* Record inode counts across all inobt records. */
2742STATIC int
2743xfs_ialloc_count_inodes_rec(
2744        struct xfs_btree_cur            *cur,
2745        union xfs_btree_rec             *rec,
2746        void                            *priv)
2747{
2748        struct xfs_inobt_rec_incore     irec;
2749        struct xfs_ialloc_count_inodes  *ci = priv;
2750
2751        xfs_inobt_btrec_to_irec(cur->bc_mp, rec, &irec);
2752        ci->count += irec.ir_count;
2753        ci->freecount += irec.ir_freecount;
2754
2755        return 0;
2756}
2757
2758/* Count allocated and free inodes under an inobt. */
2759int
2760xfs_ialloc_count_inodes(
2761        struct xfs_btree_cur            *cur,
2762        xfs_agino_t                     *count,
2763        xfs_agino_t                     *freecount)
2764{
2765        struct xfs_ialloc_count_inodes  ci = {0};
2766        int                             error;
2767
2768        ASSERT(cur->bc_btnum == XFS_BTNUM_INO);
2769        error = xfs_btree_query_all(cur, xfs_ialloc_count_inodes_rec, &ci);
2770        if (error)
2771                return error;
2772
2773        *count = ci.count;
2774        *freecount = ci.freecount;
2775        return 0;
2776}
2777