linux/fs/xfs/libxfs/xfs_ialloc.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
   3 * All Rights Reserved.
   4 *
   5 * This program is free software; you can redistribute it and/or
   6 * modify it under the terms of the GNU General Public License as
   7 * published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope that it would be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public License
  15 * along with this program; if not, write the Free Software Foundation,
  16 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17 */
  18#include "xfs.h"
  19#include "xfs_fs.h"
  20#include "xfs_shared.h"
  21#include "xfs_format.h"
  22#include "xfs_log_format.h"
  23#include "xfs_trans_resv.h"
  24#include "xfs_bit.h"
  25#include "xfs_sb.h"
  26#include "xfs_mount.h"
  27#include "xfs_defer.h"
  28#include "xfs_inode.h"
  29#include "xfs_btree.h"
  30#include "xfs_ialloc.h"
  31#include "xfs_ialloc_btree.h"
  32#include "xfs_alloc.h"
  33#include "xfs_rtalloc.h"
  34#include "xfs_error.h"
  35#include "xfs_bmap.h"
  36#include "xfs_cksum.h"
  37#include "xfs_trans.h"
  38#include "xfs_buf_item.h"
  39#include "xfs_icreate_item.h"
  40#include "xfs_icache.h"
  41#include "xfs_trace.h"
  42#include "xfs_log.h"
  43#include "xfs_rmap.h"
  44
  45
  46/*
  47 * Allocation group level functions.
  48 */
  49static inline int
  50xfs_ialloc_cluster_alignment(
  51        struct xfs_mount        *mp)
  52{
  53        if (xfs_sb_version_hasalign(&mp->m_sb) &&
  54            mp->m_sb.sb_inoalignmt >=
  55                        XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size))
  56                return mp->m_sb.sb_inoalignmt;
  57        return 1;
  58}
  59
  60/*
  61 * Lookup a record by ino in the btree given by cur.
  62 */
  63int                                     /* error */
  64xfs_inobt_lookup(
  65        struct xfs_btree_cur    *cur,   /* btree cursor */
  66        xfs_agino_t             ino,    /* starting inode of chunk */
  67        xfs_lookup_t            dir,    /* <=, >=, == */
  68        int                     *stat)  /* success/failure */
  69{
  70        cur->bc_rec.i.ir_startino = ino;
  71        cur->bc_rec.i.ir_holemask = 0;
  72        cur->bc_rec.i.ir_count = 0;
  73        cur->bc_rec.i.ir_freecount = 0;
  74        cur->bc_rec.i.ir_free = 0;
  75        return xfs_btree_lookup(cur, dir, stat);
  76}
  77
  78/*
  79 * Update the record referred to by cur to the value given.
  80 * This either works (return 0) or gets an EFSCORRUPTED error.
  81 */
  82STATIC int                              /* error */
  83xfs_inobt_update(
  84        struct xfs_btree_cur    *cur,   /* btree cursor */
  85        xfs_inobt_rec_incore_t  *irec)  /* btree record */
  86{
  87        union xfs_btree_rec     rec;
  88
  89        rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino);
  90        if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) {
  91                rec.inobt.ir_u.sp.ir_holemask = cpu_to_be16(irec->ir_holemask);
  92                rec.inobt.ir_u.sp.ir_count = irec->ir_count;
  93                rec.inobt.ir_u.sp.ir_freecount = irec->ir_freecount;
  94        } else {
  95                /* ir_holemask/ir_count not supported on-disk */
  96                rec.inobt.ir_u.f.ir_freecount = cpu_to_be32(irec->ir_freecount);
  97        }
  98        rec.inobt.ir_free = cpu_to_be64(irec->ir_free);
  99        return xfs_btree_update(cur, &rec);
 100}
 101
 102/*
 103 * Get the data from the pointed-to record.
 104 */
 105int                                     /* error */
 106xfs_inobt_get_rec(
 107        struct xfs_btree_cur    *cur,   /* btree cursor */
 108        xfs_inobt_rec_incore_t  *irec,  /* btree record */
 109        int                     *stat)  /* output: success/failure */
 110{
 111        union xfs_btree_rec     *rec;
 112        int                     error;
 113
 114        error = xfs_btree_get_rec(cur, &rec, stat);
 115        if (error || *stat == 0)
 116                return error;
 117
 118        irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino);
 119        if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) {
 120                irec->ir_holemask = be16_to_cpu(rec->inobt.ir_u.sp.ir_holemask);
 121                irec->ir_count = rec->inobt.ir_u.sp.ir_count;
 122                irec->ir_freecount = rec->inobt.ir_u.sp.ir_freecount;
 123        } else {
 124                /*
 125                 * ir_holemask/ir_count not supported on-disk. Fill in hardcoded
 126                 * values for full inode chunks.
 127                 */
 128                irec->ir_holemask = XFS_INOBT_HOLEMASK_FULL;
 129                irec->ir_count = XFS_INODES_PER_CHUNK;
 130                irec->ir_freecount =
 131                                be32_to_cpu(rec->inobt.ir_u.f.ir_freecount);
 132        }
 133        irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
 134
 135        return 0;
 136}
 137
 138/*
 139 * Insert a single inobt record. Cursor must already point to desired location.
 140 */
 141STATIC int
 142xfs_inobt_insert_rec(
 143        struct xfs_btree_cur    *cur,
 144        __uint16_t              holemask,
 145        __uint8_t               count,
 146        __int32_t               freecount,
 147        xfs_inofree_t           free,
 148        int                     *stat)
 149{
 150        cur->bc_rec.i.ir_holemask = holemask;
 151        cur->bc_rec.i.ir_count = count;
 152        cur->bc_rec.i.ir_freecount = freecount;
 153        cur->bc_rec.i.ir_free = free;
 154        return xfs_btree_insert(cur, stat);
 155}
 156
 157/*
 158 * Insert records describing a newly allocated inode chunk into the inobt.
 159 */
 160STATIC int
 161xfs_inobt_insert(
 162        struct xfs_mount        *mp,
 163        struct xfs_trans        *tp,
 164        struct xfs_buf          *agbp,
 165        xfs_agino_t             newino,
 166        xfs_agino_t             newlen,
 167        xfs_btnum_t             btnum)
 168{
 169        struct xfs_btree_cur    *cur;
 170        struct xfs_agi          *agi = XFS_BUF_TO_AGI(agbp);
 171        xfs_agnumber_t          agno = be32_to_cpu(agi->agi_seqno);
 172        xfs_agino_t             thisino;
 173        int                     i;
 174        int                     error;
 175
 176        cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum);
 177
 178        for (thisino = newino;
 179             thisino < newino + newlen;
 180             thisino += XFS_INODES_PER_CHUNK) {
 181                error = xfs_inobt_lookup(cur, thisino, XFS_LOOKUP_EQ, &i);
 182                if (error) {
 183                        xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
 184                        return error;
 185                }
 186                ASSERT(i == 0);
 187
 188                error = xfs_inobt_insert_rec(cur, XFS_INOBT_HOLEMASK_FULL,
 189                                             XFS_INODES_PER_CHUNK,
 190                                             XFS_INODES_PER_CHUNK,
 191                                             XFS_INOBT_ALL_FREE, &i);
 192                if (error) {
 193                        xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
 194                        return error;
 195                }
 196                ASSERT(i == 1);
 197        }
 198
 199        xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
 200
 201        return 0;
 202}
 203
 204/*
 205 * Verify that the number of free inodes in the AGI is correct.
 206 */
 207#ifdef DEBUG
 208STATIC int
 209xfs_check_agi_freecount(
 210        struct xfs_btree_cur    *cur,
 211        struct xfs_agi          *agi)
 212{
 213        if (cur->bc_nlevels == 1) {
 214                xfs_inobt_rec_incore_t rec;
 215                int             freecount = 0;
 216                int             error;
 217                int             i;
 218
 219                error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
 220                if (error)
 221                        return error;
 222
 223                do {
 224                        error = xfs_inobt_get_rec(cur, &rec, &i);
 225                        if (error)
 226                                return error;
 227
 228                        if (i) {
 229                                freecount += rec.ir_freecount;
 230                                error = xfs_btree_increment(cur, 0, &i);
 231                                if (error)
 232                                        return error;
 233                        }
 234                } while (i == 1);
 235
 236                if (!XFS_FORCED_SHUTDOWN(cur->bc_mp))
 237                        ASSERT(freecount == be32_to_cpu(agi->agi_freecount));
 238        }
 239        return 0;
 240}
 241#else
 242#define xfs_check_agi_freecount(cur, agi)       0
 243#endif
 244
 245/*
 246 * Initialise a new set of inodes. When called without a transaction context
 247 * (e.g. from recovery) we initiate a delayed write of the inode buffers rather
 248 * than logging them (which in a transaction context puts them into the AIL
 249 * for writeback rather than the xfsbufd queue).
 250 */
 251int
 252xfs_ialloc_inode_init(
 253        struct xfs_mount        *mp,
 254        struct xfs_trans        *tp,
 255        struct list_head        *buffer_list,
 256        int                     icount,
 257        xfs_agnumber_t          agno,
 258        xfs_agblock_t           agbno,
 259        xfs_agblock_t           length,
 260        unsigned int            gen)
 261{
 262        struct xfs_buf          *fbuf;
 263        struct xfs_dinode       *free;
 264        int                     nbufs, blks_per_cluster, inodes_per_cluster;
 265        int                     version;
 266        int                     i, j;
 267        xfs_daddr_t             d;
 268        xfs_ino_t               ino = 0;
 269
 270        /*
 271         * Loop over the new block(s), filling in the inodes.  For small block
 272         * sizes, manipulate the inodes in buffers  which are multiples of the
 273         * blocks size.
 274         */
 275        blks_per_cluster = xfs_icluster_size_fsb(mp);
 276        inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
 277        nbufs = length / blks_per_cluster;
 278
 279        /*
 280         * Figure out what version number to use in the inodes we create.  If
 281         * the superblock version has caught up to the one that supports the new
 282         * inode format, then use the new inode version.  Otherwise use the old
 283         * version so that old kernels will continue to be able to use the file
 284         * system.
 285         *
 286         * For v3 inodes, we also need to write the inode number into the inode,
 287         * so calculate the first inode number of the chunk here as
 288         * XFS_OFFBNO_TO_AGINO() only works within a filesystem block, not
 289         * across multiple filesystem blocks (such as a cluster) and so cannot
 290         * be used in the cluster buffer loop below.
 291         *
 292         * Further, because we are writing the inode directly into the buffer
 293         * and calculating a CRC on the entire inode, we have ot log the entire
 294         * inode so that the entire range the CRC covers is present in the log.
 295         * That means for v3 inode we log the entire buffer rather than just the
 296         * inode cores.
 297         */
 298        if (xfs_sb_version_hascrc(&mp->m_sb)) {
 299                version = 3;
 300                ino = XFS_AGINO_TO_INO(mp, agno,
 301                                       XFS_OFFBNO_TO_AGINO(mp, agbno, 0));
 302
 303                /*
 304                 * log the initialisation that is about to take place as an
 305                 * logical operation. This means the transaction does not
 306                 * need to log the physical changes to the inode buffers as log
 307                 * recovery will know what initialisation is actually needed.
 308                 * Hence we only need to log the buffers as "ordered" buffers so
 309                 * they track in the AIL as if they were physically logged.
 310                 */
 311                if (tp)
 312                        xfs_icreate_log(tp, agno, agbno, icount,
 313                                        mp->m_sb.sb_inodesize, length, gen);
 314        } else
 315                version = 2;
 316
 317        for (j = 0; j < nbufs; j++) {
 318                /*
 319                 * Get the block.
 320                 */
 321                d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster));
 322                fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
 323                                         mp->m_bsize * blks_per_cluster,
 324                                         XBF_UNMAPPED);
 325                if (!fbuf)
 326                        return -ENOMEM;
 327
 328                /* Initialize the inode buffers and log them appropriately. */
 329                fbuf->b_ops = &xfs_inode_buf_ops;
 330                xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length));
 331                for (i = 0; i < inodes_per_cluster; i++) {
 332                        int     ioffset = i << mp->m_sb.sb_inodelog;
 333                        uint    isize = xfs_dinode_size(version);
 334
 335                        free = xfs_make_iptr(mp, fbuf, i);
 336                        free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
 337                        free->di_version = version;
 338                        free->di_gen = cpu_to_be32(gen);
 339                        free->di_next_unlinked = cpu_to_be32(NULLAGINO);
 340
 341                        if (version == 3) {
 342                                free->di_ino = cpu_to_be64(ino);
 343                                ino++;
 344                                uuid_copy(&free->di_uuid,
 345                                          &mp->m_sb.sb_meta_uuid);
 346                                xfs_dinode_calc_crc(mp, free);
 347                        } else if (tp) {
 348                                /* just log the inode core */
 349                                xfs_trans_log_buf(tp, fbuf, ioffset,
 350                                                  ioffset + isize - 1);
 351                        }
 352                }
 353
 354                if (tp) {
 355                        /*
 356                         * Mark the buffer as an inode allocation buffer so it
 357                         * sticks in AIL at the point of this allocation
 358                         * transaction. This ensures the they are on disk before
 359                         * the tail of the log can be moved past this
 360                         * transaction (i.e. by preventing relogging from moving
 361                         * it forward in the log).
 362                         */
 363                        xfs_trans_inode_alloc_buf(tp, fbuf);
 364                        if (version == 3) {
 365                                /*
 366                                 * Mark the buffer as ordered so that they are
 367                                 * not physically logged in the transaction but
 368                                 * still tracked in the AIL as part of the
 369                                 * transaction and pin the log appropriately.
 370                                 */
 371                                xfs_trans_ordered_buf(tp, fbuf);
 372                                xfs_trans_log_buf(tp, fbuf, 0,
 373                                                  BBTOB(fbuf->b_length) - 1);
 374                        }
 375                } else {
 376                        fbuf->b_flags |= XBF_DONE;
 377                        xfs_buf_delwri_queue(fbuf, buffer_list);
 378                        xfs_buf_relse(fbuf);
 379                }
 380        }
 381        return 0;
 382}
 383
 384/*
 385 * Align startino and allocmask for a recently allocated sparse chunk such that
 386 * they are fit for insertion (or merge) into the on-disk inode btrees.
 387 *
 388 * Background:
 389 *
 390 * When enabled, sparse inode support increases the inode alignment from cluster
 391 * size to inode chunk size. This means that the minimum range between two
 392 * non-adjacent inode records in the inobt is large enough for a full inode
 393 * record. This allows for cluster sized, cluster aligned block allocation
 394 * without need to worry about whether the resulting inode record overlaps with
 395 * another record in the tree. Without this basic rule, we would have to deal
 396 * with the consequences of overlap by potentially undoing recent allocations in
 397 * the inode allocation codepath.
 398 *
 399 * Because of this alignment rule (which is enforced on mount), there are two
 400 * inobt possibilities for newly allocated sparse chunks. One is that the
 401 * aligned inode record for the chunk covers a range of inodes not already
 402 * covered in the inobt (i.e., it is safe to insert a new sparse record). The
 403 * other is that a record already exists at the aligned startino that considers
 404 * the newly allocated range as sparse. In the latter case, record content is
 405 * merged in hope that sparse inode chunks fill to full chunks over time.
 406 */
 407STATIC void
 408xfs_align_sparse_ino(
 409        struct xfs_mount                *mp,
 410        xfs_agino_t                     *startino,
 411        uint16_t                        *allocmask)
 412{
 413        xfs_agblock_t                   agbno;
 414        xfs_agblock_t                   mod;
 415        int                             offset;
 416
 417        agbno = XFS_AGINO_TO_AGBNO(mp, *startino);
 418        mod = agbno % mp->m_sb.sb_inoalignmt;
 419        if (!mod)
 420                return;
 421
 422        /* calculate the inode offset and align startino */
 423        offset = mod << mp->m_sb.sb_inopblog;
 424        *startino -= offset;
 425
 426        /*
 427         * Since startino has been aligned down, left shift allocmask such that
 428         * it continues to represent the same physical inodes relative to the
 429         * new startino.
 430         */
 431        *allocmask <<= offset / XFS_INODES_PER_HOLEMASK_BIT;
 432}
 433
 434/*
 435 * Determine whether the source inode record can merge into the target. Both
 436 * records must be sparse, the inode ranges must match and there must be no
 437 * allocation overlap between the records.
 438 */
 439STATIC bool
 440__xfs_inobt_can_merge(
 441        struct xfs_inobt_rec_incore     *trec,  /* tgt record */
 442        struct xfs_inobt_rec_incore     *srec)  /* src record */
 443{
 444        uint64_t                        talloc;
 445        uint64_t                        salloc;
 446
 447        /* records must cover the same inode range */
 448        if (trec->ir_startino != srec->ir_startino)
 449                return false;
 450
 451        /* both records must be sparse */
 452        if (!xfs_inobt_issparse(trec->ir_holemask) ||
 453            !xfs_inobt_issparse(srec->ir_holemask))
 454                return false;
 455
 456        /* both records must track some inodes */
 457        if (!trec->ir_count || !srec->ir_count)
 458                return false;
 459
 460        /* can't exceed capacity of a full record */
 461        if (trec->ir_count + srec->ir_count > XFS_INODES_PER_CHUNK)
 462                return false;
 463
 464        /* verify there is no allocation overlap */
 465        talloc = xfs_inobt_irec_to_allocmask(trec);
 466        salloc = xfs_inobt_irec_to_allocmask(srec);
 467        if (talloc & salloc)
 468                return false;
 469
 470        return true;
 471}
 472
 473/*
 474 * Merge the source inode record into the target. The caller must call
 475 * __xfs_inobt_can_merge() to ensure the merge is valid.
 476 */
 477STATIC void
 478__xfs_inobt_rec_merge(
 479        struct xfs_inobt_rec_incore     *trec,  /* target */
 480        struct xfs_inobt_rec_incore     *srec)  /* src */
 481{
 482        ASSERT(trec->ir_startino == srec->ir_startino);
 483
 484        /* combine the counts */
 485        trec->ir_count += srec->ir_count;
 486        trec->ir_freecount += srec->ir_freecount;
 487
 488        /*
 489         * Merge the holemask and free mask. For both fields, 0 bits refer to
 490         * allocated inodes. We combine the allocated ranges with bitwise AND.
 491         */
 492        trec->ir_holemask &= srec->ir_holemask;
 493        trec->ir_free &= srec->ir_free;
 494}
 495
 496/*
 497 * Insert a new sparse inode chunk into the associated inode btree. The inode
 498 * record for the sparse chunk is pre-aligned to a startino that should match
 499 * any pre-existing sparse inode record in the tree. This allows sparse chunks
 500 * to fill over time.
 501 *
 502 * This function supports two modes of handling preexisting records depending on
 503 * the merge flag. If merge is true, the provided record is merged with the
 504 * existing record and updated in place. The merged record is returned in nrec.
 505 * If merge is false, an existing record is replaced with the provided record.
 506 * If no preexisting record exists, the provided record is always inserted.
 507 *
 508 * It is considered corruption if a merge is requested and not possible. Given
 509 * the sparse inode alignment constraints, this should never happen.
 510 */
 511STATIC int
 512xfs_inobt_insert_sprec(
 513        struct xfs_mount                *mp,
 514        struct xfs_trans                *tp,
 515        struct xfs_buf                  *agbp,
 516        int                             btnum,
 517        struct xfs_inobt_rec_incore     *nrec,  /* in/out: new/merged rec. */
 518        bool                            merge)  /* merge or replace */
 519{
 520        struct xfs_btree_cur            *cur;
 521        struct xfs_agi                  *agi = XFS_BUF_TO_AGI(agbp);
 522        xfs_agnumber_t                  agno = be32_to_cpu(agi->agi_seqno);
 523        int                             error;
 524        int                             i;
 525        struct xfs_inobt_rec_incore     rec;
 526
 527        cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum);
 528
 529        /* the new record is pre-aligned so we know where to look */
 530        error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i);
 531        if (error)
 532                goto error;
 533        /* if nothing there, insert a new record and return */
 534        if (i == 0) {
 535                error = xfs_inobt_insert_rec(cur, nrec->ir_holemask,
 536                                             nrec->ir_count, nrec->ir_freecount,
 537                                             nrec->ir_free, &i);
 538                if (error)
 539                        goto error;
 540                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error);
 541
 542                goto out;
 543        }
 544
 545        /*
 546         * A record exists at this startino. Merge or replace the record
 547         * depending on what we've been asked to do.
 548         */
 549        if (merge) {
 550                error = xfs_inobt_get_rec(cur, &rec, &i);
 551                if (error)
 552                        goto error;
 553                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error);
 554                XFS_WANT_CORRUPTED_GOTO(mp,
 555                                        rec.ir_startino == nrec->ir_startino,
 556                                        error);
 557
 558                /*
 559                 * This should never fail. If we have coexisting records that
 560                 * cannot merge, something is seriously wrong.
 561                 */
 562                XFS_WANT_CORRUPTED_GOTO(mp, __xfs_inobt_can_merge(nrec, &rec),
 563                                        error);
 564
 565                trace_xfs_irec_merge_pre(mp, agno, rec.ir_startino,
 566                                         rec.ir_holemask, nrec->ir_startino,
 567                                         nrec->ir_holemask);
 568
 569                /* merge to nrec to output the updated record */
 570                __xfs_inobt_rec_merge(nrec, &rec);
 571
 572                trace_xfs_irec_merge_post(mp, agno, nrec->ir_startino,
 573                                          nrec->ir_holemask);
 574
 575                error = xfs_inobt_rec_check_count(mp, nrec);
 576                if (error)
 577                        goto error;
 578        }
 579
 580        error = xfs_inobt_update(cur, nrec);
 581        if (error)
 582                goto error;
 583
 584out:
 585        xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
 586        return 0;
 587error:
 588        xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
 589        return error;
 590}
 591
 592/*
 593 * Allocate new inodes in the allocation group specified by agbp.
 594 * Return 0 for success, else error code.
 595 */
 596STATIC int                              /* error code or 0 */
 597xfs_ialloc_ag_alloc(
 598        xfs_trans_t     *tp,            /* transaction pointer */
 599        xfs_buf_t       *agbp,          /* alloc group buffer */
 600        int             *alloc)
 601{
 602        xfs_agi_t       *agi;           /* allocation group header */
 603        xfs_alloc_arg_t args;           /* allocation argument structure */
 604        xfs_agnumber_t  agno;
 605        int             error;
 606        xfs_agino_t     newino;         /* new first inode's number */
 607        xfs_agino_t     newlen;         /* new number of inodes */
 608        int             isaligned = 0;  /* inode allocation at stripe unit */
 609                                        /* boundary */
 610        uint16_t        allocmask = (uint16_t) -1; /* init. to full chunk */
 611        struct xfs_inobt_rec_incore rec;
 612        struct xfs_perag *pag;
 613        int             do_sparse = 0;
 614
 615        memset(&args, 0, sizeof(args));
 616        args.tp = tp;
 617        args.mp = tp->t_mountp;
 618        args.fsbno = NULLFSBLOCK;
 619        xfs_rmap_ag_owner(&args.oinfo, XFS_RMAP_OWN_INODES);
 620
 621#ifdef DEBUG
 622        /* randomly do sparse inode allocations */
 623        if (xfs_sb_version_hassparseinodes(&tp->t_mountp->m_sb) &&
 624            args.mp->m_ialloc_min_blks < args.mp->m_ialloc_blks)
 625                do_sparse = prandom_u32() & 1;
 626#endif
 627
 628        /*
 629         * Locking will ensure that we don't have two callers in here
 630         * at one time.
 631         */
 632        newlen = args.mp->m_ialloc_inos;
 633        if (args.mp->m_maxicount &&
 634            percpu_counter_read_positive(&args.mp->m_icount) + newlen >
 635                                                        args.mp->m_maxicount)
 636                return -ENOSPC;
 637        args.minlen = args.maxlen = args.mp->m_ialloc_blks;
 638        /*
 639         * First try to allocate inodes contiguous with the last-allocated
 640         * chunk of inodes.  If the filesystem is striped, this will fill
 641         * an entire stripe unit with inodes.
 642         */
 643        agi = XFS_BUF_TO_AGI(agbp);
 644        newino = be32_to_cpu(agi->agi_newino);
 645        agno = be32_to_cpu(agi->agi_seqno);
 646        args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
 647                     args.mp->m_ialloc_blks;
 648        if (do_sparse)
 649                goto sparse_alloc;
 650        if (likely(newino != NULLAGINO &&
 651                  (args.agbno < be32_to_cpu(agi->agi_length)))) {
 652                args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
 653                args.type = XFS_ALLOCTYPE_THIS_BNO;
 654                args.prod = 1;
 655
 656                /*
 657                 * We need to take into account alignment here to ensure that
 658                 * we don't modify the free list if we fail to have an exact
 659                 * block. If we don't have an exact match, and every oher
 660                 * attempt allocation attempt fails, we'll end up cancelling
 661                 * a dirty transaction and shutting down.
 662                 *
 663                 * For an exact allocation, alignment must be 1,
 664                 * however we need to take cluster alignment into account when
 665                 * fixing up the freelist. Use the minalignslop field to
 666                 * indicate that extra blocks might be required for alignment,
 667                 * but not to use them in the actual exact allocation.
 668                 */
 669                args.alignment = 1;
 670                args.minalignslop = xfs_ialloc_cluster_alignment(args.mp) - 1;
 671
 672                /* Allow space for the inode btree to split. */
 673                args.minleft = args.mp->m_in_maxlevels - 1;
 674                if ((error = xfs_alloc_vextent(&args)))
 675                        return error;
 676
 677                /*
 678                 * This request might have dirtied the transaction if the AG can
 679                 * satisfy the request, but the exact block was not available.
 680                 * If the allocation did fail, subsequent requests will relax
 681                 * the exact agbno requirement and increase the alignment
 682                 * instead. It is critical that the total size of the request
 683                 * (len + alignment + slop) does not increase from this point
 684                 * on, so reset minalignslop to ensure it is not included in
 685                 * subsequent requests.
 686                 */
 687                args.minalignslop = 0;
 688        }
 689
 690        if (unlikely(args.fsbno == NULLFSBLOCK)) {
 691                /*
 692                 * Set the alignment for the allocation.
 693                 * If stripe alignment is turned on then align at stripe unit
 694                 * boundary.
 695                 * If the cluster size is smaller than a filesystem block
 696                 * then we're doing I/O for inodes in filesystem block size
 697                 * pieces, so don't need alignment anyway.
 698                 */
 699                isaligned = 0;
 700                if (args.mp->m_sinoalign) {
 701                        ASSERT(!(args.mp->m_flags & XFS_MOUNT_NOALIGN));
 702                        args.alignment = args.mp->m_dalign;
 703                        isaligned = 1;
 704                } else
 705                        args.alignment = xfs_ialloc_cluster_alignment(args.mp);
 706                /*
 707                 * Need to figure out where to allocate the inode blocks.
 708                 * Ideally they should be spaced out through the a.g.
 709                 * For now, just allocate blocks up front.
 710                 */
 711                args.agbno = be32_to_cpu(agi->agi_root);
 712                args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
 713                /*
 714                 * Allocate a fixed-size extent of inodes.
 715                 */
 716                args.type = XFS_ALLOCTYPE_NEAR_BNO;
 717                args.prod = 1;
 718                /*
 719                 * Allow space for the inode btree to split.
 720                 */
 721                args.minleft = args.mp->m_in_maxlevels - 1;
 722                if ((error = xfs_alloc_vextent(&args)))
 723                        return error;
 724        }
 725
 726        /*
 727         * If stripe alignment is turned on, then try again with cluster
 728         * alignment.
 729         */
 730        if (isaligned && args.fsbno == NULLFSBLOCK) {
 731                args.type = XFS_ALLOCTYPE_NEAR_BNO;
 732                args.agbno = be32_to_cpu(agi->agi_root);
 733                args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
 734                args.alignment = xfs_ialloc_cluster_alignment(args.mp);
 735                if ((error = xfs_alloc_vextent(&args)))
 736                        return error;
 737        }
 738
 739        /*
 740         * Finally, try a sparse allocation if the filesystem supports it and
 741         * the sparse allocation length is smaller than a full chunk.
 742         */
 743        if (xfs_sb_version_hassparseinodes(&args.mp->m_sb) &&
 744            args.mp->m_ialloc_min_blks < args.mp->m_ialloc_blks &&
 745            args.fsbno == NULLFSBLOCK) {
 746sparse_alloc:
 747                args.type = XFS_ALLOCTYPE_NEAR_BNO;
 748                args.agbno = be32_to_cpu(agi->agi_root);
 749                args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
 750                args.alignment = args.mp->m_sb.sb_spino_align;
 751                args.prod = 1;
 752
 753                args.minlen = args.mp->m_ialloc_min_blks;
 754                args.maxlen = args.minlen;
 755
 756                /*
 757                 * The inode record will be aligned to full chunk size. We must
 758                 * prevent sparse allocation from AG boundaries that result in
 759                 * invalid inode records, such as records that start at agbno 0
 760                 * or extend beyond the AG.
 761                 *
 762                 * Set min agbno to the first aligned, non-zero agbno and max to
 763                 * the last aligned agbno that is at least one full chunk from
 764                 * the end of the AG.
 765                 */
 766                args.min_agbno = args.mp->m_sb.sb_inoalignmt;
 767                args.max_agbno = round_down(args.mp->m_sb.sb_agblocks,
 768                                            args.mp->m_sb.sb_inoalignmt) -
 769                                 args.mp->m_ialloc_blks;
 770
 771                error = xfs_alloc_vextent(&args);
 772                if (error)
 773                        return error;
 774
 775                newlen = args.len << args.mp->m_sb.sb_inopblog;
 776                ASSERT(newlen <= XFS_INODES_PER_CHUNK);
 777                allocmask = (1 << (newlen / XFS_INODES_PER_HOLEMASK_BIT)) - 1;
 778        }
 779
 780        if (args.fsbno == NULLFSBLOCK) {
 781                *alloc = 0;
 782                return 0;
 783        }
 784        ASSERT(args.len == args.minlen);
 785
 786        /*
 787         * Stamp and write the inode buffers.
 788         *
 789         * Seed the new inode cluster with a random generation number. This
 790         * prevents short-term reuse of generation numbers if a chunk is
 791         * freed and then immediately reallocated. We use random numbers
 792         * rather than a linear progression to prevent the next generation
 793         * number from being easily guessable.
 794         */
 795        error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, agno,
 796                        args.agbno, args.len, prandom_u32());
 797
 798        if (error)
 799                return error;
 800        /*
 801         * Convert the results.
 802         */
 803        newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
 804
 805        if (xfs_inobt_issparse(~allocmask)) {
 806                /*
 807                 * We've allocated a sparse chunk. Align the startino and mask.
 808                 */
 809                xfs_align_sparse_ino(args.mp, &newino, &allocmask);
 810
 811                rec.ir_startino = newino;
 812                rec.ir_holemask = ~allocmask;
 813                rec.ir_count = newlen;
 814                rec.ir_freecount = newlen;
 815                rec.ir_free = XFS_INOBT_ALL_FREE;
 816
 817                /*
 818                 * Insert the sparse record into the inobt and allow for a merge
 819                 * if necessary. If a merge does occur, rec is updated to the
 820                 * merged record.
 821                 */
 822                error = xfs_inobt_insert_sprec(args.mp, tp, agbp, XFS_BTNUM_INO,
 823                                               &rec, true);
 824                if (error == -EFSCORRUPTED) {
 825                        xfs_alert(args.mp,
 826        "invalid sparse inode record: ino 0x%llx holemask 0x%x count %u",
 827                                  XFS_AGINO_TO_INO(args.mp, agno,
 828                                                   rec.ir_startino),
 829                                  rec.ir_holemask, rec.ir_count);
 830                        xfs_force_shutdown(args.mp, SHUTDOWN_CORRUPT_INCORE);
 831                }
 832                if (error)
 833                        return error;
 834
 835                /*
 836                 * We can't merge the part we've just allocated as for the inobt
 837                 * due to finobt semantics. The original record may or may not
 838                 * exist independent of whether physical inodes exist in this
 839                 * sparse chunk.
 840                 *
 841                 * We must update the finobt record based on the inobt record.
 842                 * rec contains the fully merged and up to date inobt record
 843                 * from the previous call. Set merge false to replace any
 844                 * existing record with this one.
 845                 */
 846                if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
 847                        error = xfs_inobt_insert_sprec(args.mp, tp, agbp,
 848                                                       XFS_BTNUM_FINO, &rec,
 849                                                       false);
 850                        if (error)
 851                                return error;
 852                }
 853        } else {
 854                /* full chunk - insert new records to both btrees */
 855                error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen,
 856                                         XFS_BTNUM_INO);
 857                if (error)
 858                        return error;
 859
 860                if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) {
 861                        error = xfs_inobt_insert(args.mp, tp, agbp, newino,
 862                                                 newlen, XFS_BTNUM_FINO);
 863                        if (error)
 864                                return error;
 865                }
 866        }
 867
 868        /*
 869         * Update AGI counts and newino.
 870         */
 871        be32_add_cpu(&agi->agi_count, newlen);
 872        be32_add_cpu(&agi->agi_freecount, newlen);
 873        pag = xfs_perag_get(args.mp, agno);
 874        pag->pagi_freecount += newlen;
 875        xfs_perag_put(pag);
 876        agi->agi_newino = cpu_to_be32(newino);
 877
 878        /*
 879         * Log allocation group header fields
 880         */
 881        xfs_ialloc_log_agi(tp, agbp,
 882                XFS_AGI_COUNT | XFS_AGI_FREECOUNT | XFS_AGI_NEWINO);
 883        /*
 884         * Modify/log superblock values for inode count and inode free count.
 885         */
 886        xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, (long)newlen);
 887        xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, (long)newlen);
 888        *alloc = 1;
 889        return 0;
 890}
 891
 892STATIC xfs_agnumber_t
 893xfs_ialloc_next_ag(
 894        xfs_mount_t     *mp)
 895{
 896        xfs_agnumber_t  agno;
 897
 898        spin_lock(&mp->m_agirotor_lock);
 899        agno = mp->m_agirotor;
 900        if (++mp->m_agirotor >= mp->m_maxagi)
 901                mp->m_agirotor = 0;
 902        spin_unlock(&mp->m_agirotor_lock);
 903
 904        return agno;
 905}
 906
 907/*
 908 * Select an allocation group to look for a free inode in, based on the parent
 909 * inode and the mode.  Return the allocation group buffer.
 910 */
 911STATIC xfs_agnumber_t
 912xfs_ialloc_ag_select(
 913        xfs_trans_t     *tp,            /* transaction pointer */
 914        xfs_ino_t       parent,         /* parent directory inode number */
 915        umode_t         mode,           /* bits set to indicate file type */
 916        int             okalloc)        /* ok to allocate more space */
 917{
 918        xfs_agnumber_t  agcount;        /* number of ag's in the filesystem */
 919        xfs_agnumber_t  agno;           /* current ag number */
 920        int             flags;          /* alloc buffer locking flags */
 921        xfs_extlen_t    ineed;          /* blocks needed for inode allocation */
 922        xfs_extlen_t    longest = 0;    /* longest extent available */
 923        xfs_mount_t     *mp;            /* mount point structure */
 924        int             needspace;      /* file mode implies space allocated */
 925        xfs_perag_t     *pag;           /* per allocation group data */
 926        xfs_agnumber_t  pagno;          /* parent (starting) ag number */
 927        int             error;
 928
 929        /*
 930         * Files of these types need at least one block if length > 0
 931         * (and they won't fit in the inode, but that's hard to figure out).
 932         */
 933        needspace = S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode);
 934        mp = tp->t_mountp;
 935        agcount = mp->m_maxagi;
 936        if (S_ISDIR(mode))
 937                pagno = xfs_ialloc_next_ag(mp);
 938        else {
 939                pagno = XFS_INO_TO_AGNO(mp, parent);
 940                if (pagno >= agcount)
 941                        pagno = 0;
 942        }
 943
 944        ASSERT(pagno < agcount);
 945
 946        /*
 947         * Loop through allocation groups, looking for one with a little
 948         * free space in it.  Note we don't look for free inodes, exactly.
 949         * Instead, we include whether there is a need to allocate inodes
 950         * to mean that blocks must be allocated for them,
 951         * if none are currently free.
 952         */
 953        agno = pagno;
 954        flags = XFS_ALLOC_FLAG_TRYLOCK;
 955        for (;;) {
 956                pag = xfs_perag_get(mp, agno);
 957                if (!pag->pagi_inodeok) {
 958                        xfs_ialloc_next_ag(mp);
 959                        goto nextag;
 960                }
 961
 962                if (!pag->pagi_init) {
 963                        error = xfs_ialloc_pagi_init(mp, tp, agno);
 964                        if (error)
 965                                goto nextag;
 966                }
 967
 968                if (pag->pagi_freecount) {
 969                        xfs_perag_put(pag);
 970                        return agno;
 971                }
 972
 973                if (!okalloc)
 974                        goto nextag;
 975
 976                if (!pag->pagf_init) {
 977                        error = xfs_alloc_pagf_init(mp, tp, agno, flags);
 978                        if (error)
 979                                goto nextag;
 980                }
 981
 982                /*
 983                 * Check that there is enough free space for the file plus a
 984                 * chunk of inodes if we need to allocate some. If this is the
 985                 * first pass across the AGs, take into account the potential
 986                 * space needed for alignment of inode chunks when checking the
 987                 * longest contiguous free space in the AG - this prevents us
 988                 * from getting ENOSPC because we have free space larger than
 989                 * m_ialloc_blks but alignment constraints prevent us from using
 990                 * it.
 991                 *
 992                 * If we can't find an AG with space for full alignment slack to
 993                 * be taken into account, we must be near ENOSPC in all AGs.
 994                 * Hence we don't include alignment for the second pass and so
 995                 * if we fail allocation due to alignment issues then it is most
 996                 * likely a real ENOSPC condition.
 997                 */
 998                ineed = mp->m_ialloc_min_blks;
 999                if (flags && ineed > 1)
1000                        ineed += xfs_ialloc_cluster_alignment(mp);
1001                longest = pag->pagf_longest;
1002                if (!longest)
1003                        longest = pag->pagf_flcount > 0;
1004
1005                if (pag->pagf_freeblks >= needspace + ineed &&
1006                    longest >= ineed) {
1007                        xfs_perag_put(pag);
1008                        return agno;
1009                }
1010nextag:
1011                xfs_perag_put(pag);
1012                /*
1013                 * No point in iterating over the rest, if we're shutting
1014                 * down.
1015                 */
1016                if (XFS_FORCED_SHUTDOWN(mp))
1017                        return NULLAGNUMBER;
1018                agno++;
1019                if (agno >= agcount)
1020                        agno = 0;
1021                if (agno == pagno) {
1022                        if (flags == 0)
1023                                return NULLAGNUMBER;
1024                        flags = 0;
1025                }
1026        }
1027}
1028
1029/*
1030 * Try to retrieve the next record to the left/right from the current one.
1031 */
1032STATIC int
1033xfs_ialloc_next_rec(
1034        struct xfs_btree_cur    *cur,
1035        xfs_inobt_rec_incore_t  *rec,
1036        int                     *done,
1037        int                     left)
1038{
1039        int                     error;
1040        int                     i;
1041
1042        if (left)
1043                error = xfs_btree_decrement(cur, 0, &i);
1044        else
1045                error = xfs_btree_increment(cur, 0, &i);
1046
1047        if (error)
1048                return error;
1049        *done = !i;
1050        if (i) {
1051                error = xfs_inobt_get_rec(cur, rec, &i);
1052                if (error)
1053                        return error;
1054                XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
1055        }
1056
1057        return 0;
1058}
1059
1060STATIC int
1061xfs_ialloc_get_rec(
1062        struct xfs_btree_cur    *cur,
1063        xfs_agino_t             agino,
1064        xfs_inobt_rec_incore_t  *rec,
1065        int                     *done)
1066{
1067        int                     error;
1068        int                     i;
1069
1070        error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_EQ, &i);
1071        if (error)
1072                return error;
1073        *done = !i;
1074        if (i) {
1075                error = xfs_inobt_get_rec(cur, rec, &i);
1076                if (error)
1077                        return error;
1078                XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
1079        }
1080
1081        return 0;
1082}
1083
1084/*
1085 * Return the offset of the first free inode in the record. If the inode chunk
1086 * is sparsely allocated, we convert the record holemask to inode granularity
1087 * and mask off the unallocated regions from the inode free mask.
1088 */
1089STATIC int
1090xfs_inobt_first_free_inode(
1091        struct xfs_inobt_rec_incore     *rec)
1092{
1093        xfs_inofree_t                   realfree;
1094
1095        /* if there are no holes, return the first available offset */
1096        if (!xfs_inobt_issparse(rec->ir_holemask))
1097                return xfs_lowbit64(rec->ir_free);
1098
1099        realfree = xfs_inobt_irec_to_allocmask(rec);
1100        realfree &= rec->ir_free;
1101
1102        return xfs_lowbit64(realfree);
1103}
1104
1105/*
1106 * Allocate an inode using the inobt-only algorithm.
1107 */
1108STATIC int
1109xfs_dialloc_ag_inobt(
1110        struct xfs_trans        *tp,
1111        struct xfs_buf          *agbp,
1112        xfs_ino_t               parent,
1113        xfs_ino_t               *inop)
1114{
1115        struct xfs_mount        *mp = tp->t_mountp;
1116        struct xfs_agi          *agi = XFS_BUF_TO_AGI(agbp);
1117        xfs_agnumber_t          agno = be32_to_cpu(agi->agi_seqno);
1118        xfs_agnumber_t          pagno = XFS_INO_TO_AGNO(mp, parent);
1119        xfs_agino_t             pagino = XFS_INO_TO_AGINO(mp, parent);
1120        struct xfs_perag        *pag;
1121        struct xfs_btree_cur    *cur, *tcur;
1122        struct xfs_inobt_rec_incore rec, trec;
1123        xfs_ino_t               ino;
1124        int                     error;
1125        int                     offset;
1126        int                     i, j;
1127
1128        pag = xfs_perag_get(mp, agno);
1129
1130        ASSERT(pag->pagi_init);
1131        ASSERT(pag->pagi_inodeok);
1132        ASSERT(pag->pagi_freecount > 0);
1133
1134 restart_pagno:
1135        cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
1136        /*
1137         * If pagino is 0 (this is the root inode allocation) use newino.
1138         * This must work because we've just allocated some.
1139         */
1140        if (!pagino)
1141                pagino = be32_to_cpu(agi->agi_newino);
1142
1143        error = xfs_check_agi_freecount(cur, agi);
1144        if (error)
1145                goto error0;
1146
1147        /*
1148         * If in the same AG as the parent, try to get near the parent.
1149         */
1150        if (pagno == agno) {
1151                int             doneleft;       /* done, to the left */
1152                int             doneright;      /* done, to the right */
1153                int             searchdistance = 10;
1154
1155                error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
1156                if (error)
1157                        goto error0;
1158                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
1159
1160                error = xfs_inobt_get_rec(cur, &rec, &j);
1161                if (error)
1162                        goto error0;
1163                XFS_WANT_CORRUPTED_GOTO(mp, j == 1, error0);
1164
1165                if (rec.ir_freecount > 0) {
1166                        /*
1167                         * Found a free inode in the same chunk
1168                         * as the parent, done.
1169                         */
1170                        goto alloc_inode;
1171                }
1172
1173
1174                /*
1175                 * In the same AG as parent, but parent's chunk is full.
1176                 */
1177
1178                /* duplicate the cursor, search left & right simultaneously */
1179                error = xfs_btree_dup_cursor(cur, &tcur);
1180                if (error)
1181                        goto error0;
1182
1183                /*
1184                 * Skip to last blocks looked up if same parent inode.
1185                 */
1186                if (pagino != NULLAGINO &&
1187                    pag->pagl_pagino == pagino &&
1188                    pag->pagl_leftrec != NULLAGINO &&
1189                    pag->pagl_rightrec != NULLAGINO) {
1190                        error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec,
1191                                                   &trec, &doneleft);
1192                        if (error)
1193                                goto error1;
1194
1195                        error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec,
1196                                                   &rec, &doneright);
1197                        if (error)
1198                                goto error1;
1199                } else {
1200                        /* search left with tcur, back up 1 record */
1201                        error = xfs_ialloc_next_rec(tcur, &trec, &doneleft, 1);
1202                        if (error)
1203                                goto error1;
1204
1205                        /* search right with cur, go forward 1 record. */
1206                        error = xfs_ialloc_next_rec(cur, &rec, &doneright, 0);
1207                        if (error)
1208                                goto error1;
1209                }
1210
1211                /*
1212                 * Loop until we find an inode chunk with a free inode.
1213                 */
1214                while (!doneleft || !doneright) {
1215                        int     useleft;  /* using left inode chunk this time */
1216
1217                        if (!--searchdistance) {
1218                                /*
1219                                 * Not in range - save last search
1220                                 * location and allocate a new inode
1221                                 */
1222                                xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
1223                                pag->pagl_leftrec = trec.ir_startino;
1224                                pag->pagl_rightrec = rec.ir_startino;
1225                                pag->pagl_pagino = pagino;
1226                                goto newino;
1227                        }
1228
1229                        /* figure out the closer block if both are valid. */
1230                        if (!doneleft && !doneright) {
1231                                useleft = pagino -
1232                                 (trec.ir_startino + XFS_INODES_PER_CHUNK - 1) <
1233                                  rec.ir_startino - pagino;
1234                        } else {
1235                                useleft = !doneleft;
1236                        }
1237
1238                        /* free inodes to the left? */
1239                        if (useleft && trec.ir_freecount) {
1240                                rec = trec;
1241                                xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1242                                cur = tcur;
1243
1244                                pag->pagl_leftrec = trec.ir_startino;
1245                                pag->pagl_rightrec = rec.ir_startino;
1246                                pag->pagl_pagino = pagino;
1247                                goto alloc_inode;
1248                        }
1249
1250                        /* free inodes to the right? */
1251                        if (!useleft && rec.ir_freecount) {
1252                                xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
1253
1254                                pag->pagl_leftrec = trec.ir_startino;
1255                                pag->pagl_rightrec = rec.ir_startino;
1256                                pag->pagl_pagino = pagino;
1257                                goto alloc_inode;
1258                        }
1259
1260                        /* get next record to check */
1261                        if (useleft) {
1262                                error = xfs_ialloc_next_rec(tcur, &trec,
1263                                                                 &doneleft, 1);
1264                        } else {
1265                                error = xfs_ialloc_next_rec(cur, &rec,
1266                                                                 &doneright, 0);
1267                        }
1268                        if (error)
1269                                goto error1;
1270                }
1271
1272                /*
1273                 * We've reached the end of the btree. because
1274                 * we are only searching a small chunk of the
1275                 * btree each search, there is obviously free
1276                 * inodes closer to the parent inode than we
1277                 * are now. restart the search again.
1278                 */
1279                pag->pagl_pagino = NULLAGINO;
1280                pag->pagl_leftrec = NULLAGINO;
1281                pag->pagl_rightrec = NULLAGINO;
1282                xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
1283                xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1284                goto restart_pagno;
1285        }
1286
1287        /*
1288         * In a different AG from the parent.
1289         * See if the most recently allocated block has any free.
1290         */
1291newino:
1292        if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
1293                error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
1294                                         XFS_LOOKUP_EQ, &i);
1295                if (error)
1296                        goto error0;
1297
1298                if (i == 1) {
1299                        error = xfs_inobt_get_rec(cur, &rec, &j);
1300                        if (error)
1301                                goto error0;
1302
1303                        if (j == 1 && rec.ir_freecount > 0) {
1304                                /*
1305                                 * The last chunk allocated in the group
1306                                 * still has a free inode.
1307                                 */
1308                                goto alloc_inode;
1309                        }
1310                }
1311        }
1312
1313        /*
1314         * None left in the last group, search the whole AG
1315         */
1316        error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
1317        if (error)
1318                goto error0;
1319        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
1320
1321        for (;;) {
1322                error = xfs_inobt_get_rec(cur, &rec, &i);
1323                if (error)
1324                        goto error0;
1325                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
1326                if (rec.ir_freecount > 0)
1327                        break;
1328                error = xfs_btree_increment(cur, 0, &i);
1329                if (error)
1330                        goto error0;
1331                XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
1332        }
1333
1334alloc_inode:
1335        offset = xfs_inobt_first_free_inode(&rec);
1336        ASSERT(offset >= 0);
1337        ASSERT(offset < XFS_INODES_PER_CHUNK);
1338        ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
1339                                   XFS_INODES_PER_CHUNK) == 0);
1340        ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
1341        rec.ir_free &= ~XFS_INOBT_MASK(offset);
1342        rec.ir_freecount--;
1343        error = xfs_inobt_update(cur, &rec);
1344        if (error)
1345                goto error0;
1346        be32_add_cpu(&agi->agi_freecount, -1);
1347        xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
1348        pag->pagi_freecount--;
1349
1350        error = xfs_check_agi_freecount(cur, agi);
1351        if (error)
1352                goto error0;
1353
1354        xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1355        xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
1356        xfs_perag_put(pag);
1357        *inop = ino;
1358        return 0;
1359error1:
1360        xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
1361error0:
1362        xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
1363        xfs_perag_put(pag);
1364        return error;
1365}
1366
1367/*
1368 * Use the free inode btree to allocate an inode based on distance from the
1369 * parent. Note that the provided cursor may be deleted and replaced.
1370 */
1371STATIC int
1372xfs_dialloc_ag_finobt_near(
1373        xfs_agino_t                     pagino,
1374        struct xfs_btree_cur            **ocur,
1375        struct xfs_inobt_rec_incore     *rec)
1376{
1377        struct xfs_btree_cur            *lcur = *ocur;  /* left search cursor */
1378        struct xfs_btree_cur            *rcur;  /* right search cursor */
1379        struct xfs_inobt_rec_incore     rrec;
1380        int                             error;
1381        int                             i, j;
1382
1383        error = xfs_inobt_lookup(lcur, pagino, XFS_LOOKUP_LE, &i);
1384        if (error)
1385                return error;
1386
1387        if (i == 1) {
1388                error = xfs_inobt_get_rec(lcur, rec, &i);
1389                if (error)
1390                        return error;
1391                XFS_WANT_CORRUPTED_RETURN(lcur->bc_mp, i == 1);
1392
1393                /*
1394                 * See if we've landed in the parent inode record. The finobt
1395                 * only tracks chunks with at least one free inode, so record
1396                 * existence is enough.
1397                 */
1398                if (pagino >= rec->ir_startino &&
1399                    pagino < (rec->ir_startino + XFS_INODES_PER_CHUNK))
1400                        return 0;
1401        }
1402
1403        error = xfs_btree_dup_cursor(lcur, &rcur);
1404        if (error)
1405                return error;
1406
1407        error = xfs_inobt_lookup(rcur, pagino, XFS_LOOKUP_GE, &j);
1408        if (error)
1409                goto error_rcur;
1410        if (j == 1) {
1411                error = xfs_inobt_get_rec(rcur, &rrec, &j);
1412                if (error)
1413                        goto error_rcur;
1414                XFS_WANT_CORRUPTED_GOTO(lcur->bc_mp, j == 1, error_rcur);
1415        }
1416
1417        XFS_WANT_CORRUPTED_GOTO(lcur->bc_mp, i == 1 || j == 1, error_rcur);
1418        if (i == 1 && j == 1) {
1419                /*
1420                 * Both the left and right records are valid. Choose the closer
1421                 * inode chunk to the target.
1422                 */
1423                if ((pagino - rec->ir_startino + XFS_INODES_PER_CHUNK - 1) >
1424                    (rrec.ir_startino - pagino)) {
1425                        *rec = rrec;
1426                        xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR);
1427                        *ocur = rcur;
1428                } else {
1429                        xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR);
1430                }
1431        } else if (j == 1) {
1432                /* only the right record is valid */
1433                *rec = rrec;
1434                xfs_btree_del_cursor(lcur, XFS_BTREE_NOERROR);
1435                *ocur = rcur;
1436        } else if (i == 1) {
1437                /* only the left record is valid */
1438                xfs_btree_del_cursor(rcur, XFS_BTREE_NOERROR);
1439        }
1440
1441        return 0;
1442
1443error_rcur:
1444        xfs_btree_del_cursor(rcur, XFS_BTREE_ERROR);
1445        return error;
1446}
1447
1448/*
1449 * Use the free inode btree to find a free inode based on a newino hint. If
1450 * the hint is NULL, find the first free inode in the AG.
1451 */
1452STATIC int
1453xfs_dialloc_ag_finobt_newino(
1454        struct xfs_agi                  *agi,
1455        struct xfs_btree_cur            *cur,
1456        struct xfs_inobt_rec_incore     *rec)
1457{
1458        int error;
1459        int i;
1460
1461        if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
1462                error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
1463                                         XFS_LOOKUP_EQ, &i);
1464                if (error)
1465                        return error;
1466                if (i == 1) {
1467                        error = xfs_inobt_get_rec(cur, rec, &i);
1468                        if (error)
1469                                return error;
1470                        XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
1471                        return 0;
1472                }
1473        }
1474
1475        /*
1476         * Find the first inode available in the AG.
1477         */
1478        error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
1479        if (error)
1480                return error;
1481        XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
1482
1483        error = xfs_inobt_get_rec(cur, rec, &i);
1484        if (error)
1485                return error;
1486        XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
1487
1488        return 0;
1489}
1490
1491/*
1492 * Update the inobt based on a modification made to the finobt. Also ensure that
1493 * the records from both trees are equivalent post-modification.
1494 */
1495STATIC int
1496xfs_dialloc_ag_update_inobt(
1497        struct xfs_btree_cur            *cur,   /* inobt cursor */
1498        struct xfs_inobt_rec_incore     *frec,  /* finobt record */
1499        int                             offset) /* inode offset */
1500{
1501        struct xfs_inobt_rec_incore     rec;
1502        int                             error;
1503        int                             i;
1504
1505        error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i);
1506        if (error)
1507                return error;
1508        XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
1509
1510        error = xfs_inobt_get_rec(cur, &rec, &i);
1511        if (error)
1512                return error;
1513        XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
1514        ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) %
1515                                   XFS_INODES_PER_CHUNK) == 0);
1516
1517        rec.ir_free &= ~XFS_INOBT_MASK(offset);
1518        rec.ir_freecount--;
1519
1520        XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, (rec.ir_free == frec->ir_free) &&
1521                                  (rec.ir_freecount == frec->ir_freecount));
1522
1523        return xfs_inobt_update(cur, &rec);
1524}
1525
1526/*
1527 * Allocate an inode using the free inode btree, if available. Otherwise, fall
1528 * back to the inobt search algorithm.
1529 *
1530 * The caller selected an AG for us, and made sure that free inodes are
1531 * available.
1532 */
1533STATIC int
1534xfs_dialloc_ag(
1535        struct xfs_trans        *tp,
1536        struct xfs_buf          *agbp,
1537        xfs_ino_t               parent,
1538        xfs_ino_t               *inop)
1539{
1540        struct xfs_mount                *mp = tp->t_mountp;
1541        struct xfs_agi                  *agi = XFS_BUF_TO_AGI(agbp);
1542        xfs_agnumber_t                  agno = be32_to_cpu(agi->agi_seqno);
1543        xfs_agnumber_t                  pagno = XFS_INO_TO_AGNO(mp, parent);
1544        xfs_agino_t                     pagino = XFS_INO_TO_AGINO(mp, parent);
1545        struct xfs_perag                *pag;
1546        struct xfs_btree_cur            *cur;   /* finobt cursor */
1547        struct xfs_btree_cur            *icur;  /* inobt cursor */
1548        struct xfs_inobt_rec_incore     rec;
1549        xfs_ino_t                       ino;
1550        int                             error;
1551        int                             offset;
1552        int                             i;
1553
1554        if (!xfs_sb_version_hasfinobt(&mp->m_sb))
1555                return xfs_dialloc_ag_inobt(tp, agbp, parent, inop);
1556
1557        pag = xfs_perag_get(mp, agno);
1558
1559        /*
1560         * If pagino is 0 (this is the root inode allocation) use newino.
1561         * This must work because we've just allocated some.
1562         */
1563        if (!pagino)
1564                pagino = be32_to_cpu(agi->agi_newino);
1565
1566        cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO);
1567
1568        error = xfs_check_agi_freecount(cur, agi);
1569        if (error)
1570                goto error_cur;
1571
1572        /*
1573         * The search algorithm depends on whether we're in the same AG as the
1574         * parent. If so, find the closest available inode to the parent. If
1575         * not, consider the agi hint or find the first free inode in the AG.
1576         */
1577        if (agno == pagno)
1578                error = xfs_dialloc_ag_finobt_near(pagino, &cur, &rec);
1579        else
1580                error = xfs_dialloc_ag_finobt_newino(agi, cur, &rec);
1581        if (error)
1582                goto error_cur;
1583
1584        offset = xfs_inobt_first_free_inode(&rec);
1585        ASSERT(offset >= 0);
1586        ASSERT(offset < XFS_INODES_PER_CHUNK);
1587        ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
1588                                   XFS_INODES_PER_CHUNK) == 0);
1589        ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
1590
1591        /*
1592         * Modify or remove the finobt record.
1593         */
1594        rec.ir_free &= ~XFS_INOBT_MASK(offset);
1595        rec.ir_freecount--;
1596        if (rec.ir_freecount)
1597                error = xfs_inobt_update(cur, &rec);
1598        else
1599                error = xfs_btree_delete(cur, &i);
1600        if (error)
1601                goto error_cur;
1602
1603        /*
1604         * The finobt has now been updated appropriately. We haven't updated the
1605         * agi and superblock yet, so we can create an inobt cursor and validate
1606         * the original freecount. If all is well, make the equivalent update to
1607         * the inobt using the finobt record and offset information.
1608         */
1609        icur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
1610
1611        error = xfs_check_agi_freecount(icur, agi);
1612        if (error)
1613                goto error_icur;
1614
1615        error = xfs_dialloc_ag_update_inobt(icur, &rec, offset);
1616        if (error)
1617                goto error_icur;
1618
1619        /*
1620         * Both trees have now been updated. We must update the perag and
1621         * superblock before we can check the freecount for each btree.
1622         */
1623        be32_add_cpu(&agi->agi_freecount, -1);
1624        xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
1625        pag->pagi_freecount--;
1626
1627        xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
1628
1629        error = xfs_check_agi_freecount(icur, agi);
1630        if (error)
1631                goto error_icur;
1632        error = xfs_check_agi_freecount(cur, agi);
1633        if (error)
1634                goto error_icur;
1635
1636        xfs_btree_del_cursor(icur, XFS_BTREE_NOERROR);
1637        xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1638        xfs_perag_put(pag);
1639        *inop = ino;
1640        return 0;
1641
1642error_icur:
1643        xfs_btree_del_cursor(icur, XFS_BTREE_ERROR);
1644error_cur:
1645        xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
1646        xfs_perag_put(pag);
1647        return error;
1648}
1649
1650/*
1651 * Allocate an inode on disk.
1652 *
1653 * Mode is used to tell whether the new inode will need space, and whether it
1654 * is a directory.
1655 *
1656 * This function is designed to be called twice if it has to do an allocation
1657 * to make more free inodes.  On the first call, *IO_agbp should be set to NULL.
1658 * If an inode is available without having to performn an allocation, an inode
1659 * number is returned.  In this case, *IO_agbp is set to NULL.  If an allocation
1660 * needs to be done, xfs_dialloc returns the current AGI buffer in *IO_agbp.
1661 * The caller should then commit the current transaction, allocate a
1662 * new transaction, and call xfs_dialloc() again, passing in the previous value
1663 * of *IO_agbp.  IO_agbp should be held across the transactions. Since the AGI
1664 * buffer is locked across the two calls, the second call is guaranteed to have
1665 * a free inode available.
1666 *
1667 * Once we successfully pick an inode its number is returned and the on-disk
1668 * data structures are updated.  The inode itself is not read in, since doing so
1669 * would break ordering constraints with xfs_reclaim.
1670 */
1671int
1672xfs_dialloc(
1673        struct xfs_trans        *tp,
1674        xfs_ino_t               parent,
1675        umode_t                 mode,
1676        int                     okalloc,
1677        struct xfs_buf          **IO_agbp,
1678        xfs_ino_t               *inop)
1679{
1680        struct xfs_mount        *mp = tp->t_mountp;
1681        struct xfs_buf          *agbp;
1682        xfs_agnumber_t          agno;
1683        int                     error;
1684        int                     ialloced;
1685        int                     noroom = 0;
1686        xfs_agnumber_t          start_agno;
1687        struct xfs_perag        *pag;
1688
1689        if (*IO_agbp) {
1690                /*
1691                 * If the caller passes in a pointer to the AGI buffer,
1692                 * continue where we left off before.  In this case, we
1693                 * know that the allocation group has free inodes.
1694                 */
1695                agbp = *IO_agbp;
1696                goto out_alloc;
1697        }
1698
1699        /*
1700         * We do not have an agbp, so select an initial allocation
1701         * group for inode allocation.
1702         */
1703        start_agno = xfs_ialloc_ag_select(tp, parent, mode, okalloc);
1704        if (start_agno == NULLAGNUMBER) {
1705                *inop = NULLFSINO;
1706                return 0;
1707        }
1708
1709        /*
1710         * If we have already hit the ceiling of inode blocks then clear
1711         * okalloc so we scan all available agi structures for a free
1712         * inode.
1713         *
1714         * Read rough value of mp->m_icount by percpu_counter_read_positive,
1715         * which will sacrifice the preciseness but improve the performance.
1716         */
1717        if (mp->m_maxicount &&
1718            percpu_counter_read_positive(&mp->m_icount) + mp->m_ialloc_inos
1719                                                        > mp->m_maxicount) {
1720                noroom = 1;
1721                okalloc = 0;
1722        }
1723
1724        /*
1725         * Loop until we find an allocation group that either has free inodes
1726         * or in which we can allocate some inodes.  Iterate through the
1727         * allocation groups upward, wrapping at the end.
1728         */
1729        agno = start_agno;
1730        for (;;) {
1731                pag = xfs_perag_get(mp, agno);
1732                if (!pag->pagi_inodeok) {
1733                        xfs_ialloc_next_ag(mp);
1734                        goto nextag;
1735                }
1736
1737                if (!pag->pagi_init) {
1738                        error = xfs_ialloc_pagi_init(mp, tp, agno);
1739                        if (error)
1740                                goto out_error;
1741                }
1742
1743                /*
1744                 * Do a first racy fast path check if this AG is usable.
1745                 */
1746                if (!pag->pagi_freecount && !okalloc)
1747                        goto nextag;
1748
1749                /*
1750                 * Then read in the AGI buffer and recheck with the AGI buffer
1751                 * lock held.
1752                 */
1753                error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
1754                if (error)
1755                        goto out_error;
1756
1757                if (pag->pagi_freecount) {
1758                        xfs_perag_put(pag);
1759                        goto out_alloc;
1760                }
1761
1762                if (!okalloc)
1763                        goto nextag_relse_buffer;
1764
1765
1766                error = xfs_ialloc_ag_alloc(tp, agbp, &ialloced);
1767                if (error) {
1768                        xfs_trans_brelse(tp, agbp);
1769
1770                        if (error != -ENOSPC)
1771                                goto out_error;
1772
1773                        xfs_perag_put(pag);
1774                        *inop = NULLFSINO;
1775                        return 0;
1776                }
1777
1778                if (ialloced) {
1779                        /*
1780                         * We successfully allocated some inodes, return
1781                         * the current context to the caller so that it
1782                         * can commit the current transaction and call
1783                         * us again where we left off.
1784                         */
1785                        ASSERT(pag->pagi_freecount > 0);
1786                        xfs_perag_put(pag);
1787
1788                        *IO_agbp = agbp;
1789                        *inop = NULLFSINO;
1790                        return 0;
1791                }
1792
1793nextag_relse_buffer:
1794                xfs_trans_brelse(tp, agbp);
1795nextag:
1796                xfs_perag_put(pag);
1797                if (++agno == mp->m_sb.sb_agcount)
1798                        agno = 0;
1799                if (agno == start_agno) {
1800                        *inop = NULLFSINO;
1801                        return noroom ? -ENOSPC : 0;
1802                }
1803        }
1804
1805out_alloc:
1806        *IO_agbp = NULL;
1807        return xfs_dialloc_ag(tp, agbp, parent, inop);
1808out_error:
1809        xfs_perag_put(pag);
1810        return error;
1811}
1812
1813/*
1814 * Free the blocks of an inode chunk. We must consider that the inode chunk
1815 * might be sparse and only free the regions that are allocated as part of the
1816 * chunk.
1817 */
1818STATIC void
1819xfs_difree_inode_chunk(
1820        struct xfs_mount                *mp,
1821        xfs_agnumber_t                  agno,
1822        struct xfs_inobt_rec_incore     *rec,
1823        struct xfs_defer_ops            *dfops)
1824{
1825        xfs_agblock_t   sagbno = XFS_AGINO_TO_AGBNO(mp, rec->ir_startino);
1826        int             startidx, endidx;
1827        int             nextbit;
1828        xfs_agblock_t   agbno;
1829        int             contigblk;
1830        struct xfs_owner_info   oinfo;
1831        DECLARE_BITMAP(holemask, XFS_INOBT_HOLEMASK_BITS);
1832        xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INODES);
1833
1834        if (!xfs_inobt_issparse(rec->ir_holemask)) {
1835                /* not sparse, calculate extent info directly */
1836                xfs_bmap_add_free(mp, dfops, XFS_AGB_TO_FSB(mp, agno, sagbno),
1837                                  mp->m_ialloc_blks, &oinfo);
1838                return;
1839        }
1840
1841        /* holemask is only 16-bits (fits in an unsigned long) */
1842        ASSERT(sizeof(rec->ir_holemask) <= sizeof(holemask[0]));
1843        holemask[0] = rec->ir_holemask;
1844
1845        /*
1846         * Find contiguous ranges of zeroes (i.e., allocated regions) in the
1847         * holemask and convert the start/end index of each range to an extent.
1848         * We start with the start and end index both pointing at the first 0 in
1849         * the mask.
1850         */
1851        startidx = endidx = find_first_zero_bit(holemask,
1852                                                XFS_INOBT_HOLEMASK_BITS);
1853        nextbit = startidx + 1;
1854        while (startidx < XFS_INOBT_HOLEMASK_BITS) {
1855                nextbit = find_next_zero_bit(holemask, XFS_INOBT_HOLEMASK_BITS,
1856                                             nextbit);
1857                /*
1858                 * If the next zero bit is contiguous, update the end index of
1859                 * the current range and continue.
1860                 */
1861                if (nextbit != XFS_INOBT_HOLEMASK_BITS &&
1862                    nextbit == endidx + 1) {
1863                        endidx = nextbit;
1864                        goto next;
1865                }
1866
1867                /*
1868                 * nextbit is not contiguous with the current end index. Convert
1869                 * the current start/end to an extent and add it to the free
1870                 * list.
1871                 */
1872                agbno = sagbno + (startidx * XFS_INODES_PER_HOLEMASK_BIT) /
1873                                  mp->m_sb.sb_inopblock;
1874                contigblk = ((endidx - startidx + 1) *
1875                             XFS_INODES_PER_HOLEMASK_BIT) /
1876                            mp->m_sb.sb_inopblock;
1877
1878                ASSERT(agbno % mp->m_sb.sb_spino_align == 0);
1879                ASSERT(contigblk % mp->m_sb.sb_spino_align == 0);
1880                xfs_bmap_add_free(mp, dfops, XFS_AGB_TO_FSB(mp, agno, agbno),
1881                                  contigblk, &oinfo);
1882
1883                /* reset range to current bit and carry on... */
1884                startidx = endidx = nextbit;
1885
1886next:
1887                nextbit++;
1888        }
1889}
1890
1891STATIC int
1892xfs_difree_inobt(
1893        struct xfs_mount                *mp,
1894        struct xfs_trans                *tp,
1895        struct xfs_buf                  *agbp,
1896        xfs_agino_t                     agino,
1897        struct xfs_defer_ops            *dfops,
1898        struct xfs_icluster             *xic,
1899        struct xfs_inobt_rec_incore     *orec)
1900{
1901        struct xfs_agi                  *agi = XFS_BUF_TO_AGI(agbp);
1902        xfs_agnumber_t                  agno = be32_to_cpu(agi->agi_seqno);
1903        struct xfs_perag                *pag;
1904        struct xfs_btree_cur            *cur;
1905        struct xfs_inobt_rec_incore     rec;
1906        int                             ilen;
1907        int                             error;
1908        int                             i;
1909        int                             off;
1910
1911        ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
1912        ASSERT(XFS_AGINO_TO_AGBNO(mp, agino) < be32_to_cpu(agi->agi_length));
1913
1914        /*
1915         * Initialize the cursor.
1916         */
1917        cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
1918
1919        error = xfs_check_agi_freecount(cur, agi);
1920        if (error)
1921                goto error0;
1922
1923        /*
1924         * Look for the entry describing this inode.
1925         */
1926        if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) {
1927                xfs_warn(mp, "%s: xfs_inobt_lookup() returned error %d.",
1928                        __func__, error);
1929                goto error0;
1930        }
1931        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
1932        error = xfs_inobt_get_rec(cur, &rec, &i);
1933        if (error) {
1934                xfs_warn(mp, "%s: xfs_inobt_get_rec() returned error %d.",
1935                        __func__, error);
1936                goto error0;
1937        }
1938        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0);
1939        /*
1940         * Get the offset in the inode chunk.
1941         */
1942        off = agino - rec.ir_startino;
1943        ASSERT(off >= 0 && off < XFS_INODES_PER_CHUNK);
1944        ASSERT(!(rec.ir_free & XFS_INOBT_MASK(off)));
1945        /*
1946         * Mark the inode free & increment the count.
1947         */
1948        rec.ir_free |= XFS_INOBT_MASK(off);
1949        rec.ir_freecount++;
1950
1951        /*
1952         * When an inode chunk is free, it becomes eligible for removal. Don't
1953         * remove the chunk if the block size is large enough for multiple inode
1954         * chunks (that might not be free).
1955         */
1956        if (!(mp->m_flags & XFS_MOUNT_IKEEP) &&
1957            rec.ir_free == XFS_INOBT_ALL_FREE &&
1958            mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) {
1959                xic->deleted = 1;
1960                xic->first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino);
1961                xic->alloc = xfs_inobt_irec_to_allocmask(&rec);
1962
1963                /*
1964                 * Remove the inode cluster from the AGI B+Tree, adjust the
1965                 * AGI and Superblock inode counts, and mark the disk space
1966                 * to be freed when the transaction is committed.
1967                 */
1968                ilen = rec.ir_freecount;
1969                be32_add_cpu(&agi->agi_count, -ilen);
1970                be32_add_cpu(&agi->agi_freecount, -(ilen - 1));
1971                xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
1972                pag = xfs_perag_get(mp, agno);
1973                pag->pagi_freecount -= ilen - 1;
1974                xfs_perag_put(pag);
1975                xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen);
1976                xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));
1977
1978                if ((error = xfs_btree_delete(cur, &i))) {
1979                        xfs_warn(mp, "%s: xfs_btree_delete returned error %d.",
1980                                __func__, error);
1981                        goto error0;
1982                }
1983
1984                xfs_difree_inode_chunk(mp, agno, &rec, dfops);
1985        } else {
1986                xic->deleted = 0;
1987
1988                error = xfs_inobt_update(cur, &rec);
1989                if (error) {
1990                        xfs_warn(mp, "%s: xfs_inobt_update returned error %d.",
1991                                __func__, error);
1992                        goto error0;
1993                }
1994
1995                /* 
1996                 * Change the inode free counts and log the ag/sb changes.
1997                 */
1998                be32_add_cpu(&agi->agi_freecount, 1);
1999                xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
2000                pag = xfs_perag_get(mp, agno);
2001                pag->pagi_freecount++;
2002                xfs_perag_put(pag);
2003                xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1);
2004        }
2005
2006        error = xfs_check_agi_freecount(cur, agi);
2007        if (error)
2008                goto error0;
2009
2010        *orec = rec;
2011        xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
2012        return 0;
2013
2014error0:
2015        xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
2016        return error;
2017}
2018
2019/*
2020 * Free an inode in the free inode btree.
2021 */
2022STATIC int
2023xfs_difree_finobt(
2024        struct xfs_mount                *mp,
2025        struct xfs_trans                *tp,
2026        struct xfs_buf                  *agbp,
2027        xfs_agino_t                     agino,
2028        struct xfs_inobt_rec_incore     *ibtrec) /* inobt record */
2029{
2030        struct xfs_agi                  *agi = XFS_BUF_TO_AGI(agbp);
2031        xfs_agnumber_t                  agno = be32_to_cpu(agi->agi_seqno);
2032        struct xfs_btree_cur            *cur;
2033        struct xfs_inobt_rec_incore     rec;
2034        int                             offset = agino - ibtrec->ir_startino;
2035        int                             error;
2036        int                             i;
2037
2038        cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_FINO);
2039
2040        error = xfs_inobt_lookup(cur, ibtrec->ir_startino, XFS_LOOKUP_EQ, &i);
2041        if (error)
2042                goto error;
2043        if (i == 0) {
2044                /*
2045                 * If the record does not exist in the finobt, we must have just
2046                 * freed an inode in a previously fully allocated chunk. If not,
2047                 * something is out of sync.
2048                 */
2049                XFS_WANT_CORRUPTED_GOTO(mp, ibtrec->ir_freecount == 1, error);
2050
2051                error = xfs_inobt_insert_rec(cur, ibtrec->ir_holemask,
2052                                             ibtrec->ir_count,
2053                                             ibtrec->ir_freecount,
2054                                             ibtrec->ir_free, &i);
2055                if (error)
2056                        goto error;
2057                ASSERT(i == 1);
2058
2059                goto out;
2060        }
2061
2062        /*
2063         * Read and update the existing record. We could just copy the ibtrec
2064         * across here, but that would defeat the purpose of having redundant
2065         * metadata. By making the modifications independently, we can catch
2066         * corruptions that we wouldn't see if we just copied from one record
2067         * to another.
2068         */
2069        error = xfs_inobt_get_rec(cur, &rec, &i);
2070        if (error)
2071                goto error;
2072        XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error);
2073
2074        rec.ir_free |= XFS_INOBT_MASK(offset);
2075        rec.ir_freecount++;
2076
2077        XFS_WANT_CORRUPTED_GOTO(mp, (rec.ir_free == ibtrec->ir_free) &&
2078                                (rec.ir_freecount == ibtrec->ir_freecount),
2079                                error);
2080
2081        /*
2082         * The content of inobt records should always match between the inobt
2083         * and finobt. The lifecycle of records in the finobt is different from
2084         * the inobt in that the finobt only tracks records with at least one
2085         * free inode. Hence, if all of the inodes are free and we aren't
2086         * keeping inode chunks permanently on disk, remove the record.
2087         * Otherwise, update the record with the new information.
2088         *
2089         * Note that we currently can't free chunks when the block size is large
2090         * enough for multiple chunks. Leave the finobt record to remain in sync
2091         * with the inobt.
2092         */
2093        if (rec.ir_free == XFS_INOBT_ALL_FREE &&
2094            mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK &&
2095            !(mp->m_flags & XFS_MOUNT_IKEEP)) {
2096                error = xfs_btree_delete(cur, &i);
2097                if (error)
2098                        goto error;
2099                ASSERT(i == 1);
2100        } else {
2101                error = xfs_inobt_update(cur, &rec);
2102                if (error)
2103                        goto error;
2104        }
2105
2106out:
2107        error = xfs_check_agi_freecount(cur, agi);
2108        if (error)
2109                goto error;
2110
2111        xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
2112        return 0;
2113
2114error:
2115        xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
2116        return error;
2117}
2118
2119/*
2120 * Free disk inode.  Carefully avoids touching the incore inode, all
2121 * manipulations incore are the caller's responsibility.
2122 * The on-disk inode is not changed by this operation, only the
2123 * btree (free inode mask) is changed.
2124 */
2125int
2126xfs_difree(
2127        struct xfs_trans        *tp,            /* transaction pointer */
2128        xfs_ino_t               inode,          /* inode to be freed */
2129        struct xfs_defer_ops    *dfops,         /* extents to free */
2130        struct xfs_icluster     *xic)   /* cluster info if deleted */
2131{
2132        /* REFERENCED */
2133        xfs_agblock_t           agbno;  /* block number containing inode */
2134        struct xfs_buf          *agbp;  /* buffer for allocation group header */
2135        xfs_agino_t             agino;  /* allocation group inode number */
2136        xfs_agnumber_t          agno;   /* allocation group number */
2137        int                     error;  /* error return value */
2138        struct xfs_mount        *mp;    /* mount structure for filesystem */
2139        struct xfs_inobt_rec_incore rec;/* btree record */
2140
2141        mp = tp->t_mountp;
2142
2143        /*
2144         * Break up inode number into its components.
2145         */
2146        agno = XFS_INO_TO_AGNO(mp, inode);
2147        if (agno >= mp->m_sb.sb_agcount)  {
2148                xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).",
2149                        __func__, agno, mp->m_sb.sb_agcount);
2150                ASSERT(0);
2151                return -EINVAL;
2152        }
2153        agino = XFS_INO_TO_AGINO(mp, inode);
2154        if (inode != XFS_AGINO_TO_INO(mp, agno, agino))  {
2155                xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).",
2156                        __func__, (unsigned long long)inode,
2157                        (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino));
2158                ASSERT(0);
2159                return -EINVAL;
2160        }
2161        agbno = XFS_AGINO_TO_AGBNO(mp, agino);
2162        if (agbno >= mp->m_sb.sb_agblocks)  {
2163                xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).",
2164                        __func__, agbno, mp->m_sb.sb_agblocks);
2165                ASSERT(0);
2166                return -EINVAL;
2167        }
2168        /*
2169         * Get the allocation group header.
2170         */
2171        error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
2172        if (error) {
2173                xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.",
2174                        __func__, error);
2175                return error;
2176        }
2177
2178        /*
2179         * Fix up the inode allocation btree.
2180         */
2181        error = xfs_difree_inobt(mp, tp, agbp, agino, dfops, xic, &rec);
2182        if (error)
2183                goto error0;
2184
2185        /*
2186         * Fix up the free inode btree.
2187         */
2188        if (xfs_sb_version_hasfinobt(&mp->m_sb)) {
2189                error = xfs_difree_finobt(mp, tp, agbp, agino, &rec);
2190                if (error)
2191                        goto error0;
2192        }
2193
2194        return 0;
2195
2196error0:
2197        return error;
2198}
2199
2200STATIC int
2201xfs_imap_lookup(
2202        struct xfs_mount        *mp,
2203        struct xfs_trans        *tp,
2204        xfs_agnumber_t          agno,
2205        xfs_agino_t             agino,
2206        xfs_agblock_t           agbno,
2207        xfs_agblock_t           *chunk_agbno,
2208        xfs_agblock_t           *offset_agbno,
2209        int                     flags)
2210{
2211        struct xfs_inobt_rec_incore rec;
2212        struct xfs_btree_cur    *cur;
2213        struct xfs_buf          *agbp;
2214        int                     error;
2215        int                     i;
2216
2217        error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
2218        if (error) {
2219                xfs_alert(mp,
2220                        "%s: xfs_ialloc_read_agi() returned error %d, agno %d",
2221                        __func__, error, agno);
2222                return error;
2223        }
2224
2225        /*
2226         * Lookup the inode record for the given agino. If the record cannot be
2227         * found, then it's an invalid inode number and we should abort. Once
2228         * we have a record, we need to ensure it contains the inode number
2229         * we are looking up.
2230         */
2231        cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO);
2232        error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
2233        if (!error) {
2234                if (i)
2235                        error = xfs_inobt_get_rec(cur, &rec, &i);
2236                if (!error && i == 0)
2237                        error = -EINVAL;
2238        }
2239
2240        xfs_trans_brelse(tp, agbp);
2241        xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
2242        if (error)
2243                return error;
2244
2245        /* check that the returned record contains the required inode */
2246        if (rec.ir_startino > agino ||
2247            rec.ir_startino + mp->m_ialloc_inos <= agino)
2248                return -EINVAL;
2249
2250        /* for untrusted inodes check it is allocated first */
2251        if ((flags & XFS_IGET_UNTRUSTED) &&
2252            (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino)))
2253                return -EINVAL;
2254
2255        *chunk_agbno = XFS_AGINO_TO_AGBNO(mp, rec.ir_startino);
2256        *offset_agbno = agbno - *chunk_agbno;
2257        return 0;
2258}
2259
2260/*
2261 * Return the location of the inode in imap, for mapping it into a buffer.
2262 */
2263int
2264xfs_imap(
2265        xfs_mount_t      *mp,   /* file system mount structure */
2266        xfs_trans_t      *tp,   /* transaction pointer */
2267        xfs_ino_t       ino,    /* inode to locate */
2268        struct xfs_imap *imap,  /* location map structure */
2269        uint            flags)  /* flags for inode btree lookup */
2270{
2271        xfs_agblock_t   agbno;  /* block number of inode in the alloc group */
2272        xfs_agino_t     agino;  /* inode number within alloc group */
2273        xfs_agnumber_t  agno;   /* allocation group number */
2274        int             blks_per_cluster; /* num blocks per inode cluster */
2275        xfs_agblock_t   chunk_agbno;    /* first block in inode chunk */
2276        xfs_agblock_t   cluster_agbno;  /* first block in inode cluster */
2277        int             error;  /* error code */
2278        int             offset; /* index of inode in its buffer */
2279        xfs_agblock_t   offset_agbno;   /* blks from chunk start to inode */
2280
2281        ASSERT(ino != NULLFSINO);
2282
2283        /*
2284         * Split up the inode number into its parts.
2285         */
2286        agno = XFS_INO_TO_AGNO(mp, ino);
2287        agino = XFS_INO_TO_AGINO(mp, ino);
2288        agbno = XFS_AGINO_TO_AGBNO(mp, agino);
2289        if (agno >= mp->m_sb.sb_agcount || agbno >= mp->m_sb.sb_agblocks ||
2290            ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
2291#ifdef DEBUG
2292                /*
2293                 * Don't output diagnostic information for untrusted inodes
2294                 * as they can be invalid without implying corruption.
2295                 */
2296                if (flags & XFS_IGET_UNTRUSTED)
2297                        return -EINVAL;
2298                if (agno >= mp->m_sb.sb_agcount) {
2299                        xfs_alert(mp,
2300                                "%s: agno (%d) >= mp->m_sb.sb_agcount (%d)",
2301                                __func__, agno, mp->m_sb.sb_agcount);
2302                }
2303                if (agbno >= mp->m_sb.sb_agblocks) {
2304                        xfs_alert(mp,
2305                "%s: agbno (0x%llx) >= mp->m_sb.sb_agblocks (0x%lx)",
2306                                __func__, (unsigned long long)agbno,
2307                                (unsigned long)mp->m_sb.sb_agblocks);
2308                }
2309                if (ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
2310                        xfs_alert(mp,
2311                "%s: ino (0x%llx) != XFS_AGINO_TO_INO() (0x%llx)",
2312                                __func__, ino,
2313                                XFS_AGINO_TO_INO(mp, agno, agino));
2314                }
2315                xfs_stack_trace();
2316#endif /* DEBUG */
2317                return -EINVAL;
2318        }
2319
2320        blks_per_cluster = xfs_icluster_size_fsb(mp);
2321
2322        /*
2323         * For bulkstat and handle lookups, we have an untrusted inode number
2324         * that we have to verify is valid. We cannot do this just by reading
2325         * the inode buffer as it may have been unlinked and removed leaving
2326         * inodes in stale state on disk. Hence we have to do a btree lookup
2327         * in all cases where an untrusted inode number is passed.
2328         */
2329        if (flags & XFS_IGET_UNTRUSTED) {
2330                error = xfs_imap_lookup(mp, tp, agno, agino, agbno,
2331                                        &chunk_agbno, &offset_agbno, flags);
2332                if (error)
2333                        return error;
2334                goto out_map;
2335        }
2336
2337        /*
2338         * If the inode cluster size is the same as the blocksize or
2339         * smaller we get to the buffer by simple arithmetics.
2340         */
2341        if (blks_per_cluster == 1) {
2342                offset = XFS_INO_TO_OFFSET(mp, ino);
2343                ASSERT(offset < mp->m_sb.sb_inopblock);
2344
2345                imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno);
2346                imap->im_len = XFS_FSB_TO_BB(mp, 1);
2347                imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
2348                return 0;
2349        }
2350
2351        /*
2352         * If the inode chunks are aligned then use simple maths to
2353         * find the location. Otherwise we have to do a btree
2354         * lookup to find the location.
2355         */
2356        if (mp->m_inoalign_mask) {
2357                offset_agbno = agbno & mp->m_inoalign_mask;
2358                chunk_agbno = agbno - offset_agbno;
2359        } else {
2360                error = xfs_imap_lookup(mp, tp, agno, agino, agbno,
2361                                        &chunk_agbno, &offset_agbno, flags);
2362                if (error)
2363                        return error;
2364        }
2365
2366out_map:
2367        ASSERT(agbno >= chunk_agbno);
2368        cluster_agbno = chunk_agbno +
2369                ((offset_agbno / blks_per_cluster) * blks_per_cluster);
2370        offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) +
2371                XFS_INO_TO_OFFSET(mp, ino);
2372
2373        imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, cluster_agbno);
2374        imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
2375        imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
2376
2377        /*
2378         * If the inode number maps to a block outside the bounds
2379         * of the file system then return NULL rather than calling
2380         * read_buf and panicing when we get an error from the
2381         * driver.
2382         */
2383        if ((imap->im_blkno + imap->im_len) >
2384            XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
2385                xfs_alert(mp,
2386        "%s: (im_blkno (0x%llx) + im_len (0x%llx)) > sb_dblocks (0x%llx)",
2387                        __func__, (unsigned long long) imap->im_blkno,
2388                        (unsigned long long) imap->im_len,
2389                        XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
2390                return -EINVAL;
2391        }
2392        return 0;
2393}
2394
2395/*
2396 * Compute and fill in value of m_in_maxlevels.
2397 */
2398void
2399xfs_ialloc_compute_maxlevels(
2400        xfs_mount_t     *mp)            /* file system mount structure */
2401{
2402        uint            inodes;
2403
2404        inodes = (1LL << XFS_INO_AGINO_BITS(mp)) >> XFS_INODES_PER_CHUNK_LOG;
2405        mp->m_in_maxlevels = xfs_btree_compute_maxlevels(mp, mp->m_inobt_mnr,
2406                                                         inodes);
2407}
2408
2409/*
2410 * Log specified fields for the ag hdr (inode section). The growth of the agi
2411 * structure over time requires that we interpret the buffer as two logical
2412 * regions delineated by the end of the unlinked list. This is due to the size
2413 * of the hash table and its location in the middle of the agi.
2414 *
2415 * For example, a request to log a field before agi_unlinked and a field after
2416 * agi_unlinked could cause us to log the entire hash table and use an excessive
2417 * amount of log space. To avoid this behavior, log the region up through
2418 * agi_unlinked in one call and the region after agi_unlinked through the end of
2419 * the structure in another.
2420 */
2421void
2422xfs_ialloc_log_agi(
2423        xfs_trans_t     *tp,            /* transaction pointer */
2424        xfs_buf_t       *bp,            /* allocation group header buffer */
2425        int             fields)         /* bitmask of fields to log */
2426{
2427        int                     first;          /* first byte number */
2428        int                     last;           /* last byte number */
2429        static const short      offsets[] = {   /* field starting offsets */
2430                                        /* keep in sync with bit definitions */
2431                offsetof(xfs_agi_t, agi_magicnum),
2432                offsetof(xfs_agi_t, agi_versionnum),
2433                offsetof(xfs_agi_t, agi_seqno),
2434                offsetof(xfs_agi_t, agi_length),
2435                offsetof(xfs_agi_t, agi_count),
2436                offsetof(xfs_agi_t, agi_root),
2437                offsetof(xfs_agi_t, agi_level),
2438                offsetof(xfs_agi_t, agi_freecount),
2439                offsetof(xfs_agi_t, agi_newino),
2440                offsetof(xfs_agi_t, agi_dirino),
2441                offsetof(xfs_agi_t, agi_unlinked),
2442                offsetof(xfs_agi_t, agi_free_root),
2443                offsetof(xfs_agi_t, agi_free_level),
2444                sizeof(xfs_agi_t)
2445        };
2446#ifdef DEBUG
2447        xfs_agi_t               *agi;   /* allocation group header */
2448
2449        agi = XFS_BUF_TO_AGI(bp);
2450        ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
2451#endif
2452
2453        xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGI_BUF);
2454
2455        /*
2456         * Compute byte offsets for the first and last fields in the first
2457         * region and log the agi buffer. This only logs up through
2458         * agi_unlinked.
2459         */
2460        if (fields & XFS_AGI_ALL_BITS_R1) {
2461                xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R1,
2462                                  &first, &last);
2463                xfs_trans_log_buf(tp, bp, first, last);
2464        }
2465
2466        /*
2467         * Mask off the bits in the first region and calculate the first and
2468         * last field offsets for any bits in the second region.
2469         */
2470        fields &= ~XFS_AGI_ALL_BITS_R1;
2471        if (fields) {
2472                xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS_R2,
2473                                  &first, &last);
2474                xfs_trans_log_buf(tp, bp, first, last);
2475        }
2476}
2477
2478#ifdef DEBUG
2479STATIC void
2480xfs_check_agi_unlinked(
2481        struct xfs_agi          *agi)
2482{
2483        int                     i;
2484
2485        for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++)
2486                ASSERT(agi->agi_unlinked[i]);
2487}
2488#else
2489#define xfs_check_agi_unlinked(agi)
2490#endif
2491
2492static bool
2493xfs_agi_verify(
2494        struct xfs_buf  *bp)
2495{
2496        struct xfs_mount *mp = bp->b_target->bt_mount;
2497        struct xfs_agi  *agi = XFS_BUF_TO_AGI(bp);
2498
2499        if (xfs_sb_version_hascrc(&mp->m_sb)) {
2500                if (!uuid_equal(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid))
2501                        return false;
2502                if (!xfs_log_check_lsn(mp,
2503                                be64_to_cpu(XFS_BUF_TO_AGI(bp)->agi_lsn)))
2504                        return false;
2505        }
2506
2507        /*
2508         * Validate the magic number of the agi block.
2509         */
2510        if (agi->agi_magicnum != cpu_to_be32(XFS_AGI_MAGIC))
2511                return false;
2512        if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)))
2513                return false;
2514
2515        if (be32_to_cpu(agi->agi_level) > XFS_BTREE_MAXLEVELS)
2516                return false;
2517        /*
2518         * during growfs operations, the perag is not fully initialised,
2519         * so we can't use it for any useful checking. growfs ensures we can't
2520         * use it by using uncached buffers that don't have the perag attached
2521         * so we can detect and avoid this problem.
2522         */
2523        if (bp->b_pag && be32_to_cpu(agi->agi_seqno) != bp->b_pag->pag_agno)
2524                return false;
2525
2526        xfs_check_agi_unlinked(agi);
2527        return true;
2528}
2529
2530static void
2531xfs_agi_read_verify(
2532        struct xfs_buf  *bp)
2533{
2534        struct xfs_mount *mp = bp->b_target->bt_mount;
2535
2536        if (xfs_sb_version_hascrc(&mp->m_sb) &&
2537            !xfs_buf_verify_cksum(bp, XFS_AGI_CRC_OFF))
2538                xfs_buf_ioerror(bp, -EFSBADCRC);
2539        else if (XFS_TEST_ERROR(!xfs_agi_verify(bp), mp,
2540                                XFS_ERRTAG_IALLOC_READ_AGI,
2541                                XFS_RANDOM_IALLOC_READ_AGI))
2542                xfs_buf_ioerror(bp, -EFSCORRUPTED);
2543
2544        if (bp->b_error)
2545                xfs_verifier_error(bp);
2546}
2547
2548static void
2549xfs_agi_write_verify(
2550        struct xfs_buf  *bp)
2551{
2552        struct xfs_mount *mp = bp->b_target->bt_mount;
2553        struct xfs_buf_log_item *bip = bp->b_fspriv;
2554
2555        if (!xfs_agi_verify(bp)) {
2556                xfs_buf_ioerror(bp, -EFSCORRUPTED);
2557                xfs_verifier_error(bp);
2558                return;
2559        }
2560
2561        if (!xfs_sb_version_hascrc(&mp->m_sb))
2562                return;
2563
2564        if (bip)
2565                XFS_BUF_TO_AGI(bp)->agi_lsn = cpu_to_be64(bip->bli_item.li_lsn);
2566        xfs_buf_update_cksum(bp, XFS_AGI_CRC_OFF);
2567}
2568
2569const struct xfs_buf_ops xfs_agi_buf_ops = {
2570        .name = "xfs_agi",
2571        .verify_read = xfs_agi_read_verify,
2572        .verify_write = xfs_agi_write_verify,
2573};
2574
2575/*
2576 * Read in the allocation group header (inode allocation section)
2577 */
2578int
2579xfs_read_agi(
2580        struct xfs_mount        *mp,    /* file system mount structure */
2581        struct xfs_trans        *tp,    /* transaction pointer */
2582        xfs_agnumber_t          agno,   /* allocation group number */
2583        struct xfs_buf          **bpp)  /* allocation group hdr buf */
2584{
2585        int                     error;
2586
2587        trace_xfs_read_agi(mp, agno);
2588
2589        ASSERT(agno != NULLAGNUMBER);
2590        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
2591                        XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
2592                        XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops);
2593        if (error)
2594                return error;
2595
2596        xfs_buf_set_ref(*bpp, XFS_AGI_REF);
2597        return 0;
2598}
2599
2600int
2601xfs_ialloc_read_agi(
2602        struct xfs_mount        *mp,    /* file system mount structure */
2603        struct xfs_trans        *tp,    /* transaction pointer */
2604        xfs_agnumber_t          agno,   /* allocation group number */
2605        struct xfs_buf          **bpp)  /* allocation group hdr buf */
2606{
2607        struct xfs_agi          *agi;   /* allocation group header */
2608        struct xfs_perag        *pag;   /* per allocation group data */
2609        int                     error;
2610
2611        trace_xfs_ialloc_read_agi(mp, agno);
2612
2613        error = xfs_read_agi(mp, tp, agno, bpp);
2614        if (error)
2615                return error;
2616
2617        agi = XFS_BUF_TO_AGI(*bpp);
2618        pag = xfs_perag_get(mp, agno);
2619        if (!pag->pagi_init) {
2620                pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
2621                pag->pagi_count = be32_to_cpu(agi->agi_count);
2622                pag->pagi_init = 1;
2623        }
2624
2625        /*
2626         * It's possible for these to be out of sync if
2627         * we are in the middle of a forced shutdown.
2628         */
2629        ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
2630                XFS_FORCED_SHUTDOWN(mp));
2631        xfs_perag_put(pag);
2632        return 0;
2633}
2634
2635/*
2636 * Read in the agi to initialise the per-ag data in the mount structure
2637 */
2638int
2639xfs_ialloc_pagi_init(
2640        xfs_mount_t     *mp,            /* file system mount structure */
2641        xfs_trans_t     *tp,            /* transaction pointer */
2642        xfs_agnumber_t  agno)           /* allocation group number */
2643{
2644        xfs_buf_t       *bp = NULL;
2645        int             error;
2646
2647        error = xfs_ialloc_read_agi(mp, tp, agno, &bp);
2648        if (error)
2649                return error;
2650        if (bp)
2651                xfs_trans_brelse(tp, bp);
2652        return 0;
2653}
2654