linux/fs/xfs/xfs_ialloc.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
   3 * All Rights Reserved.
   4 *
   5 * This program is free software; you can redistribute it and/or
   6 * modify it under the terms of the GNU General Public License as
   7 * published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope that it would be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public License
  15 * along with this program; if not, write the Free Software Foundation,
  16 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17 */
  18#include "xfs.h"
  19#include "xfs_fs.h"
  20#include "xfs_types.h"
  21#include "xfs_bit.h"
  22#include "xfs_log.h"
  23#include "xfs_inum.h"
  24#include "xfs_trans.h"
  25#include "xfs_sb.h"
  26#include "xfs_ag.h"
  27#include "xfs_mount.h"
  28#include "xfs_bmap_btree.h"
  29#include "xfs_alloc_btree.h"
  30#include "xfs_ialloc_btree.h"
  31#include "xfs_dinode.h"
  32#include "xfs_inode.h"
  33#include "xfs_btree.h"
  34#include "xfs_ialloc.h"
  35#include "xfs_alloc.h"
  36#include "xfs_rtalloc.h"
  37#include "xfs_error.h"
  38#include "xfs_bmap.h"
  39
  40
  41/*
  42 * Allocation group level functions.
  43 */
  44static inline int
  45xfs_ialloc_cluster_alignment(
  46        xfs_alloc_arg_t *args)
  47{
  48        if (xfs_sb_version_hasalign(&args->mp->m_sb) &&
  49            args->mp->m_sb.sb_inoalignmt >=
  50             XFS_B_TO_FSBT(args->mp, XFS_INODE_CLUSTER_SIZE(args->mp)))
  51                return args->mp->m_sb.sb_inoalignmt;
  52        return 1;
  53}
  54
  55/*
  56 * Lookup a record by ino in the btree given by cur.
  57 */
  58int                                     /* error */
  59xfs_inobt_lookup(
  60        struct xfs_btree_cur    *cur,   /* btree cursor */
  61        xfs_agino_t             ino,    /* starting inode of chunk */
  62        xfs_lookup_t            dir,    /* <=, >=, == */
  63        int                     *stat)  /* success/failure */
  64{
  65        cur->bc_rec.i.ir_startino = ino;
  66        cur->bc_rec.i.ir_freecount = 0;
  67        cur->bc_rec.i.ir_free = 0;
  68        return xfs_btree_lookup(cur, dir, stat);
  69}
  70
  71/*
  72 * Update the record referred to by cur to the value given.
  73 * This either works (return 0) or gets an EFSCORRUPTED error.
  74 */
  75STATIC int                              /* error */
  76xfs_inobt_update(
  77        struct xfs_btree_cur    *cur,   /* btree cursor */
  78        xfs_inobt_rec_incore_t  *irec)  /* btree record */
  79{
  80        union xfs_btree_rec     rec;
  81
  82        rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino);
  83        rec.inobt.ir_freecount = cpu_to_be32(irec->ir_freecount);
  84        rec.inobt.ir_free = cpu_to_be64(irec->ir_free);
  85        return xfs_btree_update(cur, &rec);
  86}
  87
  88/*
  89 * Get the data from the pointed-to record.
  90 */
  91int                                     /* error */
  92xfs_inobt_get_rec(
  93        struct xfs_btree_cur    *cur,   /* btree cursor */
  94        xfs_inobt_rec_incore_t  *irec,  /* btree record */
  95        int                     *stat)  /* output: success/failure */
  96{
  97        union xfs_btree_rec     *rec;
  98        int                     error;
  99
 100        error = xfs_btree_get_rec(cur, &rec, stat);
 101        if (!error && *stat == 1) {
 102                irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino);
 103                irec->ir_freecount = be32_to_cpu(rec->inobt.ir_freecount);
 104                irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
 105        }
 106        return error;
 107}
 108
 109/*
 110 * Verify that the number of free inodes in the AGI is correct.
 111 */
 112#ifdef DEBUG
 113STATIC int
 114xfs_check_agi_freecount(
 115        struct xfs_btree_cur    *cur,
 116        struct xfs_agi          *agi)
 117{
 118        if (cur->bc_nlevels == 1) {
 119                xfs_inobt_rec_incore_t rec;
 120                int             freecount = 0;
 121                int             error;
 122                int             i;
 123
 124                error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
 125                if (error)
 126                        return error;
 127
 128                do {
 129                        error = xfs_inobt_get_rec(cur, &rec, &i);
 130                        if (error)
 131                                return error;
 132
 133                        if (i) {
 134                                freecount += rec.ir_freecount;
 135                                error = xfs_btree_increment(cur, 0, &i);
 136                                if (error)
 137                                        return error;
 138                        }
 139                } while (i == 1);
 140
 141                if (!XFS_FORCED_SHUTDOWN(cur->bc_mp))
 142                        ASSERT(freecount == be32_to_cpu(agi->agi_freecount));
 143        }
 144        return 0;
 145}
 146#else
 147#define xfs_check_agi_freecount(cur, agi)       0
 148#endif
 149
 150/*
 151 * Initialise a new set of inodes.
 152 */
 153STATIC int
 154xfs_ialloc_inode_init(
 155        struct xfs_mount        *mp,
 156        struct xfs_trans        *tp,
 157        xfs_agnumber_t          agno,
 158        xfs_agblock_t           agbno,
 159        xfs_agblock_t           length,
 160        unsigned int            gen)
 161{
 162        struct xfs_buf          *fbuf;
 163        struct xfs_dinode       *free;
 164        int                     blks_per_cluster, nbufs, ninodes;
 165        int                     version;
 166        int                     i, j;
 167        xfs_daddr_t             d;
 168
 169        /*
 170         * Loop over the new block(s), filling in the inodes.
 171         * For small block sizes, manipulate the inodes in buffers
 172         * which are multiples of the blocks size.
 173         */
 174        if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) {
 175                blks_per_cluster = 1;
 176                nbufs = length;
 177                ninodes = mp->m_sb.sb_inopblock;
 178        } else {
 179                blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) /
 180                                   mp->m_sb.sb_blocksize;
 181                nbufs = length / blks_per_cluster;
 182                ninodes = blks_per_cluster * mp->m_sb.sb_inopblock;
 183        }
 184
 185        /*
 186         * Figure out what version number to use in the inodes we create.
 187         * If the superblock version has caught up to the one that supports
 188         * the new inode format, then use the new inode version.  Otherwise
 189         * use the old version so that old kernels will continue to be
 190         * able to use the file system.
 191         */
 192        if (xfs_sb_version_hasnlink(&mp->m_sb))
 193                version = 2;
 194        else
 195                version = 1;
 196
 197        for (j = 0; j < nbufs; j++) {
 198                /*
 199                 * Get the block.
 200                 */
 201                d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster));
 202                fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
 203                                         mp->m_bsize * blks_per_cluster,
 204                                         XBF_LOCK);
 205                if (!fbuf)
 206                        return ENOMEM;
 207                /*
 208                 * Initialize all inodes in this buffer and then log them.
 209                 *
 210                 * XXX: It would be much better if we had just one transaction
 211                 *      to log a whole cluster of inodes instead of all the
 212                 *      individual transactions causing a lot of log traffic.
 213                 */
 214                xfs_buf_zero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog);
 215                for (i = 0; i < ninodes; i++) {
 216                        int     ioffset = i << mp->m_sb.sb_inodelog;
 217                        uint    isize = sizeof(struct xfs_dinode);
 218
 219                        free = xfs_make_iptr(mp, fbuf, i);
 220                        free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
 221                        free->di_version = version;
 222                        free->di_gen = cpu_to_be32(gen);
 223                        free->di_next_unlinked = cpu_to_be32(NULLAGINO);
 224                        xfs_trans_log_buf(tp, fbuf, ioffset, ioffset + isize - 1);
 225                }
 226                xfs_trans_inode_alloc_buf(tp, fbuf);
 227        }
 228        return 0;
 229}
 230
 231/*
 232 * Allocate new inodes in the allocation group specified by agbp.
 233 * Return 0 for success, else error code.
 234 */
 235STATIC int                              /* error code or 0 */
 236xfs_ialloc_ag_alloc(
 237        xfs_trans_t     *tp,            /* transaction pointer */
 238        xfs_buf_t       *agbp,          /* alloc group buffer */
 239        int             *alloc)
 240{
 241        xfs_agi_t       *agi;           /* allocation group header */
 242        xfs_alloc_arg_t args;           /* allocation argument structure */
 243        xfs_btree_cur_t *cur;           /* inode btree cursor */
 244        xfs_agnumber_t  agno;
 245        int             error;
 246        int             i;
 247        xfs_agino_t     newino;         /* new first inode's number */
 248        xfs_agino_t     newlen;         /* new number of inodes */
 249        xfs_agino_t     thisino;        /* current inode number, for loop */
 250        int             isaligned = 0;  /* inode allocation at stripe unit */
 251                                        /* boundary */
 252        struct xfs_perag *pag;
 253
 254        args.tp = tp;
 255        args.mp = tp->t_mountp;
 256
 257        /*
 258         * Locking will ensure that we don't have two callers in here
 259         * at one time.
 260         */
 261        newlen = XFS_IALLOC_INODES(args.mp);
 262        if (args.mp->m_maxicount &&
 263            args.mp->m_sb.sb_icount + newlen > args.mp->m_maxicount)
 264                return XFS_ERROR(ENOSPC);
 265        args.minlen = args.maxlen = XFS_IALLOC_BLOCKS(args.mp);
 266        /*
 267         * First try to allocate inodes contiguous with the last-allocated
 268         * chunk of inodes.  If the filesystem is striped, this will fill
 269         * an entire stripe unit with inodes.
 270         */
 271        agi = XFS_BUF_TO_AGI(agbp);
 272        newino = be32_to_cpu(agi->agi_newino);
 273        agno = be32_to_cpu(agi->agi_seqno);
 274        args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
 275                        XFS_IALLOC_BLOCKS(args.mp);
 276        if (likely(newino != NULLAGINO &&
 277                  (args.agbno < be32_to_cpu(agi->agi_length)))) {
 278                args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
 279                args.type = XFS_ALLOCTYPE_THIS_BNO;
 280                args.mod = args.total = args.wasdel = args.isfl =
 281                        args.userdata = args.minalignslop = 0;
 282                args.prod = 1;
 283
 284                /*
 285                 * We need to take into account alignment here to ensure that
 286                 * we don't modify the free list if we fail to have an exact
 287                 * block. If we don't have an exact match, and every oher
 288                 * attempt allocation attempt fails, we'll end up cancelling
 289                 * a dirty transaction and shutting down.
 290                 *
 291                 * For an exact allocation, alignment must be 1,
 292                 * however we need to take cluster alignment into account when
 293                 * fixing up the freelist. Use the minalignslop field to
 294                 * indicate that extra blocks might be required for alignment,
 295                 * but not to use them in the actual exact allocation.
 296                 */
 297                args.alignment = 1;
 298                args.minalignslop = xfs_ialloc_cluster_alignment(&args) - 1;
 299
 300                /* Allow space for the inode btree to split. */
 301                args.minleft = args.mp->m_in_maxlevels - 1;
 302                if ((error = xfs_alloc_vextent(&args)))
 303                        return error;
 304        } else
 305                args.fsbno = NULLFSBLOCK;
 306
 307        if (unlikely(args.fsbno == NULLFSBLOCK)) {
 308                /*
 309                 * Set the alignment for the allocation.
 310                 * If stripe alignment is turned on then align at stripe unit
 311                 * boundary.
 312                 * If the cluster size is smaller than a filesystem block
 313                 * then we're doing I/O for inodes in filesystem block size
 314                 * pieces, so don't need alignment anyway.
 315                 */
 316                isaligned = 0;
 317                if (args.mp->m_sinoalign) {
 318                        ASSERT(!(args.mp->m_flags & XFS_MOUNT_NOALIGN));
 319                        args.alignment = args.mp->m_dalign;
 320                        isaligned = 1;
 321                } else
 322                        args.alignment = xfs_ialloc_cluster_alignment(&args);
 323                /*
 324                 * Need to figure out where to allocate the inode blocks.
 325                 * Ideally they should be spaced out through the a.g.
 326                 * For now, just allocate blocks up front.
 327                 */
 328                args.agbno = be32_to_cpu(agi->agi_root);
 329                args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
 330                /*
 331                 * Allocate a fixed-size extent of inodes.
 332                 */
 333                args.type = XFS_ALLOCTYPE_NEAR_BNO;
 334                args.mod = args.total = args.wasdel = args.isfl =
 335                        args.userdata = args.minalignslop = 0;
 336                args.prod = 1;
 337                /*
 338                 * Allow space for the inode btree to split.
 339                 */
 340                args.minleft = args.mp->m_in_maxlevels - 1;
 341                if ((error = xfs_alloc_vextent(&args)))
 342                        return error;
 343        }
 344
 345        /*
 346         * If stripe alignment is turned on, then try again with cluster
 347         * alignment.
 348         */
 349        if (isaligned && args.fsbno == NULLFSBLOCK) {
 350                args.type = XFS_ALLOCTYPE_NEAR_BNO;
 351                args.agbno = be32_to_cpu(agi->agi_root);
 352                args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
 353                args.alignment = xfs_ialloc_cluster_alignment(&args);
 354                if ((error = xfs_alloc_vextent(&args)))
 355                        return error;
 356        }
 357
 358        if (args.fsbno == NULLFSBLOCK) {
 359                *alloc = 0;
 360                return 0;
 361        }
 362        ASSERT(args.len == args.minlen);
 363
 364        /*
 365         * Stamp and write the inode buffers.
 366         *
 367         * Seed the new inode cluster with a random generation number. This
 368         * prevents short-term reuse of generation numbers if a chunk is
 369         * freed and then immediately reallocated. We use random numbers
 370         * rather than a linear progression to prevent the next generation
 371         * number from being easily guessable.
 372         */
 373        error = xfs_ialloc_inode_init(args.mp, tp, agno, args.agbno,
 374                        args.len, random32());
 375
 376        if (error)
 377                return error;
 378        /*
 379         * Convert the results.
 380         */
 381        newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
 382        be32_add_cpu(&agi->agi_count, newlen);
 383        be32_add_cpu(&agi->agi_freecount, newlen);
 384        pag = xfs_perag_get(args.mp, agno);
 385        pag->pagi_freecount += newlen;
 386        xfs_perag_put(pag);
 387        agi->agi_newino = cpu_to_be32(newino);
 388
 389        /*
 390         * Insert records describing the new inode chunk into the btree.
 391         */
 392        cur = xfs_inobt_init_cursor(args.mp, tp, agbp, agno);
 393        for (thisino = newino;
 394             thisino < newino + newlen;
 395             thisino += XFS_INODES_PER_CHUNK) {
 396                cur->bc_rec.i.ir_startino = thisino;
 397                cur->bc_rec.i.ir_freecount = XFS_INODES_PER_CHUNK;
 398                cur->bc_rec.i.ir_free = XFS_INOBT_ALL_FREE;
 399                error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, &i);
 400                if (error) {
 401                        xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
 402                        return error;
 403                }
 404                ASSERT(i == 0);
 405                error = xfs_btree_insert(cur, &i);
 406                if (error) {
 407                        xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
 408                        return error;
 409                }
 410                ASSERT(i == 1);
 411        }
 412        xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
 413        /*
 414         * Log allocation group header fields
 415         */
 416        xfs_ialloc_log_agi(tp, agbp,
 417                XFS_AGI_COUNT | XFS_AGI_FREECOUNT | XFS_AGI_NEWINO);
 418        /*
 419         * Modify/log superblock values for inode count and inode free count.
 420         */
 421        xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, (long)newlen);
 422        xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, (long)newlen);
 423        *alloc = 1;
 424        return 0;
 425}
 426
 427STATIC xfs_agnumber_t
 428xfs_ialloc_next_ag(
 429        xfs_mount_t     *mp)
 430{
 431        xfs_agnumber_t  agno;
 432
 433        spin_lock(&mp->m_agirotor_lock);
 434        agno = mp->m_agirotor;
 435        if (++mp->m_agirotor == mp->m_maxagi)
 436                mp->m_agirotor = 0;
 437        spin_unlock(&mp->m_agirotor_lock);
 438
 439        return agno;
 440}
 441
 442/*
 443 * Select an allocation group to look for a free inode in, based on the parent
 444 * inode and then mode.  Return the allocation group buffer.
 445 */
 446STATIC xfs_buf_t *                      /* allocation group buffer */
 447xfs_ialloc_ag_select(
 448        xfs_trans_t     *tp,            /* transaction pointer */
 449        xfs_ino_t       parent,         /* parent directory inode number */
 450        umode_t         mode,           /* bits set to indicate file type */
 451        int             okalloc)        /* ok to allocate more space */
 452{
 453        xfs_buf_t       *agbp;          /* allocation group header buffer */
 454        xfs_agnumber_t  agcount;        /* number of ag's in the filesystem */
 455        xfs_agnumber_t  agno;           /* current ag number */
 456        int             flags;          /* alloc buffer locking flags */
 457        xfs_extlen_t    ineed;          /* blocks needed for inode allocation */
 458        xfs_extlen_t    longest = 0;    /* longest extent available */
 459        xfs_mount_t     *mp;            /* mount point structure */
 460        int             needspace;      /* file mode implies space allocated */
 461        xfs_perag_t     *pag;           /* per allocation group data */
 462        xfs_agnumber_t  pagno;          /* parent (starting) ag number */
 463
 464        /*
 465         * Files of these types need at least one block if length > 0
 466         * (and they won't fit in the inode, but that's hard to figure out).
 467         */
 468        needspace = S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode);
 469        mp = tp->t_mountp;
 470        agcount = mp->m_maxagi;
 471        if (S_ISDIR(mode))
 472                pagno = xfs_ialloc_next_ag(mp);
 473        else {
 474                pagno = XFS_INO_TO_AGNO(mp, parent);
 475                if (pagno >= agcount)
 476                        pagno = 0;
 477        }
 478        ASSERT(pagno < agcount);
 479        /*
 480         * Loop through allocation groups, looking for one with a little
 481         * free space in it.  Note we don't look for free inodes, exactly.
 482         * Instead, we include whether there is a need to allocate inodes
 483         * to mean that blocks must be allocated for them,
 484         * if none are currently free.
 485         */
 486        agno = pagno;
 487        flags = XFS_ALLOC_FLAG_TRYLOCK;
 488        for (;;) {
 489                pag = xfs_perag_get(mp, agno);
 490                if (!pag->pagi_init) {
 491                        if (xfs_ialloc_read_agi(mp, tp, agno, &agbp)) {
 492                                agbp = NULL;
 493                                goto nextag;
 494                        }
 495                } else
 496                        agbp = NULL;
 497
 498                if (!pag->pagi_inodeok) {
 499                        xfs_ialloc_next_ag(mp);
 500                        goto unlock_nextag;
 501                }
 502
 503                /*
 504                 * Is there enough free space for the file plus a block
 505                 * of inodes (if we need to allocate some)?
 506                 */
 507                ineed = pag->pagi_freecount ? 0 : XFS_IALLOC_BLOCKS(mp);
 508                if (ineed && !pag->pagf_init) {
 509                        if (agbp == NULL &&
 510                            xfs_ialloc_read_agi(mp, tp, agno, &agbp)) {
 511                                agbp = NULL;
 512                                goto nextag;
 513                        }
 514                        (void)xfs_alloc_pagf_init(mp, tp, agno, flags);
 515                }
 516                if (!ineed || pag->pagf_init) {
 517                        if (ineed && !(longest = pag->pagf_longest))
 518                                longest = pag->pagf_flcount > 0;
 519                        if (!ineed ||
 520                            (pag->pagf_freeblks >= needspace + ineed &&
 521                             longest >= ineed &&
 522                             okalloc)) {
 523                                if (agbp == NULL &&
 524                                    xfs_ialloc_read_agi(mp, tp, agno, &agbp)) {
 525                                        agbp = NULL;
 526                                        goto nextag;
 527                                }
 528                                xfs_perag_put(pag);
 529                                return agbp;
 530                        }
 531                }
 532unlock_nextag:
 533                if (agbp)
 534                        xfs_trans_brelse(tp, agbp);
 535nextag:
 536                xfs_perag_put(pag);
 537                /*
 538                 * No point in iterating over the rest, if we're shutting
 539                 * down.
 540                 */
 541                if (XFS_FORCED_SHUTDOWN(mp))
 542                        return NULL;
 543                agno++;
 544                if (agno >= agcount)
 545                        agno = 0;
 546                if (agno == pagno) {
 547                        if (flags == 0)
 548                                return NULL;
 549                        flags = 0;
 550                }
 551        }
 552}
 553
 554/*
 555 * Try to retrieve the next record to the left/right from the current one.
 556 */
 557STATIC int
 558xfs_ialloc_next_rec(
 559        struct xfs_btree_cur    *cur,
 560        xfs_inobt_rec_incore_t  *rec,
 561        int                     *done,
 562        int                     left)
 563{
 564        int                     error;
 565        int                     i;
 566
 567        if (left)
 568                error = xfs_btree_decrement(cur, 0, &i);
 569        else
 570                error = xfs_btree_increment(cur, 0, &i);
 571
 572        if (error)
 573                return error;
 574        *done = !i;
 575        if (i) {
 576                error = xfs_inobt_get_rec(cur, rec, &i);
 577                if (error)
 578                        return error;
 579                XFS_WANT_CORRUPTED_RETURN(i == 1);
 580        }
 581
 582        return 0;
 583}
 584
 585STATIC int
 586xfs_ialloc_get_rec(
 587        struct xfs_btree_cur    *cur,
 588        xfs_agino_t             agino,
 589        xfs_inobt_rec_incore_t  *rec,
 590        int                     *done,
 591        int                     left)
 592{
 593        int                     error;
 594        int                     i;
 595
 596        error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_EQ, &i);
 597        if (error)
 598                return error;
 599        *done = !i;
 600        if (i) {
 601                error = xfs_inobt_get_rec(cur, rec, &i);
 602                if (error)
 603                        return error;
 604                XFS_WANT_CORRUPTED_RETURN(i == 1);
 605        }
 606
 607        return 0;
 608}
 609
 610/*
 611 * Visible inode allocation functions.
 612 */
 613
 614/*
 615 * Allocate an inode on disk.
 616 * Mode is used to tell whether the new inode will need space, and whether
 617 * it is a directory.
 618 *
 619 * The arguments IO_agbp and alloc_done are defined to work within
 620 * the constraint of one allocation per transaction.
 621 * xfs_dialloc() is designed to be called twice if it has to do an
 622 * allocation to make more free inodes.  On the first call,
 623 * IO_agbp should be set to NULL. If an inode is available,
 624 * i.e., xfs_dialloc() did not need to do an allocation, an inode
 625 * number is returned.  In this case, IO_agbp would be set to the
 626 * current ag_buf and alloc_done set to false.
 627 * If an allocation needed to be done, xfs_dialloc would return
 628 * the current ag_buf in IO_agbp and set alloc_done to true.
 629 * The caller should then commit the current transaction, allocate a new
 630 * transaction, and call xfs_dialloc() again, passing in the previous
 631 * value of IO_agbp.  IO_agbp should be held across the transactions.
 632 * Since the agbp is locked across the two calls, the second call is
 633 * guaranteed to have a free inode available.
 634 *
 635 * Once we successfully pick an inode its number is returned and the
 636 * on-disk data structures are updated.  The inode itself is not read
 637 * in, since doing so would break ordering constraints with xfs_reclaim.
 638 */
 639int
 640xfs_dialloc(
 641        xfs_trans_t     *tp,            /* transaction pointer */
 642        xfs_ino_t       parent,         /* parent inode (directory) */
 643        umode_t         mode,           /* mode bits for new inode */
 644        int             okalloc,        /* ok to allocate more space */
 645        xfs_buf_t       **IO_agbp,      /* in/out ag header's buffer */
 646        boolean_t       *alloc_done,    /* true if we needed to replenish
 647                                           inode freelist */
 648        xfs_ino_t       *inop)          /* inode number allocated */
 649{
 650        xfs_agnumber_t  agcount;        /* number of allocation groups */
 651        xfs_buf_t       *agbp;          /* allocation group header's buffer */
 652        xfs_agnumber_t  agno;           /* allocation group number */
 653        xfs_agi_t       *agi;           /* allocation group header structure */
 654        xfs_btree_cur_t *cur;           /* inode allocation btree cursor */
 655        int             error;          /* error return value */
 656        int             i;              /* result code */
 657        int             ialloced;       /* inode allocation status */
 658        int             noroom = 0;     /* no space for inode blk allocation */
 659        xfs_ino_t       ino;            /* fs-relative inode to be returned */
 660        /* REFERENCED */
 661        int             j;              /* result code */
 662        xfs_mount_t     *mp;            /* file system mount structure */
 663        int             offset;         /* index of inode in chunk */
 664        xfs_agino_t     pagino;         /* parent's AG relative inode # */
 665        xfs_agnumber_t  pagno;          /* parent's AG number */
 666        xfs_inobt_rec_incore_t rec;     /* inode allocation record */
 667        xfs_agnumber_t  tagno;          /* testing allocation group number */
 668        xfs_btree_cur_t *tcur;          /* temp cursor */
 669        xfs_inobt_rec_incore_t trec;    /* temp inode allocation record */
 670        struct xfs_perag *pag;
 671
 672
 673        if (*IO_agbp == NULL) {
 674                /*
 675                 * We do not have an agbp, so select an initial allocation
 676                 * group for inode allocation.
 677                 */
 678                agbp = xfs_ialloc_ag_select(tp, parent, mode, okalloc);
 679                /*
 680                 * Couldn't find an allocation group satisfying the
 681                 * criteria, give up.
 682                 */
 683                if (!agbp) {
 684                        *inop = NULLFSINO;
 685                        return 0;
 686                }
 687                agi = XFS_BUF_TO_AGI(agbp);
 688                ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
 689        } else {
 690                /*
 691                 * Continue where we left off before.  In this case, we
 692                 * know that the allocation group has free inodes.
 693                 */
 694                agbp = *IO_agbp;
 695                agi = XFS_BUF_TO_AGI(agbp);
 696                ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
 697                ASSERT(be32_to_cpu(agi->agi_freecount) > 0);
 698        }
 699        mp = tp->t_mountp;
 700        agcount = mp->m_sb.sb_agcount;
 701        agno = be32_to_cpu(agi->agi_seqno);
 702        tagno = agno;
 703        pagno = XFS_INO_TO_AGNO(mp, parent);
 704        pagino = XFS_INO_TO_AGINO(mp, parent);
 705
 706        /*
 707         * If we have already hit the ceiling of inode blocks then clear
 708         * okalloc so we scan all available agi structures for a free
 709         * inode.
 710         */
 711
 712        if (mp->m_maxicount &&
 713            mp->m_sb.sb_icount + XFS_IALLOC_INODES(mp) > mp->m_maxicount) {
 714                noroom = 1;
 715                okalloc = 0;
 716        }
 717
 718        /*
 719         * Loop until we find an allocation group that either has free inodes
 720         * or in which we can allocate some inodes.  Iterate through the
 721         * allocation groups upward, wrapping at the end.
 722         */
 723        *alloc_done = B_FALSE;
 724        while (!agi->agi_freecount) {
 725                /*
 726                 * Don't do anything if we're not supposed to allocate
 727                 * any blocks, just go on to the next ag.
 728                 */
 729                if (okalloc) {
 730                        /*
 731                         * Try to allocate some new inodes in the allocation
 732                         * group.
 733                         */
 734                        if ((error = xfs_ialloc_ag_alloc(tp, agbp, &ialloced))) {
 735                                xfs_trans_brelse(tp, agbp);
 736                                if (error == ENOSPC) {
 737                                        *inop = NULLFSINO;
 738                                        return 0;
 739                                } else
 740                                        return error;
 741                        }
 742                        if (ialloced) {
 743                                /*
 744                                 * We successfully allocated some inodes, return
 745                                 * the current context to the caller so that it
 746                                 * can commit the current transaction and call
 747                                 * us again where we left off.
 748                                 */
 749                                ASSERT(be32_to_cpu(agi->agi_freecount) > 0);
 750                                *alloc_done = B_TRUE;
 751                                *IO_agbp = agbp;
 752                                *inop = NULLFSINO;
 753                                return 0;
 754                        }
 755                }
 756                /*
 757                 * If it failed, give up on this ag.
 758                 */
 759                xfs_trans_brelse(tp, agbp);
 760                /*
 761                 * Go on to the next ag: get its ag header.
 762                 */
 763nextag:
 764                if (++tagno == agcount)
 765                        tagno = 0;
 766                if (tagno == agno) {
 767                        *inop = NULLFSINO;
 768                        return noroom ? ENOSPC : 0;
 769                }
 770                pag = xfs_perag_get(mp, tagno);
 771                if (pag->pagi_inodeok == 0) {
 772                        xfs_perag_put(pag);
 773                        goto nextag;
 774                }
 775                error = xfs_ialloc_read_agi(mp, tp, tagno, &agbp);
 776                xfs_perag_put(pag);
 777                if (error)
 778                        goto nextag;
 779                agi = XFS_BUF_TO_AGI(agbp);
 780                ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
 781        }
 782        /*
 783         * Here with an allocation group that has a free inode.
 784         * Reset agno since we may have chosen a new ag in the
 785         * loop above.
 786         */
 787        agno = tagno;
 788        *IO_agbp = NULL;
 789        pag = xfs_perag_get(mp, agno);
 790
 791 restart_pagno:
 792        cur = xfs_inobt_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno));
 793        /*
 794         * If pagino is 0 (this is the root inode allocation) use newino.
 795         * This must work because we've just allocated some.
 796         */
 797        if (!pagino)
 798                pagino = be32_to_cpu(agi->agi_newino);
 799
 800        error = xfs_check_agi_freecount(cur, agi);
 801        if (error)
 802                goto error0;
 803
 804        /*
 805         * If in the same AG as the parent, try to get near the parent.
 806         */
 807        if (pagno == agno) {
 808                int             doneleft;       /* done, to the left */
 809                int             doneright;      /* done, to the right */
 810                int             searchdistance = 10;
 811
 812                error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
 813                if (error)
 814                        goto error0;
 815                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
 816
 817                error = xfs_inobt_get_rec(cur, &rec, &j);
 818                if (error)
 819                        goto error0;
 820                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
 821
 822                if (rec.ir_freecount > 0) {
 823                        /*
 824                         * Found a free inode in the same chunk
 825                         * as the parent, done.
 826                         */
 827                        goto alloc_inode;
 828                }
 829
 830
 831                /*
 832                 * In the same AG as parent, but parent's chunk is full.
 833                 */
 834
 835                /* duplicate the cursor, search left & right simultaneously */
 836                error = xfs_btree_dup_cursor(cur, &tcur);
 837                if (error)
 838                        goto error0;
 839
 840                /*
 841                 * Skip to last blocks looked up if same parent inode.
 842                 */
 843                if (pagino != NULLAGINO &&
 844                    pag->pagl_pagino == pagino &&
 845                    pag->pagl_leftrec != NULLAGINO &&
 846                    pag->pagl_rightrec != NULLAGINO) {
 847                        error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec,
 848                                                   &trec, &doneleft, 1);
 849                        if (error)
 850                                goto error1;
 851
 852                        error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec,
 853                                                   &rec, &doneright, 0);
 854                        if (error)
 855                                goto error1;
 856                } else {
 857                        /* search left with tcur, back up 1 record */
 858                        error = xfs_ialloc_next_rec(tcur, &trec, &doneleft, 1);
 859                        if (error)
 860                                goto error1;
 861
 862                        /* search right with cur, go forward 1 record. */
 863                        error = xfs_ialloc_next_rec(cur, &rec, &doneright, 0);
 864                        if (error)
 865                                goto error1;
 866                }
 867
 868                /*
 869                 * Loop until we find an inode chunk with a free inode.
 870                 */
 871                while (!doneleft || !doneright) {
 872                        int     useleft;  /* using left inode chunk this time */
 873
 874                        if (!--searchdistance) {
 875                                /*
 876                                 * Not in range - save last search
 877                                 * location and allocate a new inode
 878                                 */
 879                                xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
 880                                pag->pagl_leftrec = trec.ir_startino;
 881                                pag->pagl_rightrec = rec.ir_startino;
 882                                pag->pagl_pagino = pagino;
 883                                goto newino;
 884                        }
 885
 886                        /* figure out the closer block if both are valid. */
 887                        if (!doneleft && !doneright) {
 888                                useleft = pagino -
 889                                 (trec.ir_startino + XFS_INODES_PER_CHUNK - 1) <
 890                                  rec.ir_startino - pagino;
 891                        } else {
 892                                useleft = !doneleft;
 893                        }
 894
 895                        /* free inodes to the left? */
 896                        if (useleft && trec.ir_freecount) {
 897                                rec = trec;
 898                                xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
 899                                cur = tcur;
 900
 901                                pag->pagl_leftrec = trec.ir_startino;
 902                                pag->pagl_rightrec = rec.ir_startino;
 903                                pag->pagl_pagino = pagino;
 904                                goto alloc_inode;
 905                        }
 906
 907                        /* free inodes to the right? */
 908                        if (!useleft && rec.ir_freecount) {
 909                                xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
 910
 911                                pag->pagl_leftrec = trec.ir_startino;
 912                                pag->pagl_rightrec = rec.ir_startino;
 913                                pag->pagl_pagino = pagino;
 914                                goto alloc_inode;
 915                        }
 916
 917                        /* get next record to check */
 918                        if (useleft) {
 919                                error = xfs_ialloc_next_rec(tcur, &trec,
 920                                                                 &doneleft, 1);
 921                        } else {
 922                                error = xfs_ialloc_next_rec(cur, &rec,
 923                                                                 &doneright, 0);
 924                        }
 925                        if (error)
 926                                goto error1;
 927                }
 928
 929                /*
 930                 * We've reached the end of the btree. because
 931                 * we are only searching a small chunk of the
 932                 * btree each search, there is obviously free
 933                 * inodes closer to the parent inode than we
 934                 * are now. restart the search again.
 935                 */
 936                pag->pagl_pagino = NULLAGINO;
 937                pag->pagl_leftrec = NULLAGINO;
 938                pag->pagl_rightrec = NULLAGINO;
 939                xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
 940                xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
 941                goto restart_pagno;
 942        }
 943
 944        /*
 945         * In a different AG from the parent.
 946         * See if the most recently allocated block has any free.
 947         */
 948newino:
 949        if (agi->agi_newino != cpu_to_be32(NULLAGINO)) {
 950                error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
 951                                         XFS_LOOKUP_EQ, &i);
 952                if (error)
 953                        goto error0;
 954
 955                if (i == 1) {
 956                        error = xfs_inobt_get_rec(cur, &rec, &j);
 957                        if (error)
 958                                goto error0;
 959
 960                        if (j == 1 && rec.ir_freecount > 0) {
 961                                /*
 962                                 * The last chunk allocated in the group
 963                                 * still has a free inode.
 964                                 */
 965                                goto alloc_inode;
 966                        }
 967                }
 968        }
 969
 970        /*
 971         * None left in the last group, search the whole AG
 972         */
 973        error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
 974        if (error)
 975                goto error0;
 976        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
 977
 978        for (;;) {
 979                error = xfs_inobt_get_rec(cur, &rec, &i);
 980                if (error)
 981                        goto error0;
 982                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
 983                if (rec.ir_freecount > 0)
 984                        break;
 985                error = xfs_btree_increment(cur, 0, &i);
 986                if (error)
 987                        goto error0;
 988                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
 989        }
 990
 991alloc_inode:
 992        offset = xfs_ialloc_find_free(&rec.ir_free);
 993        ASSERT(offset >= 0);
 994        ASSERT(offset < XFS_INODES_PER_CHUNK);
 995        ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) %
 996                                   XFS_INODES_PER_CHUNK) == 0);
 997        ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
 998        rec.ir_free &= ~XFS_INOBT_MASK(offset);
 999        rec.ir_freecount--;
1000        error = xfs_inobt_update(cur, &rec);
1001        if (error)
1002                goto error0;
1003        be32_add_cpu(&agi->agi_freecount, -1);
1004        xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
1005        pag->pagi_freecount--;
1006
1007        error = xfs_check_agi_freecount(cur, agi);
1008        if (error)
1009                goto error0;
1010
1011        xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1012        xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
1013        xfs_perag_put(pag);
1014        *inop = ino;
1015        return 0;
1016error1:
1017        xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
1018error0:
1019        xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
1020        xfs_perag_put(pag);
1021        return error;
1022}
1023
1024/*
1025 * Free disk inode.  Carefully avoids touching the incore inode, all
1026 * manipulations incore are the caller's responsibility.
1027 * The on-disk inode is not changed by this operation, only the
1028 * btree (free inode mask) is changed.
1029 */
1030int
1031xfs_difree(
1032        xfs_trans_t     *tp,            /* transaction pointer */
1033        xfs_ino_t       inode,          /* inode to be freed */
1034        xfs_bmap_free_t *flist,         /* extents to free */
1035        int             *delete,        /* set if inode cluster was deleted */
1036        xfs_ino_t       *first_ino)     /* first inode in deleted cluster */
1037{
1038        /* REFERENCED */
1039        xfs_agblock_t   agbno;  /* block number containing inode */
1040        xfs_buf_t       *agbp;  /* buffer containing allocation group header */
1041        xfs_agino_t     agino;  /* inode number relative to allocation group */
1042        xfs_agnumber_t  agno;   /* allocation group number */
1043        xfs_agi_t       *agi;   /* allocation group header */
1044        xfs_btree_cur_t *cur;   /* inode btree cursor */
1045        int             error;  /* error return value */
1046        int             i;      /* result code */
1047        int             ilen;   /* inodes in an inode cluster */
1048        xfs_mount_t     *mp;    /* mount structure for filesystem */
1049        int             off;    /* offset of inode in inode chunk */
1050        xfs_inobt_rec_incore_t rec;     /* btree record */
1051        struct xfs_perag *pag;
1052
1053        mp = tp->t_mountp;
1054
1055        /*
1056         * Break up inode number into its components.
1057         */
1058        agno = XFS_INO_TO_AGNO(mp, inode);
1059        if (agno >= mp->m_sb.sb_agcount)  {
1060                xfs_warn(mp, "%s: agno >= mp->m_sb.sb_agcount (%d >= %d).",
1061                        __func__, agno, mp->m_sb.sb_agcount);
1062                ASSERT(0);
1063                return XFS_ERROR(EINVAL);
1064        }
1065        agino = XFS_INO_TO_AGINO(mp, inode);
1066        if (inode != XFS_AGINO_TO_INO(mp, agno, agino))  {
1067                xfs_warn(mp, "%s: inode != XFS_AGINO_TO_INO() (%llu != %llu).",
1068                        __func__, (unsigned long long)inode,
1069                        (unsigned long long)XFS_AGINO_TO_INO(mp, agno, agino));
1070                ASSERT(0);
1071                return XFS_ERROR(EINVAL);
1072        }
1073        agbno = XFS_AGINO_TO_AGBNO(mp, agino);
1074        if (agbno >= mp->m_sb.sb_agblocks)  {
1075                xfs_warn(mp, "%s: agbno >= mp->m_sb.sb_agblocks (%d >= %d).",
1076                        __func__, agbno, mp->m_sb.sb_agblocks);
1077                ASSERT(0);
1078                return XFS_ERROR(EINVAL);
1079        }
1080        /*
1081         * Get the allocation group header.
1082         */
1083        error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
1084        if (error) {
1085                xfs_warn(mp, "%s: xfs_ialloc_read_agi() returned error %d.",
1086                        __func__, error);
1087                return error;
1088        }
1089        agi = XFS_BUF_TO_AGI(agbp);
1090        ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
1091        ASSERT(agbno < be32_to_cpu(agi->agi_length));
1092        /*
1093         * Initialize the cursor.
1094         */
1095        cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
1096
1097        error = xfs_check_agi_freecount(cur, agi);
1098        if (error)
1099                goto error0;
1100
1101        /*
1102         * Look for the entry describing this inode.
1103         */
1104        if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) {
1105                xfs_warn(mp, "%s: xfs_inobt_lookup() returned error %d.",
1106                        __func__, error);
1107                goto error0;
1108        }
1109        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1110        error = xfs_inobt_get_rec(cur, &rec, &i);
1111        if (error) {
1112                xfs_warn(mp, "%s: xfs_inobt_get_rec() returned error %d.",
1113                        __func__, error);
1114                goto error0;
1115        }
1116        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
1117        /*
1118         * Get the offset in the inode chunk.
1119         */
1120        off = agino - rec.ir_startino;
1121        ASSERT(off >= 0 && off < XFS_INODES_PER_CHUNK);
1122        ASSERT(!(rec.ir_free & XFS_INOBT_MASK(off)));
1123        /*
1124         * Mark the inode free & increment the count.
1125         */
1126        rec.ir_free |= XFS_INOBT_MASK(off);
1127        rec.ir_freecount++;
1128
1129        /*
1130         * When an inode cluster is free, it becomes eligible for removal
1131         */
1132        if (!(mp->m_flags & XFS_MOUNT_IKEEP) &&
1133            (rec.ir_freecount == XFS_IALLOC_INODES(mp))) {
1134
1135                *delete = 1;
1136                *first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino);
1137
1138                /*
1139                 * Remove the inode cluster from the AGI B+Tree, adjust the
1140                 * AGI and Superblock inode counts, and mark the disk space
1141                 * to be freed when the transaction is committed.
1142                 */
1143                ilen = XFS_IALLOC_INODES(mp);
1144                be32_add_cpu(&agi->agi_count, -ilen);
1145                be32_add_cpu(&agi->agi_freecount, -(ilen - 1));
1146                xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
1147                pag = xfs_perag_get(mp, agno);
1148                pag->pagi_freecount -= ilen - 1;
1149                xfs_perag_put(pag);
1150                xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen);
1151                xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));
1152
1153                if ((error = xfs_btree_delete(cur, &i))) {
1154                        xfs_warn(mp, "%s: xfs_btree_delete returned error %d.",
1155                                __func__, error);
1156                        goto error0;
1157                }
1158
1159                xfs_bmap_add_free(XFS_AGB_TO_FSB(mp,
1160                                agno, XFS_INO_TO_AGBNO(mp,rec.ir_startino)),
1161                                XFS_IALLOC_BLOCKS(mp), flist, mp);
1162        } else {
1163                *delete = 0;
1164
1165                error = xfs_inobt_update(cur, &rec);
1166                if (error) {
1167                        xfs_warn(mp, "%s: xfs_inobt_update returned error %d.",
1168                                __func__, error);
1169                        goto error0;
1170                }
1171
1172                /* 
1173                 * Change the inode free counts and log the ag/sb changes.
1174                 */
1175                be32_add_cpu(&agi->agi_freecount, 1);
1176                xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
1177                pag = xfs_perag_get(mp, agno);
1178                pag->pagi_freecount++;
1179                xfs_perag_put(pag);
1180                xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1);
1181        }
1182
1183        error = xfs_check_agi_freecount(cur, agi);
1184        if (error)
1185                goto error0;
1186
1187        xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1188        return 0;
1189
1190error0:
1191        xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
1192        return error;
1193}
1194
1195STATIC int
1196xfs_imap_lookup(
1197        struct xfs_mount        *mp,
1198        struct xfs_trans        *tp,
1199        xfs_agnumber_t          agno,
1200        xfs_agino_t             agino,
1201        xfs_agblock_t           agbno,
1202        xfs_agblock_t           *chunk_agbno,
1203        xfs_agblock_t           *offset_agbno,
1204        int                     flags)
1205{
1206        struct xfs_inobt_rec_incore rec;
1207        struct xfs_btree_cur    *cur;
1208        struct xfs_buf          *agbp;
1209        int                     error;
1210        int                     i;
1211
1212        error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
1213        if (error) {
1214                xfs_alert(mp,
1215                        "%s: xfs_ialloc_read_agi() returned error %d, agno %d",
1216                        __func__, error, agno);
1217                return error;
1218        }
1219
1220        /*
1221         * Lookup the inode record for the given agino. If the record cannot be
1222         * found, then it's an invalid inode number and we should abort. Once
1223         * we have a record, we need to ensure it contains the inode number
1224         * we are looking up.
1225         */
1226        cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
1227        error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
1228        if (!error) {
1229                if (i)
1230                        error = xfs_inobt_get_rec(cur, &rec, &i);
1231                if (!error && i == 0)
1232                        error = EINVAL;
1233        }
1234
1235        xfs_trans_brelse(tp, agbp);
1236        xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
1237        if (error)
1238                return error;
1239
1240        /* check that the returned record contains the required inode */
1241        if (rec.ir_startino > agino ||
1242            rec.ir_startino + XFS_IALLOC_INODES(mp) <= agino)
1243                return EINVAL;
1244
1245        /* for untrusted inodes check it is allocated first */
1246        if ((flags & XFS_IGET_UNTRUSTED) &&
1247            (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino)))
1248                return EINVAL;
1249
1250        *chunk_agbno = XFS_AGINO_TO_AGBNO(mp, rec.ir_startino);
1251        *offset_agbno = agbno - *chunk_agbno;
1252        return 0;
1253}
1254
1255/*
1256 * Return the location of the inode in imap, for mapping it into a buffer.
1257 */
1258int
1259xfs_imap(
1260        xfs_mount_t      *mp,   /* file system mount structure */
1261        xfs_trans_t      *tp,   /* transaction pointer */
1262        xfs_ino_t       ino,    /* inode to locate */
1263        struct xfs_imap *imap,  /* location map structure */
1264        uint            flags)  /* flags for inode btree lookup */
1265{
1266        xfs_agblock_t   agbno;  /* block number of inode in the alloc group */
1267        xfs_agino_t     agino;  /* inode number within alloc group */
1268        xfs_agnumber_t  agno;   /* allocation group number */
1269        int             blks_per_cluster; /* num blocks per inode cluster */
1270        xfs_agblock_t   chunk_agbno;    /* first block in inode chunk */
1271        xfs_agblock_t   cluster_agbno;  /* first block in inode cluster */
1272        int             error;  /* error code */
1273        int             offset; /* index of inode in its buffer */
1274        int             offset_agbno;   /* blks from chunk start to inode */
1275
1276        ASSERT(ino != NULLFSINO);
1277
1278        /*
1279         * Split up the inode number into its parts.
1280         */
1281        agno = XFS_INO_TO_AGNO(mp, ino);
1282        agino = XFS_INO_TO_AGINO(mp, ino);
1283        agbno = XFS_AGINO_TO_AGBNO(mp, agino);
1284        if (agno >= mp->m_sb.sb_agcount || agbno >= mp->m_sb.sb_agblocks ||
1285            ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
1286#ifdef DEBUG
1287                /*
1288                 * Don't output diagnostic information for untrusted inodes
1289                 * as they can be invalid without implying corruption.
1290                 */
1291                if (flags & XFS_IGET_UNTRUSTED)
1292                        return XFS_ERROR(EINVAL);
1293                if (agno >= mp->m_sb.sb_agcount) {
1294                        xfs_alert(mp,
1295                                "%s: agno (%d) >= mp->m_sb.sb_agcount (%d)",
1296                                __func__, agno, mp->m_sb.sb_agcount);
1297                }
1298                if (agbno >= mp->m_sb.sb_agblocks) {
1299                        xfs_alert(mp,
1300                "%s: agbno (0x%llx) >= mp->m_sb.sb_agblocks (0x%lx)",
1301                                __func__, (unsigned long long)agbno,
1302                                (unsigned long)mp->m_sb.sb_agblocks);
1303                }
1304                if (ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
1305                        xfs_alert(mp,
1306                "%s: ino (0x%llx) != XFS_AGINO_TO_INO() (0x%llx)",
1307                                __func__, ino,
1308                                XFS_AGINO_TO_INO(mp, agno, agino));
1309                }
1310                xfs_stack_trace();
1311#endif /* DEBUG */
1312                return XFS_ERROR(EINVAL);
1313        }
1314
1315        blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_blocklog;
1316
1317        /*
1318         * For bulkstat and handle lookups, we have an untrusted inode number
1319         * that we have to verify is valid. We cannot do this just by reading
1320         * the inode buffer as it may have been unlinked and removed leaving
1321         * inodes in stale state on disk. Hence we have to do a btree lookup
1322         * in all cases where an untrusted inode number is passed.
1323         */
1324        if (flags & XFS_IGET_UNTRUSTED) {
1325                error = xfs_imap_lookup(mp, tp, agno, agino, agbno,
1326                                        &chunk_agbno, &offset_agbno, flags);
1327                if (error)
1328                        return error;
1329                goto out_map;
1330        }
1331
1332        /*
1333         * If the inode cluster size is the same as the blocksize or
1334         * smaller we get to the buffer by simple arithmetics.
1335         */
1336        if (XFS_INODE_CLUSTER_SIZE(mp) <= mp->m_sb.sb_blocksize) {
1337                offset = XFS_INO_TO_OFFSET(mp, ino);
1338                ASSERT(offset < mp->m_sb.sb_inopblock);
1339
1340                imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno);
1341                imap->im_len = XFS_FSB_TO_BB(mp, 1);
1342                imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
1343                return 0;
1344        }
1345
1346        /*
1347         * If the inode chunks are aligned then use simple maths to
1348         * find the location. Otherwise we have to do a btree
1349         * lookup to find the location.
1350         */
1351        if (mp->m_inoalign_mask) {
1352                offset_agbno = agbno & mp->m_inoalign_mask;
1353                chunk_agbno = agbno - offset_agbno;
1354        } else {
1355                error = xfs_imap_lookup(mp, tp, agno, agino, agbno,
1356                                        &chunk_agbno, &offset_agbno, flags);
1357                if (error)
1358                        return error;
1359        }
1360
1361out_map:
1362        ASSERT(agbno >= chunk_agbno);
1363        cluster_agbno = chunk_agbno +
1364                ((offset_agbno / blks_per_cluster) * blks_per_cluster);
1365        offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) +
1366                XFS_INO_TO_OFFSET(mp, ino);
1367
1368        imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, cluster_agbno);
1369        imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
1370        imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
1371
1372        /*
1373         * If the inode number maps to a block outside the bounds
1374         * of the file system then return NULL rather than calling
1375         * read_buf and panicing when we get an error from the
1376         * driver.
1377         */
1378        if ((imap->im_blkno + imap->im_len) >
1379            XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
1380                xfs_alert(mp,
1381        "%s: (im_blkno (0x%llx) + im_len (0x%llx)) > sb_dblocks (0x%llx)",
1382                        __func__, (unsigned long long) imap->im_blkno,
1383                        (unsigned long long) imap->im_len,
1384                        XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
1385                return XFS_ERROR(EINVAL);
1386        }
1387        return 0;
1388}
1389
1390/*
1391 * Compute and fill in value of m_in_maxlevels.
1392 */
1393void
1394xfs_ialloc_compute_maxlevels(
1395        xfs_mount_t     *mp)            /* file system mount structure */
1396{
1397        int             level;
1398        uint            maxblocks;
1399        uint            maxleafents;
1400        int             minleafrecs;
1401        int             minnoderecs;
1402
1403        maxleafents = (1LL << XFS_INO_AGINO_BITS(mp)) >>
1404                XFS_INODES_PER_CHUNK_LOG;
1405        minleafrecs = mp->m_alloc_mnr[0];
1406        minnoderecs = mp->m_alloc_mnr[1];
1407        maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
1408        for (level = 1; maxblocks > 1; level++)
1409                maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
1410        mp->m_in_maxlevels = level;
1411}
1412
1413/*
1414 * Log specified fields for the ag hdr (inode section)
1415 */
1416void
1417xfs_ialloc_log_agi(
1418        xfs_trans_t     *tp,            /* transaction pointer */
1419        xfs_buf_t       *bp,            /* allocation group header buffer */
1420        int             fields)         /* bitmask of fields to log */
1421{
1422        int                     first;          /* first byte number */
1423        int                     last;           /* last byte number */
1424        static const short      offsets[] = {   /* field starting offsets */
1425                                        /* keep in sync with bit definitions */
1426                offsetof(xfs_agi_t, agi_magicnum),
1427                offsetof(xfs_agi_t, agi_versionnum),
1428                offsetof(xfs_agi_t, agi_seqno),
1429                offsetof(xfs_agi_t, agi_length),
1430                offsetof(xfs_agi_t, agi_count),
1431                offsetof(xfs_agi_t, agi_root),
1432                offsetof(xfs_agi_t, agi_level),
1433                offsetof(xfs_agi_t, agi_freecount),
1434                offsetof(xfs_agi_t, agi_newino),
1435                offsetof(xfs_agi_t, agi_dirino),
1436                offsetof(xfs_agi_t, agi_unlinked),
1437                sizeof(xfs_agi_t)
1438        };
1439#ifdef DEBUG
1440        xfs_agi_t               *agi;   /* allocation group header */
1441
1442        agi = XFS_BUF_TO_AGI(bp);
1443        ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
1444#endif
1445        /*
1446         * Compute byte offsets for the first and last fields.
1447         */
1448        xfs_btree_offsets(fields, offsets, XFS_AGI_NUM_BITS, &first, &last);
1449        /*
1450         * Log the allocation group inode header buffer.
1451         */
1452        xfs_trans_log_buf(tp, bp, first, last);
1453}
1454
1455#ifdef DEBUG
1456STATIC void
1457xfs_check_agi_unlinked(
1458        struct xfs_agi          *agi)
1459{
1460        int                     i;
1461
1462        for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++)
1463                ASSERT(agi->agi_unlinked[i]);
1464}
1465#else
1466#define xfs_check_agi_unlinked(agi)
1467#endif
1468
1469/*
1470 * Read in the allocation group header (inode allocation section)
1471 */
1472int
1473xfs_read_agi(
1474        struct xfs_mount        *mp,    /* file system mount structure */
1475        struct xfs_trans        *tp,    /* transaction pointer */
1476        xfs_agnumber_t          agno,   /* allocation group number */
1477        struct xfs_buf          **bpp)  /* allocation group hdr buf */
1478{
1479        struct xfs_agi          *agi;   /* allocation group header */
1480        int                     agi_ok; /* agi is consistent */
1481        int                     error;
1482
1483        ASSERT(agno != NULLAGNUMBER);
1484
1485        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
1486                        XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
1487                        XFS_FSS_TO_BB(mp, 1), 0, bpp);
1488        if (error)
1489                return error;
1490
1491        ASSERT(!xfs_buf_geterror(*bpp));
1492        agi = XFS_BUF_TO_AGI(*bpp);
1493
1494        /*
1495         * Validate the magic number of the agi block.
1496         */
1497        agi_ok = agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC) &&
1498                XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)) &&
1499                be32_to_cpu(agi->agi_seqno) == agno;
1500        if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI,
1501                        XFS_RANDOM_IALLOC_READ_AGI))) {
1502                XFS_CORRUPTION_ERROR("xfs_read_agi", XFS_ERRLEVEL_LOW,
1503                                     mp, agi);
1504                xfs_trans_brelse(tp, *bpp);
1505                return XFS_ERROR(EFSCORRUPTED);
1506        }
1507
1508        xfs_buf_set_ref(*bpp, XFS_AGI_REF);
1509
1510        xfs_check_agi_unlinked(agi);
1511        return 0;
1512}
1513
1514int
1515xfs_ialloc_read_agi(
1516        struct xfs_mount        *mp,    /* file system mount structure */
1517        struct xfs_trans        *tp,    /* transaction pointer */
1518        xfs_agnumber_t          agno,   /* allocation group number */
1519        struct xfs_buf          **bpp)  /* allocation group hdr buf */
1520{
1521        struct xfs_agi          *agi;   /* allocation group header */
1522        struct xfs_perag        *pag;   /* per allocation group data */
1523        int                     error;
1524
1525        error = xfs_read_agi(mp, tp, agno, bpp);
1526        if (error)
1527                return error;
1528
1529        agi = XFS_BUF_TO_AGI(*bpp);
1530        pag = xfs_perag_get(mp, agno);
1531        if (!pag->pagi_init) {
1532                pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
1533                pag->pagi_count = be32_to_cpu(agi->agi_count);
1534                pag->pagi_init = 1;
1535        }
1536
1537        /*
1538         * It's possible for these to be out of sync if
1539         * we are in the middle of a forced shutdown.
1540         */
1541        ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
1542                XFS_FORCED_SHUTDOWN(mp));
1543        xfs_perag_put(pag);
1544        return 0;
1545}
1546
1547/*
1548 * Read in the agi to initialise the per-ag data in the mount structure
1549 */
1550int
1551xfs_ialloc_pagi_init(
1552        xfs_mount_t     *mp,            /* file system mount structure */
1553        xfs_trans_t     *tp,            /* transaction pointer */
1554        xfs_agnumber_t  agno)           /* allocation group number */
1555{
1556        xfs_buf_t       *bp = NULL;
1557        int             error;
1558
1559        error = xfs_ialloc_read_agi(mp, tp, agno, &bp);
1560        if (error)
1561                return error;
1562        if (bp)
1563                xfs_trans_brelse(tp, bp);
1564        return 0;
1565}
1566