linux/fs/xfs/xfs_inode_buf.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   3 * All Rights Reserved.
   4 *
   5 * This program is free software; you can redistribute it and/or
   6 * modify it under the terms of the GNU General Public License as
   7 * published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope that it would be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public License
  15 * along with this program; if not, write the Free Software Foundation,
  16 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17 */
  18#include "xfs.h"
  19#include "xfs_fs.h"
  20#include "xfs_format.h"
  21#include "xfs_log.h"
  22#include "xfs_trans.h"
  23#include "xfs_sb.h"
  24#include "xfs_ag.h"
  25#include "xfs_mount.h"
  26#include "xfs_bmap_btree.h"
  27#include "xfs_ialloc_btree.h"
  28#include "xfs_dinode.h"
  29#include "xfs_inode.h"
  30#include "xfs_error.h"
  31#include "xfs_cksum.h"
  32#include "xfs_icache.h"
  33#include "xfs_ialloc.h"
  34
  35/*
  36 * Check that none of the inode's in the buffer have a next
  37 * unlinked field of 0.
  38 */
  39#if defined(DEBUG)
  40void
  41xfs_inobp_check(
  42        xfs_mount_t     *mp,
  43        xfs_buf_t       *bp)
  44{
  45        int             i;
  46        int             j;
  47        xfs_dinode_t    *dip;
  48
  49        j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
  50
  51        for (i = 0; i < j; i++) {
  52                dip = (xfs_dinode_t *)xfs_buf_offset(bp,
  53                                        i * mp->m_sb.sb_inodesize);
  54                if (!dip->di_next_unlinked)  {
  55                        xfs_alert(mp,
  56        "Detected bogus zero next_unlinked field in inode %d buffer 0x%llx.",
  57                                i, (long long)bp->b_bn);
  58                }
  59        }
  60}
  61#endif
  62
  63/*
  64 * If we are doing readahead on an inode buffer, we might be in log recovery
  65 * reading an inode allocation buffer that hasn't yet been replayed, and hence
  66 * has not had the inode cores stamped into it. Hence for readahead, the buffer
  67 * may be potentially invalid.
  68 *
  69 * If the readahead buffer is invalid, we don't want to mark it with an error,
  70 * but we do want to clear the DONE status of the buffer so that a followup read
  71 * will re-read it from disk. This will ensure that we don't get an unnecessary
  72 * warnings during log recovery and we don't get unnecssary panics on debug
  73 * kernels.
  74 */
  75static void
  76xfs_inode_buf_verify(
  77        struct xfs_buf  *bp,
  78        bool            readahead)
  79{
  80        struct xfs_mount *mp = bp->b_target->bt_mount;
  81        int             i;
  82        int             ni;
  83
  84        /*
  85         * Validate the magic number and version of every inode in the buffer
  86         */
  87        ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
  88        for (i = 0; i < ni; i++) {
  89                int             di_ok;
  90                xfs_dinode_t    *dip;
  91
  92                dip = (struct xfs_dinode *)xfs_buf_offset(bp,
  93                                        (i << mp->m_sb.sb_inodelog));
  94                di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
  95                            XFS_DINODE_GOOD_VERSION(dip->di_version);
  96                if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
  97                                                XFS_ERRTAG_ITOBP_INOTOBP,
  98                                                XFS_RANDOM_ITOBP_INOTOBP))) {
  99                        if (readahead) {
 100                                bp->b_flags &= ~XBF_DONE;
 101                                return;
 102                        }
 103
 104                        xfs_buf_ioerror(bp, EFSCORRUPTED);
 105                        XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH,
 106                                             mp, dip);
 107#ifdef DEBUG
 108                        xfs_alert(mp,
 109                                "bad inode magic/vsn daddr %lld #%d (magic=%x)",
 110                                (unsigned long long)bp->b_bn, i,
 111                                be16_to_cpu(dip->di_magic));
 112#endif
 113                }
 114        }
 115        xfs_inobp_check(mp, bp);
 116}
 117
 118
 119static void
 120xfs_inode_buf_read_verify(
 121        struct xfs_buf  *bp)
 122{
 123        xfs_inode_buf_verify(bp, false);
 124}
 125
 126static void
 127xfs_inode_buf_readahead_verify(
 128        struct xfs_buf  *bp)
 129{
 130        xfs_inode_buf_verify(bp, true);
 131}
 132
 133static void
 134xfs_inode_buf_write_verify(
 135        struct xfs_buf  *bp)
 136{
 137        xfs_inode_buf_verify(bp, false);
 138}
 139
 140const struct xfs_buf_ops xfs_inode_buf_ops = {
 141        .verify_read = xfs_inode_buf_read_verify,
 142        .verify_write = xfs_inode_buf_write_verify,
 143};
 144
 145const struct xfs_buf_ops xfs_inode_buf_ra_ops = {
 146        .verify_read = xfs_inode_buf_readahead_verify,
 147        .verify_write = xfs_inode_buf_write_verify,
 148};
 149
 150
 151/*
 152 * This routine is called to map an inode to the buffer containing the on-disk
 153 * version of the inode.  It returns a pointer to the buffer containing the
 154 * on-disk inode in the bpp parameter, and in the dipp parameter it returns a
 155 * pointer to the on-disk inode within that buffer.
 156 *
 157 * If a non-zero error is returned, then the contents of bpp and dipp are
 158 * undefined.
 159 */
 160int
 161xfs_imap_to_bp(
 162        struct xfs_mount        *mp,
 163        struct xfs_trans        *tp,
 164        struct xfs_imap         *imap,
 165        struct xfs_dinode       **dipp,
 166        struct xfs_buf          **bpp,
 167        uint                    buf_flags,
 168        uint                    iget_flags)
 169{
 170        struct xfs_buf          *bp;
 171        int                     error;
 172
 173        buf_flags |= XBF_UNMAPPED;
 174        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
 175                                   (int)imap->im_len, buf_flags, &bp,
 176                                   &xfs_inode_buf_ops);
 177        if (error) {
 178                if (error == EAGAIN) {
 179                        ASSERT(buf_flags & XBF_TRYLOCK);
 180                        return error;
 181                }
 182
 183                if (error == EFSCORRUPTED &&
 184                    (iget_flags & XFS_IGET_UNTRUSTED))
 185                        return XFS_ERROR(EINVAL);
 186
 187                xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.",
 188                        __func__, error);
 189                return error;
 190        }
 191
 192        *bpp = bp;
 193        *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset);
 194        return 0;
 195}
 196
 197void
 198xfs_dinode_from_disk(
 199        xfs_icdinode_t          *to,
 200        xfs_dinode_t            *from)
 201{
 202        to->di_magic = be16_to_cpu(from->di_magic);
 203        to->di_mode = be16_to_cpu(from->di_mode);
 204        to->di_version = from ->di_version;
 205        to->di_format = from->di_format;
 206        to->di_onlink = be16_to_cpu(from->di_onlink);
 207        to->di_uid = be32_to_cpu(from->di_uid);
 208        to->di_gid = be32_to_cpu(from->di_gid);
 209        to->di_nlink = be32_to_cpu(from->di_nlink);
 210        to->di_projid_lo = be16_to_cpu(from->di_projid_lo);
 211        to->di_projid_hi = be16_to_cpu(from->di_projid_hi);
 212        memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
 213        to->di_flushiter = be16_to_cpu(from->di_flushiter);
 214        to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec);
 215        to->di_atime.t_nsec = be32_to_cpu(from->di_atime.t_nsec);
 216        to->di_mtime.t_sec = be32_to_cpu(from->di_mtime.t_sec);
 217        to->di_mtime.t_nsec = be32_to_cpu(from->di_mtime.t_nsec);
 218        to->di_ctime.t_sec = be32_to_cpu(from->di_ctime.t_sec);
 219        to->di_ctime.t_nsec = be32_to_cpu(from->di_ctime.t_nsec);
 220        to->di_size = be64_to_cpu(from->di_size);
 221        to->di_nblocks = be64_to_cpu(from->di_nblocks);
 222        to->di_extsize = be32_to_cpu(from->di_extsize);
 223        to->di_nextents = be32_to_cpu(from->di_nextents);
 224        to->di_anextents = be16_to_cpu(from->di_anextents);
 225        to->di_forkoff = from->di_forkoff;
 226        to->di_aformat  = from->di_aformat;
 227        to->di_dmevmask = be32_to_cpu(from->di_dmevmask);
 228        to->di_dmstate  = be16_to_cpu(from->di_dmstate);
 229        to->di_flags    = be16_to_cpu(from->di_flags);
 230        to->di_gen      = be32_to_cpu(from->di_gen);
 231
 232        if (to->di_version == 3) {
 233                to->di_changecount = be64_to_cpu(from->di_changecount);
 234                to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec);
 235                to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec);
 236                to->di_flags2 = be64_to_cpu(from->di_flags2);
 237                to->di_ino = be64_to_cpu(from->di_ino);
 238                to->di_lsn = be64_to_cpu(from->di_lsn);
 239                memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
 240                uuid_copy(&to->di_uuid, &from->di_uuid);
 241        }
 242}
 243
 244void
 245xfs_dinode_to_disk(
 246        xfs_dinode_t            *to,
 247        xfs_icdinode_t          *from)
 248{
 249        to->di_magic = cpu_to_be16(from->di_magic);
 250        to->di_mode = cpu_to_be16(from->di_mode);
 251        to->di_version = from ->di_version;
 252        to->di_format = from->di_format;
 253        to->di_onlink = cpu_to_be16(from->di_onlink);
 254        to->di_uid = cpu_to_be32(from->di_uid);
 255        to->di_gid = cpu_to_be32(from->di_gid);
 256        to->di_nlink = cpu_to_be32(from->di_nlink);
 257        to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
 258        to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
 259        memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
 260        to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
 261        to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec);
 262        to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec);
 263        to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec);
 264        to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec);
 265        to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec);
 266        to->di_size = cpu_to_be64(from->di_size);
 267        to->di_nblocks = cpu_to_be64(from->di_nblocks);
 268        to->di_extsize = cpu_to_be32(from->di_extsize);
 269        to->di_nextents = cpu_to_be32(from->di_nextents);
 270        to->di_anextents = cpu_to_be16(from->di_anextents);
 271        to->di_forkoff = from->di_forkoff;
 272        to->di_aformat = from->di_aformat;
 273        to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
 274        to->di_dmstate = cpu_to_be16(from->di_dmstate);
 275        to->di_flags = cpu_to_be16(from->di_flags);
 276        to->di_gen = cpu_to_be32(from->di_gen);
 277
 278        if (from->di_version == 3) {
 279                to->di_changecount = cpu_to_be64(from->di_changecount);
 280                to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
 281                to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
 282                to->di_flags2 = cpu_to_be64(from->di_flags2);
 283                to->di_ino = cpu_to_be64(from->di_ino);
 284                to->di_lsn = cpu_to_be64(from->di_lsn);
 285                memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
 286                uuid_copy(&to->di_uuid, &from->di_uuid);
 287                to->di_flushiter = 0;
 288        } else {
 289                to->di_flushiter = cpu_to_be16(from->di_flushiter);
 290        }
 291}
 292
 293static bool
 294xfs_dinode_verify(
 295        struct xfs_mount        *mp,
 296        struct xfs_inode        *ip,
 297        struct xfs_dinode       *dip)
 298{
 299        if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))
 300                return false;
 301
 302        /* only version 3 or greater inodes are extensively verified here */
 303        if (dip->di_version < 3)
 304                return true;
 305
 306        if (!xfs_sb_version_hascrc(&mp->m_sb))
 307                return false;
 308        if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
 309                              offsetof(struct xfs_dinode, di_crc)))
 310                return false;
 311        if (be64_to_cpu(dip->di_ino) != ip->i_ino)
 312                return false;
 313        if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_uuid))
 314                return false;
 315        return true;
 316}
 317
 318void
 319xfs_dinode_calc_crc(
 320        struct xfs_mount        *mp,
 321        struct xfs_dinode       *dip)
 322{
 323        __uint32_t              crc;
 324
 325        if (dip->di_version < 3)
 326                return;
 327
 328        ASSERT(xfs_sb_version_hascrc(&mp->m_sb));
 329        crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize,
 330                              offsetof(struct xfs_dinode, di_crc));
 331        dip->di_crc = xfs_end_cksum(crc);
 332}
 333
 334/*
 335 * Read the disk inode attributes into the in-core inode structure.
 336 *
 337 * For version 5 superblocks, if we are initialising a new inode and we are not
 338 * utilising the XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new
 339 * inode core with a random generation number. If we are keeping inodes around,
 340 * we need to read the inode cluster to get the existing generation number off
 341 * disk. Further, if we are using version 4 superblocks (i.e. v1/v2 inode
 342 * format) then log recovery is dependent on the di_flushiter field being
 343 * initialised from the current on-disk value and hence we must also read the
 344 * inode off disk.
 345 */
 346int
 347xfs_iread(
 348        xfs_mount_t     *mp,
 349        xfs_trans_t     *tp,
 350        xfs_inode_t     *ip,
 351        uint            iget_flags)
 352{
 353        xfs_buf_t       *bp;
 354        xfs_dinode_t    *dip;
 355        int             error;
 356
 357        /*
 358         * Fill in the location information in the in-core inode.
 359         */
 360        error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags);
 361        if (error)
 362                return error;
 363
 364        /* shortcut IO on inode allocation if possible */
 365        if ((iget_flags & XFS_IGET_CREATE) &&
 366            xfs_sb_version_hascrc(&mp->m_sb) &&
 367            !(mp->m_flags & XFS_MOUNT_IKEEP)) {
 368                /* initialise the on-disk inode core */
 369                memset(&ip->i_d, 0, sizeof(ip->i_d));
 370                ip->i_d.di_magic = XFS_DINODE_MAGIC;
 371                ip->i_d.di_gen = prandom_u32();
 372                if (xfs_sb_version_hascrc(&mp->m_sb)) {
 373                        ip->i_d.di_version = 3;
 374                        ip->i_d.di_ino = ip->i_ino;
 375                        uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid);
 376                } else
 377                        ip->i_d.di_version = 2;
 378                return 0;
 379        }
 380
 381        /*
 382         * Get pointers to the on-disk inode and the buffer containing it.
 383         */
 384        error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0, iget_flags);
 385        if (error)
 386                return error;
 387
 388        /* even unallocated inodes are verified */
 389        if (!xfs_dinode_verify(mp, ip, dip)) {
 390                xfs_alert(mp, "%s: validation failed for inode %lld failed",
 391                                __func__, ip->i_ino);
 392
 393                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, dip);
 394                error = XFS_ERROR(EFSCORRUPTED);
 395                goto out_brelse;
 396        }
 397
 398        /*
 399         * If the on-disk inode is already linked to a directory
 400         * entry, copy all of the inode into the in-core inode.
 401         * xfs_iformat_fork() handles copying in the inode format
 402         * specific information.
 403         * Otherwise, just get the truly permanent information.
 404         */
 405        if (dip->di_mode) {
 406                xfs_dinode_from_disk(&ip->i_d, dip);
 407                error = xfs_iformat_fork(ip, dip);
 408                if (error)  {
 409#ifdef DEBUG
 410                        xfs_alert(mp, "%s: xfs_iformat() returned error %d",
 411                                __func__, error);
 412#endif /* DEBUG */
 413                        goto out_brelse;
 414                }
 415        } else {
 416                /*
 417                 * Partial initialisation of the in-core inode. Just the bits
 418                 * that xfs_ialloc won't overwrite or relies on being correct.
 419                 */
 420                ip->i_d.di_magic = be16_to_cpu(dip->di_magic);
 421                ip->i_d.di_version = dip->di_version;
 422                ip->i_d.di_gen = be32_to_cpu(dip->di_gen);
 423                ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter);
 424
 425                if (dip->di_version == 3) {
 426                        ip->i_d.di_ino = be64_to_cpu(dip->di_ino);
 427                        uuid_copy(&ip->i_d.di_uuid, &dip->di_uuid);
 428                }
 429
 430                /*
 431                 * Make sure to pull in the mode here as well in
 432                 * case the inode is released without being used.
 433                 * This ensures that xfs_inactive() will see that
 434                 * the inode is already free and not try to mess
 435                 * with the uninitialized part of it.
 436                 */
 437                ip->i_d.di_mode = 0;
 438        }
 439
 440        /*
 441         * The inode format changed when we moved the link count and
 442         * made it 32 bits long.  If this is an old format inode,
 443         * convert it in memory to look like a new one.  If it gets
 444         * flushed to disk we will convert back before flushing or
 445         * logging it.  We zero out the new projid field and the old link
 446         * count field.  We'll handle clearing the pad field (the remains
 447         * of the old uuid field) when we actually convert the inode to
 448         * the new format. We don't change the version number so that we
 449         * can distinguish this from a real new format inode.
 450         */
 451        if (ip->i_d.di_version == 1) {
 452                ip->i_d.di_nlink = ip->i_d.di_onlink;
 453                ip->i_d.di_onlink = 0;
 454                xfs_set_projid(ip, 0);
 455        }
 456
 457        ip->i_delayed_blks = 0;
 458
 459        /*
 460         * Mark the buffer containing the inode as something to keep
 461         * around for a while.  This helps to keep recently accessed
 462         * meta-data in-core longer.
 463         */
 464        xfs_buf_set_ref(bp, XFS_INO_REF);
 465
 466        /*
 467         * Use xfs_trans_brelse() to release the buffer containing the on-disk
 468         * inode, because it was acquired with xfs_trans_read_buf() in
 469         * xfs_imap_to_bp() above.  If tp is NULL, this is just a normal
 470         * brelse().  If we're within a transaction, then xfs_trans_brelse()
 471         * will only release the buffer if it is not dirty within the
 472         * transaction.  It will be OK to release the buffer in this case,
 473         * because inodes on disk are never destroyed and we will be locking the
 474         * new in-core inode before putting it in the cache where other
 475         * processes can find it.  Thus we don't have to worry about the inode
 476         * being changed just because we released the buffer.
 477         */
 478 out_brelse:
 479        xfs_trans_brelse(tp, bp);
 480        return error;
 481}
 482