linux/fs/xfs/libxfs/xfs_inode_buf.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   3 * All Rights Reserved.
   4 *
   5 * This program is free software; you can redistribute it and/or
   6 * modify it under the terms of the GNU General Public License as
   7 * published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope that it would be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public License
  15 * along with this program; if not, write the Free Software Foundation,
  16 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17 */
  18#include "xfs.h"
  19#include "xfs_fs.h"
  20#include "xfs_shared.h"
  21#include "xfs_format.h"
  22#include "xfs_log_format.h"
  23#include "xfs_trans_resv.h"
  24#include "xfs_mount.h"
  25#include "xfs_inode.h"
  26#include "xfs_error.h"
  27#include "xfs_cksum.h"
  28#include "xfs_icache.h"
  29#include "xfs_trans.h"
  30#include "xfs_ialloc.h"
  31
  32/*
  33 * Check that none of the inode's in the buffer have a next
  34 * unlinked field of 0.
  35 */
  36#if defined(DEBUG)
  37void
  38xfs_inobp_check(
  39        xfs_mount_t     *mp,
  40        xfs_buf_t       *bp)
  41{
  42        int             i;
  43        int             j;
  44        xfs_dinode_t    *dip;
  45
  46        j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
  47
  48        for (i = 0; i < j; i++) {
  49                dip = xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize);
  50                if (!dip->di_next_unlinked)  {
  51                        xfs_alert(mp,
  52        "Detected bogus zero next_unlinked field in inode %d buffer 0x%llx.",
  53                                i, (long long)bp->b_bn);
  54                }
  55        }
  56}
  57#endif
  58
  59/*
  60 * If we are doing readahead on an inode buffer, we might be in log recovery
  61 * reading an inode allocation buffer that hasn't yet been replayed, and hence
  62 * has not had the inode cores stamped into it. Hence for readahead, the buffer
  63 * may be potentially invalid.
  64 *
  65 * If the readahead buffer is invalid, we need to mark it with an error and
  66 * clear the DONE status of the buffer so that a followup read will re-read it
  67 * from disk. We don't report the error otherwise to avoid warnings during log
  68 * recovery and we don't get unnecssary panics on debug kernels. We use EIO here
  69 * because all we want to do is say readahead failed; there is no-one to report
  70 * the error to, so this will distinguish it from a non-ra verifier failure.
  71 * Changes to this readahead error behavour also need to be reflected in
  72 * xfs_dquot_buf_readahead_verify().
  73 */
  74static void
  75xfs_inode_buf_verify(
  76        struct xfs_buf  *bp,
  77        bool            readahead)
  78{
  79        struct xfs_mount *mp = bp->b_target->bt_mount;
  80        int             i;
  81        int             ni;
  82
  83        /*
  84         * Validate the magic number and version of every inode in the buffer
  85         */
  86        ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
  87        for (i = 0; i < ni; i++) {
  88                int             di_ok;
  89                xfs_dinode_t    *dip;
  90
  91                dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog));
  92                di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
  93                            XFS_DINODE_GOOD_VERSION(dip->di_version);
  94                if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
  95                                                XFS_ERRTAG_ITOBP_INOTOBP,
  96                                                XFS_RANDOM_ITOBP_INOTOBP))) {
  97                        if (readahead) {
  98                                bp->b_flags &= ~XBF_DONE;
  99                                xfs_buf_ioerror(bp, -EIO);
 100                                return;
 101                        }
 102
 103                        xfs_buf_ioerror(bp, -EFSCORRUPTED);
 104                        xfs_verifier_error(bp);
 105#ifdef DEBUG
 106                        xfs_alert(mp,
 107                                "bad inode magic/vsn daddr %lld #%d (magic=%x)",
 108                                (unsigned long long)bp->b_bn, i,
 109                                be16_to_cpu(dip->di_magic));
 110#endif
 111                }
 112        }
 113        xfs_inobp_check(mp, bp);
 114}
 115
 116
 117static void
 118xfs_inode_buf_read_verify(
 119        struct xfs_buf  *bp)
 120{
 121        xfs_inode_buf_verify(bp, false);
 122}
 123
 124static void
 125xfs_inode_buf_readahead_verify(
 126        struct xfs_buf  *bp)
 127{
 128        xfs_inode_buf_verify(bp, true);
 129}
 130
 131static void
 132xfs_inode_buf_write_verify(
 133        struct xfs_buf  *bp)
 134{
 135        xfs_inode_buf_verify(bp, false);
 136}
 137
 138const struct xfs_buf_ops xfs_inode_buf_ops = {
 139        .name = "xfs_inode",
 140        .verify_read = xfs_inode_buf_read_verify,
 141        .verify_write = xfs_inode_buf_write_verify,
 142};
 143
 144const struct xfs_buf_ops xfs_inode_buf_ra_ops = {
 145        .name = "xxfs_inode_ra",
 146        .verify_read = xfs_inode_buf_readahead_verify,
 147        .verify_write = xfs_inode_buf_write_verify,
 148};
 149
 150
 151/*
 152 * This routine is called to map an inode to the buffer containing the on-disk
 153 * version of the inode.  It returns a pointer to the buffer containing the
 154 * on-disk inode in the bpp parameter, and in the dipp parameter it returns a
 155 * pointer to the on-disk inode within that buffer.
 156 *
 157 * If a non-zero error is returned, then the contents of bpp and dipp are
 158 * undefined.
 159 */
 160int
 161xfs_imap_to_bp(
 162        struct xfs_mount        *mp,
 163        struct xfs_trans        *tp,
 164        struct xfs_imap         *imap,
 165        struct xfs_dinode       **dipp,
 166        struct xfs_buf          **bpp,
 167        uint                    buf_flags,
 168        uint                    iget_flags)
 169{
 170        struct xfs_buf          *bp;
 171        int                     error;
 172
 173        buf_flags |= XBF_UNMAPPED;
 174        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
 175                                   (int)imap->im_len, buf_flags, &bp,
 176                                   &xfs_inode_buf_ops);
 177        if (error) {
 178                if (error == -EAGAIN) {
 179                        ASSERT(buf_flags & XBF_TRYLOCK);
 180                        return error;
 181                }
 182
 183                if (error == -EFSCORRUPTED &&
 184                    (iget_flags & XFS_IGET_UNTRUSTED))
 185                        return -EINVAL;
 186
 187                xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.",
 188                        __func__, error);
 189                return error;
 190        }
 191
 192        *bpp = bp;
 193        *dipp = xfs_buf_offset(bp, imap->im_boffset);
 194        return 0;
 195}
 196
 197void
 198xfs_dinode_from_disk(
 199        xfs_icdinode_t          *to,
 200        xfs_dinode_t            *from)
 201{
 202        to->di_magic = be16_to_cpu(from->di_magic);
 203        to->di_mode = be16_to_cpu(from->di_mode);
 204        to->di_version = from ->di_version;
 205        to->di_format = from->di_format;
 206        to->di_onlink = be16_to_cpu(from->di_onlink);
 207        to->di_uid = be32_to_cpu(from->di_uid);
 208        to->di_gid = be32_to_cpu(from->di_gid);
 209        to->di_nlink = be32_to_cpu(from->di_nlink);
 210        to->di_projid_lo = be16_to_cpu(from->di_projid_lo);
 211        to->di_projid_hi = be16_to_cpu(from->di_projid_hi);
 212        memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
 213        to->di_flushiter = be16_to_cpu(from->di_flushiter);
 214        to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec);
 215        to->di_atime.t_nsec = be32_to_cpu(from->di_atime.t_nsec);
 216        to->di_mtime.t_sec = be32_to_cpu(from->di_mtime.t_sec);
 217        to->di_mtime.t_nsec = be32_to_cpu(from->di_mtime.t_nsec);
 218        to->di_ctime.t_sec = be32_to_cpu(from->di_ctime.t_sec);
 219        to->di_ctime.t_nsec = be32_to_cpu(from->di_ctime.t_nsec);
 220        to->di_size = be64_to_cpu(from->di_size);
 221        to->di_nblocks = be64_to_cpu(from->di_nblocks);
 222        to->di_extsize = be32_to_cpu(from->di_extsize);
 223        to->di_nextents = be32_to_cpu(from->di_nextents);
 224        to->di_anextents = be16_to_cpu(from->di_anextents);
 225        to->di_forkoff = from->di_forkoff;
 226        to->di_aformat  = from->di_aformat;
 227        to->di_dmevmask = be32_to_cpu(from->di_dmevmask);
 228        to->di_dmstate  = be16_to_cpu(from->di_dmstate);
 229        to->di_flags    = be16_to_cpu(from->di_flags);
 230        to->di_gen      = be32_to_cpu(from->di_gen);
 231
 232        if (to->di_version == 3) {
 233                to->di_changecount = be64_to_cpu(from->di_changecount);
 234                to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec);
 235                to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec);
 236                to->di_flags2 = be64_to_cpu(from->di_flags2);
 237                to->di_ino = be64_to_cpu(from->di_ino);
 238                to->di_lsn = be64_to_cpu(from->di_lsn);
 239                memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
 240                uuid_copy(&to->di_uuid, &from->di_uuid);
 241        }
 242}
 243
 244void
 245xfs_dinode_to_disk(
 246        xfs_dinode_t            *to,
 247        xfs_icdinode_t          *from)
 248{
 249        to->di_magic = cpu_to_be16(from->di_magic);
 250        to->di_mode = cpu_to_be16(from->di_mode);
 251        to->di_version = from ->di_version;
 252        to->di_format = from->di_format;
 253        to->di_onlink = cpu_to_be16(from->di_onlink);
 254        to->di_uid = cpu_to_be32(from->di_uid);
 255        to->di_gid = cpu_to_be32(from->di_gid);
 256        to->di_nlink = cpu_to_be32(from->di_nlink);
 257        to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
 258        to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
 259        memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
 260        to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
 261        to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec);
 262        to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec);
 263        to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec);
 264        to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec);
 265        to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec);
 266        to->di_size = cpu_to_be64(from->di_size);
 267        to->di_nblocks = cpu_to_be64(from->di_nblocks);
 268        to->di_extsize = cpu_to_be32(from->di_extsize);
 269        to->di_nextents = cpu_to_be32(from->di_nextents);
 270        to->di_anextents = cpu_to_be16(from->di_anextents);
 271        to->di_forkoff = from->di_forkoff;
 272        to->di_aformat = from->di_aformat;
 273        to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
 274        to->di_dmstate = cpu_to_be16(from->di_dmstate);
 275        to->di_flags = cpu_to_be16(from->di_flags);
 276        to->di_gen = cpu_to_be32(from->di_gen);
 277
 278        if (from->di_version == 3) {
 279                to->di_changecount = cpu_to_be64(from->di_changecount);
 280                to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
 281                to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
 282                to->di_flags2 = cpu_to_be64(from->di_flags2);
 283                to->di_ino = cpu_to_be64(from->di_ino);
 284                to->di_lsn = cpu_to_be64(from->di_lsn);
 285                memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
 286                uuid_copy(&to->di_uuid, &from->di_uuid);
 287                to->di_flushiter = 0;
 288        } else {
 289                to->di_flushiter = cpu_to_be16(from->di_flushiter);
 290        }
 291}
 292
 293static bool
 294xfs_dinode_verify(
 295        struct xfs_mount        *mp,
 296        struct xfs_inode        *ip,
 297        struct xfs_dinode       *dip)
 298{
 299        if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))
 300                return false;
 301
 302        /* only version 3 or greater inodes are extensively verified here */
 303        if (dip->di_version < 3)
 304                return true;
 305
 306        if (!xfs_sb_version_hascrc(&mp->m_sb))
 307                return false;
 308        if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
 309                              XFS_DINODE_CRC_OFF))
 310                return false;
 311        if (be64_to_cpu(dip->di_ino) != ip->i_ino)
 312                return false;
 313        if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_meta_uuid))
 314                return false;
 315        return true;
 316}
 317
 318void
 319xfs_dinode_calc_crc(
 320        struct xfs_mount        *mp,
 321        struct xfs_dinode       *dip)
 322{
 323        __uint32_t              crc;
 324
 325        if (dip->di_version < 3)
 326                return;
 327
 328        ASSERT(xfs_sb_version_hascrc(&mp->m_sb));
 329        crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize,
 330                              XFS_DINODE_CRC_OFF);
 331        dip->di_crc = xfs_end_cksum(crc);
 332}
 333
 334/*
 335 * Read the disk inode attributes into the in-core inode structure.
 336 *
 337 * For version 5 superblocks, if we are initialising a new inode and we are not
 338 * utilising the XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new
 339 * inode core with a random generation number. If we are keeping inodes around,
 340 * we need to read the inode cluster to get the existing generation number off
 341 * disk. Further, if we are using version 4 superblocks (i.e. v1/v2 inode
 342 * format) then log recovery is dependent on the di_flushiter field being
 343 * initialised from the current on-disk value and hence we must also read the
 344 * inode off disk.
 345 */
 346int
 347xfs_iread(
 348        xfs_mount_t     *mp,
 349        xfs_trans_t     *tp,
 350        xfs_inode_t     *ip,
 351        uint            iget_flags)
 352{
 353        xfs_buf_t       *bp;
 354        xfs_dinode_t    *dip;
 355        int             error;
 356
 357        /*
 358         * Fill in the location information in the in-core inode.
 359         */
 360        error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags);
 361        if (error)
 362                return error;
 363
 364        /* shortcut IO on inode allocation if possible */
 365        if ((iget_flags & XFS_IGET_CREATE) &&
 366            xfs_sb_version_hascrc(&mp->m_sb) &&
 367            !(mp->m_flags & XFS_MOUNT_IKEEP)) {
 368                /* initialise the on-disk inode core */
 369                memset(&ip->i_d, 0, sizeof(ip->i_d));
 370                ip->i_d.di_magic = XFS_DINODE_MAGIC;
 371                ip->i_d.di_gen = prandom_u32();
 372                if (xfs_sb_version_hascrc(&mp->m_sb)) {
 373                        ip->i_d.di_version = 3;
 374                        ip->i_d.di_ino = ip->i_ino;
 375                        uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_meta_uuid);
 376                } else
 377                        ip->i_d.di_version = 2;
 378                return 0;
 379        }
 380
 381        /*
 382         * Get pointers to the on-disk inode and the buffer containing it.
 383         */
 384        error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0, iget_flags);
 385        if (error)
 386                return error;
 387
 388        /* even unallocated inodes are verified */
 389        if (!xfs_dinode_verify(mp, ip, dip)) {
 390                xfs_alert(mp, "%s: validation failed for inode %lld failed",
 391                                __func__, ip->i_ino);
 392
 393                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, dip);
 394                error = -EFSCORRUPTED;
 395                goto out_brelse;
 396        }
 397
 398        /*
 399         * If the on-disk inode is already linked to a directory
 400         * entry, copy all of the inode into the in-core inode.
 401         * xfs_iformat_fork() handles copying in the inode format
 402         * specific information.
 403         * Otherwise, just get the truly permanent information.
 404         */
 405        if (dip->di_mode) {
 406                xfs_dinode_from_disk(&ip->i_d, dip);
 407                error = xfs_iformat_fork(ip, dip);
 408                if (error)  {
 409#ifdef DEBUG
 410                        xfs_alert(mp, "%s: xfs_iformat() returned error %d",
 411                                __func__, error);
 412#endif /* DEBUG */
 413                        goto out_brelse;
 414                }
 415        } else {
 416                /*
 417                 * Partial initialisation of the in-core inode. Just the bits
 418                 * that xfs_ialloc won't overwrite or relies on being correct.
 419                 */
 420                ip->i_d.di_magic = be16_to_cpu(dip->di_magic);
 421                ip->i_d.di_version = dip->di_version;
 422                ip->i_d.di_gen = be32_to_cpu(dip->di_gen);
 423                ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter);
 424
 425                if (dip->di_version == 3) {
 426                        ip->i_d.di_ino = be64_to_cpu(dip->di_ino);
 427                        uuid_copy(&ip->i_d.di_uuid, &dip->di_uuid);
 428                }
 429
 430                /*
 431                 * Make sure to pull in the mode here as well in
 432                 * case the inode is released without being used.
 433                 * This ensures that xfs_inactive() will see that
 434                 * the inode is already free and not try to mess
 435                 * with the uninitialized part of it.
 436                 */
 437                ip->i_d.di_mode = 0;
 438        }
 439
 440        /*
 441         * Automatically convert version 1 inode formats in memory to version 2
 442         * inode format. If the inode is modified, it will get logged and
 443         * rewritten as a version 2 inode. We can do this because we set the
 444         * superblock feature bit for v2 inodes unconditionally during mount
 445         * and it means the reast of the code can assume the inode version is 2
 446         * or higher.
 447         */
 448        if (ip->i_d.di_version == 1) {
 449                ip->i_d.di_version = 2;
 450                memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
 451                ip->i_d.di_nlink = ip->i_d.di_onlink;
 452                ip->i_d.di_onlink = 0;
 453                xfs_set_projid(ip, 0);
 454        }
 455
 456        ip->i_delayed_blks = 0;
 457
 458        /*
 459         * Mark the buffer containing the inode as something to keep
 460         * around for a while.  This helps to keep recently accessed
 461         * meta-data in-core longer.
 462         */
 463        xfs_buf_set_ref(bp, XFS_INO_REF);
 464
 465        /*
 466         * Use xfs_trans_brelse() to release the buffer containing the on-disk
 467         * inode, because it was acquired with xfs_trans_read_buf() in
 468         * xfs_imap_to_bp() above.  If tp is NULL, this is just a normal
 469         * brelse().  If we're within a transaction, then xfs_trans_brelse()
 470         * will only release the buffer if it is not dirty within the
 471         * transaction.  It will be OK to release the buffer in this case,
 472         * because inodes on disk are never destroyed and we will be locking the
 473         * new in-core inode before putting it in the cache where other
 474         * processes can find it.  Thus we don't have to worry about the inode
 475         * being changed just because we released the buffer.
 476         */
 477 out_brelse:
 478        xfs_trans_brelse(tp, bp);
 479        return error;
 480}
 481