linux/fs/xfs/libxfs/xfs_inode_buf.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   3 * All Rights Reserved.
   4 *
   5 * This program is free software; you can redistribute it and/or
   6 * modify it under the terms of the GNU General Public License as
   7 * published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope that it would be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public License
  15 * along with this program; if not, write the Free Software Foundation,
  16 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17 */
  18#include "xfs.h"
  19#include "xfs_fs.h"
  20#include "xfs_shared.h"
  21#include "xfs_format.h"
  22#include "xfs_log_format.h"
  23#include "xfs_trans_resv.h"
  24#include "xfs_mount.h"
  25#include "xfs_inode.h"
  26#include "xfs_error.h"
  27#include "xfs_cksum.h"
  28#include "xfs_icache.h"
  29#include "xfs_trans.h"
  30#include "xfs_ialloc.h"
  31
  32/*
  33 * Check that none of the inode's in the buffer have a next
  34 * unlinked field of 0.
  35 */
  36#if defined(DEBUG)
  37void
  38xfs_inobp_check(
  39        xfs_mount_t     *mp,
  40        xfs_buf_t       *bp)
  41{
  42        int             i;
  43        int             j;
  44        xfs_dinode_t    *dip;
  45
  46        j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
  47
  48        for (i = 0; i < j; i++) {
  49                dip = xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize);
  50                if (!dip->di_next_unlinked)  {
  51                        xfs_alert(mp,
  52        "Detected bogus zero next_unlinked field in inode %d buffer 0x%llx.",
  53                                i, (long long)bp->b_bn);
  54                }
  55        }
  56}
  57#endif
  58
  59/*
  60 * If we are doing readahead on an inode buffer, we might be in log recovery
  61 * reading an inode allocation buffer that hasn't yet been replayed, and hence
  62 * has not had the inode cores stamped into it. Hence for readahead, the buffer
  63 * may be potentially invalid.
  64 *
  65 * If the readahead buffer is invalid, we need to mark it with an error and
  66 * clear the DONE status of the buffer so that a followup read will re-read it
  67 * from disk. We don't report the error otherwise to avoid warnings during log
  68 * recovery and we don't get unnecssary panics on debug kernels. We use EIO here
  69 * because all we want to do is say readahead failed; there is no-one to report
  70 * the error to, so this will distinguish it from a non-ra verifier failure.
  71 * Changes to this readahead error behavour also need to be reflected in
  72 * xfs_dquot_buf_readahead_verify().
  73 */
  74static void
  75xfs_inode_buf_verify(
  76        struct xfs_buf  *bp,
  77        bool            readahead)
  78{
  79        struct xfs_mount *mp = bp->b_target->bt_mount;
  80        int             i;
  81        int             ni;
  82
  83        /*
  84         * Validate the magic number and version of every inode in the buffer
  85         */
  86        ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
  87        for (i = 0; i < ni; i++) {
  88                int             di_ok;
  89                xfs_dinode_t    *dip;
  90
  91                dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog));
  92                di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
  93                            XFS_DINODE_GOOD_VERSION(dip->di_version);
  94                if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
  95                                                XFS_ERRTAG_ITOBP_INOTOBP,
  96                                                XFS_RANDOM_ITOBP_INOTOBP))) {
  97                        if (readahead) {
  98                                bp->b_flags &= ~XBF_DONE;
  99                                xfs_buf_ioerror(bp, -EIO);
 100                                return;
 101                        }
 102
 103                        xfs_buf_ioerror(bp, -EFSCORRUPTED);
 104                        xfs_verifier_error(bp);
 105#ifdef DEBUG
 106                        xfs_alert(mp,
 107                                "bad inode magic/vsn daddr %lld #%d (magic=%x)",
 108                                (unsigned long long)bp->b_bn, i,
 109                                be16_to_cpu(dip->di_magic));
 110#endif
 111                }
 112        }
 113        xfs_inobp_check(mp, bp);
 114}
 115
 116
 117static void
 118xfs_inode_buf_read_verify(
 119        struct xfs_buf  *bp)
 120{
 121        xfs_inode_buf_verify(bp, false);
 122}
 123
 124static void
 125xfs_inode_buf_readahead_verify(
 126        struct xfs_buf  *bp)
 127{
 128        xfs_inode_buf_verify(bp, true);
 129}
 130
 131static void
 132xfs_inode_buf_write_verify(
 133        struct xfs_buf  *bp)
 134{
 135        xfs_inode_buf_verify(bp, false);
 136}
 137
 138const struct xfs_buf_ops xfs_inode_buf_ops = {
 139        .name = "xfs_inode",
 140        .verify_read = xfs_inode_buf_read_verify,
 141        .verify_write = xfs_inode_buf_write_verify,
 142};
 143
 144const struct xfs_buf_ops xfs_inode_buf_ra_ops = {
 145        .name = "xxfs_inode_ra",
 146        .verify_read = xfs_inode_buf_readahead_verify,
 147        .verify_write = xfs_inode_buf_write_verify,
 148};
 149
 150
 151/*
 152 * This routine is called to map an inode to the buffer containing the on-disk
 153 * version of the inode.  It returns a pointer to the buffer containing the
 154 * on-disk inode in the bpp parameter, and in the dipp parameter it returns a
 155 * pointer to the on-disk inode within that buffer.
 156 *
 157 * If a non-zero error is returned, then the contents of bpp and dipp are
 158 * undefined.
 159 */
 160int
 161xfs_imap_to_bp(
 162        struct xfs_mount        *mp,
 163        struct xfs_trans        *tp,
 164        struct xfs_imap         *imap,
 165        struct xfs_dinode       **dipp,
 166        struct xfs_buf          **bpp,
 167        uint                    buf_flags,
 168        uint                    iget_flags)
 169{
 170        struct xfs_buf          *bp;
 171        int                     error;
 172
 173        buf_flags |= XBF_UNMAPPED;
 174        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
 175                                   (int)imap->im_len, buf_flags, &bp,
 176                                   &xfs_inode_buf_ops);
 177        if (error) {
 178                if (error == -EAGAIN) {
 179                        ASSERT(buf_flags & XBF_TRYLOCK);
 180                        return error;
 181                }
 182
 183                if (error == -EFSCORRUPTED &&
 184                    (iget_flags & XFS_IGET_UNTRUSTED))
 185                        return -EINVAL;
 186
 187                xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.",
 188                        __func__, error);
 189                return error;
 190        }
 191
 192        *bpp = bp;
 193        *dipp = xfs_buf_offset(bp, imap->im_boffset);
 194        return 0;
 195}
 196
 197void
 198xfs_inode_from_disk(
 199        struct xfs_inode        *ip,
 200        struct xfs_dinode       *from)
 201{
 202        struct xfs_icdinode     *to = &ip->i_d;
 203        struct inode            *inode = VFS_I(ip);
 204
 205
 206        /*
 207         * Convert v1 inodes immediately to v2 inode format as this is the
 208         * minimum inode version format we support in the rest of the code.
 209         */
 210        to->di_version = from->di_version;
 211        if (to->di_version == 1) {
 212                set_nlink(inode, be16_to_cpu(from->di_onlink));
 213                to->di_projid_lo = 0;
 214                to->di_projid_hi = 0;
 215                to->di_version = 2;
 216        } else {
 217                set_nlink(inode, be32_to_cpu(from->di_nlink));
 218                to->di_projid_lo = be16_to_cpu(from->di_projid_lo);
 219                to->di_projid_hi = be16_to_cpu(from->di_projid_hi);
 220        }
 221
 222        to->di_format = from->di_format;
 223        to->di_uid = be32_to_cpu(from->di_uid);
 224        to->di_gid = be32_to_cpu(from->di_gid);
 225        to->di_flushiter = be16_to_cpu(from->di_flushiter);
 226
 227        /*
 228         * Time is signed, so need to convert to signed 32 bit before
 229         * storing in inode timestamp which may be 64 bit. Otherwise
 230         * a time before epoch is converted to a time long after epoch
 231         * on 64 bit systems.
 232         */
 233        inode->i_atime.tv_sec = (int)be32_to_cpu(from->di_atime.t_sec);
 234        inode->i_atime.tv_nsec = (int)be32_to_cpu(from->di_atime.t_nsec);
 235        inode->i_mtime.tv_sec = (int)be32_to_cpu(from->di_mtime.t_sec);
 236        inode->i_mtime.tv_nsec = (int)be32_to_cpu(from->di_mtime.t_nsec);
 237        inode->i_ctime.tv_sec = (int)be32_to_cpu(from->di_ctime.t_sec);
 238        inode->i_ctime.tv_nsec = (int)be32_to_cpu(from->di_ctime.t_nsec);
 239        inode->i_generation = be32_to_cpu(from->di_gen);
 240        inode->i_mode = be16_to_cpu(from->di_mode);
 241
 242        to->di_size = be64_to_cpu(from->di_size);
 243        to->di_nblocks = be64_to_cpu(from->di_nblocks);
 244        to->di_extsize = be32_to_cpu(from->di_extsize);
 245        to->di_nextents = be32_to_cpu(from->di_nextents);
 246        to->di_anextents = be16_to_cpu(from->di_anextents);
 247        to->di_forkoff = from->di_forkoff;
 248        to->di_aformat  = from->di_aformat;
 249        to->di_dmevmask = be32_to_cpu(from->di_dmevmask);
 250        to->di_dmstate  = be16_to_cpu(from->di_dmstate);
 251        to->di_flags    = be16_to_cpu(from->di_flags);
 252
 253        if (to->di_version == 3) {
 254                inode->i_version = be64_to_cpu(from->di_changecount);
 255                to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec);
 256                to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec);
 257                to->di_flags2 = be64_to_cpu(from->di_flags2);
 258        }
 259}
 260
 261void
 262xfs_inode_to_disk(
 263        struct xfs_inode        *ip,
 264        struct xfs_dinode       *to,
 265        xfs_lsn_t               lsn)
 266{
 267        struct xfs_icdinode     *from = &ip->i_d;
 268        struct inode            *inode = VFS_I(ip);
 269
 270        to->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
 271        to->di_onlink = 0;
 272
 273        to->di_version = from->di_version;
 274        to->di_format = from->di_format;
 275        to->di_uid = cpu_to_be32(from->di_uid);
 276        to->di_gid = cpu_to_be32(from->di_gid);
 277        to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
 278        to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
 279
 280        memset(to->di_pad, 0, sizeof(to->di_pad));
 281        to->di_atime.t_sec = cpu_to_be32(inode->i_atime.tv_sec);
 282        to->di_atime.t_nsec = cpu_to_be32(inode->i_atime.tv_nsec);
 283        to->di_mtime.t_sec = cpu_to_be32(inode->i_mtime.tv_sec);
 284        to->di_mtime.t_nsec = cpu_to_be32(inode->i_mtime.tv_nsec);
 285        to->di_ctime.t_sec = cpu_to_be32(inode->i_ctime.tv_sec);
 286        to->di_ctime.t_nsec = cpu_to_be32(inode->i_ctime.tv_nsec);
 287        to->di_nlink = cpu_to_be32(inode->i_nlink);
 288        to->di_gen = cpu_to_be32(inode->i_generation);
 289        to->di_mode = cpu_to_be16(inode->i_mode);
 290
 291        to->di_size = cpu_to_be64(from->di_size);
 292        to->di_nblocks = cpu_to_be64(from->di_nblocks);
 293        to->di_extsize = cpu_to_be32(from->di_extsize);
 294        to->di_nextents = cpu_to_be32(from->di_nextents);
 295        to->di_anextents = cpu_to_be16(from->di_anextents);
 296        to->di_forkoff = from->di_forkoff;
 297        to->di_aformat = from->di_aformat;
 298        to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
 299        to->di_dmstate = cpu_to_be16(from->di_dmstate);
 300        to->di_flags = cpu_to_be16(from->di_flags);
 301
 302        if (from->di_version == 3) {
 303                to->di_changecount = cpu_to_be64(inode->i_version);
 304                to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
 305                to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
 306                to->di_flags2 = cpu_to_be64(from->di_flags2);
 307
 308                to->di_ino = cpu_to_be64(ip->i_ino);
 309                to->di_lsn = cpu_to_be64(lsn);
 310                memset(to->di_pad2, 0, sizeof(to->di_pad2));
 311                uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid);
 312                to->di_flushiter = 0;
 313        } else {
 314                to->di_flushiter = cpu_to_be16(from->di_flushiter);
 315        }
 316}
 317
 318void
 319xfs_log_dinode_to_disk(
 320        struct xfs_log_dinode   *from,
 321        struct xfs_dinode       *to)
 322{
 323        to->di_magic = cpu_to_be16(from->di_magic);
 324        to->di_mode = cpu_to_be16(from->di_mode);
 325        to->di_version = from->di_version;
 326        to->di_format = from->di_format;
 327        to->di_onlink = 0;
 328        to->di_uid = cpu_to_be32(from->di_uid);
 329        to->di_gid = cpu_to_be32(from->di_gid);
 330        to->di_nlink = cpu_to_be32(from->di_nlink);
 331        to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
 332        to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
 333        memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
 334
 335        to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
 336        to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec);
 337        to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec);
 338        to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec);
 339        to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec);
 340        to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec);
 341
 342        to->di_size = cpu_to_be64(from->di_size);
 343        to->di_nblocks = cpu_to_be64(from->di_nblocks);
 344        to->di_extsize = cpu_to_be32(from->di_extsize);
 345        to->di_nextents = cpu_to_be32(from->di_nextents);
 346        to->di_anextents = cpu_to_be16(from->di_anextents);
 347        to->di_forkoff = from->di_forkoff;
 348        to->di_aformat = from->di_aformat;
 349        to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
 350        to->di_dmstate = cpu_to_be16(from->di_dmstate);
 351        to->di_flags = cpu_to_be16(from->di_flags);
 352        to->di_gen = cpu_to_be32(from->di_gen);
 353
 354        if (from->di_version == 3) {
 355                to->di_changecount = cpu_to_be64(from->di_changecount);
 356                to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
 357                to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
 358                to->di_flags2 = cpu_to_be64(from->di_flags2);
 359                to->di_ino = cpu_to_be64(from->di_ino);
 360                to->di_lsn = cpu_to_be64(from->di_lsn);
 361                memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
 362                uuid_copy(&to->di_uuid, &from->di_uuid);
 363                to->di_flushiter = 0;
 364        } else {
 365                to->di_flushiter = cpu_to_be16(from->di_flushiter);
 366        }
 367}
 368
 369static bool
 370xfs_dinode_verify(
 371        struct xfs_mount        *mp,
 372        struct xfs_inode        *ip,
 373        struct xfs_dinode       *dip)
 374{
 375        if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))
 376                return false;
 377
 378        /* only version 3 or greater inodes are extensively verified here */
 379        if (dip->di_version < 3)
 380                return true;
 381
 382        if (!xfs_sb_version_hascrc(&mp->m_sb))
 383                return false;
 384        if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
 385                              XFS_DINODE_CRC_OFF))
 386                return false;
 387        if (be64_to_cpu(dip->di_ino) != ip->i_ino)
 388                return false;
 389        if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_meta_uuid))
 390                return false;
 391        return true;
 392}
 393
 394void
 395xfs_dinode_calc_crc(
 396        struct xfs_mount        *mp,
 397        struct xfs_dinode       *dip)
 398{
 399        __uint32_t              crc;
 400
 401        if (dip->di_version < 3)
 402                return;
 403
 404        ASSERT(xfs_sb_version_hascrc(&mp->m_sb));
 405        crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize,
 406                              XFS_DINODE_CRC_OFF);
 407        dip->di_crc = xfs_end_cksum(crc);
 408}
 409
 410/*
 411 * Read the disk inode attributes into the in-core inode structure.
 412 *
 413 * For version 5 superblocks, if we are initialising a new inode and we are not
 414 * utilising the XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new
 415 * inode core with a random generation number. If we are keeping inodes around,
 416 * we need to read the inode cluster to get the existing generation number off
 417 * disk. Further, if we are using version 4 superblocks (i.e. v1/v2 inode
 418 * format) then log recovery is dependent on the di_flushiter field being
 419 * initialised from the current on-disk value and hence we must also read the
 420 * inode off disk.
 421 */
 422int
 423xfs_iread(
 424        xfs_mount_t     *mp,
 425        xfs_trans_t     *tp,
 426        xfs_inode_t     *ip,
 427        uint            iget_flags)
 428{
 429        xfs_buf_t       *bp;
 430        xfs_dinode_t    *dip;
 431        int             error;
 432
 433        /*
 434         * Fill in the location information in the in-core inode.
 435         */
 436        error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags);
 437        if (error)
 438                return error;
 439
 440        /* shortcut IO on inode allocation if possible */
 441        if ((iget_flags & XFS_IGET_CREATE) &&
 442            xfs_sb_version_hascrc(&mp->m_sb) &&
 443            !(mp->m_flags & XFS_MOUNT_IKEEP)) {
 444                /* initialise the on-disk inode core */
 445                memset(&ip->i_d, 0, sizeof(ip->i_d));
 446                VFS_I(ip)->i_generation = prandom_u32();
 447                if (xfs_sb_version_hascrc(&mp->m_sb))
 448                        ip->i_d.di_version = 3;
 449                else
 450                        ip->i_d.di_version = 2;
 451                return 0;
 452        }
 453
 454        /*
 455         * Get pointers to the on-disk inode and the buffer containing it.
 456         */
 457        error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0, iget_flags);
 458        if (error)
 459                return error;
 460
 461        /* even unallocated inodes are verified */
 462        if (!xfs_dinode_verify(mp, ip, dip)) {
 463                xfs_alert(mp, "%s: validation failed for inode %lld failed",
 464                                __func__, ip->i_ino);
 465
 466                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, dip);
 467                error = -EFSCORRUPTED;
 468                goto out_brelse;
 469        }
 470
 471        /*
 472         * If the on-disk inode is already linked to a directory
 473         * entry, copy all of the inode into the in-core inode.
 474         * xfs_iformat_fork() handles copying in the inode format
 475         * specific information.
 476         * Otherwise, just get the truly permanent information.
 477         */
 478        if (dip->di_mode) {
 479                xfs_inode_from_disk(ip, dip);
 480                error = xfs_iformat_fork(ip, dip);
 481                if (error)  {
 482#ifdef DEBUG
 483                        xfs_alert(mp, "%s: xfs_iformat() returned error %d",
 484                                __func__, error);
 485#endif /* DEBUG */
 486                        goto out_brelse;
 487                }
 488        } else {
 489                /*
 490                 * Partial initialisation of the in-core inode. Just the bits
 491                 * that xfs_ialloc won't overwrite or relies on being correct.
 492                 */
 493                ip->i_d.di_version = dip->di_version;
 494                VFS_I(ip)->i_generation = be32_to_cpu(dip->di_gen);
 495                ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter);
 496
 497                /*
 498                 * Make sure to pull in the mode here as well in
 499                 * case the inode is released without being used.
 500                 * This ensures that xfs_inactive() will see that
 501                 * the inode is already free and not try to mess
 502                 * with the uninitialized part of it.
 503                 */
 504                VFS_I(ip)->i_mode = 0;
 505        }
 506
 507        ASSERT(ip->i_d.di_version >= 2);
 508        ip->i_delayed_blks = 0;
 509
 510        /*
 511         * Mark the buffer containing the inode as something to keep
 512         * around for a while.  This helps to keep recently accessed
 513         * meta-data in-core longer.
 514         */
 515        xfs_buf_set_ref(bp, XFS_INO_REF);
 516
 517        /*
 518         * Use xfs_trans_brelse() to release the buffer containing the on-disk
 519         * inode, because it was acquired with xfs_trans_read_buf() in
 520         * xfs_imap_to_bp() above.  If tp is NULL, this is just a normal
 521         * brelse().  If we're within a transaction, then xfs_trans_brelse()
 522         * will only release the buffer if it is not dirty within the
 523         * transaction.  It will be OK to release the buffer in this case,
 524         * because inodes on disk are never destroyed and we will be locking the
 525         * new in-core inode before putting it in the cache where other
 526         * processes can find it.  Thus we don't have to worry about the inode
 527         * being changed just because we released the buffer.
 528         */
 529 out_brelse:
 530        xfs_trans_brelse(tp, bp);
 531        return error;
 532}
 533