linux/fs/xfs/libxfs/xfs_inode_buf.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   3 * All Rights Reserved.
   4 *
   5 * This program is free software; you can redistribute it and/or
   6 * modify it under the terms of the GNU General Public License as
   7 * published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope that it would be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public License
  15 * along with this program; if not, write the Free Software Foundation,
  16 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17 */
  18#include "xfs.h"
  19#include "xfs_fs.h"
  20#include "xfs_shared.h"
  21#include "xfs_format.h"
  22#include "xfs_log_format.h"
  23#include "xfs_trans_resv.h"
  24#include "xfs_mount.h"
  25#include "xfs_inode.h"
  26#include "xfs_error.h"
  27#include "xfs_cksum.h"
  28#include "xfs_icache.h"
  29#include "xfs_trans.h"
  30#include "xfs_ialloc.h"
  31
  32/*
  33 * Check that none of the inode's in the buffer have a next
  34 * unlinked field of 0.
  35 */
  36#if defined(DEBUG)
  37void
  38xfs_inobp_check(
  39        xfs_mount_t     *mp,
  40        xfs_buf_t       *bp)
  41{
  42        int             i;
  43        int             j;
  44        xfs_dinode_t    *dip;
  45
  46        j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
  47
  48        for (i = 0; i < j; i++) {
  49                dip = xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize);
  50                if (!dip->di_next_unlinked)  {
  51                        xfs_alert(mp,
  52        "Detected bogus zero next_unlinked field in inode %d buffer 0x%llx.",
  53                                i, (long long)bp->b_bn);
  54                }
  55        }
  56}
  57#endif
  58
  59/*
  60 * If we are doing readahead on an inode buffer, we might be in log recovery
  61 * reading an inode allocation buffer that hasn't yet been replayed, and hence
  62 * has not had the inode cores stamped into it. Hence for readahead, the buffer
  63 * may be potentially invalid.
  64 *
  65 * If the readahead buffer is invalid, we don't want to mark it with an error,
  66 * but we do want to clear the DONE status of the buffer so that a followup read
  67 * will re-read it from disk. This will ensure that we don't get an unnecessary
  68 * warnings during log recovery and we don't get unnecssary panics on debug
  69 * kernels.
  70 */
  71static void
  72xfs_inode_buf_verify(
  73        struct xfs_buf  *bp,
  74        bool            readahead)
  75{
  76        struct xfs_mount *mp = bp->b_target->bt_mount;
  77        int             i;
  78        int             ni;
  79
  80        /*
  81         * Validate the magic number and version of every inode in the buffer
  82         */
  83        ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
  84        for (i = 0; i < ni; i++) {
  85                int             di_ok;
  86                xfs_dinode_t    *dip;
  87
  88                dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog));
  89                di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
  90                            XFS_DINODE_GOOD_VERSION(dip->di_version);
  91                if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
  92                                                XFS_ERRTAG_ITOBP_INOTOBP,
  93                                                XFS_RANDOM_ITOBP_INOTOBP))) {
  94                        if (readahead) {
  95                                bp->b_flags &= ~XBF_DONE;
  96                                return;
  97                        }
  98
  99                        xfs_buf_ioerror(bp, -EFSCORRUPTED);
 100                        xfs_verifier_error(bp);
 101#ifdef DEBUG
 102                        xfs_alert(mp,
 103                                "bad inode magic/vsn daddr %lld #%d (magic=%x)",
 104                                (unsigned long long)bp->b_bn, i,
 105                                be16_to_cpu(dip->di_magic));
 106#endif
 107                }
 108        }
 109        xfs_inobp_check(mp, bp);
 110}
 111
 112
 113static void
 114xfs_inode_buf_read_verify(
 115        struct xfs_buf  *bp)
 116{
 117        xfs_inode_buf_verify(bp, false);
 118}
 119
 120static void
 121xfs_inode_buf_readahead_verify(
 122        struct xfs_buf  *bp)
 123{
 124        xfs_inode_buf_verify(bp, true);
 125}
 126
 127static void
 128xfs_inode_buf_write_verify(
 129        struct xfs_buf  *bp)
 130{
 131        xfs_inode_buf_verify(bp, false);
 132}
 133
 134const struct xfs_buf_ops xfs_inode_buf_ops = {
 135        .verify_read = xfs_inode_buf_read_verify,
 136        .verify_write = xfs_inode_buf_write_verify,
 137};
 138
 139const struct xfs_buf_ops xfs_inode_buf_ra_ops = {
 140        .verify_read = xfs_inode_buf_readahead_verify,
 141        .verify_write = xfs_inode_buf_write_verify,
 142};
 143
 144
 145/*
 146 * This routine is called to map an inode to the buffer containing the on-disk
 147 * version of the inode.  It returns a pointer to the buffer containing the
 148 * on-disk inode in the bpp parameter, and in the dipp parameter it returns a
 149 * pointer to the on-disk inode within that buffer.
 150 *
 151 * If a non-zero error is returned, then the contents of bpp and dipp are
 152 * undefined.
 153 */
 154int
 155xfs_imap_to_bp(
 156        struct xfs_mount        *mp,
 157        struct xfs_trans        *tp,
 158        struct xfs_imap         *imap,
 159        struct xfs_dinode       **dipp,
 160        struct xfs_buf          **bpp,
 161        uint                    buf_flags,
 162        uint                    iget_flags)
 163{
 164        struct xfs_buf          *bp;
 165        int                     error;
 166
 167        buf_flags |= XBF_UNMAPPED;
 168        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
 169                                   (int)imap->im_len, buf_flags, &bp,
 170                                   &xfs_inode_buf_ops);
 171        if (error) {
 172                if (error == -EAGAIN) {
 173                        ASSERT(buf_flags & XBF_TRYLOCK);
 174                        return error;
 175                }
 176
 177                if (error == -EFSCORRUPTED &&
 178                    (iget_flags & XFS_IGET_UNTRUSTED))
 179                        return -EINVAL;
 180
 181                xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.",
 182                        __func__, error);
 183                return error;
 184        }
 185
 186        *bpp = bp;
 187        *dipp = xfs_buf_offset(bp, imap->im_boffset);
 188        return 0;
 189}
 190
 191void
 192xfs_dinode_from_disk(
 193        xfs_icdinode_t          *to,
 194        xfs_dinode_t            *from)
 195{
 196        to->di_magic = be16_to_cpu(from->di_magic);
 197        to->di_mode = be16_to_cpu(from->di_mode);
 198        to->di_version = from ->di_version;
 199        to->di_format = from->di_format;
 200        to->di_onlink = be16_to_cpu(from->di_onlink);
 201        to->di_uid = be32_to_cpu(from->di_uid);
 202        to->di_gid = be32_to_cpu(from->di_gid);
 203        to->di_nlink = be32_to_cpu(from->di_nlink);
 204        to->di_projid_lo = be16_to_cpu(from->di_projid_lo);
 205        to->di_projid_hi = be16_to_cpu(from->di_projid_hi);
 206        memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
 207        to->di_flushiter = be16_to_cpu(from->di_flushiter);
 208        to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec);
 209        to->di_atime.t_nsec = be32_to_cpu(from->di_atime.t_nsec);
 210        to->di_mtime.t_sec = be32_to_cpu(from->di_mtime.t_sec);
 211        to->di_mtime.t_nsec = be32_to_cpu(from->di_mtime.t_nsec);
 212        to->di_ctime.t_sec = be32_to_cpu(from->di_ctime.t_sec);
 213        to->di_ctime.t_nsec = be32_to_cpu(from->di_ctime.t_nsec);
 214        to->di_size = be64_to_cpu(from->di_size);
 215        to->di_nblocks = be64_to_cpu(from->di_nblocks);
 216        to->di_extsize = be32_to_cpu(from->di_extsize);
 217        to->di_nextents = be32_to_cpu(from->di_nextents);
 218        to->di_anextents = be16_to_cpu(from->di_anextents);
 219        to->di_forkoff = from->di_forkoff;
 220        to->di_aformat  = from->di_aformat;
 221        to->di_dmevmask = be32_to_cpu(from->di_dmevmask);
 222        to->di_dmstate  = be16_to_cpu(from->di_dmstate);
 223        to->di_flags    = be16_to_cpu(from->di_flags);
 224        to->di_gen      = be32_to_cpu(from->di_gen);
 225
 226        if (to->di_version == 3) {
 227                to->di_changecount = be64_to_cpu(from->di_changecount);
 228                to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec);
 229                to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec);
 230                to->di_flags2 = be64_to_cpu(from->di_flags2);
 231                to->di_ino = be64_to_cpu(from->di_ino);
 232                to->di_lsn = be64_to_cpu(from->di_lsn);
 233                memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
 234                uuid_copy(&to->di_uuid, &from->di_uuid);
 235        }
 236}
 237
 238void
 239xfs_dinode_to_disk(
 240        xfs_dinode_t            *to,
 241        xfs_icdinode_t          *from)
 242{
 243        to->di_magic = cpu_to_be16(from->di_magic);
 244        to->di_mode = cpu_to_be16(from->di_mode);
 245        to->di_version = from ->di_version;
 246        to->di_format = from->di_format;
 247        to->di_onlink = cpu_to_be16(from->di_onlink);
 248        to->di_uid = cpu_to_be32(from->di_uid);
 249        to->di_gid = cpu_to_be32(from->di_gid);
 250        to->di_nlink = cpu_to_be32(from->di_nlink);
 251        to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
 252        to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
 253        memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
 254        to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
 255        to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec);
 256        to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec);
 257        to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec);
 258        to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec);
 259        to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec);
 260        to->di_size = cpu_to_be64(from->di_size);
 261        to->di_nblocks = cpu_to_be64(from->di_nblocks);
 262        to->di_extsize = cpu_to_be32(from->di_extsize);
 263        to->di_nextents = cpu_to_be32(from->di_nextents);
 264        to->di_anextents = cpu_to_be16(from->di_anextents);
 265        to->di_forkoff = from->di_forkoff;
 266        to->di_aformat = from->di_aformat;
 267        to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
 268        to->di_dmstate = cpu_to_be16(from->di_dmstate);
 269        to->di_flags = cpu_to_be16(from->di_flags);
 270        to->di_gen = cpu_to_be32(from->di_gen);
 271
 272        if (from->di_version == 3) {
 273                to->di_changecount = cpu_to_be64(from->di_changecount);
 274                to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
 275                to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
 276                to->di_flags2 = cpu_to_be64(from->di_flags2);
 277                to->di_ino = cpu_to_be64(from->di_ino);
 278                to->di_lsn = cpu_to_be64(from->di_lsn);
 279                memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
 280                uuid_copy(&to->di_uuid, &from->di_uuid);
 281                to->di_flushiter = 0;
 282        } else {
 283                to->di_flushiter = cpu_to_be16(from->di_flushiter);
 284        }
 285}
 286
 287static bool
 288xfs_dinode_verify(
 289        struct xfs_mount        *mp,
 290        struct xfs_inode        *ip,
 291        struct xfs_dinode       *dip)
 292{
 293        if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))
 294                return false;
 295
 296        /* only version 3 or greater inodes are extensively verified here */
 297        if (dip->di_version < 3)
 298                return true;
 299
 300        if (!xfs_sb_version_hascrc(&mp->m_sb))
 301                return false;
 302        if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
 303                              XFS_DINODE_CRC_OFF))
 304                return false;
 305        if (be64_to_cpu(dip->di_ino) != ip->i_ino)
 306                return false;
 307        if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_meta_uuid))
 308                return false;
 309        return true;
 310}
 311
 312void
 313xfs_dinode_calc_crc(
 314        struct xfs_mount        *mp,
 315        struct xfs_dinode       *dip)
 316{
 317        __uint32_t              crc;
 318
 319        if (dip->di_version < 3)
 320                return;
 321
 322        ASSERT(xfs_sb_version_hascrc(&mp->m_sb));
 323        crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize,
 324                              XFS_DINODE_CRC_OFF);
 325        dip->di_crc = xfs_end_cksum(crc);
 326}
 327
 328/*
 329 * Read the disk inode attributes into the in-core inode structure.
 330 *
 331 * For version 5 superblocks, if we are initialising a new inode and we are not
 332 * utilising the XFS_MOUNT_IKEEP inode cluster mode, we can simple build the new
 333 * inode core with a random generation number. If we are keeping inodes around,
 334 * we need to read the inode cluster to get the existing generation number off
 335 * disk. Further, if we are using version 4 superblocks (i.e. v1/v2 inode
 336 * format) then log recovery is dependent on the di_flushiter field being
 337 * initialised from the current on-disk value and hence we must also read the
 338 * inode off disk.
 339 */
 340int
 341xfs_iread(
 342        xfs_mount_t     *mp,
 343        xfs_trans_t     *tp,
 344        xfs_inode_t     *ip,
 345        uint            iget_flags)
 346{
 347        xfs_buf_t       *bp;
 348        xfs_dinode_t    *dip;
 349        int             error;
 350
 351        /*
 352         * Fill in the location information in the in-core inode.
 353         */
 354        error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags);
 355        if (error)
 356                return error;
 357
 358        /* shortcut IO on inode allocation if possible */
 359        if ((iget_flags & XFS_IGET_CREATE) &&
 360            xfs_sb_version_hascrc(&mp->m_sb) &&
 361            !(mp->m_flags & XFS_MOUNT_IKEEP)) {
 362                /* initialise the on-disk inode core */
 363                memset(&ip->i_d, 0, sizeof(ip->i_d));
 364                ip->i_d.di_magic = XFS_DINODE_MAGIC;
 365                ip->i_d.di_gen = prandom_u32();
 366                if (xfs_sb_version_hascrc(&mp->m_sb)) {
 367                        ip->i_d.di_version = 3;
 368                        ip->i_d.di_ino = ip->i_ino;
 369                        uuid_copy(&ip->i_d.di_uuid, &mp->m_sb.sb_meta_uuid);
 370                } else
 371                        ip->i_d.di_version = 2;
 372                return 0;
 373        }
 374
 375        /*
 376         * Get pointers to the on-disk inode and the buffer containing it.
 377         */
 378        error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0, iget_flags);
 379        if (error)
 380                return error;
 381
 382        /* even unallocated inodes are verified */
 383        if (!xfs_dinode_verify(mp, ip, dip)) {
 384                xfs_alert(mp, "%s: validation failed for inode %lld failed",
 385                                __func__, ip->i_ino);
 386
 387                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, dip);
 388                error = -EFSCORRUPTED;
 389                goto out_brelse;
 390        }
 391
 392        /*
 393         * If the on-disk inode is already linked to a directory
 394         * entry, copy all of the inode into the in-core inode.
 395         * xfs_iformat_fork() handles copying in the inode format
 396         * specific information.
 397         * Otherwise, just get the truly permanent information.
 398         */
 399        if (dip->di_mode) {
 400                xfs_dinode_from_disk(&ip->i_d, dip);
 401                error = xfs_iformat_fork(ip, dip);
 402                if (error)  {
 403#ifdef DEBUG
 404                        xfs_alert(mp, "%s: xfs_iformat() returned error %d",
 405                                __func__, error);
 406#endif /* DEBUG */
 407                        goto out_brelse;
 408                }
 409        } else {
 410                /*
 411                 * Partial initialisation of the in-core inode. Just the bits
 412                 * that xfs_ialloc won't overwrite or relies on being correct.
 413                 */
 414                ip->i_d.di_magic = be16_to_cpu(dip->di_magic);
 415                ip->i_d.di_version = dip->di_version;
 416                ip->i_d.di_gen = be32_to_cpu(dip->di_gen);
 417                ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter);
 418
 419                if (dip->di_version == 3) {
 420                        ip->i_d.di_ino = be64_to_cpu(dip->di_ino);
 421                        uuid_copy(&ip->i_d.di_uuid, &dip->di_uuid);
 422                }
 423
 424                /*
 425                 * Make sure to pull in the mode here as well in
 426                 * case the inode is released without being used.
 427                 * This ensures that xfs_inactive() will see that
 428                 * the inode is already free and not try to mess
 429                 * with the uninitialized part of it.
 430                 */
 431                ip->i_d.di_mode = 0;
 432        }
 433
 434        /*
 435         * Automatically convert version 1 inode formats in memory to version 2
 436         * inode format. If the inode is modified, it will get logged and
 437         * rewritten as a version 2 inode. We can do this because we set the
 438         * superblock feature bit for v2 inodes unconditionally during mount
 439         * and it means the reast of the code can assume the inode version is 2
 440         * or higher.
 441         */
 442        if (ip->i_d.di_version == 1) {
 443                ip->i_d.di_version = 2;
 444                memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
 445                ip->i_d.di_nlink = ip->i_d.di_onlink;
 446                ip->i_d.di_onlink = 0;
 447                xfs_set_projid(ip, 0);
 448        }
 449
 450        ip->i_delayed_blks = 0;
 451
 452        /*
 453         * Mark the buffer containing the inode as something to keep
 454         * around for a while.  This helps to keep recently accessed
 455         * meta-data in-core longer.
 456         */
 457        xfs_buf_set_ref(bp, XFS_INO_REF);
 458
 459        /*
 460         * Use xfs_trans_brelse() to release the buffer containing the on-disk
 461         * inode, because it was acquired with xfs_trans_read_buf() in
 462         * xfs_imap_to_bp() above.  If tp is NULL, this is just a normal
 463         * brelse().  If we're within a transaction, then xfs_trans_brelse()
 464         * will only release the buffer if it is not dirty within the
 465         * transaction.  It will be OK to release the buffer in this case,
 466         * because inodes on disk are never destroyed and we will be locking the
 467         * new in-core inode before putting it in the cache where other
 468         * processes can find it.  Thus we don't have to worry about the inode
 469         * being changed just because we released the buffer.
 470         */
 471 out_brelse:
 472        xfs_trans_brelse(tp, bp);
 473        return error;
 474}
 475