LXR linux/fs/xfs/xfs_log

   1/*
   2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   3 * All Rights Reserved.
   4 *
   5 * This program is free software; you can redistribute it and/or
   6 * modify it under the terms of the GNU General Public License as
   7 * published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope that it would be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public License
  15 * along with this program; if not, write the Free Software Foundation,
  16 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17 */
  18#include "xfs.h"
  19#include "xfs_fs.h"
  20#include "xfs_shared.h"
  21#include "xfs_format.h"
  22#include "xfs_log_format.h"
  23#include "xfs_trans_resv.h"
  24#include "xfs_bit.h"
  25#include "xfs_sb.h"
  26#include "xfs_mount.h"
  27#include "xfs_da_format.h"
  28#include "xfs_da_btree.h"
  29#include "xfs_inode.h"
  30#include "xfs_trans.h"
  31#include "xfs_log.h"
  32#include "xfs_log_priv.h"
  33#include "xfs_log_recover.h"
  34#include "xfs_inode_item.h"
  35#include "xfs_extfree_item.h"
  36#include "xfs_trans_priv.h"
  37#include "xfs_alloc.h"
  38#include "xfs_ialloc.h"
  39#include "xfs_quota.h"
  40#include "xfs_cksum.h"
  41#include "xfs_trace.h"
  42#include "xfs_icache.h"
  43#include "xfs_bmap_btree.h"
  44#include "xfs_error.h"
  45#include "xfs_dir2.h"
  46#include "xfs_buf_item.h"
  47
  48#define BLK_AVG(blk1, blk2)     ((blk1+blk2) >> 1)
  49
  50STATIC int
  51xlog_find_zeroed(
  52        struct xlog     *,
  53        xfs_daddr_t     *);
  54STATIC int
  55xlog_clear_stale_blocks(
  56        struct xlog     *,
  57        xfs_lsn_t);
  58#if defined(DEBUG)
  59STATIC void
  60xlog_recover_check_summary(
  61        struct xlog *);
  62#else
  63#define xlog_recover_check_summary(log)
  64#endif
  65STATIC int
  66xlog_do_recovery_pass(
  67        struct xlog *, xfs_daddr_t, xfs_daddr_t, int, xfs_daddr_t *);
  68
  69/*
  70 * This structure is used during recovery to record the buf log items which
  71 * have been canceled and should not be replayed.
  72 */
  73struct xfs_buf_cancel {
  74        xfs_daddr_t             bc_blkno;
  75        uint                    bc_len;
  76        int                     bc_refcount;
  77        struct list_head        bc_list;
  78};
  79
  80/*
  81 * Sector aligned buffer routines for buffer create/read/write/access
  82 */
  83
  84/*
  85 * Verify the given count of basic blocks is valid number of blocks
  86 * to specify for an operation involving the given XFS log buffer.
  87 * Returns nonzero if the count is valid, 0 otherwise.
  88 */
  89
  90static inline int
  91xlog_buf_bbcount_valid(
  92        struct xlog     *log,
  93        int             bbcount)
  94{
  95        return bbcount > 0 && bbcount <= log->l_logBBsize;
  96}
  97
  98/*
  99 * Allocate a buffer to hold log data.  The buffer needs to be able
 100 * to map to a range of nbblks basic blocks at any valid (basic
 101 * block) offset within the log.
 102 */
 103STATIC xfs_buf_t *
 104xlog_get_bp(
 105        struct xlog     *log,
 106        int             nbblks)
 107{
 108        struct xfs_buf  *bp;
 109
 110        if (!xlog_buf_bbcount_valid(log, nbblks)) {
 111                xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
 112                        nbblks);
 113                XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
 114                return NULL;
 115        }
 116
 117        /*
 118         * We do log I/O in units of log sectors (a power-of-2
 119         * multiple of the basic block size), so we round up the
 120         * requested size to accommodate the basic blocks required
 121         * for complete log sectors.
 122         *
 123         * In addition, the buffer may be used for a non-sector-
 124         * aligned block offset, in which case an I/O of the
 125         * requested size could extend beyond the end of the
 126         * buffer.  If the requested size is only 1 basic block it
 127         * will never straddle a sector boundary, so this won't be
 128         * an issue.  Nor will this be a problem if the log I/O is
 129         * done in basic blocks (sector size 1).  But otherwise we
 130         * extend the buffer by one extra log sector to ensure
 131         * there's space to accommodate this possibility.
 132         */
 133        if (nbblks > 1 && log->l_sectBBsize > 1)
 134                nbblks += log->l_sectBBsize;
 135        nbblks = round_up(nbblks, log->l_sectBBsize);
 136
 137        bp = xfs_buf_get_uncached(log->l_mp->m_logdev_targp, nbblks, 0);
 138        if (bp)
 139                xfs_buf_unlock(bp);
 140        return bp;
 141}
 142
 143STATIC void
 144xlog_put_bp(
 145        xfs_buf_t       *bp)
 146{
 147        xfs_buf_free(bp);
 148}
 149
 150/*
 151 * Return the address of the start of the given block number's data
 152 * in a log buffer.  The buffer covers a log sector-aligned region.
 153 */
 154STATIC char *
 155xlog_align(
 156        struct xlog     *log,
 157        xfs_daddr_t     blk_no,
 158        int             nbblks,
 159        struct xfs_buf  *bp)
 160{
 161        xfs_daddr_t     offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1);
 162
 163        ASSERT(offset + nbblks <= bp->b_length);
 164        return bp->b_addr + BBTOB(offset);
 165}
 166
 167
 168/*
 169 * nbblks should be uint, but oh well.  Just want to catch that 32-bit length.
 170 */
 171STATIC int
 172xlog_bread_noalign(
 173        struct xlog     *log,
 174        xfs_daddr_t     blk_no,
 175        int             nbblks,
 176        struct xfs_buf  *bp)
 177{
 178        int             error;
 179
 180        if (!xlog_buf_bbcount_valid(log, nbblks)) {
 181                xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
 182                        nbblks);
 183                XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
 184                return -EFSCORRUPTED;
 185        }
 186
 187        blk_no = round_down(blk_no, log->l_sectBBsize);
 188        nbblks = round_up(nbblks, log->l_sectBBsize);
 189
 190        ASSERT(nbblks > 0);
 191        ASSERT(nbblks <= bp->b_length);
 192
 193        XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
 194        bp->b_flags |= XBF_READ;
 195        bp->b_io_length = nbblks;
 196        bp->b_error = 0;
 197
 198        error = xfs_buf_submit_wait(bp);
 199        if (error && !XFS_FORCED_SHUTDOWN(log->l_mp))
 200                xfs_buf_ioerror_alert(bp, __func__);
 201        return error;
 202}
 203
 204STATIC int
 205xlog_bread(
 206        struct xlog     *log,
 207        xfs_daddr_t     blk_no,
 208        int             nbblks,
 209        struct xfs_buf  *bp,
 210        char            **offset)
 211{
 212        int             error;
 213
 214        error = xlog_bread_noalign(log, blk_no, nbblks, bp);
 215        if (error)
 216                return error;
 217
 218        *offset = xlog_align(log, blk_no, nbblks, bp);
 219        return 0;
 220}
 221
 222/*
 223 * Read at an offset into the buffer. Returns with the buffer in it's original
 224 * state regardless of the result of the read.
 225 */
 226STATIC int
 227xlog_bread_offset(
 228        struct xlog     *log,
 229        xfs_daddr_t     blk_no,         /* block to read from */
 230        int             nbblks,         /* blocks to read */
 231        struct xfs_buf  *bp,
 232        char            *offset)
 233{
 234        char            *orig_offset = bp->b_addr;
 235        int             orig_len = BBTOB(bp->b_length);
 236        int             error, error2;
 237
 238        error = xfs_buf_associate_memory(bp, offset, BBTOB(nbblks));
 239        if (error)
 240                return error;
 241
 242        error = xlog_bread_noalign(log, blk_no, nbblks, bp);
 243
 244        /* must reset buffer pointer even on error */
 245        error2 = xfs_buf_associate_memory(bp, orig_offset, orig_len);
 246        if (error)
 247                return error;
 248        return error2;
 249}
 250
 251/*
 252 * Write out the buffer at the given block for the given number of blocks.
 253 * The buffer is kept locked across the write and is returned locked.
 254 * This can only be used for synchronous log writes.
 255 */
 256STATIC int
 257xlog_bwrite(
 258        struct xlog     *log,
 259        xfs_daddr_t     blk_no,
 260        int             nbblks,
 261        struct xfs_buf  *bp)
 262{
 263        int             error;
 264
 265        if (!xlog_buf_bbcount_valid(log, nbblks)) {
 266                xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
 267                        nbblks);
 268                XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
 269                return -EFSCORRUPTED;
 270        }
 271
 272        blk_no = round_down(blk_no, log->l_sectBBsize);
 273        nbblks = round_up(nbblks, log->l_sectBBsize);
 274
 275        ASSERT(nbblks > 0);
 276        ASSERT(nbblks <= bp->b_length);
 277
 278        XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
 279        xfs_buf_hold(bp);
 280        xfs_buf_lock(bp);
 281        bp->b_io_length = nbblks;
 282        bp->b_error = 0;
 283
 284        error = xfs_bwrite(bp);
 285        if (error)
 286                xfs_buf_ioerror_alert(bp, __func__);
 287        xfs_buf_relse(bp);
 288        return error;
 289}
 290
 291#ifdef DEBUG
 292/*
 293 * dump debug superblock and log record information
 294 */
 295STATIC void
 296xlog_header_check_dump(
 297        xfs_mount_t             *mp,
 298        xlog_rec_header_t       *head)
 299{
 300        xfs_debug(mp, "%s:  SB : uuid = %pU, fmt = %d",
 301                __func__, &mp->m_sb.sb_uuid, XLOG_FMT);
 302        xfs_debug(mp, "    log : uuid = %pU, fmt = %d",
 303                &head->h_fs_uuid, be32_to_cpu(head->h_fmt));
 304}
 305#else
 306#define xlog_header_check_dump(mp, head)
 307#endif
 308
 309/*
 310 * check log record header for recovery
 311 */
 312STATIC int
 313xlog_header_check_recover(
 314        xfs_mount_t             *mp,
 315        xlog_rec_header_t       *head)
 316{
 317        ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM));
 318
 319        /*
 320         * IRIX doesn't write the h_fmt field and leaves it zeroed
 321         * (XLOG_FMT_UNKNOWN). This stops us from trying to recover
 322         * a dirty log created in IRIX.
 323         */
 324        if (unlikely(head->h_fmt != cpu_to_be32(XLOG_FMT))) {
 325                xfs_warn(mp,
 326        "dirty log written in incompatible format - can't recover");
 327                xlog_header_check_dump(mp, head);
 328                XFS_ERROR_REPORT("xlog_header_check_recover(1)",
 329                                 XFS_ERRLEVEL_HIGH, mp);
 330                return -EFSCORRUPTED;
 331        } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
 332                xfs_warn(mp,
 333        "dirty log entry has mismatched uuid - can't recover");
 334                xlog_header_check_dump(mp, head);
 335                XFS_ERROR_REPORT("xlog_header_check_recover(2)",
 336                                 XFS_ERRLEVEL_HIGH, mp);
 337                return -EFSCORRUPTED;
 338        }
 339        return 0;
 340}
 341
 342/*
 343 * read the head block of the log and check the header
 344 */
 345STATIC int
 346xlog_header_check_mount(
 347        xfs_mount_t             *mp,
 348        xlog_rec_header_t       *head)
 349{
 350        ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM));
 351
 352        if (uuid_is_nil(&head->h_fs_uuid)) {
 353                /*
 354                 * IRIX doesn't write the h_fs_uuid or h_fmt fields. If
 355                 * h_fs_uuid is nil, we assume this log was last mounted
 356                 * by IRIX and continue.
 357                 */
 358                xfs_warn(mp, "nil uuid in log - IRIX style log");
 359        } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
 360                xfs_warn(mp, "log has mismatched uuid - can't recover");
 361                xlog_header_check_dump(mp, head);
 362                XFS_ERROR_REPORT("xlog_header_check_mount",
 363                                 XFS_ERRLEVEL_HIGH, mp);
 364                return -EFSCORRUPTED;
 365        }
 366        return 0;
 367}
 368
 369STATIC void
 370xlog_recover_iodone(
 371        struct xfs_buf  *bp)
 372{
 373        if (bp->b_error) {
 374                /*
 375                 * We're not going to bother about retrying
 376                 * this during recovery. One strike!
 377                 */
 378                if (!XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
 379                        xfs_buf_ioerror_alert(bp, __func__);
 380                        xfs_force_shutdown(bp->b_target->bt_mount,
 381                                                SHUTDOWN_META_IO_ERROR);
 382                }
 383        }
 384
 385        /*
 386         * On v5 supers, a bli could be attached to update the metadata LSN.
 387         * Clean it up.
 388         */
 389        if (bp->b_fspriv)
 390                xfs_buf_item_relse(bp);
 391        ASSERT(bp->b_fspriv == NULL);
 392
 393        bp->b_iodone = NULL;
 394        xfs_buf_ioend(bp);
 395}
 396
 397/*
 398 * This routine finds (to an approximation) the first block in the physical
 399 * log which contains the given cycle.  It uses a binary search algorithm.
 400 * Note that the algorithm can not be perfect because the disk will not
 401 * necessarily be perfect.
 402 */
 403STATIC int
 404xlog_find_cycle_start(
 405        struct xlog     *log,
 406        struct xfs_buf  *bp,
 407        xfs_daddr_t     first_blk,
 408        xfs_daddr_t     *last_blk,
 409        uint            cycle)
 410{
 411        char            *offset;
 412        xfs_daddr_t     mid_blk;
 413        xfs_daddr_t     end_blk;
 414        uint            mid_cycle;
 415        int             error;
 416
 417        end_blk = *last_blk;
 418        mid_blk = BLK_AVG(first_blk, end_blk);
 419        while (mid_blk != first_blk && mid_blk != end_blk) {
 420                error = xlog_bread(log, mid_blk, 1, bp, &offset);
 421                if (error)
 422                        return error;
 423                mid_cycle = xlog_get_cycle(offset);
 424                if (mid_cycle == cycle)
 425                        end_blk = mid_blk;   /* last_half_cycle == mid_cycle */
 426                else
 427                        first_blk = mid_blk; /* first_half_cycle == mid_cycle */
 428                mid_blk = BLK_AVG(first_blk, end_blk);
 429        }
 430        ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) ||
 431               (mid_blk == end_blk && mid_blk-1 == first_blk));
 432
 433        *last_blk = end_blk;
 434
 435        return 0;
 436}
 437
 438/*
 439 * Check that a range of blocks does not contain stop_on_cycle_no.
 440 * Fill in *new_blk with the block offset where such a block is
 441 * found, or with -1 (an invalid block number) if there is no such
 442 * block in the range.  The scan needs to occur from front to back
 443 * and the pointer into the region must be updated since a later
 444 * routine will need to perform another test.
 445 */
 446STATIC int
 447xlog_find_verify_cycle(
 448        struct xlog     *log,
 449        xfs_daddr_t     start_blk,
 450        int             nbblks,
 451        uint            stop_on_cycle_no,
 452        xfs_daddr_t     *new_blk)
 453{
 454        xfs_daddr_t     i, j;
 455        uint            cycle;
 456        xfs_buf_t       *bp;
 457        xfs_daddr_t     bufblks;
 458        char            *buf = NULL;
 459        int             error = 0;
 460
 461        /*
 462         * Greedily allocate a buffer big enough to handle the full
 463         * range of basic blocks we'll be examining.  If that fails,
 464         * try a smaller size.  We need to be able to read at least
 465         * a log sector, or we're out of luck.
 466         */
 467        bufblks = 1 << ffs(nbblks);
 468        while (bufblks > log->l_logBBsize)
 469                bufblks >>= 1;
 470        while (!(bp = xlog_get_bp(log, bufblks))) {
 471                bufblks >>= 1;
 472                if (bufblks < log->l_sectBBsize)
 473                        return -ENOMEM;
 474        }
 475
 476        for (i = start_blk; i < start_blk + nbblks; i += bufblks) {
 477                int     bcount;
 478
 479                bcount = min(bufblks, (start_blk + nbblks - i));
 480
 481                error = xlog_bread(log, i, bcount, bp, &buf);
 482                if (error)
 483                        goto out;
 484
 485                for (j = 0; j < bcount; j++) {
 486                        cycle = xlog_get_cycle(buf);
 487                        if (cycle == stop_on_cycle_no) {
 488                                *new_blk = i+j;
 489                                goto out;
 490                        }
 491
 492                        buf += BBSIZE;
 493                }
 494        }
 495
 496        *new_blk = -1;
 497
 498out:
 499        xlog_put_bp(bp);
 500        return error;
 501}
 502
 503/*
 504 * Potentially backup over partial log record write.
 505 *
 506 * In the typical case, last_blk is the number of the block directly after
 507 * a good log record.  Therefore, we subtract one to get the block number
 508 * of the last block in the given buffer.  extra_bblks contains the number
 509 * of blocks we would have read on a previous read.  This happens when the
 510 * last log record is split over the end of the physical log.
 511 *
 512 * extra_bblks is the number of blocks potentially verified on a previous
 513 * call to this routine.
 514 */
 515STATIC int
 516xlog_find_verify_log_record(
 517        struct xlog             *log,
 518        xfs_daddr_t             start_blk,
 519        xfs_daddr_t             *last_blk,
 520        int                     extra_bblks)
 521{
 522        xfs_daddr_t             i;
 523        xfs_buf_t               *bp;
 524        char                    *offset = NULL;
 525        xlog_rec_header_t       *head = NULL;
 526        int                     error = 0;
 527        int                     smallmem = 0;
 528        int                     num_blks = *last_blk - start_blk;
 529        int                     xhdrs;
 530
 531        ASSERT(start_blk != 0 || *last_blk != start_blk);
 532
 533        if (!(bp = xlog_get_bp(log, num_blks))) {
 534                if (!(bp = xlog_get_bp(log, 1)))
 535                        return -ENOMEM;
 536                smallmem = 1;
 537        } else {
 538                error = xlog_bread(log, start_blk, num_blks, bp, &offset);
 539                if (error)
 540                        goto out;
 541                offset += ((num_blks - 1) << BBSHIFT);
 542        }
 543
 544        for (i = (*last_blk) - 1; i >= 0; i--) {
 545                if (i < start_blk) {
 546                        /* valid log record not found */
 547                        xfs_warn(log->l_mp,
 548                "Log inconsistent (didn't find previous header)");
 549                        ASSERT(0);
 550                        error = -EIO;
 551                        goto out;
 552                }
 553
 554                if (smallmem) {
 555                        error = xlog_bread(log, i, 1, bp, &offset);
 556                        if (error)
 557                                goto out;
 558                }
 559
 560                head = (xlog_rec_header_t *)offset;
 561
 562                if (head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
 563                        break;
 564
 565                if (!smallmem)
 566                        offset -= BBSIZE;
 567        }
 568
 569        /*
 570         * We hit the beginning of the physical log & still no header.  Return
 571         * to caller.  If caller can handle a return of -1, then this routine
 572         * will be called again for the end of the physical log.
 573         */
 574        if (i == -1) {
 575                error = 1;
 576                goto out;
 577        }
 578
 579        /*
 580         * We have the final block of the good log (the first block
 581         * of the log record _before_ the head. So we check the uuid.
 582         */
 583        if ((error = xlog_header_check_mount(log->l_mp, head)))
 584                goto out;
 585
 586        /*
 587         * We may have found a log record header before we expected one.
 588         * last_blk will be the 1st block # with a given cycle #.  We may end
 589         * up reading an entire log record.  In this case, we don't want to
 590         * reset last_blk.  Only when last_blk points in the middle of a log
 591         * record do we update last_blk.
 592         */
 593        if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
 594                uint    h_size = be32_to_cpu(head->h_size);
 595
 596                xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE;
 597                if (h_size % XLOG_HEADER_CYCLE_SIZE)
 598                        xhdrs++;
 599        } else {
 600                xhdrs = 1;
 601        }
 602
 603        if (*last_blk - i + extra_bblks !=
 604            BTOBB(be32_to_cpu(head->h_len)) + xhdrs)
 605                *last_blk = i;
 606
 607out:
 608        xlog_put_bp(bp);
 609        return error;
 610}
 611
 612/*
 613 * Head is defined to be the point of the log where the next log write
 614 * could go.  This means that incomplete LR writes at the end are
 615 * eliminated when calculating the head.  We aren't guaranteed that previous
 616 * LR have complete transactions.  We only know that a cycle number of
 617 * current cycle number -1 won't be present in the log if we start writing
 618 * from our current block number.
 619 *
 620 * last_blk contains the block number of the first block with a given
 621 * cycle number.
 622 *
 623 * Return: zero if normal, non-zero if error.
 624 */
 625STATIC int
 626xlog_find_head(
 627        struct xlog     *log,
 628        xfs_daddr_t     *return_head_blk)
 629{
 630        xfs_buf_t       *bp;
 631        char            *offset;
 632        xfs_daddr_t     new_blk, first_blk, start_blk, last_blk, head_blk;
 633        int             num_scan_bblks;
 634        uint            first_half_cycle, last_half_cycle;
 635        uint            stop_on_cycle;
 636        int             error, log_bbnum = log->l_logBBsize;
 637
 638        /* Is the end of the log device zeroed? */
 639        error = xlog_find_zeroed(log, &first_blk);
 640        if (error < 0) {
 641                xfs_warn(log->l_mp, "empty log check failed");
 642                return error;
 643        }
 644        if (error == 1) {
 645                *return_head_blk = first_blk;
 646
 647                /* Is the whole lot zeroed? */
 648                if (!first_blk) {
 649                        /* Linux XFS shouldn't generate totally zeroed logs -
 650                         * mkfs etc write a dummy unmount record to a fresh
 651                         * log so we can store the uuid in there
 652                         */
 653                        xfs_warn(log->l_mp, "totally zeroed log");
 654                }
 655
 656                return 0;
 657        }
 658
 659        first_blk = 0;                  /* get cycle # of 1st block */
 660        bp = xlog_get_bp(log, 1);
 661        if (!bp)
 662                return -ENOMEM;
 663
 664        error = xlog_bread(log, 0, 1, bp, &offset);
 665        if (error)
 666                goto bp_err;
 667
 668        first_half_cycle = xlog_get_cycle(offset);
 669
 670        last_blk = head_blk = log_bbnum - 1;    /* get cycle # of last block */
 671        error = xlog_bread(log, last_blk, 1, bp, &offset);
 672        if (error)
 673                goto bp_err;
 674
 675        last_half_cycle = xlog_get_cycle(offset);
 676        ASSERT(last_half_cycle != 0);
 677
 678        /*
 679         * If the 1st half cycle number is equal to the last half cycle number,
 680         * then the entire log is stamped with the same cycle number.  In this
 681         * case, head_blk can't be set to zero (which makes sense).  The below
 682         * math doesn't work out properly with head_blk equal to zero.  Instead,
 683         * we set it to log_bbnum which is an invalid block number, but this
 684         * value makes the math correct.  If head_blk doesn't changed through
 685         * all the tests below, *head_blk is set to zero at the very end rather
 686         * than log_bbnum.  In a sense, log_bbnum and zero are the same block
 687         * in a circular file.
 688         */
 689        if (first_half_cycle == last_half_cycle) {
 690                /*
 691                 * In this case we believe that the entire log should have
 692                 * cycle number last_half_cycle.  We need to scan backwards
 693                 * from the end verifying that there are no holes still
 694                 * containing last_half_cycle - 1.  If we find such a hole,
 695                 * then the start of that hole will be the new head.  The
 696                 * simple case looks like
 697                 *        x | x ... | x - 1 | x
 698                 * Another case that fits this picture would be
 699                 *        x | x + 1 | x ... | x
 700                 * In this case the head really is somewhere at the end of the
 701                 * log, as one of the latest writes at the beginning was
 702                 * incomplete.
 703                 * One more case is
 704                 *        x | x + 1 | x ... | x - 1 | x
 705                 * This is really the combination of the above two cases, and
 706                 * the head has to end up at the start of the x-1 hole at the
 707                 * end of the log.
 708                 *
 709                 * In the 256k log case, we will read from the beginning to the
 710                 * end of the log and search for cycle numbers equal to x-1.
 711                 * We don't worry about the x+1 blocks that we encounter,
 712                 * because we know that they cannot be the head since the log
 713                 * started with x.
 714                 */
 715                head_blk = log_bbnum;
 716                stop_on_cycle = last_half_cycle - 1;
 717        } else {
 718                /*
 719                 * In this case we want to find the first block with cycle
 720                 * number matching last_half_cycle.  We expect the log to be
 721                 * some variation on
 722                 *        x + 1 ... | x ... | x
 723                 * The first block with cycle number x (last_half_cycle) will
 724                 * be where the new head belongs.  First we do a binary search
 725                 * for the first occurrence of last_half_cycle.  The binary
 726                 * search may not be totally accurate, so then we scan back
 727                 * from there looking for occurrences of last_half_cycle before
 728                 * us.  If that backwards scan wraps around the beginning of
 729                 * the log, then we look for occurrences of last_half_cycle - 1
 730                 * at the end of the log.  The cases we're looking for look
 731                 * like
 732                 *                               v binary search stopped here
 733                 *        x + 1 ... | x | x + 1 | x ... | x
 734                 *                   ^ but we want to locate this spot
 735                 * or
 736                 *        <---------> less than scan distance
 737                 *        x + 1 ... | x ... | x - 1 | x
 738                 *                           ^ we want to locate this spot
 739                 */
 740                stop_on_cycle = last_half_cycle;
 741                if ((error = xlog_find_cycle_start(log, bp, first_blk,
 742                                                &head_blk, last_half_cycle)))
 743                        goto bp_err;
 744        }
 745
 746        /*
 747         * Now validate the answer.  Scan back some number of maximum possible
 748         * blocks and make sure each one has the expected cycle number.  The
 749         * maximum is determined by the total possible amount of buffering
 750         * in the in-core log.  The following number can be made tighter if
 751         * we actually look at the block size of the filesystem.
 752         */
 753        num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
 754        if (head_blk >= num_scan_bblks) {
 755                /*
 756                 * We are guaranteed that the entire check can be performed
 757                 * in one buffer.
 758                 */
 759                start_blk = head_blk - num_scan_bblks;
 760                if ((error = xlog_find_verify_cycle(log,
 761                                                start_blk, num_scan_bblks,
 762                                                stop_on_cycle, &new_blk)))
 763                        goto bp_err;
 764                if (new_blk != -1)
 765                        head_blk = new_blk;
 766        } else {                /* need to read 2 parts of log */
 767                /*
 768                 * We are going to scan backwards in the log in two parts.
 769                 * First we scan the physical end of the log.  In this part
 770                 * of the log, we are looking for blocks with cycle number
 771                 * last_half_cycle - 1.
 772                 * If we find one, then we know that the log starts there, as
 773                 * we've found a hole that didn't get written in going around
 774                 * the end of the physical log.  The simple case for this is
 775                 *        x + 1 ... | x ... | x - 1 | x
 776                 *        <---------> less than scan distance
 777                 * If all of the blocks at the end of the log have cycle number
 778                 * last_half_cycle, then we check the blocks at the start of
 779                 * the log looking for occurrences of last_half_cycle.  If we
 780                 * find one, then our current estimate for the location of the
 781                 * first occurrence of last_half_cycle is wrong and we move
 782                 * back to the hole we've found.  This case looks like
 783                 *        x + 1 ... | x | x + 1 | x ...
 784                 *                               ^ binary search stopped here
 785                 * Another case we need to handle that only occurs in 256k
 786                 * logs is
 787                 *        x + 1 ... | x ... | x+1 | x ...
 788                 *                   ^ binary search stops here
 789                 * In a 256k log, the scan at the end of the log will see the
 790                 * x + 1 blocks.  We need to skip past those since that is
 791                 * certainly not the head of the log.  By searching for
 792                 * last_half_cycle-1 we accomplish that.
 793                 */
 794                ASSERT(head_blk <= INT_MAX &&
 795                        (xfs_daddr_t) num_scan_bblks >= head_blk);
 796                start_blk = log_bbnum - (num_scan_bblks - head_blk);
 797                if ((error = xlog_find_verify_cycle(log, start_blk,
 798                                        num_scan_bblks - (int)head_blk,
 799                                        (stop_on_cycle - 1), &new_blk)))
 800                        goto bp_err;
 801                if (new_blk != -1) {
 802                        head_blk = new_blk;
 803                        goto validate_head;
 804                }
 805
 806                /*
 807                 * Scan beginning of log now.  The last part of the physical
 808                 * log is good.  This scan needs to verify that it doesn't find
 809                 * the last_half_cycle.
 810                 */
 811                start_blk = 0;
 812                ASSERT(head_blk <= INT_MAX);
 813                if ((error = xlog_find_verify_cycle(log,
 814                                        start_blk, (int)head_blk,
 815                                        stop_on_cycle, &new_blk)))
 816                        goto bp_err;
 817                if (new_blk != -1)
 818                        head_blk = new_blk;
 819        }
 820
 821validate_head:
 822        /*
 823         * Now we need to make sure head_blk is not pointing to a block in
 824         * the middle of a log record.
 825         */
 826        num_scan_bblks = XLOG_REC_SHIFT(log);
 827        if (head_blk >= num_scan_bblks) {
 828                start_blk = head_blk - num_scan_bblks; /* don't read head_blk */
 829
 830                /* start ptr at last block ptr before head_blk */
 831                error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0);
 832                if (error == 1)
 833                        error = -EIO;
 834                if (error)
 835                        goto bp_err;
 836        } else {
 837                start_blk = 0;
 838                ASSERT(head_blk <= INT_MAX);
 839                error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0);
 840                if (error < 0)
 841                        goto bp_err;
 842                if (error == 1) {
 843                        /* We hit the beginning of the log during our search */
 844                        start_blk = log_bbnum - (num_scan_bblks - head_blk);
 845                        new_blk = log_bbnum;
 846                        ASSERT(start_blk <= INT_MAX &&
 847                                (xfs_daddr_t) log_bbnum-start_blk >= 0);
 848                        ASSERT(head_blk <= INT_MAX);
 849                        error = xlog_find_verify_log_record(log, start_blk,
 850                                                        &new_blk, (int)head_blk);
 851                        if (error == 1)
 852                                error = -EIO;
 853                        if (error)
 854                                goto bp_err;
 855                        if (new_blk != log_bbnum)
 856                                head_blk = new_blk;
 857                } else if (error)
 858                        goto bp_err;
 859        }
 860
 861        xlog_put_bp(bp);
 862        if (head_blk == log_bbnum)
 863                *return_head_blk = 0;
 864        else
 865                *return_head_blk = head_blk;
 866        /*
 867         * When returning here, we have a good block number.  Bad block
 868         * means that during a previous crash, we didn't have a clean break
 869         * from cycle number N to cycle number N-1.  In this case, we need
 870         * to find the first block with cycle number N-1.
 871         */
 872        return 0;
 873
 874 bp_err:
 875        xlog_put_bp(bp);
 876
 877        if (error)
 878                xfs_warn(log->l_mp, "failed to find log head");
 879        return error;
 880}
 881
 882/*
 883 * Seek backwards in the log for log record headers.
 884 *
 885 * Given a starting log block, walk backwards until we find the provided number
 886 * of records or hit the provided tail block. The return value is the number of
 887 * records encountered or a negative error code. The log block and buffer
 888 * pointer of the last record seen are returned in rblk and rhead respectively.
 889 */
 890STATIC int
 891xlog_rseek_logrec_hdr(
 892        struct xlog             *log,
 893        xfs_daddr_t             head_blk,
 894        xfs_daddr_t             tail_blk,
 895        int                     count,
 896        struct xfs_buf          *bp,
 897        xfs_daddr_t             *rblk,
 898        struct xlog_rec_header  **rhead,
 899        bool                    *wrapped)
 900{
 901        int                     i;
 902        int                     error;
 903        int                     found = 0;
 904        char                    *offset = NULL;
 905        xfs_daddr_t             end_blk;
 906
 907        *wrapped = false;
 908
 909        /*
 910         * Walk backwards from the head block until we hit the tail or the first
 911         * block in the log.
 912         */
 913        end_blk = head_blk > tail_blk ? tail_blk : 0;
 914        for (i = (int) head_blk - 1; i >= end_blk; i--) {
 915                error = xlog_bread(log, i, 1, bp, &offset);
 916                if (error)
 917                        goto out_error;
 918
 919                if (*(__be32 *) offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
 920                        *rblk = i;
 921                        *rhead = (struct xlog_rec_header *) offset;
 922                        if (++found == count)
 923                                break;
 924                }
 925        }
 926
 927        /*
 928         * If we haven't hit the tail block or the log record header count,
 929         * start looking again from the end of the physical log. Note that
 930         * callers can pass head == tail if the tail is not yet known.
 931         */
 932        if (tail_blk >= head_blk && found != count) {
 933                for (i = log->l_logBBsize - 1; i >= (int) tail_blk; i--) {
 934                        error = xlog_bread(log, i, 1, bp, &offset);
 935                        if (error)
 936                                goto out_error;
 937
 938                        if (*(__be32 *)offset ==
 939                            cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
 940                                *wrapped = true;
 941                                *rblk = i;
 942                                *rhead = (struct xlog_rec_header *) offset;
 943                                if (++found == count)
 944                                        break;
 945                        }
 946                }
 947        }
 948
 949        return found;
 950
 951out_error:
 952        return error;
 953}
 954
 955/*
 956 * Seek forward in the log for log record headers.
 957 *
 958 * Given head and tail blocks, walk forward from the tail block until we find
 959 * the provided number of records or hit the head block. The return value is the
 960 * number of records encountered or a negative error code. The log block and
 961 * buffer pointer of the last record seen are returned in rblk and rhead
 962 * respectively.
 963 */
 964STATIC int
 965xlog_seek_logrec_hdr(
 966        struct xlog             *log,
 967        xfs_daddr_t             head_blk,
 968        xfs_daddr_t             tail_blk,
 969        int                     count,
 970        struct xfs_buf          *bp,
 971        xfs_daddr_t             *rblk,
 972        struct xlog_rec_header  **rhead,
 973        bool                    *wrapped)
 974{
 975        int                     i;
 976        int                     error;
 977        int                     found = 0;
 978        char                    *offset = NULL;
 979        xfs_daddr_t             end_blk;
 980
 981        *wrapped = false;
 982
 983        /*
 984         * Walk forward from the tail block until we hit the head or the last
 985         * block in the log.
 986         */
 987        end_blk = head_blk > tail_blk ? head_blk : log->l_logBBsize - 1;
 988        for (i = (int) tail_blk; i <= end_blk; i++) {
 989                error = xlog_bread(log, i, 1, bp, &offset);
 990                if (error)
 991                        goto out_error;
 992
 993                if (*(__be32 *) offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
 994                        *rblk = i;
 995                        *rhead = (struct xlog_rec_header *) offset;
 996                        if (++found == count)
 997                                break;
 998                }
 999        }
1000

1001        /*
1002         * If we haven't hit the head block or the log record header count,
1003         * start looking again from the start of the physical log.
1004         */
1005        if (tail_blk > head_blk && found != count) {
1006                for (i = 0; i < (int) head_blk; i++) {
1007                        error = xlog_bread(log, i, 1, bp, &offset);
1008                        if (error)
1009                                goto out_error;
1010
1011                        if (*(__be32 *)offset ==
1012                            cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
1013                                *wrapped = true;
1014                                *rblk = i;
1015                                *rhead = (struct xlog_rec_header *) offset;
1016                                if (++found == count)
1017                                        break;
1018                        }
1019                }
1020        }
1021
1022        return found;
1023
1024out_error:
1025        return error;
1026}
1027
1028/*
1029 * Check the log tail for torn writes. This is required when torn writes are
1030 * detected at the head and the head had to be walked back to a previous record.
1031 * The tail of the previous record must now be verified to ensure the torn
1032 * writes didn't corrupt the previous tail.
1033 *
1034 * Return an error if CRC verification fails as recovery cannot proceed.
1035 */
1036STATIC int
1037xlog_verify_tail(
1038        struct xlog             *log,
1039        xfs_daddr_t             head_blk,
1040        xfs_daddr_t             tail_blk)
1041{
1042        struct xlog_rec_header  *thead;
1043        struct xfs_buf          *bp;
1044        xfs_daddr_t             first_bad;
1045        int                     count;
1046        int                     error = 0;
1047        bool                    wrapped;
1048        xfs_daddr_t             tmp_head;
1049
1050        bp = xlog_get_bp(log, 1);
1051        if (!bp)
1052                return -ENOMEM;
1053
1054        /*
1055         * Seek XLOG_MAX_ICLOGS + 1 records past the current tail record to get
1056         * a temporary head block that points after the last possible
1057         * concurrently written record of the tail.
1058         */
1059        count = xlog_seek_logrec_hdr(log, head_blk, tail_blk,
1060                                     XLOG_MAX_ICLOGS + 1, bp, &tmp_head, &thead,
1061                                     &wrapped);
1062        if (count < 0) {
1063                error = count;
1064                goto out;
1065        }
1066
1067        /*
1068         * If the call above didn't find XLOG_MAX_ICLOGS + 1 records, we ran
1069         * into the actual log head. tmp_head points to the start of the record
1070         * so update it to the actual head block.
1071         */
1072        if (count < XLOG_MAX_ICLOGS + 1)
1073                tmp_head = head_blk;
1074
1075        /*
1076         * We now have a tail and temporary head block that covers at least
1077         * XLOG_MAX_ICLOGS records from the tail. We need to verify that these
1078         * records were completely written. Run a CRC verification pass from
1079         * tail to head and return the result.
1080         */
1081        error = xlog_do_recovery_pass(log, tmp_head, tail_blk,
1082                                      XLOG_RECOVER_CRCPASS, &first_bad);
1083
1084out:
1085        xlog_put_bp(bp);
1086        return error;
1087}
1088
1089/*
1090 * Detect and trim torn writes from the head of the log.
1091 *
1092 * Storage without sector atomicity guarantees can result in torn writes in the
1093 * log in the event of a crash. Our only means to detect this scenario is via
1094 * CRC verification. While we can't always be certain that CRC verification
1095 * failure is due to a torn write vs. an unrelated corruption, we do know that
1096 * only a certain number (XLOG_MAX_ICLOGS) of log records can be written out at
1097 * one time. Therefore, CRC verify up to XLOG_MAX_ICLOGS records at the head of
1098 * the log and treat failures in this range as torn writes as a matter of
1099 * policy. In the event of CRC failure, the head is walked back to the last good
1100 * record in the log and the tail is updated from that record and verified.
1101 */
1102STATIC int
1103xlog_verify_head(
1104        struct xlog             *log,
1105        xfs_daddr_t             *head_blk,      /* in/out: unverified head */
1106        xfs_daddr_t             *tail_blk,      /* out: tail block */
1107        struct xfs_buf          *bp,
1108        xfs_daddr_t             *rhead_blk,     /* start blk of last record */
1109        struct xlog_rec_header  **rhead,        /* ptr to last record */
1110        bool                    *wrapped)       /* last rec. wraps phys. log */
1111{
1112        struct xlog_rec_header  *tmp_rhead;
1113        struct xfs_buf          *tmp_bp;
1114        xfs_daddr_t             first_bad;
1115        xfs_daddr_t             tmp_rhead_blk;
1116        int                     found;
1117        int                     error;
1118        bool                    tmp_wrapped;
1119
1120        /*
1121         * Check the head of the log for torn writes. Search backwards from the
1122         * head until we hit the tail or the maximum number of log record I/Os
1123         * that could have been in flight at one time. Use a temporary buffer so
1124         * we don't trash the rhead/bp pointers from the caller.
1125         */
1126        tmp_bp = xlog_get_bp(log, 1);
1127        if (!tmp_bp)
1128                return -ENOMEM;
1129        error = xlog_rseek_logrec_hdr(log, *head_blk, *tail_blk,
1130                                      XLOG_MAX_ICLOGS, tmp_bp, &tmp_rhead_blk,
1131                                      &tmp_rhead, &tmp_wrapped);
1132        xlog_put_bp(tmp_bp);
1133        if (error < 0)
1134                return error;
1135
1136        /*
1137         * Now run a CRC verification pass over the records starting at the
1138         * block found above to the current head. If a CRC failure occurs, the
1139         * log block of the first bad record is saved in first_bad.
1140         */
1141        error = xlog_do_recovery_pass(log, *head_blk, tmp_rhead_blk,
1142                                      XLOG_RECOVER_CRCPASS, &first_bad);
1143        if (error == -EFSBADCRC) {
1144                /*
1145                 * We've hit a potential torn write. Reset the error and warn
1146                 * about it.
1147                 */
1148                error = 0;
1149                xfs_warn(log->l_mp,
1150"Torn write (CRC failure) detected at log block 0x%llx. Truncating head block from 0x%llx.",
1151                         first_bad, *head_blk);
1152
1153                /*
1154                 * Get the header block and buffer pointer for the last good
1155                 * record before the bad record.
1156                 *
1157                 * Note that xlog_find_tail() clears the blocks at the new head
1158                 * (i.e., the records with invalid CRC) if the cycle number
1159                 * matches the the current cycle.
1160                 */
1161                found = xlog_rseek_logrec_hdr(log, first_bad, *tail_blk, 1, bp,
1162                                              rhead_blk, rhead, wrapped);
1163                if (found < 0)
1164                        return found;
1165                if (found == 0)         /* XXX: right thing to do here? */
1166                        return -EIO;
1167
1168                /*
1169                 * Reset the head block to the starting block of the first bad
1170                 * log record and set the tail block based on the last good
1171                 * record.
1172                 *
1173                 * Bail out if the updated head/tail match as this indicates
1174                 * possible corruption outside of the acceptable
1175                 * (XLOG_MAX_ICLOGS) range. This is a job for xfs_repair...
1176                 */
1177                *head_blk = first_bad;
1178                *tail_blk = BLOCK_LSN(be64_to_cpu((*rhead)->h_tail_lsn));
1179                if (*head_blk == *tail_blk) {
1180                        ASSERT(0);
1181                        return 0;
1182                }
1183
1184                /*
1185                 * Now verify the tail based on the updated head. This is
1186                 * required because the torn writes trimmed from the head could
1187                 * have been written over the tail of a previous record. Return
1188                 * any errors since recovery cannot proceed if the tail is
1189                 * corrupt.
1190                 *
1191                 * XXX: This leaves a gap in truly robust protection from torn
1192                 * writes in the log. If the head is behind the tail, the tail
1193                 * pushes forward to create some space and then a crash occurs
1194                 * causing the writes into the previous record's tail region to
1195                 * tear, log recovery isn't able to recover.
1196                 *
1197                 * How likely is this to occur? If possible, can we do something
1198                 * more intelligent here? Is it safe to push the tail forward if
1199                 * we can determine that the tail is within the range of the
1200                 * torn write (e.g., the kernel can only overwrite the tail if
1201                 * it has actually been pushed forward)? Alternatively, could we
1202                 * somehow prevent this condition at runtime?
1203                 */
1204                error = xlog_verify_tail(log, *head_blk, *tail_blk);
1205        }
1206
1207        return error;
1208}
1209
1210/*
1211 * Check whether the head of the log points to an unmount record. In other
1212 * words, determine whether the log is clean. If so, update the in-core state
1213 * appropriately.
1214 */
1215static int
1216xlog_check_unmount_rec(
1217        struct xlog             *log,
1218        xfs_daddr_t             *head_blk,
1219        xfs_daddr_t             *tail_blk,
1220        struct xlog_rec_header  *rhead,
1221        xfs_daddr_t             rhead_blk,
1222        struct xfs_buf          *bp,
1223        bool                    *clean)
1224{
1225        struct xlog_op_header   *op_head;
1226        xfs_daddr_t             umount_data_blk;
1227        xfs_daddr_t             after_umount_blk;
1228        int                     hblks;
1229        int                     error;
1230        char                    *offset;
1231
1232        *clean = false;
1233
1234        /*
1235         * Look for unmount record. If we find it, then we know there was a
1236         * clean unmount. Since 'i' could be the last block in the physical
1237         * log, we convert to a log block before comparing to the head_blk.
1238         *
1239         * Save the current tail lsn to use to pass to xlog_clear_stale_blocks()
1240         * below. We won't want to clear the unmount record if there is one, so
1241         * we pass the lsn of the unmount record rather than the block after it.
1242         */
1243        if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
1244                int     h_size = be32_to_cpu(rhead->h_size);
1245                int     h_version = be32_to_cpu(rhead->h_version);
1246
1247                if ((h_version & XLOG_VERSION_2) &&
1248                    (h_size > XLOG_HEADER_CYCLE_SIZE)) {
1249                        hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
1250                        if (h_size % XLOG_HEADER_CYCLE_SIZE)
1251                                hblks++;
1252                } else {
1253                        hblks = 1;
1254                }
1255        } else {
1256                hblks = 1;
1257        }
1258        after_umount_blk = rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len));
1259        after_umount_blk = do_mod(after_umount_blk, log->l_logBBsize);
1260        if (*head_blk == after_umount_blk &&
1261            be32_to_cpu(rhead->h_num_logops) == 1) {
1262                umount_data_blk = rhead_blk + hblks;
1263                umount_data_blk = do_mod(umount_data_blk, log->l_logBBsize);
1264                error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
1265                if (error)
1266                        return error;
1267
1268                op_head = (struct xlog_op_header *)offset;
1269                if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
1270                        /*
1271                         * Set tail and last sync so that newly written log
1272                         * records will point recovery to after the current
1273                         * unmount record.
1274                         */
1275                        xlog_assign_atomic_lsn(&log->l_tail_lsn,
1276                                        log->l_curr_cycle, after_umount_blk);
1277                        xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
1278                                        log->l_curr_cycle, after_umount_blk);
1279                        *tail_blk = after_umount_blk;
1280
1281                        *clean = true;
1282                }
1283        }
1284
1285        return 0;
1286}
1287
1288static void
1289xlog_set_state(
1290        struct xlog             *log,
1291        xfs_daddr_t             head_blk,
1292        struct xlog_rec_header  *rhead,
1293        xfs_daddr_t             rhead_blk,
1294        bool                    bump_cycle)
1295{
1296        /*
1297         * Reset log values according to the state of the log when we
1298         * crashed.  In the case where head_blk == 0, we bump curr_cycle
1299         * one because the next write starts a new cycle rather than
1300         * continuing the cycle of the last good log record.  At this
1301         * point we have guaranteed that all partial log records have been
1302         * accounted for.  Therefore, we know that the last good log record
1303         * written was complete and ended exactly on the end boundary
1304         * of the physical log.
1305         */
1306        log->l_prev_block = rhead_blk;
1307        log->l_curr_block = (int)head_blk;
1308        log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
1309        if (bump_cycle)
1310                log->l_curr_cycle++;
1311        atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
1312        atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
1313        xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle,
1314                                        BBTOB(log->l_curr_block));
1315        xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,
1316                                        BBTOB(log->l_curr_block));
1317}
1318
1319/*
1320 * Find the sync block number or the tail of the log.
1321 *
1322 * This will be the block number of the last record to have its
1323 * associated buffers synced to disk.  Every log record header has
1324 * a sync lsn embedded in it.  LSNs hold block numbers, so it is easy
1325 * to get a sync block number.  The only concern is to figure out which
1326 * log record header to believe.
1327 *
1328 * The following algorithm uses the log record header with the largest
1329 * lsn.  The entire log record does not need to be valid.  We only care
1330 * that the header is valid.
1331 *
1332 * We could speed up search by using current head_blk buffer, but it is not
1333 * available.
1334 */
1335STATIC int
1336xlog_find_tail(
1337        struct xlog             *log,
1338        xfs_daddr_t             *head_blk,
1339        xfs_daddr_t             *tail_blk)
1340{
1341        xlog_rec_header_t       *rhead;
1342        char                    *offset = NULL;
1343        xfs_buf_t               *bp;
1344        int                     error;
1345        xfs_daddr_t             rhead_blk;
1346        xfs_lsn_t               tail_lsn;
1347        bool                    wrapped = false;
1348        bool                    clean = false;
1349
1350        /*
1351         * Find previous log record
1352         */
1353        if ((error = xlog_find_head(log, head_blk)))
1354                return error;
1355        ASSERT(*head_blk < INT_MAX);
1356
1357        bp = xlog_get_bp(log, 1);
1358        if (!bp)
1359                return -ENOMEM;
1360        if (*head_blk == 0) {                           /* special case */
1361                error = xlog_bread(log, 0, 1, bp, &offset);
1362                if (error)
1363                        goto done;
1364
1365                if (xlog_get_cycle(offset) == 0) {
1366                        *tail_blk = 0;
1367                        /* leave all other log inited values alone */
1368                        goto done;
1369                }
1370        }
1371
1372        /*
1373         * Search backwards through the log looking for the log record header
1374         * block. This wraps all the way back around to the head so something is
1375         * seriously wrong if we can't find it.
1376         */
1377        error = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, bp,
1378                                      &rhead_blk, &rhead, &wrapped);
1379        if (error < 0)
1380                return error;
1381        if (!error) {
1382                xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
1383                return -EIO;
1384        }
1385        *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
1386
1387        /*
1388         * Set the log state based on the current head record.
1389         */
1390        xlog_set_state(log, *head_blk, rhead, rhead_blk, wrapped);
1391        tail_lsn = atomic64_read(&log->l_tail_lsn);
1392
1393        /*
1394         * Look for an unmount record at the head of the log. This sets the log
1395         * state to determine whether recovery is necessary.
1396         */
1397        error = xlog_check_unmount_rec(log, head_blk, tail_blk, rhead,
1398                                       rhead_blk, bp, &clean);
1399        if (error)
1400                goto done;
1401
1402        /*
1403         * Verify the log head if the log is not clean (e.g., we have anything
1404         * but an unmount record at the head). This uses CRC verification to
1405         * detect and trim torn writes. If discovered, CRC failures are
1406         * considered torn writes and the log head is trimmed accordingly.
1407         *
1408         * Note that we can only run CRC verification when the log is dirty
1409         * because there's no guarantee that the log data behind an unmount
1410         * record is compatible with the current architecture.
1411         */
1412        if (!clean) {
1413                xfs_daddr_t     orig_head = *head_blk;
1414
1415                error = xlog_verify_head(log, head_blk, tail_blk, bp,
1416                                         &rhead_blk, &rhead, &wrapped);
1417                if (error)
1418                        goto done;
1419
1420                /* update in-core state again if the head changed */
1421                if (*head_blk != orig_head) {
1422                        xlog_set_state(log, *head_blk, rhead, rhead_blk,
1423                                       wrapped);
1424                        tail_lsn = atomic64_read(&log->l_tail_lsn);
1425                        error = xlog_check_unmount_rec(log, head_blk, tail_blk,
1426                                                       rhead, rhead_blk, bp,
1427                                                       &clean);
1428                        if (error)
1429                                goto done;
1430                }
1431        }
1432
1433        /*
1434         * Note that the unmount was clean. If the unmount was not clean, we
1435         * need to know this to rebuild the superblock counters from the perag
1436         * headers if we have a filesystem using non-persistent counters.
1437         */
1438        if (clean)
1439                log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN;
1440
1441        /*
1442         * Make sure that there are no blocks in front of the head
1443         * with the same cycle number as the head.  This can happen
1444         * because we allow multiple outstanding log writes concurrently,
1445         * and the later writes might make it out before earlier ones.
1446         *
1447         * We use the lsn from before modifying it so that we'll never
1448         * overwrite the unmount record after a clean unmount.
1449         *
1450         * Do this only if we are going to recover the filesystem
1451         *
1452         * NOTE: This used to say "if (!readonly)"
1453         * However on Linux, we can & do recover a read-only filesystem.
1454         * We only skip recovery if NORECOVERY is specified on mount,
1455         * in which case we would not be here.
1456         *
1457         * But... if the -device- itself is readonly, just skip this.
1458         * We can't recover this device anyway, so it won't matter.
1459         */
1460        if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp))
1461                error = xlog_clear_stale_blocks(log, tail_lsn);
1462
1463done:
1464        xlog_put_bp(bp);
1465
1466        if (error)
1467                xfs_warn(log->l_mp, "failed to locate log tail");
1468        return error;
1469}
1470
1471/*
1472 * Is the log zeroed at all?
1473 *
1474 * The last binary search should be changed to perform an X block read
1475 * once X becomes small enough.  You can then search linearly through
1476 * the X blocks.  This will cut down on the number of reads we need to do.
1477 *
1478 * If the log is partially zeroed, this routine will pass back the blkno
1479 * of the first block with cycle number 0.  It won't have a complete LR
1480 * preceding it.
1481 *
1482 * Return:
1483 *      0  => the log is completely written to
1484 *      1 => use *blk_no as the first block of the log
1485 *      <0 => error has occurred
1486 */
1487STATIC int
1488xlog_find_zeroed(
1489        struct xlog     *log,
1490        xfs_daddr_t     *blk_no)
1491{
1492        xfs_buf_t       *bp;
1493        char            *offset;
1494        uint            first_cycle, last_cycle;
1495        xfs_daddr_t     new_blk, last_blk, start_blk;
1496        xfs_daddr_t     num_scan_bblks;
1497        int             error, log_bbnum = log->l_logBBsize;
1498
1499        *blk_no = 0;
1500
1501        /* check totally zeroed log */
1502        bp = xlog_get_bp(log, 1);
1503        if (!bp)
1504                return -ENOMEM;
1505        error = xlog_bread(log, 0, 1, bp, &offset);
1506        if (error)
1507                goto bp_err;
1508
1509        first_cycle = xlog_get_cycle(offset);
1510        if (first_cycle == 0) {         /* completely zeroed log */
1511                *blk_no = 0;
1512                xlog_put_bp(bp);
1513                return 1;
1514        }
1515
1516        /* check partially zeroed log */
1517        error = xlog_bread(log, log_bbnum-1, 1, bp, &offset);
1518        if (error)
1519                goto bp_err;
1520
1521        last_cycle = xlog_get_cycle(offset);
1522        if (last_cycle != 0) {          /* log completely written to */
1523                xlog_put_bp(bp);
1524                return 0;
1525        } else if (first_cycle != 1) {
1526                /*
1527                 * If the cycle of the last block is zero, the cycle of
1528                 * the first block must be 1. If it's not, maybe we're
1529                 * not looking at a log... Bail out.
1530                 */
1531                xfs_warn(log->l_mp,
1532                        "Log inconsistent or not a log (last==0, first!=1)");
1533                error = -EINVAL;
1534                goto bp_err;
1535        }
1536
1537        /* we have a partially zeroed log */
1538        last_blk = log_bbnum-1;
1539        if ((error = xlog_find_cycle_start(log, bp, 0, &last_blk, 0)))
1540                goto bp_err;
1541
1542        /*
1543         * Validate the answer.  Because there is no way to guarantee that
1544         * the entire log is made up of log records which are the same size,
1545         * we scan over the defined maximum blocks.  At this point, the maximum
1546         * is not chosen to mean anything special.   XXXmiken
1547         */
1548        num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
1549        ASSERT(num_scan_bblks <= INT_MAX);
1550
1551        if (last_blk < num_scan_bblks)
1552                num_scan_bblks = last_blk;
1553        start_blk = last_blk - num_scan_bblks;
1554
1555        /*
1556         * We search for any instances of cycle number 0 that occur before
1557         * our current estimate of the head.  What we're trying to detect is
1558         *        1 ... | 0 | 1 | 0...
1559         *                       ^ binary search ends here
1560         */
1561        if ((error = xlog_find_verify_cycle(log, start_blk,
1562                                         (int)num_scan_bblks, 0, &new_blk)))
1563                goto bp_err;
1564        if (new_blk != -1)
1565                last_blk = new_blk;
1566
1567        /*
1568         * Potentially backup over partial log record write.  We don't need
1569         * to search the end of the log because we know it is zero.
1570         */
1571        error = xlog_find_verify_log_record(log, start_blk, &last_blk, 0);
1572        if (error == 1)
1573                error = -EIO;
1574        if (error)
1575                goto bp_err;
1576
1577        *blk_no = last_blk;
1578bp_err:
1579        xlog_put_bp(bp);
1580        if (error)
1581                return error;
1582        return 1;
1583}
1584
1585/*
1586 * These are simple subroutines used by xlog_clear_stale_blocks() below
1587 * to initialize a buffer full of empty log record headers and write
1588 * them into the log.
1589 */
1590STATIC void
1591xlog_add_record(
1592        struct xlog             *log,
1593        char                    *buf,
1594        int                     cycle,
1595        int                     block,
1596        int                     tail_cycle,
1597        int                     tail_block)
1598{
1599        xlog_rec_header_t       *recp = (xlog_rec_header_t *)buf;
1600
1601        memset(buf, 0, BBSIZE);
1602        recp->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
1603        recp->h_cycle = cpu_to_be32(cycle);
1604        recp->h_version = cpu_to_be32(
1605                        xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1);
1606        recp->h_lsn = cpu_to_be64(xlog_assign_lsn(cycle, block));
1607        recp->h_tail_lsn = cpu_to_be64(xlog_assign_lsn(tail_cycle, tail_block));
1608        recp->h_fmt = cpu_to_be32(XLOG_FMT);
1609        memcpy(&recp->h_fs_uuid, &log->l_mp->m_sb.sb_uuid, sizeof(uuid_t));
1610}
1611
1612STATIC int
1613xlog_write_log_records(
1614        struct xlog     *log,
1615        int             cycle,
1616        int             start_block,
1617        int             blocks,
1618        int             tail_cycle,
1619        int             tail_block)
1620{
1621        char            *offset;
1622        xfs_buf_t       *bp;
1623        int             balign, ealign;
1624        int             sectbb = log->l_sectBBsize;
1625        int             end_block = start_block + blocks;
1626        int             bufblks;
1627        int             error = 0;
1628        int             i, j = 0;
1629
1630        /*
1631         * Greedily allocate a buffer big enough to handle the full
1632         * range of basic blocks to be written.  If that fails, try
1633         * a smaller size.  We need to be able to write at least a
1634         * log sector, or we're out of luck.
1635         */
1636        bufblks = 1 << ffs(blocks);
1637        while (bufblks > log->l_logBBsize)
1638                bufblks >>= 1;
1639        while (!(bp = xlog_get_bp(log, bufblks))) {
1640                bufblks >>= 1;
1641                if (bufblks < sectbb)
1642                        return -ENOMEM;
1643        }
1644
1645        /* We may need to do a read at the start to fill in part of
1646         * the buffer in the starting sector not covered by the first
1647         * write below.
1648         */
1649        balign = round_down(start_block, sectbb);
1650        if (balign != start_block) {
1651                error = xlog_bread_noalign(log, start_block, 1, bp);
1652                if (error)
1653                        goto out_put_bp;
1654
1655                j = start_block - balign;
1656        }
1657
1658        for (i = start_block; i < end_block; i += bufblks) {
1659                int             bcount, endcount;
1660
1661                bcount = min(bufblks, end_block - start_block);
1662                endcount = bcount - j;
1663
1664                /* We may need to do a read at the end to fill in part of
1665                 * the buffer in the final sector not covered by the write.
1666                 * If this is the same sector as the above read, skip it.
1667                 */
1668                ealign = round_down(end_block, sectbb);
1669                if (j == 0 && (start_block + endcount > ealign)) {
1670                        offset = bp->b_addr + BBTOB(ealign - start_block);
1671                        error = xlog_bread_offset(log, ealign, sectbb,
1672                                                        bp, offset);
1673                        if (error)
1674                                break;
1675
1676                }
1677
1678                offset = xlog_align(log, start_block, endcount, bp);
1679                for (; j < endcount; j++) {
1680                        xlog_add_record(log, offset, cycle, i+j,
1681                                        tail_cycle, tail_block);
1682                        offset += BBSIZE;
1683                }
1684                error = xlog_bwrite(log, start_block, endcount, bp);
1685                if (error)
1686                        break;
1687                start_block += endcount;
1688                j = 0;
1689        }
1690
1691 out_put_bp:
1692        xlog_put_bp(bp);
1693        return error;
1694}
1695
1696/*
1697 * This routine is called to blow away any incomplete log writes out
1698 * in front of the log head.  We do this so that we won't become confused
1699 * if we come up, write only a little bit more, and then crash again.
1700 * If we leave the partial log records out there, this situation could
1701 * cause us to think those partial writes are valid blocks since they
1702 * have the current cycle number.  We get rid of them by overwriting them
1703 * with empty log records with the old cycle number rather than the
1704 * current one.
1705 *
1706 * The tail lsn is passed in rather than taken from
1707 * the log so that we will not write over the unmount record after a
1708 * clean unmount in a 512 block log.  Doing so would leave the log without
1709 * any valid log records in it until a new one was written.  If we crashed
1710 * during that time we would not be able to recover.
1711 */
1712STATIC int
1713xlog_clear_stale_blocks(
1714        struct xlog     *log,
1715        xfs_lsn_t       tail_lsn)
1716{
1717        int             tail_cycle, head_cycle;
1718        int             tail_block, head_block;
1719        int             tail_distance, max_distance;
1720        int             distance;
1721        int             error;
1722
1723        tail_cycle = CYCLE_LSN(tail_lsn);
1724        tail_block = BLOCK_LSN(tail_lsn);
1725        head_cycle = log->l_curr_cycle;
1726        head_block = log->l_curr_block;
1727
1728        /*
1729         * Figure out the distance between the new head of the log
1730         * and the tail.  We want to write over any blocks beyond the
1731         * head that we may have written just before the crash, but
1732         * we don't want to overwrite the tail of the log.
1733         */
1734        if (head_cycle == tail_cycle) {
1735                /*
1736                 * The tail is behind the head in the physical log,
1737                 * so the distance from the head to the tail is the
1738                 * distance from the head to the end of the log plus
1739                 * the distance from the beginning of the log to the
1740                 * tail.
1741                 */
1742                if (unlikely(head_block < tail_block || head_block >= log->l_logBBsize)) {
1743                        XFS_ERROR_REPORT("xlog_clear_stale_blocks(1)",
1744                                         XFS_ERRLEVEL_LOW, log->l_mp);
1745                        return -EFSCORRUPTED;
1746                }
1747                tail_distance = tail_block + (log->l_logBBsize - head_block);
1748        } else {
1749                /*
1750                 * The head is behind the tail in the physical log,
1751                 * so the distance from the head to the tail is just
1752                 * the tail block minus the head block.
1753                 */
1754                if (unlikely(head_block >= tail_block || head_cycle != (tail_cycle + 1))){
1755                        XFS_ERROR_REPORT("xlog_clear_stale_blocks(2)",
1756                                         XFS_ERRLEVEL_LOW, log->l_mp);
1757                        return -EFSCORRUPTED;
1758                }
1759                tail_distance = tail_block - head_block;
1760        }
1761
1762        /*
1763         * If the head is right up against the tail, we can't clear
1764         * anything.
1765         */
1766        if (tail_distance <= 0) {
1767                ASSERT(tail_distance == 0);
1768                return 0;
1769        }
1770
1771        max_distance = XLOG_TOTAL_REC_SHIFT(log);
1772        /*
1773         * Take the smaller of the maximum amount of outstanding I/O
1774         * we could have and the distance to the tail to clear out.
1775         * We take the smaller so that we don't overwrite the tail and
1776         * we don't waste all day writing from the head to the tail
1777         * for no reason.
1778         */
1779        max_distance = MIN(max_distance, tail_distance);
1780
1781        if ((head_block + max_distance) <= log->l_logBBsize) {
1782                /*
1783                 * We can stomp all the blocks we need to without
1784                 * wrapping around the end of the log.  Just do it
1785                 * in a single write.  Use the cycle number of the
1786                 * current cycle minus one so that the log will look like:
1787                 *     n ... | n - 1 ...
1788                 */
1789                error = xlog_write_log_records(log, (head_cycle - 1),
1790                                head_block, max_distance, tail_cycle,
1791                                tail_block);
1792                if (error)
1793                        return error;
1794        } else {
1795                /*
1796                 * We need to wrap around the end of the physical log in
1797                 * order to clear all the blocks.  Do it in two separate
1798                 * I/Os.  The first write should be from the head to the
1799                 * end of the physical log, and it should use the current
1800                 * cycle number minus one just like above.
1801                 */
1802                distance = log->l_logBBsize - head_block;
1803                error = xlog_write_log_records(log, (head_cycle - 1),
1804                                head_block, distance, tail_cycle,
1805                                tail_block);
1806
1807                if (error)
1808                        return error;
1809
1810                /*
1811                 * Now write the blocks at the start of the physical log.
1812                 * This writes the remainder of the blocks we want to clear.
1813                 * It uses the current cycle number since we're now on the
1814                 * same cycle as the head so that we get:
1815                 *    n ... n ... | n - 1 ...
1816                 *    ^^^^^ blocks we're writing
1817                 */
1818                distance = max_distance - (log->l_logBBsize - head_block);
1819                error = xlog_write_log_records(log, head_cycle, 0, distance,
1820                                tail_cycle, tail_block);
1821                if (error)
1822                        return error;
1823        }
1824
1825        return 0;
1826}
1827
1828/******************************************************************************
1829 *
1830 *              Log recover routines
1831 *
1832 ******************************************************************************
1833 */
1834
1835/*
1836 * Sort the log items in the transaction.
1837 *
1838 * The ordering constraints are defined by the inode allocation and unlink
1839 * behaviour. The rules are:
1840 *
1841 *      1. Every item is only logged once in a given transaction. Hence it
1842 *         represents the last logged state of the item. Hence ordering is
1843 *         dependent on the order in which operations need to be performed so
1844 *         required initial conditions are always met.
1845 *
1846 *      2. Cancelled buffers are recorded in pass 1 in a separate table and
1847 *         there's nothing to replay from them so we can simply cull them
1848 *         from the transaction. However, we can't do that until after we've
1849 *         replayed all the other items because they may be dependent on the
1850 *         cancelled buffer and replaying the cancelled buffer can remove it
1851 *         form the cancelled buffer table. Hence they have tobe done last.
1852 *
1853 *      3. Inode allocation buffers must be replayed before inode items that
1854 *         read the buffer and replay changes into it. For filesystems using the
1855 *         ICREATE transactions, this means XFS_LI_ICREATE objects need to get
1856 *         treated the same as inode allocation buffers as they create and
1857 *         initialise the buffers directly.
1858 *
1859 *      4. Inode unlink buffers must be replayed after inode items are replayed.
1860 *         This ensures that inodes are completely flushed to the inode buffer
1861 *         in a "free" state before we remove the unlinked inode list pointer.
1862 *
1863 * Hence the ordering needs to be inode allocation buffers first, inode items
1864 * second, inode unlink buffers third and cancelled buffers last.
1865 *
1866 * But there's a problem with that - we can't tell an inode allocation buffer
1867 * apart from a regular buffer, so we can't separate them. We can, however,
1868 * tell an inode unlink buffer from the others, and so we can separate them out
1869 * from all the other buffers and move them to last.
1870 *
1871 * Hence, 4 lists, in order from head to tail:
1872 *      - buffer_list for all buffers except cancelled/inode unlink buffers
1873 *      - item_list for all non-buffer items
1874 *      - inode_buffer_list for inode unlink buffers
1875 *      - cancel_list for the cancelled buffers
1876 *
1877 * Note that we add objects to the tail of the lists so that first-to-last
1878 * ordering is preserved within the lists. Adding objects to the head of the
1879 * list means when we traverse from the head we walk them in last-to-first
1880 * order. For cancelled buffers and inode unlink buffers this doesn't matter,
1881 * but for all other items there may be specific ordering that we need to
1882 * preserve.
1883 */
1884STATIC int
1885xlog_recover_reorder_trans(
1886        struct xlog             *log,
1887        struct xlog_recover     *trans,
1888        int                     pass)
1889{
1890        xlog_recover_item_t     *item, *n;
1891        int                     error = 0;
1892        LIST_HEAD(sort_list);
1893        LIST_HEAD(cancel_list);
1894        LIST_HEAD(buffer_list);
1895        LIST_HEAD(inode_buffer_list);
1896        LIST_HEAD(inode_list);
1897
1898        list_splice_init(&trans->r_itemq, &sort_list);
1899        list_for_each_entry_safe(item, n, &sort_list, ri_list) {
1900                xfs_buf_log_format_t    *buf_f = item->ri_buf[0].i_addr;
1901
1902                switch (ITEM_TYPE(item)) {
1903                case XFS_LI_ICREATE:
1904                        list_move_tail(&item->ri_list, &buffer_list);
1905                        break;
1906                case XFS_LI_BUF:
1907                        if (buf_f->blf_flags & XFS_BLF_CANCEL) {
1908                                trace_xfs_log_recover_item_reorder_head(log,
1909                                                        trans, item, pass);
1910                                list_move(&item->ri_list, &cancel_list);
1911                                break;
1912                        }
1913                        if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
1914                                list_move(&item->ri_list, &inode_buffer_list);
1915                                break;
1916                        }
1917                        list_move_tail(&item->ri_list, &buffer_list);
1918                        break;
1919                case XFS_LI_INODE:
1920                case XFS_LI_DQUOT:
1921                case XFS_LI_QUOTAOFF:
1922                case XFS_LI_EFD:
1923                case XFS_LI_EFI:
1924                        trace_xfs_log_recover_item_reorder_tail(log,
1925                                                        trans, item, pass);
1926                        list_move_tail(&item->ri_list, &inode_list);
1927                        break;
1928                default:
1929                        xfs_warn(log->l_mp,
1930                                "%s: unrecognized type of log operation",
1931                                __func__);
1932                        ASSERT(0);
1933                        /*
1934                         * return the remaining items back to the transaction
1935                         * item list so they can be freed in caller.
1936                         */
1937                        if (!list_empty(&sort_list))
1938                                list_splice_init(&sort_list, &trans->r_itemq);
1939                        error = -EIO;
1940                        goto out;
1941                }
1942        }
1943out:
1944        ASSERT(list_empty(&sort_list));
1945        if (!list_empty(&buffer_list))
1946                list_splice(&buffer_list, &trans->r_itemq);
1947        if (!list_empty(&inode_list))
1948                list_splice_tail(&inode_list, &trans->r_itemq);
1949        if (!list_empty(&inode_buffer_list))
1950                list_splice_tail(&inode_buffer_list, &trans->r_itemq);
1951        if (!list_empty(&cancel_list))
1952                list_splice_tail(&cancel_list, &trans->r_itemq);
1953        return error;
1954}
1955
1956/*
1957 * Build up the table of buf cancel records so that we don't replay
1958 * cancelled data in the second pass.  For buffer records that are
1959 * not cancel records, there is nothing to do here so we just return.
1960 *
1961 * If we get a cancel record which is already in the table, this indicates
1962 * that the buffer was cancelled multiple times.  In order to ensure
1963 * that during pass 2 we keep the record in the table until we reach its
1964 * last occurrence in the log, we keep a reference count in the cancel
1965 * record in the table to tell us how many times we expect to see this
1966 * record during the second pass.
1967 */
1968STATIC int
1969xlog_recover_buffer_pass1(
1970        struct xlog                     *log,
1971        struct xlog_recover_item        *item)
1972{
1973        xfs_buf_log_format_t    *buf_f = item->ri_buf[0].i_addr;
1974        struct list_head        *bucket;
1975        struct xfs_buf_cancel   *bcp;
1976
1977        /*
1978         * If this isn't a cancel buffer item, then just return.
1979         */
1980        if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
1981                trace_xfs_log_recover_buf_not_cancel(log, buf_f);
1982                return 0;
1983        }
1984
1985        /*
1986         * Insert an xfs_buf_cancel record into the hash table of them.
1987         * If there is already an identical record, bump its reference count.
1988         */
1989        bucket = XLOG_BUF_CANCEL_BUCKET(log, buf_f->blf_blkno);
1990        list_for_each_entry(bcp, bucket, bc_list) {
1991                if (bcp->bc_blkno == buf_f->blf_blkno &&
1992                    bcp->bc_len == buf_f->blf_len) {
1993                        bcp->bc_refcount++;
1994                        trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f);
1995                        return 0;
1996                }
1997        }
1998
1999        bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP);
2000        bcp->bc_blkno = buf_f->blf_blkno;

2001        bcp->bc_len = buf_f->blf_len;
2002        bcp->bc_refcount = 1;
2003        list_add_tail(&bcp->bc_list, bucket);
2004
2005        trace_xfs_log_recover_buf_cancel_add(log, buf_f);
2006        return 0;
2007}
2008
2009/*
2010 * Check to see whether the buffer being recovered has a corresponding
2011 * entry in the buffer cancel record table. If it is, return the cancel
2012 * buffer structure to the caller.
2013 */
2014STATIC struct xfs_buf_cancel *
2015xlog_peek_buffer_cancelled(
2016        struct xlog             *log,
2017        xfs_daddr_t             blkno,
2018        uint                    len,
2019        ushort                  flags)
2020{
2021        struct list_head        *bucket;
2022        struct xfs_buf_cancel   *bcp;
2023
2024        if (!log->l_buf_cancel_table) {
2025                /* empty table means no cancelled buffers in the log */
2026                ASSERT(!(flags & XFS_BLF_CANCEL));
2027                return NULL;
2028        }
2029
2030        bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno);
2031        list_for_each_entry(bcp, bucket, bc_list) {
2032                if (bcp->bc_blkno == blkno && bcp->bc_len == len)
2033                        return bcp;
2034        }
2035
2036        /*
2037         * We didn't find a corresponding entry in the table, so return 0 so
2038         * that the buffer is NOT cancelled.
2039         */
2040        ASSERT(!(flags & XFS_BLF_CANCEL));
2041        return NULL;
2042}
2043
2044/*
2045 * If the buffer is being cancelled then return 1 so that it will be cancelled,
2046 * otherwise return 0.  If the buffer is actually a buffer cancel item
2047 * (XFS_BLF_CANCEL is set), then decrement the refcount on the entry in the
2048 * table and remove it from the table if this is the last reference.
2049 *
2050 * We remove the cancel record from the table when we encounter its last
2051 * occurrence in the log so that if the same buffer is re-used again after its
2052 * last cancellation we actually replay the changes made at that point.
2053 */
2054STATIC int
2055xlog_check_buffer_cancelled(
2056        struct xlog             *log,
2057        xfs_daddr_t             blkno,
2058        uint                    len,
2059        ushort                  flags)
2060{
2061        struct xfs_buf_cancel   *bcp;
2062
2063        bcp = xlog_peek_buffer_cancelled(log, blkno, len, flags);
2064        if (!bcp)
2065                return 0;
2066
2067        /*
2068         * We've go a match, so return 1 so that the recovery of this buffer
2069         * is cancelled.  If this buffer is actually a buffer cancel log
2070         * item, then decrement the refcount on the one in the table and
2071         * remove it if this is the last reference.
2072         */
2073        if (flags & XFS_BLF_CANCEL) {
2074                if (--bcp->bc_refcount == 0) {
2075                        list_del(&bcp->bc_list);
2076                        kmem_free(bcp);
2077                }
2078        }
2079        return 1;
2080}
2081
2082/*
2083 * Perform recovery for a buffer full of inodes.  In these buffers, the only
2084 * data which should be recovered is that which corresponds to the
2085 * di_next_unlinked pointers in the on disk inode structures.  The rest of the
2086 * data for the inodes is always logged through the inodes themselves rather
2087 * than the inode buffer and is recovered in xlog_recover_inode_pass2().
2088 *
2089 * The only time when buffers full of inodes are fully recovered is when the
2090 * buffer is full of newly allocated inodes.  In this case the buffer will
2091 * not be marked as an inode buffer and so will be sent to
2092 * xlog_recover_do_reg_buffer() below during recovery.
2093 */
2094STATIC int
2095xlog_recover_do_inode_buffer(
2096        struct xfs_mount        *mp,
2097        xlog_recover_item_t     *item,
2098        struct xfs_buf          *bp,
2099        xfs_buf_log_format_t    *buf_f)
2100{
2101        int                     i;
2102        int                     item_index = 0;
2103        int                     bit = 0;
2104        int                     nbits = 0;
2105        int                     reg_buf_offset = 0;
2106        int                     reg_buf_bytes = 0;
2107        int                     next_unlinked_offset;
2108        int                     inodes_per_buf;
2109        xfs_agino_t             *logged_nextp;
2110        xfs_agino_t             *buffer_nextp;
2111
2112        trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
2113
2114        /*
2115         * Post recovery validation only works properly on CRC enabled
2116         * filesystems.
2117         */
2118        if (xfs_sb_version_hascrc(&mp->m_sb))
2119                bp->b_ops = &xfs_inode_buf_ops;
2120
2121        inodes_per_buf = BBTOB(bp->b_io_length) >> mp->m_sb.sb_inodelog;
2122        for (i = 0; i < inodes_per_buf; i++) {
2123                next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
2124                        offsetof(xfs_dinode_t, di_next_unlinked);
2125
2126                while (next_unlinked_offset >=
2127                       (reg_buf_offset + reg_buf_bytes)) {
2128                        /*
2129                         * The next di_next_unlinked field is beyond
2130                         * the current logged region.  Find the next
2131                         * logged region that contains or is beyond
2132                         * the current di_next_unlinked field.
2133                         */
2134                        bit += nbits;
2135                        bit = xfs_next_bit(buf_f->blf_data_map,
2136                                           buf_f->blf_map_size, bit);
2137
2138                        /*
2139                         * If there are no more logged regions in the
2140                         * buffer, then we're done.
2141                         */
2142                        if (bit == -1)
2143                                return 0;
2144
2145                        nbits = xfs_contig_bits(buf_f->blf_data_map,
2146                                                buf_f->blf_map_size, bit);
2147                        ASSERT(nbits > 0);
2148                        reg_buf_offset = bit << XFS_BLF_SHIFT;
2149                        reg_buf_bytes = nbits << XFS_BLF_SHIFT;
2150                        item_index++;
2151                }
2152
2153                /*
2154                 * If the current logged region starts after the current
2155                 * di_next_unlinked field, then move on to the next
2156                 * di_next_unlinked field.
2157                 */
2158                if (next_unlinked_offset < reg_buf_offset)
2159                        continue;
2160
2161                ASSERT(item->ri_buf[item_index].i_addr != NULL);
2162                ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
2163                ASSERT((reg_buf_offset + reg_buf_bytes) <=
2164                                                        BBTOB(bp->b_io_length));
2165
2166                /*
2167                 * The current logged region contains a copy of the
2168                 * current di_next_unlinked field.  Extract its value
2169                 * and copy it to the buffer copy.
2170                 */
2171                logged_nextp = item->ri_buf[item_index].i_addr +
2172                                next_unlinked_offset - reg_buf_offset;
2173                if (unlikely(*logged_nextp == 0)) {
2174                        xfs_alert(mp,
2175                "Bad inode buffer log record (ptr = 0x%p, bp = 0x%p). "
2176                "Trying to replay bad (0) inode di_next_unlinked field.",
2177                                item, bp);
2178                        XFS_ERROR_REPORT("xlog_recover_do_inode_buf",
2179                                         XFS_ERRLEVEL_LOW, mp);
2180                        return -EFSCORRUPTED;
2181                }
2182
2183                buffer_nextp = xfs_buf_offset(bp, next_unlinked_offset);
2184                *buffer_nextp = *logged_nextp;
2185
2186                /*
2187                 * If necessary, recalculate the CRC in the on-disk inode. We
2188                 * have to leave the inode in a consistent state for whoever
2189                 * reads it next....
2190                 */
2191                xfs_dinode_calc_crc(mp,
2192                                xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize));
2193
2194        }
2195
2196        return 0;
2197}
2198
2199/*
2200 * V5 filesystems know the age of the buffer on disk being recovered. We can
2201 * have newer objects on disk than we are replaying, and so for these cases we
2202 * don't want to replay the current change as that will make the buffer contents
2203 * temporarily invalid on disk.
2204 *
2205 * The magic number might not match the buffer type we are going to recover
2206 * (e.g. reallocated blocks), so we ignore the xfs_buf_log_format flags.  Hence
2207 * extract the LSN of the existing object in the buffer based on it's current
2208 * magic number.  If we don't recognise the magic number in the buffer, then
2209 * return a LSN of -1 so that the caller knows it was an unrecognised block and
2210 * so can recover the buffer.
2211 *
2212 * Note: we cannot rely solely on magic number matches to determine that the
2213 * buffer has a valid LSN - we also need to verify that it belongs to this
2214 * filesystem, so we need to extract the object's LSN and compare it to that
2215 * which we read from the superblock. If the UUIDs don't match, then we've got a
2216 * stale metadata block from an old filesystem instance that we need to recover
2217 * over the top of.
2218 */
2219static xfs_lsn_t
2220xlog_recover_get_buf_lsn(
2221        struct xfs_mount        *mp,
2222        struct xfs_buf          *bp)
2223{
2224        __uint32_t              magic32;
2225        __uint16_t              magic16;
2226        __uint16_t              magicda;
2227        void                    *blk = bp->b_addr;
2228        uuid_t                  *uuid;
2229        xfs_lsn_t               lsn = -1;
2230
2231        /* v4 filesystems always recover immediately */
2232        if (!xfs_sb_version_hascrc(&mp->m_sb))
2233                goto recover_immediately;
2234
2235        magic32 = be32_to_cpu(*(__be32 *)blk);
2236        switch (magic32) {
2237        case XFS_ABTB_CRC_MAGIC:
2238        case XFS_ABTC_CRC_MAGIC:
2239        case XFS_ABTB_MAGIC:
2240        case XFS_ABTC_MAGIC:
2241        case XFS_IBT_CRC_MAGIC:
2242        case XFS_IBT_MAGIC: {
2243                struct xfs_btree_block *btb = blk;
2244
2245                lsn = be64_to_cpu(btb->bb_u.s.bb_lsn);
2246                uuid = &btb->bb_u.s.bb_uuid;
2247                break;
2248        }
2249        case XFS_BMAP_CRC_MAGIC:
2250        case XFS_BMAP_MAGIC: {
2251                struct xfs_btree_block *btb = blk;
2252
2253                lsn = be64_to_cpu(btb->bb_u.l.bb_lsn);
2254                uuid = &btb->bb_u.l.bb_uuid;
2255                break;
2256        }
2257        case XFS_AGF_MAGIC:
2258                lsn = be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn);
2259                uuid = &((struct xfs_agf *)blk)->agf_uuid;
2260                break;
2261        case XFS_AGFL_MAGIC:
2262                lsn = be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn);
2263                uuid = &((struct xfs_agfl *)blk)->agfl_uuid;
2264                break;
2265        case XFS_AGI_MAGIC:
2266                lsn = be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn);
2267                uuid = &((struct xfs_agi *)blk)->agi_uuid;
2268                break;
2269        case XFS_SYMLINK_MAGIC:
2270                lsn = be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn);
2271                uuid = &((struct xfs_dsymlink_hdr *)blk)->sl_uuid;
2272                break;
2273        case XFS_DIR3_BLOCK_MAGIC:
2274        case XFS_DIR3_DATA_MAGIC:
2275        case XFS_DIR3_FREE_MAGIC:
2276                lsn = be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn);
2277                uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid;
2278                break;
2279        case XFS_ATTR3_RMT_MAGIC:
2280                /*
2281                 * Remote attr blocks are written synchronously, rather than
2282                 * being logged. That means they do not contain a valid LSN
2283                 * (i.e. transactionally ordered) in them, and hence any time we
2284                 * see a buffer to replay over the top of a remote attribute
2285                 * block we should simply do so.
2286                 */
2287                goto recover_immediately;
2288        case XFS_SB_MAGIC:
2289                /*
2290                 * superblock uuids are magic. We may or may not have a
2291                 * sb_meta_uuid on disk, but it will be set in the in-core
2292                 * superblock. We set the uuid pointer for verification
2293                 * according to the superblock feature mask to ensure we check
2294                 * the relevant UUID in the superblock.
2295                 */
2296                lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn);
2297                if (xfs_sb_version_hasmetauuid(&mp->m_sb))
2298                        uuid = &((struct xfs_dsb *)blk)->sb_meta_uuid;
2299                else
2300                        uuid = &((struct xfs_dsb *)blk)->sb_uuid;
2301                break;
2302        default:
2303                break;
2304        }
2305
2306        if (lsn != (xfs_lsn_t)-1) {
2307                if (!uuid_equal(&mp->m_sb.sb_meta_uuid, uuid))
2308                        goto recover_immediately;
2309                return lsn;
2310        }
2311
2312        magicda = be16_to_cpu(((struct xfs_da_blkinfo *)blk)->magic);
2313        switch (magicda) {
2314        case XFS_DIR3_LEAF1_MAGIC:
2315        case XFS_DIR3_LEAFN_MAGIC:
2316        case XFS_DA3_NODE_MAGIC:
2317                lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn);
2318                uuid = &((struct xfs_da3_blkinfo *)blk)->uuid;
2319                break;
2320        default:
2321                break;
2322        }
2323
2324        if (lsn != (xfs_lsn_t)-1) {
2325                if (!uuid_equal(&mp->m_sb.sb_uuid, uuid))
2326                        goto recover_immediately;
2327                return lsn;
2328        }
2329
2330        /*
2331         * We do individual object checks on dquot and inode buffers as they
2332         * have their own individual LSN records. Also, we could have a stale
2333         * buffer here, so we have to at least recognise these buffer types.
2334         *
2335         * A notd complexity here is inode unlinked list processing - it logs
2336         * the inode directly in the buffer, but we don't know which inodes have
2337         * been modified, and there is no global buffer LSN. Hence we need to
2338         * recover all inode buffer types immediately. This problem will be
2339         * fixed by logical logging of the unlinked list modifications.
2340         */
2341        magic16 = be16_to_cpu(*(__be16 *)blk);
2342        switch (magic16) {
2343        case XFS_DQUOT_MAGIC:
2344        case XFS_DINODE_MAGIC:
2345                goto recover_immediately;
2346        default:
2347                break;
2348        }
2349
2350        /* unknown buffer contents, recover immediately */
2351
2352recover_immediately:
2353        return (xfs_lsn_t)-1;
2354
2355}
2356
2357/*
2358 * Validate the recovered buffer is of the correct type and attach the
2359 * appropriate buffer operations to them for writeback. Magic numbers are in a
2360 * few places:
2361 *      the first 16 bits of the buffer (inode buffer, dquot buffer),
2362 *      the first 32 bits of the buffer (most blocks),
2363 *      inside a struct xfs_da_blkinfo at the start of the buffer.
2364 */
2365static void
2366xlog_recover_validate_buf_type(
2367        struct xfs_mount        *mp,
2368        struct xfs_buf          *bp,
2369        xfs_buf_log_format_t    *buf_f,
2370        xfs_lsn_t               current_lsn)
2371{
2372        struct xfs_da_blkinfo   *info = bp->b_addr;
2373        __uint32_t              magic32;
2374        __uint16_t              magic16;
2375        __uint16_t              magicda;
2376        char                    *warnmsg = NULL;
2377
2378        /*
2379         * We can only do post recovery validation on items on CRC enabled
2380         * fielsystems as we need to know when the buffer was written to be able
2381         * to determine if we should have replayed the item. If we replay old
2382         * metadata over a newer buffer, then it will enter a temporarily
2383         * inconsistent state resulting in verification failures. Hence for now
2384         * just avoid the verification stage for non-crc filesystems
2385         */
2386        if (!xfs_sb_version_hascrc(&mp->m_sb))
2387                return;
2388
2389        magic32 = be32_to_cpu(*(__be32 *)bp->b_addr);
2390        magic16 = be16_to_cpu(*(__be16*)bp->b_addr);
2391        magicda = be16_to_cpu(info->magic);
2392        switch (xfs_blft_from_flags(buf_f)) {
2393        case XFS_BLFT_BTREE_BUF:
2394                switch (magic32) {
2395                case XFS_ABTB_CRC_MAGIC:
2396                case XFS_ABTC_CRC_MAGIC:
2397                case XFS_ABTB_MAGIC:
2398                case XFS_ABTC_MAGIC:
2399                        bp->b_ops = &xfs_allocbt_buf_ops;
2400                        break;
2401                case XFS_IBT_CRC_MAGIC:
2402                case XFS_FIBT_CRC_MAGIC:
2403                case XFS_IBT_MAGIC:
2404                case XFS_FIBT_MAGIC:
2405                        bp->b_ops = &xfs_inobt_buf_ops;
2406                        break;
2407                case XFS_BMAP_CRC_MAGIC:
2408                case XFS_BMAP_MAGIC:
2409                        bp->b_ops = &xfs_bmbt_buf_ops;
2410                        break;
2411                default:
2412                        warnmsg = "Bad btree block magic!";
2413                        break;
2414                }
2415                break;
2416        case XFS_BLFT_AGF_BUF:
2417                if (magic32 != XFS_AGF_MAGIC) {
2418                        warnmsg = "Bad AGF block magic!";
2419                        break;
2420                }
2421                bp->b_ops = &xfs_agf_buf_ops;
2422                break;
2423        case XFS_BLFT_AGFL_BUF:
2424                if (magic32 != XFS_AGFL_MAGIC) {
2425                        warnmsg = "Bad AGFL block magic!";
2426                        break;
2427                }
2428                bp->b_ops = &xfs_agfl_buf_ops;
2429                break;
2430        case XFS_BLFT_AGI_BUF:
2431                if (magic32 != XFS_AGI_MAGIC) {
2432                        warnmsg = "Bad AGI block magic!";
2433                        break;
2434                }
2435                bp->b_ops = &xfs_agi_buf_ops;
2436                break;
2437        case XFS_BLFT_UDQUOT_BUF:
2438        case XFS_BLFT_PDQUOT_BUF:
2439        case XFS_BLFT_GDQUOT_BUF:
2440#ifdef CONFIG_XFS_QUOTA
2441                if (magic16 != XFS_DQUOT_MAGIC) {
2442                        warnmsg = "Bad DQUOT block magic!";
2443                        break;
2444                }
2445                bp->b_ops = &xfs_dquot_buf_ops;
2446#else
2447                xfs_alert(mp,
2448        "Trying to recover dquots without QUOTA support built in!");
2449                ASSERT(0);
2450#endif
2451                break;
2452        case XFS_BLFT_DINO_BUF:
2453                if (magic16 != XFS_DINODE_MAGIC) {
2454                        warnmsg = "Bad INODE block magic!";
2455                        break;
2456                }
2457                bp->b_ops = &xfs_inode_buf_ops;
2458                break;
2459        case XFS_BLFT_SYMLINK_BUF:
2460                if (magic32 != XFS_SYMLINK_MAGIC) {
2461                        warnmsg = "Bad symlink block magic!";
2462                        break;
2463                }
2464                bp->b_ops = &xfs_symlink_buf_ops;
2465                break;
2466        case XFS_BLFT_DIR_BLOCK_BUF:
2467                if (magic32 != XFS_DIR2_BLOCK_MAGIC &&
2468                    magic32 != XFS_DIR3_BLOCK_MAGIC) {
2469                        warnmsg = "Bad dir block magic!";
2470                        break;
2471                }
2472                bp->b_ops = &xfs_dir3_block_buf_ops;
2473                break;
2474        case XFS_BLFT_DIR_DATA_BUF:
2475                if (magic32 != XFS_DIR2_DATA_MAGIC &&
2476                    magic32 != XFS_DIR3_DATA_MAGIC) {
2477                        warnmsg = "Bad dir data magic!";
2478                        break;
2479                }
2480                bp->b_ops = &xfs_dir3_data_buf_ops;
2481                break;
2482        case XFS_BLFT_DIR_FREE_BUF:
2483                if (magic32 != XFS_DIR2_FREE_MAGIC &&
2484                    magic32 != XFS_DIR3_FREE_MAGIC) {
2485                        warnmsg = "Bad dir3 free magic!";
2486                        break;
2487                }
2488                bp->b_ops = &xfs_dir3_free_buf_ops;
2489                break;
2490        case XFS_BLFT_DIR_LEAF1_BUF:
2491                if (magicda != XFS_DIR2_LEAF1_MAGIC &&
2492                    magicda != XFS_DIR3_LEAF1_MAGIC) {
2493                        warnmsg = "Bad dir leaf1 magic!";
2494                        break;
2495                }
2496                bp->b_ops = &xfs_dir3_leaf1_buf_ops;
2497                break;
2498        case XFS_BLFT_DIR_LEAFN_BUF:
2499                if (magicda != XFS_DIR2_LEAFN_MAGIC &&
2500                    magicda != XFS_DIR3_LEAFN_MAGIC) {
2501                        warnmsg = "Bad dir leafn magic!";
2502                        break;
2503                }
2504                bp->b_ops = &xfs_dir3_leafn_buf_ops;
2505                break;
2506        case XFS_BLFT_DA_NODE_BUF:
2507                if (magicda != XFS_DA_NODE_MAGIC &&
2508                    magicda != XFS_DA3_NODE_MAGIC) {
2509                        warnmsg = "Bad da node magic!";
2510                        break;
2511                }
2512                bp->b_ops = &xfs_da3_node_buf_ops;
2513                break;
2514        case XFS_BLFT_ATTR_LEAF_BUF:
2515                if (magicda != XFS_ATTR_LEAF_MAGIC &&
2516                    magicda != XFS_ATTR3_LEAF_MAGIC) {
2517                        warnmsg = "Bad attr leaf magic!";
2518                        break;
2519                }
2520                bp->b_ops = &xfs_attr3_leaf_buf_ops;
2521                break;
2522        case XFS_BLFT_ATTR_RMT_BUF:
2523                if (magic32 != XFS_ATTR3_RMT_MAGIC) {
2524                        warnmsg = "Bad attr remote magic!";
2525                        break;
2526                }
2527                bp->b_ops = &xfs_attr3_rmt_buf_ops;
2528                break;
2529        case XFS_BLFT_SB_BUF:
2530                if (magic32 != XFS_SB_MAGIC) {
2531                        warnmsg = "Bad SB block magic!";
2532                        break;
2533                }
2534                bp->b_ops = &xfs_sb_buf_ops;
2535                break;
2536#ifdef CONFIG_XFS_RT
2537        case XFS_BLFT_RTBITMAP_BUF:
2538        case XFS_BLFT_RTSUMMARY_BUF:
2539                /* no magic numbers for verification of RT buffers */
2540                bp->b_ops = &xfs_rtbuf_ops;
2541                break;
2542#endif /* CONFIG_XFS_RT */
2543        default:
2544                xfs_warn(mp, "Unknown buffer type %d!",
2545                         xfs_blft_from_flags(buf_f));
2546                break;
2547        }
2548
2549        /*
2550         * Nothing else to do in the case of a NULL current LSN as this means
2551         * the buffer is more recent than the change in the log and will be
2552         * skipped.
2553         */
2554        if (current_lsn == NULLCOMMITLSN)
2555                return;
2556
2557        if (warnmsg) {
2558                xfs_warn(mp, warnmsg);
2559                ASSERT(0);
2560        }
2561
2562        /*
2563         * We must update the metadata LSN of the buffer as it is written out to
2564         * ensure that older transactions never replay over this one and corrupt
2565         * the buffer. This can occur if log recovery is interrupted at some
2566         * point after the current transaction completes, at which point a
2567         * subsequent mount starts recovery from the beginning.
2568         *
2569         * Write verifiers update the metadata LSN from log items attached to
2570         * the buffer. Therefore, initialize a bli purely to carry the LSN to
2571         * the verifier. We'll clean it up in our ->iodone() callback.
2572         */
2573        if (bp->b_ops) {
2574                struct xfs_buf_log_item *bip;
2575
2576                ASSERT(!bp->b_iodone || bp->b_iodone == xlog_recover_iodone);
2577                bp->b_iodone = xlog_recover_iodone;
2578                xfs_buf_item_init(bp, mp);
2579                bip = bp->b_fspriv;
2580                bip->bli_item.li_lsn = current_lsn;
2581        }
2582}
2583
2584/*
2585 * Perform a 'normal' buffer recovery.  Each logged region of the
2586 * buffer should be copied over the corresponding region in the
2587 * given buffer.  The bitmap in the buf log format structure indicates
2588 * where to place the logged data.
2589 */
2590STATIC void
2591xlog_recover_do_reg_buffer(
2592        struct xfs_mount        *mp,
2593        xlog_recover_item_t     *item,
2594        struct xfs_buf          *bp,
2595        xfs_buf_log_format_t    *buf_f,
2596        xfs_lsn_t               current_lsn)
2597{
2598        int                     i;
2599        int                     bit;
2600        int                     nbits;
2601        int                     error;
2602
2603        trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
2604
2605        bit = 0;
2606        i = 1;  /* 0 is the buf format structure */
2607        while (1) {
2608                bit = xfs_next_bit(buf_f->blf_data_map,
2609                                   buf_f->blf_map_size, bit);
2610                if (bit == -1)
2611                        break;
2612                nbits = xfs_contig_bits(buf_f->blf_data_map,
2613                                        buf_f->blf_map_size, bit);
2614                ASSERT(nbits > 0);
2615                ASSERT(item->ri_buf[i].i_addr != NULL);
2616                ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
2617                ASSERT(BBTOB(bp->b_io_length) >=
2618                       ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT));
2619
2620                /*
2621                 * The dirty regions logged in the buffer, even though
2622                 * contiguous, may span multiple chunks. This is because the
2623                 * dirty region may span a physical page boundary in a buffer
2624                 * and hence be split into two separate vectors for writing into
2625                 * the log. Hence we need to trim nbits back to the length of
2626                 * the current region being copied out of the log.
2627                 */
2628                if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT))
2629                        nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT;
2630
2631                /*
2632                 * Do a sanity check if this is a dquot buffer. Just checking
2633                 * the first dquot in the buffer should do. XXXThis is
2634                 * probably a good thing to do for other buf types also.
2635                 */
2636                error = 0;
2637                if (buf_f->blf_flags &
2638                   (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
2639                        if (item->ri_buf[i].i_addr == NULL) {
2640                                xfs_alert(mp,
2641                                        "XFS: NULL dquot in %s.", __func__);
2642                                goto next;
2643                        }
2644                        if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) {
2645                                xfs_alert(mp,
2646                                        "XFS: dquot too small (%d) in %s.",
2647                                        item->ri_buf[i].i_len, __func__);
2648                                goto next;
2649                        }
2650                        error = xfs_dqcheck(mp, item->ri_buf[i].i_addr,
2651                                               -1, 0, XFS_QMOPT_DOWARN,
2652                                               "dquot_buf_recover");
2653                        if (error)
2654                                goto next;
2655                }
2656
2657                memcpy(xfs_buf_offset(bp,
2658                        (uint)bit << XFS_BLF_SHIFT),    /* dest */
2659                        item->ri_buf[i].i_addr,         /* source */
2660                        nbits<<XFS_BLF_SHIFT);          /* length */
2661 next:
2662                i++;
2663                bit += nbits;
2664        }
2665
2666        /* Shouldn't be any more regions */
2667        ASSERT(i == item->ri_total);
2668
2669        xlog_recover_validate_buf_type(mp, bp, buf_f, current_lsn);
2670}
2671
2672/*
2673 * Perform a dquot buffer recovery.
2674 * Simple algorithm: if we have found a QUOTAOFF log item of the same type
2675 * (ie. USR or GRP), then just toss this buffer away; don't recover it.
2676 * Else, treat it as a regular buffer and do recovery.
2677 *
2678 * Return false if the buffer was tossed and true if we recovered the buffer to
2679 * indicate to the caller if the buffer needs writing.
2680 */
2681STATIC bool
2682xlog_recover_do_dquot_buffer(
2683        struct xfs_mount                *mp,
2684        struct xlog                     *log,
2685        struct xlog_recover_item        *item,
2686        struct xfs_buf                  *bp,
2687        struct xfs_buf_log_format       *buf_f)
2688{
2689        uint                    type;
2690
2691        trace_xfs_log_recover_buf_dquot_buf(log, buf_f);
2692
2693        /*
2694         * Filesystems are required to send in quota flags at mount time.
2695         */
2696        if (!mp->m_qflags)
2697                return false;
2698
2699        type = 0;
2700        if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF)
2701                type |= XFS_DQ_USER;
2702        if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF)
2703                type |= XFS_DQ_PROJ;
2704        if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF)
2705                type |= XFS_DQ_GROUP;
2706        /*
2707         * This type of quotas was turned off, so ignore this buffer
2708         */
2709        if (log->l_quotaoffs_flag & type)
2710                return false;
2711
2712        xlog_recover_do_reg_buffer(mp, item, bp, buf_f, NULLCOMMITLSN);
2713        return true;
2714}
2715
2716/*
2717 * This routine replays a modification made to a buffer at runtime.
2718 * There are actually two types of buffer, regular and inode, which
2719 * are handled differently.  Inode buffers are handled differently
2720 * in that we only recover a specific set of data from them, namely
2721 * the inode di_next_unlinked fields.  This is because all other inode
2722 * data is actually logged via inode records and any data we replay
2723 * here which overlaps that may be stale.
2724 *
2725 * When meta-data buffers are freed at run time we log a buffer item
2726 * with the XFS_BLF_CANCEL bit set to indicate that previous copies
2727 * of the buffer in the log should not be replayed at recovery time.
2728 * This is so that if the blocks covered by the buffer are reused for
2729 * file data before we crash we don't end up replaying old, freed
2730 * meta-data into a user's file.
2731 *
2732 * To handle the cancellation of buffer log items, we make two passes
2733 * over the log during recovery.  During the first we build a table of
2734 * those buffers which have been cancelled, and during the second we
2735 * only replay those buffers which do not have corresponding cancel
2736 * records in the table.  See xlog_recover_buffer_pass[1,2] above
2737 * for more details on the implementation of the table of cancel records.
2738 */
2739STATIC int
2740xlog_recover_buffer_pass2(
2741        struct xlog                     *log,
2742        struct list_head                *buffer_list,
2743        struct xlog_recover_item        *item,
2744        xfs_lsn_t                       current_lsn)
2745{
2746        xfs_buf_log_format_t    *buf_f = item->ri_buf[0].i_addr;
2747        xfs_mount_t             *mp = log->l_mp;
2748        xfs_buf_t               *bp;
2749        int                     error;
2750        uint                    buf_flags;
2751        xfs_lsn_t               lsn;
2752
2753        /*
2754         * In this pass we only want to recover all the buffers which have
2755         * not been cancelled and are not cancellation buffers themselves.
2756         */
2757        if (xlog_check_buffer_cancelled(log, buf_f->blf_blkno,
2758                        buf_f->blf_len, buf_f->blf_flags)) {
2759                trace_xfs_log_recover_buf_cancel(log, buf_f);
2760                return 0;
2761        }
2762
2763        trace_xfs_log_recover_buf_recover(log, buf_f);
2764
2765        buf_flags = 0;
2766        if (buf_f->blf_flags & XFS_BLF_INODE_BUF)
2767                buf_flags |= XBF_UNMAPPED;
2768
2769        bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
2770                          buf_flags, NULL);
2771        if (!bp)
2772                return -ENOMEM;
2773        error = bp->b_error;
2774        if (error) {
2775                xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)");
2776                goto out_release;
2777        }
2778
2779        /*
2780         * Recover the buffer only if we get an LSN from it and it's less than
2781         * the lsn of the transaction we are replaying.
2782         *
2783         * Note that we have to be extremely careful of readahead here.
2784         * Readahead does not attach verfiers to the buffers so if we don't
2785         * actually do any replay after readahead because of the LSN we found
2786         * in the buffer if more recent than that current transaction then we
2787         * need to attach the verifier directly. Failure to do so can lead to
2788         * future recovery actions (e.g. EFI and unlinked list recovery) can
2789         * operate on the buffers and they won't get the verifier attached. This
2790         * can lead to blocks on disk having the correct content but a stale
2791         * CRC.
2792         *
2793         * It is safe to assume these clean buffers are currently up to date.
2794         * If the buffer is dirtied by a later transaction being replayed, then
2795         * the verifier will be reset to match whatever recover turns that
2796         * buffer into.
2797         */
2798        lsn = xlog_recover_get_buf_lsn(mp, bp);
2799        if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
2800                trace_xfs_log_recover_buf_skip(log, buf_f);
2801                xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN);
2802                goto out_release;
2803        }
2804
2805        if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
2806                error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
2807                if (error)
2808                        goto out_release;
2809        } else if (buf_f->blf_flags &
2810                  (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
2811                bool    dirty;
2812
2813                dirty = xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
2814                if (!dirty)
2815                        goto out_release;
2816        } else {
2817                xlog_recover_do_reg_buffer(mp, item, bp, buf_f, current_lsn);
2818        }
2819
2820        /*
2821         * Perform delayed write on the buffer.  Asynchronous writes will be
2822         * slower when taking into account all the buffers to be flushed.
2823         *
2824         * Also make sure that only inode buffers with good sizes stay in
2825         * the buffer cache.  The kernel moves inodes in buffers of 1 block
2826         * or mp->m_inode_cluster_size bytes, whichever is bigger.  The inode
2827         * buffers in the log can be a different size if the log was generated
2828         * by an older kernel using unclustered inode buffers or a newer kernel
2829         * running with a different inode cluster size.  Regardless, if the
2830         * the inode buffer size isn't MAX(blocksize, mp->m_inode_cluster_size)
2831         * for *our* value of mp->m_inode_cluster_size, then we need to keep
2832         * the buffer out of the buffer cache so that the buffer won't
2833         * overlap with future reads of those inodes.
2834         */
2835        if (XFS_DINODE_MAGIC ==
2836            be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&
2837            (BBTOB(bp->b_io_length) != MAX(log->l_mp->m_sb.sb_blocksize,
2838                        (__uint32_t)log->l_mp->m_inode_cluster_size))) {
2839                xfs_buf_stale(bp);
2840                error = xfs_bwrite(bp);
2841        } else {
2842                ASSERT(bp->b_target->bt_mount == mp);
2843                bp->b_iodone = xlog_recover_iodone;
2844                xfs_buf_delwri_queue(bp, buffer_list);
2845        }
2846
2847out_release:
2848        xfs_buf_relse(bp);
2849        return error;
2850}
2851
2852/*
2853 * Inode fork owner changes
2854 *
2855 * If we have been told that we have to reparent the inode fork, it's because an
2856 * extent swap operation on a CRC enabled filesystem has been done and we are
2857 * replaying it. We need to walk the BMBT of the appropriate fork and change the
2858 * owners of it.
2859 *
2860 * The complexity here is that we don't have an inode context to work with, so
2861 * after we've replayed the inode we need to instantiate one.  This is where the
2862 * fun begins.
2863 *
2864 * We are in the middle of log recovery, so we can't run transactions. That
2865 * means we cannot use cache coherent inode instantiation via xfs_iget(), as
2866 * that will result in the corresponding iput() running the inode through
2867 * xfs_inactive(). If we've just replayed an inode core that changes the link
2868 * count to zero (i.e. it's been unlinked), then xfs_inactive() will run
2869 * transactions (bad!).
2870 *
2871 * So, to avoid this, we instantiate an inode directly from the inode core we've
2872 * just recovered. We have the buffer still locked, and all we really need to
2873 * instantiate is the inode core and the forks being modified. We can do this
2874 * manually, then run the inode btree owner change, and then tear down the
2875 * xfs_inode without having to run any transactions at all.
2876 *
2877 * Also, because we don't have a transaction context available here but need to
2878 * gather all the buffers we modify for writeback so we pass the buffer_list
2879 * instead for the operation to use.
2880 */
2881
2882STATIC int
2883xfs_recover_inode_owner_change(
2884        struct xfs_mount        *mp,
2885        struct xfs_dinode       *dip,
2886        struct xfs_inode_log_format *in_f,
2887        struct list_head        *buffer_list)
2888{
2889        struct xfs_inode        *ip;
2890        int                     error;
2891
2892        ASSERT(in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER));
2893
2894        ip = xfs_inode_alloc(mp, in_f->ilf_ino);
2895        if (!ip)
2896                return -ENOMEM;
2897
2898        /* instantiate the inode */
2899        xfs_inode_from_disk(ip, dip);
2900        ASSERT(ip->i_d.di_version >= 3);
2901
2902        error = xfs_iformat_fork(ip, dip);
2903        if (error)
2904                goto out_free_ip;
2905
2906
2907        if (in_f->ilf_fields & XFS_ILOG_DOWNER) {
2908                ASSERT(in_f->ilf_fields & XFS_ILOG_DBROOT);
2909                error = xfs_bmbt_change_owner(NULL, ip, XFS_DATA_FORK,
2910                                              ip->i_ino, buffer_list);
2911                if (error)
2912                        goto out_free_ip;
2913        }
2914
2915        if (in_f->ilf_fields & XFS_ILOG_AOWNER) {
2916                ASSERT(in_f->ilf_fields & XFS_ILOG_ABROOT);
2917                error = xfs_bmbt_change_owner(NULL, ip, XFS_ATTR_FORK,
2918                                              ip->i_ino, buffer_list);
2919                if (error)
2920                        goto out_free_ip;
2921        }
2922
2923out_free_ip:
2924        xfs_inode_free(ip);
2925        return error;
2926}
2927
2928STATIC int
2929xlog_recover_inode_pass2(
2930        struct xlog                     *log,
2931        struct list_head                *buffer_list,
2932        struct xlog_recover_item        *item,
2933        xfs_lsn_t                       current_lsn)
2934{
2935        xfs_inode_log_format_t  *in_f;
2936        xfs_mount_t             *mp = log->l_mp;
2937        xfs_buf_t               *bp;
2938        xfs_dinode_t            *dip;
2939        int                     len;
2940        char                    *src;
2941        char                    *dest;
2942        int                     error;
2943        int                     attr_index;
2944        uint                    fields;
2945        struct xfs_log_dinode   *ldip;
2946        uint                    isize;
2947        int                     need_free = 0;
2948
2949        if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
2950                in_f = item->ri_buf[0].i_addr;
2951        } else {
2952                in_f = kmem_alloc(sizeof(xfs_inode_log_format_t), KM_SLEEP);
2953                need_free = 1;
2954                error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
2955                if (error)
2956                        goto error;
2957        }
2958
2959        /*
2960         * Inode buffers can be freed, look out for it,
2961         * and do not replay the inode.
2962         */
2963        if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno,
2964                                        in_f->ilf_len, 0)) {
2965                error = 0;
2966                trace_xfs_log_recover_inode_cancel(log, in_f);
2967                goto error;
2968        }
2969        trace_xfs_log_recover_inode_recover(log, in_f);
2970
2971        bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0,
2972                          &xfs_inode_buf_ops);
2973        if (!bp) {
2974                error = -ENOMEM;
2975                goto error;
2976        }
2977        error = bp->b_error;
2978        if (error) {
2979                xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)");
2980                goto out_release;
2981        }
2982        ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
2983        dip = xfs_buf_offset(bp, in_f->ilf_boffset);
2984
2985        /*
2986         * Make sure the place we're flushing out to really looks
2987         * like an inode!
2988         */
2989        if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) {
2990                xfs_alert(mp,
2991        "%s: Bad inode magic number, dip = 0x%p, dino bp = 0x%p, ino = %Ld",
2992                        __func__, dip, bp, in_f->ilf_ino);
2993                XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
2994                                 XFS_ERRLEVEL_LOW, mp);
2995                error = -EFSCORRUPTED;
2996                goto out_release;
2997        }
2998        ldip = item->ri_buf[1].i_addr;
2999        if (unlikely(ldip->di_magic != XFS_DINODE_MAGIC)) {
3000                xfs_alert(mp,

3001                        "%s: Bad inode log record, rec ptr 0x%p, ino %Ld",
3002                        __func__, item, in_f->ilf_ino);
3003                XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
3004                                 XFS_ERRLEVEL_LOW, mp);
3005                error = -EFSCORRUPTED;
3006                goto out_release;
3007        }
3008
3009        /*
3010         * If the inode has an LSN in it, recover the inode only if it's less
3011         * than the lsn of the transaction we are replaying. Note: we still
3012         * need to replay an owner change even though the inode is more recent
3013         * than the transaction as there is no guarantee that all the btree
3014         * blocks are more recent than this transaction, too.
3015         */
3016        if (dip->di_version >= 3) {
3017                xfs_lsn_t       lsn = be64_to_cpu(dip->di_lsn);
3018
3019                if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
3020                        trace_xfs_log_recover_inode_skip(log, in_f);
3021                        error = 0;
3022                        goto out_owner_change;
3023                }
3024        }
3025
3026        /*
3027         * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes
3028         * are transactional and if ordering is necessary we can determine that
3029         * more accurately by the LSN field in the V3 inode core. Don't trust
3030         * the inode versions we might be changing them here - use the
3031         * superblock flag to determine whether we need to look at di_flushiter
3032         * to skip replay when the on disk inode is newer than the log one
3033         */
3034        if (!xfs_sb_version_hascrc(&mp->m_sb) &&
3035            ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
3036                /*
3037                 * Deal with the wrap case, DI_MAX_FLUSH is less
3038                 * than smaller numbers
3039                 */
3040                if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
3041                    ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) {
3042                        /* do nothing */
3043                } else {
3044                        trace_xfs_log_recover_inode_skip(log, in_f);
3045                        error = 0;
3046                        goto out_release;
3047                }
3048        }
3049
3050        /* Take the opportunity to reset the flush iteration count */
3051        ldip->di_flushiter = 0;
3052
3053        if (unlikely(S_ISREG(ldip->di_mode))) {
3054                if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
3055                    (ldip->di_format != XFS_DINODE_FMT_BTREE)) {
3056                        XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
3057                                         XFS_ERRLEVEL_LOW, mp, ldip);
3058                        xfs_alert(mp,
3059                "%s: Bad regular inode log record, rec ptr 0x%p, "
3060                "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
3061                                __func__, item, dip, bp, in_f->ilf_ino);
3062                        error = -EFSCORRUPTED;
3063                        goto out_release;
3064                }
3065        } else if (unlikely(S_ISDIR(ldip->di_mode))) {
3066                if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
3067                    (ldip->di_format != XFS_DINODE_FMT_BTREE) &&
3068                    (ldip->di_format != XFS_DINODE_FMT_LOCAL)) {
3069                        XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
3070                                             XFS_ERRLEVEL_LOW, mp, ldip);
3071                        xfs_alert(mp,
3072                "%s: Bad dir inode log record, rec ptr 0x%p, "
3073                "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
3074                                __func__, item, dip, bp, in_f->ilf_ino);
3075                        error = -EFSCORRUPTED;
3076                        goto out_release;
3077                }
3078        }
3079        if (unlikely(ldip->di_nextents + ldip->di_anextents > ldip->di_nblocks)){
3080                XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
3081                                     XFS_ERRLEVEL_LOW, mp, ldip);
3082                xfs_alert(mp,
3083        "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
3084        "dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
3085                        __func__, item, dip, bp, in_f->ilf_ino,
3086                        ldip->di_nextents + ldip->di_anextents,
3087                        ldip->di_nblocks);
3088                error = -EFSCORRUPTED;
3089                goto out_release;
3090        }
3091        if (unlikely(ldip->di_forkoff > mp->m_sb.sb_inodesize)) {
3092                XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
3093                                     XFS_ERRLEVEL_LOW, mp, ldip);
3094                xfs_alert(mp,
3095        "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
3096        "dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__,
3097                        item, dip, bp, in_f->ilf_ino, ldip->di_forkoff);
3098                error = -EFSCORRUPTED;
3099                goto out_release;
3100        }
3101        isize = xfs_log_dinode_size(ldip->di_version);
3102        if (unlikely(item->ri_buf[1].i_len > isize)) {
3103                XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
3104                                     XFS_ERRLEVEL_LOW, mp, ldip);
3105                xfs_alert(mp,
3106                        "%s: Bad inode log record length %d, rec ptr 0x%p",
3107                        __func__, item->ri_buf[1].i_len, item);
3108                error = -EFSCORRUPTED;
3109                goto out_release;
3110        }
3111
3112        /* recover the log dinode inode into the on disk inode */
3113        xfs_log_dinode_to_disk(ldip, dip);
3114
3115        /* the rest is in on-disk format */
3116        if (item->ri_buf[1].i_len > isize) {
3117                memcpy((char *)dip + isize,
3118                        item->ri_buf[1].i_addr + isize,
3119                        item->ri_buf[1].i_len - isize);
3120        }
3121
3122        fields = in_f->ilf_fields;
3123        switch (fields & (XFS_ILOG_DEV | XFS_ILOG_UUID)) {
3124        case XFS_ILOG_DEV:
3125                xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev);
3126                break;
3127        case XFS_ILOG_UUID:
3128                memcpy(XFS_DFORK_DPTR(dip),
3129                       &in_f->ilf_u.ilfu_uuid,
3130                       sizeof(uuid_t));
3131                break;
3132        }
3133
3134        if (in_f->ilf_size == 2)
3135                goto out_owner_change;
3136        len = item->ri_buf[2].i_len;
3137        src = item->ri_buf[2].i_addr;
3138        ASSERT(in_f->ilf_size <= 4);
3139        ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK));
3140        ASSERT(!(fields & XFS_ILOG_DFORK) ||
3141               (len == in_f->ilf_dsize));
3142
3143        switch (fields & XFS_ILOG_DFORK) {
3144        case XFS_ILOG_DDATA:
3145        case XFS_ILOG_DEXT:
3146                memcpy(XFS_DFORK_DPTR(dip), src, len);
3147                break;
3148
3149        case XFS_ILOG_DBROOT:
3150                xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len,
3151                                 (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dip),
3152                                 XFS_DFORK_DSIZE(dip, mp));
3153                break;
3154
3155        default:
3156                /*
3157                 * There are no data fork flags set.
3158                 */
3159                ASSERT((fields & XFS_ILOG_DFORK) == 0);
3160                break;
3161        }
3162
3163        /*
3164         * If we logged any attribute data, recover it.  There may or
3165         * may not have been any other non-core data logged in this
3166         * transaction.
3167         */
3168        if (in_f->ilf_fields & XFS_ILOG_AFORK) {
3169                if (in_f->ilf_fields & XFS_ILOG_DFORK) {
3170                        attr_index = 3;
3171                } else {
3172                        attr_index = 2;
3173                }
3174                len = item->ri_buf[attr_index].i_len;
3175                src = item->ri_buf[attr_index].i_addr;
3176                ASSERT(len == in_f->ilf_asize);
3177
3178                switch (in_f->ilf_fields & XFS_ILOG_AFORK) {
3179                case XFS_ILOG_ADATA:
3180                case XFS_ILOG_AEXT:
3181                        dest = XFS_DFORK_APTR(dip);
3182                        ASSERT(len <= XFS_DFORK_ASIZE(dip, mp));
3183                        memcpy(dest, src, len);
3184                        break;
3185
3186                case XFS_ILOG_ABROOT:
3187                        dest = XFS_DFORK_APTR(dip);
3188                        xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src,
3189                                         len, (xfs_bmdr_block_t*)dest,
3190                                         XFS_DFORK_ASIZE(dip, mp));
3191                        break;
3192
3193                default:
3194                        xfs_warn(log->l_mp, "%s: Invalid flag", __func__);
3195                        ASSERT(0);
3196                        error = -EIO;
3197                        goto out_release;
3198                }
3199        }
3200
3201out_owner_change:
3202        if (in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER))
3203                error = xfs_recover_inode_owner_change(mp, dip, in_f,
3204                                                       buffer_list);
3205        /* re-generate the checksum. */
3206        xfs_dinode_calc_crc(log->l_mp, dip);
3207
3208        ASSERT(bp->b_target->bt_mount == mp);
3209        bp->b_iodone = xlog_recover_iodone;
3210        xfs_buf_delwri_queue(bp, buffer_list);
3211
3212out_release:
3213        xfs_buf_relse(bp);
3214error:
3215        if (need_free)
3216                kmem_free(in_f);
3217        return error;
3218}
3219
3220/*
3221 * Recover QUOTAOFF records. We simply make a note of it in the xlog
3222 * structure, so that we know not to do any dquot item or dquot buffer recovery,
3223 * of that type.
3224 */
3225STATIC int
3226xlog_recover_quotaoff_pass1(
3227        struct xlog                     *log,
3228        struct xlog_recover_item        *item)
3229{
3230        xfs_qoff_logformat_t    *qoff_f = item->ri_buf[0].i_addr;
3231        ASSERT(qoff_f);
3232
3233        /*
3234         * The logitem format's flag tells us if this was user quotaoff,
3235         * group/project quotaoff or both.
3236         */
3237        if (qoff_f->qf_flags & XFS_UQUOTA_ACCT)
3238                log->l_quotaoffs_flag |= XFS_DQ_USER;
3239        if (qoff_f->qf_flags & XFS_PQUOTA_ACCT)
3240                log->l_quotaoffs_flag |= XFS_DQ_PROJ;
3241        if (qoff_f->qf_flags & XFS_GQUOTA_ACCT)
3242                log->l_quotaoffs_flag |= XFS_DQ_GROUP;
3243
3244        return 0;
3245}
3246
3247/*
3248 * Recover a dquot record
3249 */
3250STATIC int
3251xlog_recover_dquot_pass2(
3252        struct xlog                     *log,
3253        struct list_head                *buffer_list,
3254        struct xlog_recover_item        *item,
3255        xfs_lsn_t                       current_lsn)
3256{
3257        xfs_mount_t             *mp = log->l_mp;
3258        xfs_buf_t               *bp;
3259        struct xfs_disk_dquot   *ddq, *recddq;
3260        int                     error;
3261        xfs_dq_logformat_t      *dq_f;
3262        uint                    type;
3263
3264
3265        /*
3266         * Filesystems are required to send in quota flags at mount time.
3267         */
3268        if (mp->m_qflags == 0)
3269                return 0;
3270
3271        recddq = item->ri_buf[1].i_addr;
3272        if (recddq == NULL) {
3273                xfs_alert(log->l_mp, "NULL dquot in %s.", __func__);
3274                return -EIO;
3275        }
3276        if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) {
3277                xfs_alert(log->l_mp, "dquot too small (%d) in %s.",
3278                        item->ri_buf[1].i_len, __func__);
3279                return -EIO;
3280        }
3281
3282        /*
3283         * This type of quotas was turned off, so ignore this record.
3284         */
3285        type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);
3286        ASSERT(type);
3287        if (log->l_quotaoffs_flag & type)
3288                return 0;
3289
3290        /*
3291         * At this point we know that quota was _not_ turned off.
3292         * Since the mount flags are not indicating to us otherwise, this
3293         * must mean that quota is on, and the dquot needs to be replayed.
3294         * Remember that we may not have fully recovered the superblock yet,
3295         * so we can't do the usual trick of looking at the SB quota bits.
3296         *
3297         * The other possibility, of course, is that the quota subsystem was
3298         * removed since the last mount - ENOSYS.
3299         */
3300        dq_f = item->ri_buf[0].i_addr;
3301        ASSERT(dq_f);
3302        error = xfs_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
3303                           "xlog_recover_dquot_pass2 (log copy)");
3304        if (error)
3305                return -EIO;
3306        ASSERT(dq_f->qlf_len == 1);
3307
3308        /*
3309         * At this point we are assuming that the dquots have been allocated
3310         * and hence the buffer has valid dquots stamped in it. It should,
3311         * therefore, pass verifier validation. If the dquot is bad, then the
3312         * we'll return an error here, so we don't need to specifically check
3313         * the dquot in the buffer after the verifier has run.
3314         */
3315        error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno,
3316                                   XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp,
3317                                   &xfs_dquot_buf_ops);
3318        if (error)
3319                return error;
3320
3321        ASSERT(bp);
3322        ddq = xfs_buf_offset(bp, dq_f->qlf_boffset);
3323
3324        /*
3325         * If the dquot has an LSN in it, recover the dquot only if it's less
3326         * than the lsn of the transaction we are replaying.
3327         */
3328        if (xfs_sb_version_hascrc(&mp->m_sb)) {
3329                struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddq;
3330                xfs_lsn_t       lsn = be64_to_cpu(dqb->dd_lsn);
3331
3332                if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
3333                        goto out_release;
3334                }
3335        }
3336
3337        memcpy(ddq, recddq, item->ri_buf[1].i_len);
3338        if (xfs_sb_version_hascrc(&mp->m_sb)) {
3339                xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk),
3340                                 XFS_DQUOT_CRC_OFF);
3341        }
3342
3343        ASSERT(dq_f->qlf_size == 2);
3344        ASSERT(bp->b_target->bt_mount == mp);
3345        bp->b_iodone = xlog_recover_iodone;
3346        xfs_buf_delwri_queue(bp, buffer_list);
3347
3348out_release:
3349        xfs_buf_relse(bp);
3350        return 0;
3351}
3352
3353/*
3354 * This routine is called to create an in-core extent free intent
3355 * item from the efi format structure which was logged on disk.
3356 * It allocates an in-core efi, copies the extents from the format
3357 * structure into it, and adds the efi to the AIL with the given
3358 * LSN.
3359 */
3360STATIC int
3361xlog_recover_efi_pass2(
3362        struct xlog                     *log,
3363        struct xlog_recover_item        *item,
3364        xfs_lsn_t                       lsn)
3365{
3366        int                             error;
3367        struct xfs_mount                *mp = log->l_mp;
3368        struct xfs_efi_log_item         *efip;
3369        struct xfs_efi_log_format       *efi_formatp;
3370
3371        efi_formatp = item->ri_buf[0].i_addr;
3372
3373        efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
3374        error = xfs_efi_copy_format(&item->ri_buf[0], &efip->efi_format);
3375        if (error) {
3376                xfs_efi_item_free(efip);
3377                return error;
3378        }
3379        atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents);
3380
3381        spin_lock(&log->l_ailp->xa_lock);
3382        /*
3383         * The EFI has two references. One for the EFD and one for EFI to ensure
3384         * it makes it into the AIL. Insert the EFI into the AIL directly and
3385         * drop the EFI reference. Note that xfs_trans_ail_update() drops the
3386         * AIL lock.
3387         */
3388        xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn);
3389        xfs_efi_release(efip);
3390        return 0;
3391}
3392
3393
3394/*
3395 * This routine is called when an EFD format structure is found in a committed
3396 * transaction in the log. Its purpose is to cancel the corresponding EFI if it
3397 * was still in the log. To do this it searches the AIL for the EFI with an id
3398 * equal to that in the EFD format structure. If we find it we drop the EFD
3399 * reference, which removes the EFI from the AIL and frees it.
3400 */
3401STATIC int
3402xlog_recover_efd_pass2(
3403        struct xlog                     *log,
3404        struct xlog_recover_item        *item)
3405{
3406        xfs_efd_log_format_t    *efd_formatp;
3407        xfs_efi_log_item_t      *efip = NULL;
3408        xfs_log_item_t          *lip;
3409        __uint64_t              efi_id;
3410        struct xfs_ail_cursor   cur;
3411        struct xfs_ail          *ailp = log->l_ailp;
3412
3413        efd_formatp = item->ri_buf[0].i_addr;
3414        ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
3415                ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
3416               (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) +
3417                ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_64_t)))));
3418        efi_id = efd_formatp->efd_efi_id;
3419
3420        /*
3421         * Search for the EFI with the id in the EFD format structure in the
3422         * AIL.
3423         */
3424        spin_lock(&ailp->xa_lock);
3425        lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
3426        while (lip != NULL) {
3427                if (lip->li_type == XFS_LI_EFI) {
3428                        efip = (xfs_efi_log_item_t *)lip;
3429                        if (efip->efi_format.efi_id == efi_id) {
3430                                /*
3431                                 * Drop the EFD reference to the EFI. This
3432                                 * removes the EFI from the AIL and frees it.
3433                                 */
3434                                spin_unlock(&ailp->xa_lock);
3435                                xfs_efi_release(efip);
3436                                spin_lock(&ailp->xa_lock);
3437                                break;
3438                        }
3439                }
3440                lip = xfs_trans_ail_cursor_next(ailp, &cur);
3441        }
3442
3443        xfs_trans_ail_cursor_done(&cur);
3444        spin_unlock(&ailp->xa_lock);
3445
3446        return 0;
3447}
3448
3449/*
3450 * This routine is called when an inode create format structure is found in a
3451 * committed transaction in the log.  It's purpose is to initialise the inodes
3452 * being allocated on disk. This requires us to get inode cluster buffers that
3453 * match the range to be intialised, stamped with inode templates and written
3454 * by delayed write so that subsequent modifications will hit the cached buffer
3455 * and only need writing out at the end of recovery.
3456 */
3457STATIC int
3458xlog_recover_do_icreate_pass2(
3459        struct xlog             *log,
3460        struct list_head        *buffer_list,
3461        xlog_recover_item_t     *item)
3462{
3463        struct xfs_mount        *mp = log->l_mp;
3464        struct xfs_icreate_log  *icl;
3465        xfs_agnumber_t          agno;
3466        xfs_agblock_t           agbno;
3467        unsigned int            count;
3468        unsigned int            isize;
3469        xfs_agblock_t           length;
3470        int                     blks_per_cluster;
3471        int                     bb_per_cluster;
3472        int                     cancel_count;
3473        int                     nbufs;
3474        int                     i;
3475
3476        icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr;
3477        if (icl->icl_type != XFS_LI_ICREATE) {
3478                xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type");
3479                return -EINVAL;
3480        }
3481
3482        if (icl->icl_size != 1) {
3483                xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size");
3484                return -EINVAL;
3485        }
3486
3487        agno = be32_to_cpu(icl->icl_ag);
3488        if (agno >= mp->m_sb.sb_agcount) {
3489                xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno");
3490                return -EINVAL;
3491        }
3492        agbno = be32_to_cpu(icl->icl_agbno);
3493        if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) {
3494                xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno");
3495                return -EINVAL;
3496        }
3497        isize = be32_to_cpu(icl->icl_isize);
3498        if (isize != mp->m_sb.sb_inodesize) {
3499                xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize");
3500                return -EINVAL;
3501        }
3502        count = be32_to_cpu(icl->icl_count);
3503        if (!count) {
3504                xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count");
3505                return -EINVAL;
3506        }
3507        length = be32_to_cpu(icl->icl_length);
3508        if (!length || length >= mp->m_sb.sb_agblocks) {
3509                xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length");
3510                return -EINVAL;
3511        }
3512
3513        /*
3514         * The inode chunk is either full or sparse and we only support
3515         * m_ialloc_min_blks sized sparse allocations at this time.
3516         */
3517        if (length != mp->m_ialloc_blks &&
3518            length != mp->m_ialloc_min_blks) {
3519                xfs_warn(log->l_mp,
3520                         "%s: unsupported chunk length", __FUNCTION__);
3521                return -EINVAL;
3522        }
3523
3524        /* verify inode count is consistent with extent length */
3525        if ((count >> mp->m_sb.sb_inopblog) != length) {
3526                xfs_warn(log->l_mp,
3527                         "%s: inconsistent inode count and chunk length",
3528                         __FUNCTION__);
3529                return -EINVAL;
3530        }
3531
3532        /*
3533         * The icreate transaction can cover multiple cluster buffers and these
3534         * buffers could have been freed and reused. Check the individual
3535         * buffers for cancellation so we don't overwrite anything written after
3536         * a cancellation.
3537         */
3538        blks_per_cluster = xfs_icluster_size_fsb(mp);
3539        bb_per_cluster = XFS_FSB_TO_BB(mp, blks_per_cluster);
3540        nbufs = length / blks_per_cluster;
3541        for (i = 0, cancel_count = 0; i < nbufs; i++) {
3542                xfs_daddr_t     daddr;
3543
3544                daddr = XFS_AGB_TO_DADDR(mp, agno,
3545                                         agbno + i * blks_per_cluster);
3546                if (xlog_check_buffer_cancelled(log, daddr, bb_per_cluster, 0))
3547                        cancel_count++;
3548        }
3549
3550        /*
3551         * We currently only use icreate for a single allocation at a time. This
3552         * means we should expect either all or none of the buffers to be
3553         * cancelled. Be conservative and skip replay if at least one buffer is
3554         * cancelled, but warn the user that something is awry if the buffers
3555         * are not consistent.
3556         *
3557         * XXX: This must be refined to only skip cancelled clusters once we use
3558         * icreate for multiple chunk allocations.
3559         */
3560        ASSERT(!cancel_count || cancel_count == nbufs);
3561        if (cancel_count) {
3562                if (cancel_count != nbufs)
3563                        xfs_warn(mp,
3564        "WARNING: partial inode chunk cancellation, skipped icreate.");
3565                trace_xfs_log_recover_icreate_cancel(log, icl);
3566                return 0;
3567        }
3568
3569        trace_xfs_log_recover_icreate_recover(log, icl);
3570        return xfs_ialloc_inode_init(mp, NULL, buffer_list, count, agno, agbno,
3571                                     length, be32_to_cpu(icl->icl_gen));
3572}
3573
3574STATIC void
3575xlog_recover_buffer_ra_pass2(
3576        struct xlog                     *log,
3577        struct xlog_recover_item        *item)
3578{
3579        struct xfs_buf_log_format       *buf_f = item->ri_buf[0].i_addr;
3580        struct xfs_mount                *mp = log->l_mp;
3581
3582        if (xlog_peek_buffer_cancelled(log, buf_f->blf_blkno,
3583                        buf_f->blf_len, buf_f->blf_flags)) {
3584                return;
3585        }
3586
3587        xfs_buf_readahead(mp->m_ddev_targp, buf_f->blf_blkno,
3588                                buf_f->blf_len, NULL);
3589}
3590
3591STATIC void
3592xlog_recover_inode_ra_pass2(
3593        struct xlog                     *log,
3594        struct xlog_recover_item        *item)
3595{
3596        struct xfs_inode_log_format     ilf_buf;
3597        struct xfs_inode_log_format     *ilfp;
3598        struct xfs_mount                *mp = log->l_mp;
3599        int                     error;
3600
3601        if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
3602                ilfp = item->ri_buf[0].i_addr;
3603        } else {
3604                ilfp = &ilf_buf;
3605                memset(ilfp, 0, sizeof(*ilfp));
3606                error = xfs_inode_item_format_convert(&item->ri_buf[0], ilfp);
3607                if (error)
3608                        return;
3609        }
3610
3611        if (xlog_peek_buffer_cancelled(log, ilfp->ilf_blkno, ilfp->ilf_len, 0))
3612                return;
3613
3614        xfs_buf_readahead(mp->m_ddev_targp, ilfp->ilf_blkno,
3615                                ilfp->ilf_len, &xfs_inode_buf_ra_ops);
3616}
3617
3618STATIC void
3619xlog_recover_dquot_ra_pass2(
3620        struct xlog                     *log,
3621        struct xlog_recover_item        *item)
3622{
3623        struct xfs_mount        *mp = log->l_mp;
3624        struct xfs_disk_dquot   *recddq;
3625        struct xfs_dq_logformat *dq_f;
3626        uint                    type;
3627        int                     len;
3628
3629
3630        if (mp->m_qflags == 0)
3631                return;
3632
3633        recddq = item->ri_buf[1].i_addr;
3634        if (recddq == NULL)
3635                return;
3636        if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot))
3637                return;
3638
3639        type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);
3640        ASSERT(type);
3641        if (log->l_quotaoffs_flag & type)
3642                return;
3643
3644        dq_f = item->ri_buf[0].i_addr;
3645        ASSERT(dq_f);
3646        ASSERT(dq_f->qlf_len == 1);
3647
3648        len = XFS_FSB_TO_BB(mp, dq_f->qlf_len);
3649        if (xlog_peek_buffer_cancelled(log, dq_f->qlf_blkno, len, 0))
3650                return;
3651
3652        xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno, len,
3653                          &xfs_dquot_buf_ra_ops);
3654}
3655
3656STATIC void
3657xlog_recover_ra_pass2(
3658        struct xlog                     *log,
3659        struct xlog_recover_item        *item)
3660{
3661        switch (ITEM_TYPE(item)) {
3662        case XFS_LI_BUF:
3663                xlog_recover_buffer_ra_pass2(log, item);
3664                break;
3665        case XFS_LI_INODE:
3666                xlog_recover_inode_ra_pass2(log, item);
3667                break;
3668        case XFS_LI_DQUOT:
3669                xlog_recover_dquot_ra_pass2(log, item);
3670                break;
3671        case XFS_LI_EFI:
3672        case XFS_LI_EFD:
3673        case XFS_LI_QUOTAOFF:
3674        default:
3675                break;
3676        }
3677}
3678
3679STATIC int
3680xlog_recover_commit_pass1(
3681        struct xlog                     *log,
3682        struct xlog_recover             *trans,
3683        struct xlog_recover_item        *item)
3684{
3685        trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1);
3686
3687        switch (ITEM_TYPE(item)) {
3688        case XFS_LI_BUF:
3689                return xlog_recover_buffer_pass1(log, item);
3690        case XFS_LI_QUOTAOFF:
3691                return xlog_recover_quotaoff_pass1(log, item);
3692        case XFS_LI_INODE:
3693        case XFS_LI_EFI:
3694        case XFS_LI_EFD:
3695        case XFS_LI_DQUOT:
3696        case XFS_LI_ICREATE:
3697                /* nothing to do in pass 1 */
3698                return 0;
3699        default:
3700                xfs_warn(log->l_mp, "%s: invalid item type (%d)",
3701                        __func__, ITEM_TYPE(item));
3702                ASSERT(0);
3703                return -EIO;
3704        }
3705}
3706
3707STATIC int
3708xlog_recover_commit_pass2(
3709        struct xlog                     *log,
3710        struct xlog_recover             *trans,
3711        struct list_head                *buffer_list,
3712        struct xlog_recover_item        *item)
3713{
3714        trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2);
3715
3716        switch (ITEM_TYPE(item)) {
3717        case XFS_LI_BUF:
3718                return xlog_recover_buffer_pass2(log, buffer_list, item,
3719                                                 trans->r_lsn);
3720        case XFS_LI_INODE:
3721                return xlog_recover_inode_pass2(log, buffer_list, item,
3722                                                 trans->r_lsn);
3723        case XFS_LI_EFI:
3724                return xlog_recover_efi_pass2(log, item, trans->r_lsn);
3725        case XFS_LI_EFD:
3726                return xlog_recover_efd_pass2(log, item);
3727        case XFS_LI_DQUOT:
3728                return xlog_recover_dquot_pass2(log, buffer_list, item,
3729                                                trans->r_lsn);
3730        case XFS_LI_ICREATE:
3731                return xlog_recover_do_icreate_pass2(log, buffer_list, item);
3732        case XFS_LI_QUOTAOFF:
3733                /* nothing to do in pass2 */
3734                return 0;
3735        default:
3736                xfs_warn(log->l_mp, "%s: invalid item type (%d)",
3737                        __func__, ITEM_TYPE(item));
3738                ASSERT(0);
3739                return -EIO;
3740        }
3741}
3742
3743STATIC int
3744xlog_recover_items_pass2(
3745        struct xlog                     *log,
3746        struct xlog_recover             *trans,
3747        struct list_head                *buffer_list,
3748        struct list_head                *item_list)
3749{
3750        struct xlog_recover_item        *item;
3751        int                             error = 0;
3752
3753        list_for_each_entry(item, item_list, ri_list) {
3754                error = xlog_recover_commit_pass2(log, trans,
3755                                          buffer_list, item);
3756                if (error)
3757                        return error;
3758        }
3759
3760        return error;
3761}
3762
3763/*
3764 * Perform the transaction.
3765 *
3766 * If the transaction modifies a buffer or inode, do it now.  Otherwise,
3767 * EFIs and EFDs get queued up by adding entries into the AIL for them.
3768 */
3769STATIC int
3770xlog_recover_commit_trans(
3771        struct xlog             *log,
3772        struct xlog_recover     *trans,
3773        int                     pass,
3774        struct list_head        *buffer_list)
3775{
3776        int                             error = 0;
3777        int                             items_queued = 0;
3778        struct xlog_recover_item        *item;
3779        struct xlog_recover_item        *next;
3780        LIST_HEAD                       (ra_list);
3781        LIST_HEAD                       (done_list);
3782
3783        #define XLOG_RECOVER_COMMIT_QUEUE_MAX 100
3784
3785        hlist_del(&trans->r_list);
3786
3787        error = xlog_recover_reorder_trans(log, trans, pass);
3788        if (error)
3789                return error;
3790
3791        list_for_each_entry_safe(item, next, &trans->r_itemq, ri_list) {
3792                switch (pass) {
3793                case XLOG_RECOVER_PASS1:
3794                        error = xlog_recover_commit_pass1(log, trans, item);
3795                        break;
3796                case XLOG_RECOVER_PASS2:
3797                        xlog_recover_ra_pass2(log, item);
3798                        list_move_tail(&item->ri_list, &ra_list);
3799                        items_queued++;
3800                        if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) {
3801                                error = xlog_recover_items_pass2(log, trans,
3802                                                buffer_list, &ra_list);
3803                                list_splice_tail_init(&ra_list, &done_list);
3804                                items_queued = 0;
3805                        }
3806
3807                        break;
3808                default:
3809                        ASSERT(0);
3810                }
3811
3812                if (error)
3813                        goto out;
3814        }
3815
3816out:
3817        if (!list_empty(&ra_list)) {
3818                if (!error)
3819                        error = xlog_recover_items_pass2(log, trans,
3820                                        buffer_list, &ra_list);
3821                list_splice_tail_init(&ra_list, &done_list);
3822        }
3823
3824        if (!list_empty(&done_list))
3825                list_splice_init(&done_list, &trans->r_itemq);
3826
3827        return error;
3828}
3829
3830STATIC void
3831xlog_recover_add_item(
3832        struct list_head        *head)
3833{
3834        xlog_recover_item_t     *item;
3835
3836        item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP);
3837        INIT_LIST_HEAD(&item->ri_list);
3838        list_add_tail(&item->ri_list, head);
3839}
3840
3841STATIC int
3842xlog_recover_add_to_cont_trans(
3843        struct xlog             *log,
3844        struct xlog_recover     *trans,
3845        char                    *dp,
3846        int                     len)
3847{
3848        xlog_recover_item_t     *item;
3849        char                    *ptr, *old_ptr;
3850        int                     old_len;
3851
3852        /*
3853         * If the transaction is empty, the header was split across this and the
3854         * previous record. Copy the rest of the header.
3855         */
3856        if (list_empty(&trans->r_itemq)) {
3857                ASSERT(len <= sizeof(struct xfs_trans_header));
3858                if (len > sizeof(struct xfs_trans_header)) {
3859                        xfs_warn(log->l_mp, "%s: bad header length", __func__);
3860                        return -EIO;
3861                }
3862
3863                xlog_recover_add_item(&trans->r_itemq);
3864                ptr = (char *)&trans->r_theader +
3865                                sizeof(struct xfs_trans_header) - len;
3866                memcpy(ptr, dp, len);
3867                return 0;
3868        }
3869
3870        /* take the tail entry */
3871        item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
3872
3873        old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
3874        old_len = item->ri_buf[item->ri_cnt-1].i_len;
3875
3876        ptr = kmem_realloc(old_ptr, len + old_len, KM_SLEEP);
3877        memcpy(&ptr[old_len], dp, len);
3878        item->ri_buf[item->ri_cnt-1].i_len += len;
3879        item->ri_buf[item->ri_cnt-1].i_addr = ptr;
3880        trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
3881        return 0;
3882}
3883
3884/*
3885 * The next region to add is the start of a new region.  It could be
3886 * a whole region or it could be the first part of a new region.  Because
3887 * of this, the assumption here is that the type and size fields of all
3888 * format structures fit into the first 32 bits of the structure.
3889 *
3890 * This works because all regions must be 32 bit aligned.  Therefore, we
3891 * either have both fields or we have neither field.  In the case we have
3892 * neither field, the data part of the region is zero length.  We only have
3893 * a log_op_header and can throw away the header since a new one will appear
3894 * later.  If we have at least 4 bytes, then we can determine how many regions
3895 * will appear in the current log item.
3896 */
3897STATIC int
3898xlog_recover_add_to_trans(
3899        struct xlog             *log,
3900        struct xlog_recover     *trans,
3901        char                    *dp,
3902        int                     len)
3903{
3904        xfs_inode_log_format_t  *in_f;                  /* any will do */
3905        xlog_recover_item_t     *item;
3906        char                    *ptr;
3907
3908        if (!len)
3909                return 0;
3910        if (list_empty(&trans->r_itemq)) {
3911                /* we need to catch log corruptions here */
3912                if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
3913                        xfs_warn(log->l_mp, "%s: bad header magic number",
3914                                __func__);
3915                        ASSERT(0);
3916                        return -EIO;
3917                }
3918
3919                if (len > sizeof(struct xfs_trans_header)) {
3920                        xfs_warn(log->l_mp, "%s: bad header length", __func__);
3921                        ASSERT(0);
3922                        return -EIO;
3923                }
3924
3925                /*
3926                 * The transaction header can be arbitrarily split across op
3927                 * records. If we don't have the whole thing here, copy what we
3928                 * do have and handle the rest in the next record.
3929                 */
3930                if (len == sizeof(struct xfs_trans_header))
3931                        xlog_recover_add_item(&trans->r_itemq);
3932                memcpy(&trans->r_theader, dp, len);
3933                return 0;
3934        }
3935
3936        ptr = kmem_alloc(len, KM_SLEEP);
3937        memcpy(ptr, dp, len);
3938        in_f = (xfs_inode_log_format_t *)ptr;
3939
3940        /* take the tail entry */
3941        item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
3942        if (item->ri_total != 0 &&
3943             item->ri_total == item->ri_cnt) {
3944                /* tail item is in use, get a new one */
3945                xlog_recover_add_item(&trans->r_itemq);
3946                item = list_entry(trans->r_itemq.prev,
3947                                        xlog_recover_item_t, ri_list);
3948        }
3949
3950        if (item->ri_total == 0) {              /* first region to be added */
3951                if (in_f->ilf_size == 0 ||
3952                    in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
3953                        xfs_warn(log->l_mp,
3954                "bad number of regions (%d) in inode log format",
3955                                  in_f->ilf_size);
3956                        ASSERT(0);
3957                        kmem_free(ptr);
3958                        return -EIO;
3959                }
3960
3961                item->ri_total = in_f->ilf_size;
3962                item->ri_buf =
3963                        kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
3964                                    KM_SLEEP);
3965        }
3966        ASSERT(item->ri_total > item->ri_cnt);
3967        /* Description region is ri_buf[0] */
3968        item->ri_buf[item->ri_cnt].i_addr = ptr;
3969        item->ri_buf[item->ri_cnt].i_len  = len;
3970        item->ri_cnt++;
3971        trace_xfs_log_recover_item_add(log, trans, item, 0);
3972        return 0;
3973}
3974
3975/*
3976 * Free up any resources allocated by the transaction
3977 *
3978 * Remember that EFIs, EFDs, and IUNLINKs are handled later.
3979 */
3980STATIC void
3981xlog_recover_free_trans(
3982        struct xlog_recover     *trans)
3983{
3984        xlog_recover_item_t     *item, *n;
3985        int                     i;
3986
3987        list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
3988                /* Free the regions in the item. */
3989                list_del(&item->ri_list);
3990                for (i = 0; i < item->ri_cnt; i++)
3991                        kmem_free(item->ri_buf[i].i_addr);
3992                /* Free the item itself */
3993                kmem_free(item->ri_buf);
3994                kmem_free(item);
3995        }
3996        /* Free the transaction recover structure */
3997        kmem_free(trans);
3998}
3999
4000/*

4001 * On error or completion, trans is freed.
4002 */
4003STATIC int
4004xlog_recovery_process_trans(
4005        struct xlog             *log,
4006        struct xlog_recover     *trans,
4007        char                    *dp,
4008        unsigned int            len,
4009        unsigned int            flags,
4010        int                     pass,
4011        struct list_head        *buffer_list)
4012{
4013        int                     error = 0;
4014        bool                    freeit = false;
4015
4016        /* mask off ophdr transaction container flags */
4017        flags &= ~XLOG_END_TRANS;
4018        if (flags & XLOG_WAS_CONT_TRANS)
4019                flags &= ~XLOG_CONTINUE_TRANS;
4020
4021        /*
4022         * Callees must not free the trans structure. We'll decide if we need to
4023         * free it or not based on the operation being done and it's result.
4024         */
4025        switch (flags) {
4026        /* expected flag values */
4027        case 0:
4028        case XLOG_CONTINUE_TRANS:
4029                error = xlog_recover_add_to_trans(log, trans, dp, len);
4030                break;
4031        case XLOG_WAS_CONT_TRANS:
4032                error = xlog_recover_add_to_cont_trans(log, trans, dp, len);
4033                break;
4034        case XLOG_COMMIT_TRANS:
4035                error = xlog_recover_commit_trans(log, trans, pass,
4036                                                  buffer_list);
4037                /* success or fail, we are now done with this transaction. */
4038                freeit = true;
4039                break;
4040
4041        /* unexpected flag values */
4042        case XLOG_UNMOUNT_TRANS:
4043                /* just skip trans */
4044                xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
4045                freeit = true;
4046                break;
4047        case XLOG_START_TRANS:
4048        default:
4049                xfs_warn(log->l_mp, "%s: bad flag 0x%x", __func__, flags);
4050                ASSERT(0);
4051                error = -EIO;
4052                break;
4053        }
4054        if (error || freeit)
4055                xlog_recover_free_trans(trans);
4056        return error;
4057}
4058
4059/*
4060 * Lookup the transaction recovery structure associated with the ID in the
4061 * current ophdr. If the transaction doesn't exist and the start flag is set in
4062 * the ophdr, then allocate a new transaction for future ID matches to find.
4063 * Either way, return what we found during the lookup - an existing transaction
4064 * or nothing.
4065 */
4066STATIC struct xlog_recover *
4067xlog_recover_ophdr_to_trans(
4068        struct hlist_head       rhash[],
4069        struct xlog_rec_header  *rhead,
4070        struct xlog_op_header   *ohead)
4071{
4072        struct xlog_recover     *trans;
4073        xlog_tid_t              tid;
4074        struct hlist_head       *rhp;
4075
4076        tid = be32_to_cpu(ohead->oh_tid);
4077        rhp = &rhash[XLOG_RHASH(tid)];
4078        hlist_for_each_entry(trans, rhp, r_list) {
4079                if (trans->r_log_tid == tid)
4080                        return trans;
4081        }
4082
4083        /*
4084         * skip over non-start transaction headers - we could be
4085         * processing slack space before the next transaction starts
4086         */
4087        if (!(ohead->oh_flags & XLOG_START_TRANS))
4088                return NULL;
4089
4090        ASSERT(be32_to_cpu(ohead->oh_len) == 0);
4091
4092        /*
4093         * This is a new transaction so allocate a new recovery container to
4094         * hold the recovery ops that will follow.
4095         */
4096        trans = kmem_zalloc(sizeof(struct xlog_recover), KM_SLEEP);
4097        trans->r_log_tid = tid;
4098        trans->r_lsn = be64_to_cpu(rhead->h_lsn);
4099        INIT_LIST_HEAD(&trans->r_itemq);
4100        INIT_HLIST_NODE(&trans->r_list);
4101        hlist_add_head(&trans->r_list, rhp);
4102
4103        /*
4104         * Nothing more to do for this ophdr. Items to be added to this new
4105         * transaction will be in subsequent ophdr containers.
4106         */
4107        return NULL;
4108}
4109
4110STATIC int
4111xlog_recover_process_ophdr(
4112        struct xlog             *log,
4113        struct hlist_head       rhash[],
4114        struct xlog_rec_header  *rhead,
4115        struct xlog_op_header   *ohead,
4116        char                    *dp,
4117        char                    *end,
4118        int                     pass,
4119        struct list_head        *buffer_list)
4120{
4121        struct xlog_recover     *trans;
4122        unsigned int            len;
4123        int                     error;
4124
4125        /* Do we understand who wrote this op? */
4126        if (ohead->oh_clientid != XFS_TRANSACTION &&
4127            ohead->oh_clientid != XFS_LOG) {
4128                xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
4129                        __func__, ohead->oh_clientid);
4130                ASSERT(0);
4131                return -EIO;
4132        }
4133
4134        /*
4135         * Check the ophdr contains all the data it is supposed to contain.
4136         */
4137        len = be32_to_cpu(ohead->oh_len);
4138        if (dp + len > end) {
4139                xfs_warn(log->l_mp, "%s: bad length 0x%x", __func__, len);
4140                WARN_ON(1);
4141                return -EIO;
4142        }
4143
4144        trans = xlog_recover_ophdr_to_trans(rhash, rhead, ohead);
4145        if (!trans) {
4146                /* nothing to do, so skip over this ophdr */
4147                return 0;
4148        }
4149
4150        /*
4151         * The recovered buffer queue is drained only once we know that all
4152         * recovery items for the current LSN have been processed. This is
4153         * required because:
4154         *
4155         * - Buffer write submission updates the metadata LSN of the buffer.
4156         * - Log recovery skips items with a metadata LSN >= the current LSN of
4157         *   the recovery item.
4158         * - Separate recovery items against the same metadata buffer can share
4159         *   a current LSN. I.e., consider that the LSN of a recovery item is
4160         *   defined as the starting LSN of the first record in which its
4161         *   transaction appears, that a record can hold multiple transactions,
4162         *   and/or that a transaction can span multiple records.
4163         *
4164         * In other words, we are allowed to submit a buffer from log recovery
4165         * once per current LSN. Otherwise, we may incorrectly skip recovery
4166         * items and cause corruption.
4167         *
4168         * We don't know up front whether buffers are updated multiple times per
4169         * LSN. Therefore, track the current LSN of each commit log record as it
4170         * is processed and drain the queue when it changes. Use commit records
4171         * because they are ordered correctly by the logging code.
4172         */
4173        if (log->l_recovery_lsn != trans->r_lsn &&
4174            ohead->oh_flags & XLOG_COMMIT_TRANS) {
4175                error = xfs_buf_delwri_submit(buffer_list);
4176                if (error)
4177                        return error;
4178                log->l_recovery_lsn = trans->r_lsn;
4179        }
4180
4181        return xlog_recovery_process_trans(log, trans, dp, len,
4182                                           ohead->oh_flags, pass, buffer_list);
4183}
4184
4185/*
4186 * There are two valid states of the r_state field.  0 indicates that the
4187 * transaction structure is in a normal state.  We have either seen the
4188 * start of the transaction or the last operation we added was not a partial
4189 * operation.  If the last operation we added to the transaction was a
4190 * partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS.
4191 *
4192 * NOTE: skip LRs with 0 data length.
4193 */
4194STATIC int
4195xlog_recover_process_data(
4196        struct xlog             *log,
4197        struct hlist_head       rhash[],
4198        struct xlog_rec_header  *rhead,
4199        char                    *dp,
4200        int                     pass,
4201        struct list_head        *buffer_list)
4202{
4203        struct xlog_op_header   *ohead;
4204        char                    *end;
4205        int                     num_logops;
4206        int                     error;
4207
4208        end = dp + be32_to_cpu(rhead->h_len);
4209        num_logops = be32_to_cpu(rhead->h_num_logops);
4210
4211        /* check the log format matches our own - else we can't recover */
4212        if (xlog_header_check_recover(log->l_mp, rhead))
4213                return -EIO;
4214
4215        trace_xfs_log_recover_record(log, rhead, pass);
4216        while ((dp < end) && num_logops) {
4217
4218                ohead = (struct xlog_op_header *)dp;
4219                dp += sizeof(*ohead);
4220                ASSERT(dp <= end);
4221
4222                /* errors will abort recovery */
4223                error = xlog_recover_process_ophdr(log, rhash, rhead, ohead,
4224                                                   dp, end, pass, buffer_list);
4225                if (error)
4226                        return error;
4227
4228                dp += be32_to_cpu(ohead->oh_len);
4229                num_logops--;
4230        }
4231        return 0;
4232}
4233
4234/* Recover the EFI if necessary. */
4235STATIC int
4236xlog_recover_process_efi(
4237        struct xfs_mount                *mp,
4238        struct xfs_ail                  *ailp,
4239        struct xfs_log_item             *lip)
4240{
4241        struct xfs_efi_log_item         *efip;
4242        int                             error;
4243
4244        /*
4245         * Skip EFIs that we've already processed.
4246         */
4247        efip = container_of(lip, struct xfs_efi_log_item, efi_item);
4248        if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags))
4249                return 0;
4250
4251        spin_unlock(&ailp->xa_lock);
4252        error = xfs_efi_recover(mp, efip);
4253        spin_lock(&ailp->xa_lock);
4254
4255        return error;
4256}
4257
4258/* Release the EFI since we're cancelling everything. */
4259STATIC void
4260xlog_recover_cancel_efi(
4261        struct xfs_mount                *mp,
4262        struct xfs_ail                  *ailp,
4263        struct xfs_log_item             *lip)
4264{
4265        struct xfs_efi_log_item         *efip;
4266
4267        efip = container_of(lip, struct xfs_efi_log_item, efi_item);
4268
4269        spin_unlock(&ailp->xa_lock);
4270        xfs_efi_release(efip);
4271        spin_lock(&ailp->xa_lock);
4272}
4273
4274/* Is this log item a deferred action intent? */
4275static inline bool xlog_item_is_intent(struct xfs_log_item *lip)
4276{
4277        switch (lip->li_type) {
4278        case XFS_LI_EFI:
4279                return true;
4280        default:
4281                return false;
4282        }
4283}
4284
4285/*
4286 * When this is called, all of the log intent items which did not have
4287 * corresponding log done items should be in the AIL.  What we do now
4288 * is update the data structures associated with each one.
4289 *
4290 * Since we process the log intent items in normal transactions, they
4291 * will be removed at some point after the commit.  This prevents us
4292 * from just walking down the list processing each one.  We'll use a
4293 * flag in the intent item to skip those that we've already processed
4294 * and use the AIL iteration mechanism's generation count to try to
4295 * speed this up at least a bit.
4296 *
4297 * When we start, we know that the intents are the only things in the
4298 * AIL.  As we process them, however, other items are added to the
4299 * AIL.
4300 */
4301STATIC int
4302xlog_recover_process_intents(
4303        struct xlog             *log)
4304{
4305        struct xfs_log_item     *lip;
4306        int                     error = 0;
4307        struct xfs_ail_cursor   cur;
4308        struct xfs_ail          *ailp;
4309        xfs_lsn_t               last_lsn;
4310
4311        ailp = log->l_ailp;
4312        spin_lock(&ailp->xa_lock);
4313        lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
4314        last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block);
4315        while (lip != NULL) {
4316                /*
4317                 * We're done when we see something other than an intent.
4318                 * There should be no intents left in the AIL now.
4319                 */
4320                if (!xlog_item_is_intent(lip)) {
4321#ifdef DEBUG
4322                        for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur))
4323                                ASSERT(!xlog_item_is_intent(lip));
4324#endif
4325                        break;
4326                }
4327
4328                /*
4329                 * We should never see a redo item with a LSN higher than
4330                 * the last transaction we found in the log at the start
4331                 * of recovery.
4332                 */
4333                ASSERT(XFS_LSN_CMP(last_lsn, lip->li_lsn) >= 0);
4334
4335                switch (lip->li_type) {
4336                case XFS_LI_EFI:
4337                        error = xlog_recover_process_efi(log->l_mp, ailp, lip);
4338                        break;
4339                }
4340                if (error)
4341                        goto out;
4342                lip = xfs_trans_ail_cursor_next(ailp, &cur);
4343        }
4344out:
4345        xfs_trans_ail_cursor_done(&cur);
4346        spin_unlock(&ailp->xa_lock);
4347        return error;
4348}
4349
4350/*
4351 * A cancel occurs when the mount has failed and we're bailing out.
4352 * Release all pending log intent items so they don't pin the AIL.
4353 */
4354STATIC int
4355xlog_recover_cancel_intents(
4356        struct xlog             *log)
4357{
4358        struct xfs_log_item     *lip;
4359        int                     error = 0;
4360        struct xfs_ail_cursor   cur;
4361        struct xfs_ail          *ailp;
4362
4363        ailp = log->l_ailp;
4364        spin_lock(&ailp->xa_lock);
4365        lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
4366        while (lip != NULL) {
4367                /*
4368                 * We're done when we see something other than an intent.
4369                 * There should be no intents left in the AIL now.
4370                 */
4371                if (!xlog_item_is_intent(lip)) {
4372#ifdef DEBUG
4373                        for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur))
4374                                ASSERT(!xlog_item_is_intent(lip));
4375#endif
4376                        break;
4377                }
4378
4379                switch (lip->li_type) {
4380                case XFS_LI_EFI:
4381                        xlog_recover_cancel_efi(log->l_mp, ailp, lip);
4382                        break;
4383                }
4384
4385                lip = xfs_trans_ail_cursor_next(ailp, &cur);
4386        }
4387
4388        xfs_trans_ail_cursor_done(&cur);
4389        spin_unlock(&ailp->xa_lock);
4390        return error;
4391}
4392
4393/*
4394 * This routine performs a transaction to null out a bad inode pointer
4395 * in an agi unlinked inode hash bucket.
4396 */
4397STATIC void
4398xlog_recover_clear_agi_bucket(
4399        xfs_mount_t     *mp,
4400        xfs_agnumber_t  agno,
4401        int             bucket)
4402{
4403        xfs_trans_t     *tp;
4404        xfs_agi_t       *agi;
4405        xfs_buf_t       *agibp;
4406        int             offset;
4407        int             error;
4408
4409        error = xfs_trans_alloc(mp, &M_RES(mp)->tr_clearagi, 0, 0, 0, &tp);
4410        if (error)
4411                goto out_error;
4412
4413        error = xfs_read_agi(mp, tp, agno, &agibp);
4414        if (error)
4415                goto out_abort;
4416
4417        agi = XFS_BUF_TO_AGI(agibp);
4418        agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
4419        offset = offsetof(xfs_agi_t, agi_unlinked) +
4420                 (sizeof(xfs_agino_t) * bucket);
4421        xfs_trans_buf_set_type(tp, agibp, XFS_BLFT_AGI_BUF);
4422        xfs_trans_log_buf(tp, agibp, offset,
4423                          (offset + sizeof(xfs_agino_t) - 1));
4424
4425        error = xfs_trans_commit(tp);
4426        if (error)
4427                goto out_error;
4428        return;
4429
4430out_abort:
4431        xfs_trans_cancel(tp);
4432out_error:
4433        xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno);
4434        return;
4435}
4436
4437STATIC xfs_agino_t
4438xlog_recover_process_one_iunlink(
4439        struct xfs_mount                *mp,
4440        xfs_agnumber_t                  agno,
4441        xfs_agino_t                     agino,
4442        int                             bucket)
4443{
4444        struct xfs_buf                  *ibp;
4445        struct xfs_dinode               *dip;
4446        struct xfs_inode                *ip;
4447        xfs_ino_t                       ino;
4448        int                             error;
4449
4450        ino = XFS_AGINO_TO_INO(mp, agno, agino);
4451        error = xfs_iget(mp, NULL, ino, 0, 0, &ip);
4452        if (error)
4453                goto fail;
4454
4455        /*
4456         * Get the on disk inode to find the next inode in the bucket.
4457         */
4458        error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &ibp, 0, 0);
4459        if (error)
4460                goto fail_iput;
4461
4462        ASSERT(VFS_I(ip)->i_nlink == 0);
4463        ASSERT(VFS_I(ip)->i_mode != 0);
4464
4465        /* setup for the next pass */
4466        agino = be32_to_cpu(dip->di_next_unlinked);
4467        xfs_buf_relse(ibp);
4468
4469        /*
4470         * Prevent any DMAPI event from being sent when the reference on
4471         * the inode is dropped.
4472         */
4473        ip->i_d.di_dmevmask = 0;
4474
4475        IRELE(ip);
4476        return agino;
4477
4478 fail_iput:
4479        IRELE(ip);
4480 fail:
4481        /*
4482         * We can't read in the inode this bucket points to, or this inode
4483         * is messed up.  Just ditch this bucket of inodes.  We will lose
4484         * some inodes and space, but at least we won't hang.
4485         *
4486         * Call xlog_recover_clear_agi_bucket() to perform a transaction to
4487         * clear the inode pointer in the bucket.
4488         */
4489        xlog_recover_clear_agi_bucket(mp, agno, bucket);
4490        return NULLAGINO;
4491}
4492
4493/*
4494 * xlog_iunlink_recover
4495 *
4496 * This is called during recovery to process any inodes which
4497 * we unlinked but not freed when the system crashed.  These
4498 * inodes will be on the lists in the AGI blocks.  What we do
4499 * here is scan all the AGIs and fully truncate and free any
4500 * inodes found on the lists.  Each inode is removed from the
4501 * lists when it has been fully truncated and is freed.  The
4502 * freeing of the inode and its removal from the list must be
4503 * atomic.
4504 */
4505STATIC void
4506xlog_recover_process_iunlinks(
4507        struct xlog     *log)
4508{
4509        xfs_mount_t     *mp;
4510        xfs_agnumber_t  agno;
4511        xfs_agi_t       *agi;
4512        xfs_buf_t       *agibp;
4513        xfs_agino_t     agino;
4514        int             bucket;
4515        int             error;
4516        uint            mp_dmevmask;
4517
4518        mp = log->l_mp;
4519
4520        /*
4521         * Prevent any DMAPI event from being sent while in this function.
4522         */
4523        mp_dmevmask = mp->m_dmevmask;
4524        mp->m_dmevmask = 0;
4525
4526        for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
4527                /*
4528                 * Find the agi for this ag.
4529                 */
4530                error = xfs_read_agi(mp, NULL, agno, &agibp);
4531                if (error) {
4532                        /*
4533                         * AGI is b0rked. Don't process it.
4534                         *
4535                         * We should probably mark the filesystem as corrupt
4536                         * after we've recovered all the ag's we can....
4537                         */
4538                        continue;
4539                }
4540                /*
4541                 * Unlock the buffer so that it can be acquired in the normal
4542                 * course of the transaction to truncate and free each inode.
4543                 * Because we are not racing with anyone else here for the AGI
4544                 * buffer, we don't even need to hold it locked to read the
4545                 * initial unlinked bucket entries out of the buffer. We keep
4546                 * buffer reference though, so that it stays pinned in memory
4547                 * while we need the buffer.
4548                 */
4549                agi = XFS_BUF_TO_AGI(agibp);
4550                xfs_buf_unlock(agibp);
4551
4552                for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
4553                        agino = be32_to_cpu(agi->agi_unlinked[bucket]);
4554                        while (agino != NULLAGINO) {
4555                                agino = xlog_recover_process_one_iunlink(mp,
4556                                                        agno, agino, bucket);
4557                        }
4558                }
4559                xfs_buf_rele(agibp);
4560        }
4561
4562        mp->m_dmevmask = mp_dmevmask;
4563}
4564
4565STATIC int
4566xlog_unpack_data(
4567        struct xlog_rec_header  *rhead,
4568        char                    *dp,
4569        struct xlog             *log)
4570{
4571        int                     i, j, k;
4572
4573        for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
4574                  i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
4575                *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i];
4576                dp += BBSIZE;
4577        }
4578
4579        if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
4580                xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;
4581                for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
4582                        j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
4583                        k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
4584                        *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k];
4585                        dp += BBSIZE;
4586                }
4587        }
4588
4589        return 0;
4590}
4591
4592/*
4593 * CRC check, unpack and process a log record.
4594 */
4595STATIC int
4596xlog_recover_process(
4597        struct xlog             *log,
4598        struct hlist_head       rhash[],
4599        struct xlog_rec_header  *rhead,
4600        char                    *dp,
4601        int                     pass,
4602        struct list_head        *buffer_list)
4603{
4604        int                     error;
4605        __le32                  crc;
4606
4607        crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
4608
4609        /*
4610         * Nothing else to do if this is a CRC verification pass. Just return
4611         * if this a record with a non-zero crc. Unfortunately, mkfs always
4612         * sets h_crc to 0 so we must consider this valid even on v5 supers.
4613         * Otherwise, return EFSBADCRC on failure so the callers up the stack
4614         * know precisely what failed.
4615         */
4616        if (pass == XLOG_RECOVER_CRCPASS) {
4617                if (rhead->h_crc && crc != rhead->h_crc)
4618                        return -EFSBADCRC;
4619                return 0;
4620        }
4621
4622        /*
4623         * We're in the normal recovery path. Issue a warning if and only if the
4624         * CRC in the header is non-zero. This is an advisory warning and the
4625         * zero CRC check prevents warnings from being emitted when upgrading
4626         * the kernel from one that does not add CRCs by default.
4627         */
4628        if (crc != rhead->h_crc) {
4629                if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
4630                        xfs_alert(log->l_mp,
4631                "log record CRC mismatch: found 0x%x, expected 0x%x.",
4632                                        le32_to_cpu(rhead->h_crc),
4633                                        le32_to_cpu(crc));
4634                        xfs_hex_dump(dp, 32);
4635                }
4636
4637                /*
4638                 * If the filesystem is CRC enabled, this mismatch becomes a
4639                 * fatal log corruption failure.
4640                 */
4641                if (xfs_sb_version_hascrc(&log->l_mp->m_sb))
4642                        return -EFSCORRUPTED;
4643        }
4644
4645        error = xlog_unpack_data(rhead, dp, log);
4646        if (error)
4647                return error;
4648
4649        return xlog_recover_process_data(log, rhash, rhead, dp, pass,
4650                                         buffer_list);
4651}
4652
4653STATIC int
4654xlog_valid_rec_header(
4655        struct xlog             *log,
4656        struct xlog_rec_header  *rhead,
4657        xfs_daddr_t             blkno)
4658{
4659        int                     hlen;
4660
4661        if (unlikely(rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))) {
4662                XFS_ERROR_REPORT("xlog_valid_rec_header(1)",
4663                                XFS_ERRLEVEL_LOW, log->l_mp);
4664                return -EFSCORRUPTED;
4665        }
4666        if (unlikely(
4667            (!rhead->h_version ||
4668            (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {
4669                xfs_warn(log->l_mp, "%s: unrecognised log version (%d).",
4670                        __func__, be32_to_cpu(rhead->h_version));
4671                return -EIO;
4672        }
4673
4674        /* LR body must have data or it wouldn't have been written */
4675        hlen = be32_to_cpu(rhead->h_len);
4676        if (unlikely( hlen <= 0 || hlen > INT_MAX )) {
4677                XFS_ERROR_REPORT("xlog_valid_rec_header(2)",
4678                                XFS_ERRLEVEL_LOW, log->l_mp);
4679                return -EFSCORRUPTED;
4680        }
4681        if (unlikely( blkno > log->l_logBBsize || blkno > INT_MAX )) {
4682                XFS_ERROR_REPORT("xlog_valid_rec_header(3)",
4683                                XFS_ERRLEVEL_LOW, log->l_mp);
4684                return -EFSCORRUPTED;
4685        }
4686        return 0;
4687}
4688
4689/*
4690 * Read the log from tail to head and process the log records found.
4691 * Handle the two cases where the tail and head are in the same cycle
4692 * and where the active portion of the log wraps around the end of
4693 * the physical log separately.  The pass parameter is passed through
4694 * to the routines called to process the data and is not looked at
4695 * here.
4696 */
4697STATIC int
4698xlog_do_recovery_pass(
4699        struct xlog             *log,
4700        xfs_daddr_t             head_blk,
4701        xfs_daddr_t             tail_blk,
4702        int                     pass,
4703        xfs_daddr_t             *first_bad)     /* out: first bad log rec */
4704{
4705        xlog_rec_header_t       *rhead;
4706        xfs_daddr_t             blk_no;
4707        xfs_daddr_t             rhead_blk;
4708        char                    *offset;
4709        xfs_buf_t               *hbp, *dbp;
4710        int                     error = 0, h_size, h_len;
4711        int                     error2 = 0;
4712        int                     bblks, split_bblks;
4713        int                     hblks, split_hblks, wrapped_hblks;
4714        struct hlist_head       rhash[XLOG_RHASH_SIZE];
4715        LIST_HEAD               (buffer_list);
4716
4717        ASSERT(head_blk != tail_blk);
4718        rhead_blk = 0;
4719
4720        /*
4721         * Read the header of the tail block and get the iclog buffer size from
4722         * h_size.  Use this to tell how many sectors make up the log header.
4723         */
4724        if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
4725                /*
4726                 * When using variable length iclogs, read first sector of
4727                 * iclog header and extract the header size from it.  Get a
4728                 * new hbp that is the correct size.
4729                 */
4730                hbp = xlog_get_bp(log, 1);
4731                if (!hbp)
4732                        return -ENOMEM;
4733
4734                error = xlog_bread(log, tail_blk, 1, hbp, &offset);
4735                if (error)
4736                        goto bread_err1;
4737
4738                rhead = (xlog_rec_header_t *)offset;
4739                error = xlog_valid_rec_header(log, rhead, tail_blk);
4740                if (error)
4741                        goto bread_err1;
4742
4743                /*
4744                 * xfsprogs has a bug where record length is based on lsunit but
4745                 * h_size (iclog size) is hardcoded to 32k. Now that we
4746                 * unconditionally CRC verify the unmount record, this means the
4747                 * log buffer can be too small for the record and cause an
4748                 * overrun.
4749                 *
4750                 * Detect this condition here. Use lsunit for the buffer size as
4751                 * long as this looks like the mkfs case. Otherwise, return an
4752                 * error to avoid a buffer overrun.
4753                 */
4754                h_size = be32_to_cpu(rhead->h_size);
4755                h_len = be32_to_cpu(rhead->h_len);
4756                if (h_len > h_size) {
4757                        if (h_len <= log->l_mp->m_logbsize &&
4758                            be32_to_cpu(rhead->h_num_logops) == 1) {
4759                                xfs_warn(log->l_mp,
4760                "invalid iclog size (%d bytes), using lsunit (%d bytes)",
4761                                         h_size, log->l_mp->m_logbsize);
4762                                h_size = log->l_mp->m_logbsize;
4763                        } else
4764                                return -EFSCORRUPTED;
4765                }
4766
4767                if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) &&
4768                    (h_size > XLOG_HEADER_CYCLE_SIZE)) {
4769                        hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
4770                        if (h_size % XLOG_HEADER_CYCLE_SIZE)
4771                                hblks++;
4772                        xlog_put_bp(hbp);
4773                        hbp = xlog_get_bp(log, hblks);
4774                } else {
4775                        hblks = 1;
4776                }
4777        } else {
4778                ASSERT(log->l_sectBBsize == 1);
4779                hblks = 1;
4780                hbp = xlog_get_bp(log, 1);
4781                h_size = XLOG_BIG_RECORD_BSIZE;
4782        }
4783
4784        if (!hbp)
4785                return -ENOMEM;
4786        dbp = xlog_get_bp(log, BTOBB(h_size));
4787        if (!dbp) {
4788                xlog_put_bp(hbp);
4789                return -ENOMEM;
4790        }
4791
4792        memset(rhash, 0, sizeof(rhash));
4793        blk_no = rhead_blk = tail_blk;
4794        if (tail_blk > head_blk) {
4795                /*
4796                 * Perform recovery around the end of the physical log.
4797                 * When the head is not on the same cycle number as the tail,
4798                 * we can't do a sequential recovery.
4799                 */
4800                while (blk_no < log->l_logBBsize) {
4801                        /*
4802                         * Check for header wrapping around physical end-of-log
4803                         */
4804                        offset = hbp->b_addr;
4805                        split_hblks = 0;
4806                        wrapped_hblks = 0;
4807                        if (blk_no + hblks <= log->l_logBBsize) {
4808                                /* Read header in one read */
4809                                error = xlog_bread(log, blk_no, hblks, hbp,
4810                                                   &offset);
4811                                if (error)
4812                                        goto bread_err2;
4813                        } else {
4814                                /* This LR is split across physical log end */
4815                                if (blk_no != log->l_logBBsize) {
4816                                        /* some data before physical log end */
4817                                        ASSERT(blk_no <= INT_MAX);
4818                                        split_hblks = log->l_logBBsize - (int)blk_no;
4819                                        ASSERT(split_hblks > 0);
4820                                        error = xlog_bread(log, blk_no,
4821                                                           split_hblks, hbp,
4822                                                           &offset);
4823                                        if (error)
4824                                                goto bread_err2;
4825                                }
4826
4827                                /*
4828                                 * Note: this black magic still works with
4829                                 * large sector sizes (non-512) only because:
4830                                 * - we increased the buffer size originally
4831                                 *   by 1 sector giving us enough extra space
4832                                 *   for the second read;
4833                                 * - the log start is guaranteed to be sector
4834                                 *   aligned;
4835                                 * - we read the log end (LR header start)
4836                                 *   _first_, then the log start (LR header end)
4837                                 *   - order is important.
4838                                 */
4839                                wrapped_hblks = hblks - split_hblks;
4840                                error = xlog_bread_offset(log, 0,
4841                                                wrapped_hblks, hbp,
4842                                                offset + BBTOB(split_hblks));
4843                                if (error)
4844                                        goto bread_err2;
4845                        }
4846                        rhead = (xlog_rec_header_t *)offset;
4847                        error = xlog_valid_rec_header(log, rhead,
4848                                                split_hblks ? blk_no : 0);
4849                        if (error)
4850                                goto bread_err2;
4851
4852                        bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
4853                        blk_no += hblks;
4854
4855                        /* Read in data for log record */
4856                        if (blk_no + bblks <= log->l_logBBsize) {
4857                                error = xlog_bread(log, blk_no, bblks, dbp,
4858                                                   &offset);
4859                                if (error)
4860                                        goto bread_err2;
4861                        } else {
4862                                /* This log record is split across the
4863                                 * physical end of log */
4864                                offset = dbp->b_addr;
4865                                split_bblks = 0;
4866                                if (blk_no != log->l_logBBsize) {
4867                                        /* some data is before the physical
4868                                         * end of log */
4869                                        ASSERT(!wrapped_hblks);
4870                                        ASSERT(blk_no <= INT_MAX);
4871                                        split_bblks =
4872                                                log->l_logBBsize - (int)blk_no;
4873                                        ASSERT(split_bblks > 0);
4874                                        error = xlog_bread(log, blk_no,
4875                                                        split_bblks, dbp,
4876                                                        &offset);
4877                                        if (error)
4878                                                goto bread_err2;
4879                                }
4880
4881                                /*
4882                                 * Note: this black magic still works with
4883                                 * large sector sizes (non-512) only because:
4884                                 * - we increased the buffer size originally
4885                                 *   by 1 sector giving us enough extra space
4886                                 *   for the second read;
4887                                 * - the log start is guaranteed to be sector
4888                                 *   aligned;
4889                                 * - we read the log end (LR header start)
4890                                 *   _first_, then the log start (LR header end)
4891                                 *   - order is important.
4892                                 */
4893                                error = xlog_bread_offset(log, 0,
4894                                                bblks - split_bblks, dbp,
4895                                                offset + BBTOB(split_bblks));
4896                                if (error)
4897                                        goto bread_err2;
4898                        }
4899
4900                        error = xlog_recover_process(log, rhash, rhead, offset,
4901                                                     pass, &buffer_list);
4902                        if (error)
4903                                goto bread_err2;
4904
4905                        blk_no += bblks;
4906                        rhead_blk = blk_no;
4907                }
4908
4909                ASSERT(blk_no >= log->l_logBBsize);
4910                blk_no -= log->l_logBBsize;
4911                rhead_blk = blk_no;
4912        }
4913
4914        /* read first part of physical log */
4915        while (blk_no < head_blk) {
4916                error = xlog_bread(log, blk_no, hblks, hbp, &offset);
4917                if (error)
4918                        goto bread_err2;
4919
4920                rhead = (xlog_rec_header_t *)offset;
4921                error = xlog_valid_rec_header(log, rhead, blk_no);
4922                if (error)
4923                        goto bread_err2;
4924
4925                /* blocks in data section */
4926                bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
4927                error = xlog_bread(log, blk_no+hblks, bblks, dbp,
4928                                   &offset);
4929                if (error)
4930                        goto bread_err2;
4931
4932                error = xlog_recover_process(log, rhash, rhead, offset, pass,
4933                                             &buffer_list);
4934                if (error)
4935                        goto bread_err2;
4936
4937                blk_no += bblks + hblks;
4938                rhead_blk = blk_no;
4939        }
4940
4941 bread_err2:
4942        xlog_put_bp(dbp);
4943 bread_err1:
4944        xlog_put_bp(hbp);
4945
4946        /*
4947         * Submit buffers that have been added from the last record processed,
4948         * regardless of error status.
4949         */
4950        if (!list_empty(&buffer_list))
4951                error2 = xfs_buf_delwri_submit(&buffer_list);
4952
4953        if (error && first_bad)
4954                *first_bad = rhead_blk;
4955
4956        return error ? error : error2;
4957}
4958
4959/*
4960 * Do the recovery of the log.  We actually do this in two phases.
4961 * The two passes are necessary in order to implement the function
4962 * of cancelling a record written into the log.  The first pass
4963 * determines those things which have been cancelled, and the
4964 * second pass replays log items normally except for those which
4965 * have been cancelled.  The handling of the replay and cancellations
4966 * takes place in the log item type specific routines.
4967 *
4968 * The table of items which have cancel records in the log is allocated
4969 * and freed at this level, since only here do we know when all of
4970 * the log recovery has been completed.
4971 */
4972STATIC int
4973xlog_do_log_recovery(
4974        struct xlog     *log,
4975        xfs_daddr_t     head_blk,
4976        xfs_daddr_t     tail_blk)
4977{
4978        int             error, i;
4979
4980        ASSERT(head_blk != tail_blk);
4981
4982        /*
4983         * First do a pass to find all of the cancelled buf log items.
4984         * Store them in the buf_cancel_table for use in the second pass.
4985         */
4986        log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE *
4987                                                 sizeof(struct list_head),
4988                                                 KM_SLEEP);
4989        for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
4990                INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
4991
4992        error = xlog_do_recovery_pass(log, head_blk, tail_blk,
4993                                      XLOG_RECOVER_PASS1, NULL);
4994        if (error != 0) {
4995                kmem_free(log->l_buf_cancel_table);
4996                log->l_buf_cancel_table = NULL;
4997                return error;
4998        }
4999        /*
5000         * Then do a second pass to actually recover the items in the log.

5001         * When it is complete free the table of buf cancel items.
5002         */
5003        error = xlog_do_recovery_pass(log, head_blk, tail_blk,
5004                                      XLOG_RECOVER_PASS2, NULL);
5005#ifdef DEBUG
5006        if (!error) {
5007                int     i;
5008
5009                for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
5010                        ASSERT(list_empty(&log->l_buf_cancel_table[i]));
5011        }
5012#endif  /* DEBUG */
5013
5014        kmem_free(log->l_buf_cancel_table);
5015        log->l_buf_cancel_table = NULL;
5016
5017        return error;
5018}
5019
5020/*
5021 * Do the actual recovery
5022 */
5023STATIC int
5024xlog_do_recover(
5025        struct xlog     *log,
5026        xfs_daddr_t     head_blk,
5027        xfs_daddr_t     tail_blk)
5028{
5029        struct xfs_mount *mp = log->l_mp;
5030        int             error;
5031        xfs_buf_t       *bp;
5032        xfs_sb_t        *sbp;
5033
5034        /*
5035         * First replay the images in the log.
5036         */
5037        error = xlog_do_log_recovery(log, head_blk, tail_blk);
5038        if (error)
5039                return error;
5040
5041        /*
5042         * If IO errors happened during recovery, bail out.
5043         */
5044        if (XFS_FORCED_SHUTDOWN(mp)) {
5045                return -EIO;
5046        }
5047
5048        /*
5049         * We now update the tail_lsn since much of the recovery has completed
5050         * and there may be space available to use.  If there were no extent
5051         * or iunlinks, we can free up the entire log and set the tail_lsn to
5052         * be the last_sync_lsn.  This was set in xlog_find_tail to be the
5053         * lsn of the last known good LR on disk.  If there are extent frees
5054         * or iunlinks they will have some entries in the AIL; so we look at
5055         * the AIL to determine how to set the tail_lsn.
5056         */
5057        xlog_assign_tail_lsn(mp);
5058
5059        /*
5060         * Now that we've finished replaying all buffer and inode
5061         * updates, re-read in the superblock and reverify it.
5062         */
5063        bp = xfs_getsb(mp, 0);
5064        bp->b_flags &= ~(XBF_DONE | XBF_ASYNC);
5065        ASSERT(!(bp->b_flags & XBF_WRITE));
5066        bp->b_flags |= XBF_READ;
5067        bp->b_ops = &xfs_sb_buf_ops;
5068
5069        error = xfs_buf_submit_wait(bp);
5070        if (error) {
5071                if (!XFS_FORCED_SHUTDOWN(mp)) {
5072                        xfs_buf_ioerror_alert(bp, __func__);
5073                        ASSERT(0);
5074                }
5075                xfs_buf_relse(bp);
5076                return error;
5077        }
5078
5079        /* Convert superblock from on-disk format */
5080        sbp = &mp->m_sb;
5081        xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
5082        xfs_buf_relse(bp);
5083
5084        /* re-initialise in-core superblock and geometry structures */
5085        xfs_reinit_percpu_counters(mp);
5086        error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
5087        if (error) {
5088                xfs_warn(mp, "Failed post-recovery per-ag init: %d", error);
5089                return error;
5090        }
5091
5092        xlog_recover_check_summary(log);
5093
5094        /* Normal transactions can now occur */
5095        log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
5096        return 0;
5097}
5098
5099/*
5100 * Perform recovery and re-initialize some log variables in xlog_find_tail.
5101 *
5102 * Return error or zero.
5103 */
5104int
5105xlog_recover(
5106        struct xlog     *log)
5107{
5108        xfs_daddr_t     head_blk, tail_blk;
5109        int             error;
5110
5111        /* find the tail of the log */
5112        error = xlog_find_tail(log, &head_blk, &tail_blk);
5113        if (error)
5114                return error;
5115
5116        /*
5117         * The superblock was read before the log was available and thus the LSN
5118         * could not be verified. Check the superblock LSN against the current
5119         * LSN now that it's known.
5120         */
5121        if (xfs_sb_version_hascrc(&log->l_mp->m_sb) &&
5122            !xfs_log_check_lsn(log->l_mp, log->l_mp->m_sb.sb_lsn))
5123                return -EINVAL;
5124
5125        if (tail_blk != head_blk) {
5126                /* There used to be a comment here:
5127                 *
5128                 * disallow recovery on read-only mounts.  note -- mount
5129                 * checks for ENOSPC and turns it into an intelligent
5130                 * error message.
5131                 * ...but this is no longer true.  Now, unless you specify
5132                 * NORECOVERY (in which case this function would never be
5133                 * called), we just go ahead and recover.  We do this all
5134                 * under the vfs layer, so we can get away with it unless
5135                 * the device itself is read-only, in which case we fail.
5136                 */
5137                if ((error = xfs_dev_is_read_only(log->l_mp, "recovery"))) {
5138                        return error;
5139                }
5140
5141                /*
5142                 * Version 5 superblock log feature mask validation. We know the
5143                 * log is dirty so check if there are any unknown log features
5144                 * in what we need to recover. If there are unknown features
5145                 * (e.g. unsupported transactions, then simply reject the
5146                 * attempt at recovery before touching anything.
5147                 */
5148                if (XFS_SB_VERSION_NUM(&log->l_mp->m_sb) == XFS_SB_VERSION_5 &&
5149                    xfs_sb_has_incompat_log_feature(&log->l_mp->m_sb,
5150                                        XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)) {
5151                        xfs_warn(log->l_mp,
5152"Superblock has unknown incompatible log features (0x%x) enabled.",
5153                                (log->l_mp->m_sb.sb_features_log_incompat &
5154                                        XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN));
5155                        xfs_warn(log->l_mp,
5156"The log can not be fully and/or safely recovered by this kernel.");
5157                        xfs_warn(log->l_mp,
5158"Please recover the log on a kernel that supports the unknown features.");
5159                        return -EINVAL;
5160                }
5161
5162                /*
5163                 * Delay log recovery if the debug hook is set. This is debug
5164                 * instrumention to coordinate simulation of I/O failures with
5165                 * log recovery.
5166                 */
5167                if (xfs_globals.log_recovery_delay) {
5168                        xfs_notice(log->l_mp,
5169                                "Delaying log recovery for %d seconds.",
5170                                xfs_globals.log_recovery_delay);
5171                        msleep(xfs_globals.log_recovery_delay * 1000);
5172                }
5173
5174                xfs_notice(log->l_mp, "Starting recovery (logdev: %s)",
5175                                log->l_mp->m_logname ? log->l_mp->m_logname
5176                                                     : "internal");
5177
5178                error = xlog_do_recover(log, head_blk, tail_blk);
5179                log->l_flags |= XLOG_RECOVERY_NEEDED;
5180        }
5181        return error;
5182}
5183
5184/*
5185 * In the first part of recovery we replay inodes and buffers and build
5186 * up the list of extent free items which need to be processed.  Here
5187 * we process the extent free items and clean up the on disk unlinked
5188 * inode lists.  This is separated from the first part of recovery so
5189 * that the root and real-time bitmap inodes can be read in from disk in
5190 * between the two stages.  This is necessary so that we can free space
5191 * in the real-time portion of the file system.
5192 */
5193int
5194xlog_recover_finish(
5195        struct xlog     *log)
5196{
5197        /*
5198         * Now we're ready to do the transactions needed for the
5199         * rest of recovery.  Start with completing all the extent
5200         * free intent records and then process the unlinked inode
5201         * lists.  At this point, we essentially run in normal mode
5202         * except that we're still performing recovery actions
5203         * rather than accepting new requests.
5204         */
5205        if (log->l_flags & XLOG_RECOVERY_NEEDED) {
5206                int     error;
5207                error = xlog_recover_process_intents(log);
5208                if (error) {
5209                        xfs_alert(log->l_mp, "Failed to recover intents");
5210                        return error;
5211                }
5212                /*
5213                 * Sync the log to get all the intents out of the AIL.
5214                 * This isn't absolutely necessary, but it helps in
5215                 * case the unlink transactions would have problems
5216                 * pushing the intents out of the way.
5217                 */
5218                xfs_log_force(log->l_mp, XFS_LOG_SYNC);
5219
5220                xlog_recover_process_iunlinks(log);
5221
5222                xlog_recover_check_summary(log);
5223
5224                xfs_notice(log->l_mp, "Ending recovery (logdev: %s)",
5225                                log->l_mp->m_logname ? log->l_mp->m_logname
5226                                                     : "internal");
5227                log->l_flags &= ~XLOG_RECOVERY_NEEDED;
5228        } else {
5229                xfs_info(log->l_mp, "Ending clean mount");
5230        }
5231        return 0;
5232}
5233
5234int
5235xlog_recover_cancel(
5236        struct xlog     *log)
5237{
5238        int             error = 0;
5239
5240        if (log->l_flags & XLOG_RECOVERY_NEEDED)
5241                error = xlog_recover_cancel_intents(log);
5242
5243        return error;
5244}
5245
5246#if defined(DEBUG)
5247/*
5248 * Read all of the agf and agi counters and check that they
5249 * are consistent with the superblock counters.
5250 */
5251void
5252xlog_recover_check_summary(
5253        struct xlog     *log)
5254{
5255        xfs_mount_t     *mp;
5256        xfs_agf_t       *agfp;
5257        xfs_buf_t       *agfbp;
5258        xfs_buf_t       *agibp;
5259        xfs_agnumber_t  agno;
5260        __uint64_t      freeblks;
5261        __uint64_t      itotal;
5262        __uint64_t      ifree;
5263        int             error;
5264
5265        mp = log->l_mp;
5266
5267        freeblks = 0LL;
5268        itotal = 0LL;
5269        ifree = 0LL;
5270        for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
5271                error = xfs_read_agf(mp, NULL, agno, 0, &agfbp);
5272                if (error) {
5273                        xfs_alert(mp, "%s agf read failed agno %d error %d",
5274                                                __func__, agno, error);
5275                } else {
5276                        agfp = XFS_BUF_TO_AGF(agfbp);
5277                        freeblks += be32_to_cpu(agfp->agf_freeblks) +
5278                                    be32_to_cpu(agfp->agf_flcount);
5279                        xfs_buf_relse(agfbp);
5280                }
5281
5282                error = xfs_read_agi(mp, NULL, agno, &agibp);
5283                if (error) {
5284                        xfs_alert(mp, "%s agi read failed agno %d error %d",
5285                                                __func__, agno, error);
5286                } else {
5287                        struct xfs_agi  *agi = XFS_BUF_TO_AGI(agibp);
5288
5289                        itotal += be32_to_cpu(agi->agi_count);
5290                        ifree += be32_to_cpu(agi->agi_freecount);
5291                        xfs_buf_relse(agibp);
5292                }
5293        }
5294}
5295#endif /* DEBUG */
5296