linux/fs/xfs/xfs_buf_item.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2000-2005 Silicon Graphics, Inc.
   3 * All Rights Reserved.
   4 *
   5 * This program is free software; you can redistribute it and/or
   6 * modify it under the terms of the GNU General Public License as
   7 * published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope that it would be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public License
  15 * along with this program; if not, write the Free Software Foundation,
  16 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17 */
  18#include "xfs.h"
  19#include "xfs_fs.h"
  20#include "xfs_types.h"
  21#include "xfs_bit.h"
  22#include "xfs_log.h"
  23#include "xfs_inum.h"
  24#include "xfs_trans.h"
  25#include "xfs_sb.h"
  26#include "xfs_ag.h"
  27#include "xfs_dmapi.h"
  28#include "xfs_mount.h"
  29#include "xfs_buf_item.h"
  30#include "xfs_trans_priv.h"
  31#include "xfs_error.h"
  32
  33
  34kmem_zone_t     *xfs_buf_item_zone;
  35
  36#ifdef XFS_TRANS_DEBUG
  37/*
  38 * This function uses an alternate strategy for tracking the bytes
  39 * that the user requests to be logged.  This can then be used
  40 * in conjunction with the bli_orig array in the buf log item to
  41 * catch bugs in our callers' code.
  42 *
  43 * We also double check the bits set in xfs_buf_item_log using a
  44 * simple algorithm to check that every byte is accounted for.
  45 */
  46STATIC void
  47xfs_buf_item_log_debug(
  48        xfs_buf_log_item_t      *bip,
  49        uint                    first,
  50        uint                    last)
  51{
  52        uint    x;
  53        uint    byte;
  54        uint    nbytes;
  55        uint    chunk_num;
  56        uint    word_num;
  57        uint    bit_num;
  58        uint    bit_set;
  59        uint    *wordp;
  60
  61        ASSERT(bip->bli_logged != NULL);
  62        byte = first;
  63        nbytes = last - first + 1;
  64        bfset(bip->bli_logged, first, nbytes);
  65        for (x = 0; x < nbytes; x++) {
  66                chunk_num = byte >> XFS_BLI_SHIFT;
  67                word_num = chunk_num >> BIT_TO_WORD_SHIFT;
  68                bit_num = chunk_num & (NBWORD - 1);
  69                wordp = &(bip->bli_format.blf_data_map[word_num]);
  70                bit_set = *wordp & (1 << bit_num);
  71                ASSERT(bit_set);
  72                byte++;
  73        }
  74}
  75
  76/*
  77 * This function is called when we flush something into a buffer without
  78 * logging it.  This happens for things like inodes which are logged
  79 * separately from the buffer.
  80 */
  81void
  82xfs_buf_item_flush_log_debug(
  83        xfs_buf_t       *bp,
  84        uint            first,
  85        uint            last)
  86{
  87        xfs_buf_log_item_t      *bip;
  88        uint                    nbytes;
  89
  90        bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
  91        if ((bip == NULL) || (bip->bli_item.li_type != XFS_LI_BUF)) {
  92                return;
  93        }
  94
  95        ASSERT(bip->bli_logged != NULL);
  96        nbytes = last - first + 1;
  97        bfset(bip->bli_logged, first, nbytes);
  98}
  99
 100/*
 101 * This function is called to verify that our callers have logged
 102 * all the bytes that they changed.
 103 *
 104 * It does this by comparing the original copy of the buffer stored in
 105 * the buf log item's bli_orig array to the current copy of the buffer
 106 * and ensuring that all bytes which mismatch are set in the bli_logged
 107 * array of the buf log item.
 108 */
 109STATIC void
 110xfs_buf_item_log_check(
 111        xfs_buf_log_item_t      *bip)
 112{
 113        char            *orig;
 114        char            *buffer;
 115        int             x;
 116        xfs_buf_t       *bp;
 117
 118        ASSERT(bip->bli_orig != NULL);
 119        ASSERT(bip->bli_logged != NULL);
 120
 121        bp = bip->bli_buf;
 122        ASSERT(XFS_BUF_COUNT(bp) > 0);
 123        ASSERT(XFS_BUF_PTR(bp) != NULL);
 124        orig = bip->bli_orig;
 125        buffer = XFS_BUF_PTR(bp);
 126        for (x = 0; x < XFS_BUF_COUNT(bp); x++) {
 127                if (orig[x] != buffer[x] && !btst(bip->bli_logged, x))
 128                        cmn_err(CE_PANIC,
 129        "xfs_buf_item_log_check bip %x buffer %x orig %x index %d",
 130                                bip, bp, orig, x);
 131        }
 132}
 133#else
 134#define         xfs_buf_item_log_debug(x,y,z)
 135#define         xfs_buf_item_log_check(x)
 136#endif
 137
 138STATIC void     xfs_buf_error_relse(xfs_buf_t *bp);
 139STATIC void     xfs_buf_do_callbacks(xfs_buf_t *bp, xfs_log_item_t *lip);
 140
 141/*
 142 * This returns the number of log iovecs needed to log the
 143 * given buf log item.
 144 *
 145 * It calculates this as 1 iovec for the buf log format structure
 146 * and 1 for each stretch of non-contiguous chunks to be logged.
 147 * Contiguous chunks are logged in a single iovec.
 148 *
 149 * If the XFS_BLI_STALE flag has been set, then log nothing.
 150 */
 151STATIC uint
 152xfs_buf_item_size(
 153        xfs_buf_log_item_t      *bip)
 154{
 155        uint            nvecs;
 156        int             next_bit;
 157        int             last_bit;
 158        xfs_buf_t       *bp;
 159
 160        ASSERT(atomic_read(&bip->bli_refcount) > 0);
 161        if (bip->bli_flags & XFS_BLI_STALE) {
 162                /*
 163                 * The buffer is stale, so all we need to log
 164                 * is the buf log format structure with the
 165                 * cancel flag in it.
 166                 */
 167                xfs_buf_item_trace("SIZE STALE", bip);
 168                ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
 169                return 1;
 170        }
 171
 172        bp = bip->bli_buf;
 173        ASSERT(bip->bli_flags & XFS_BLI_LOGGED);
 174        nvecs = 1;
 175        last_bit = xfs_next_bit(bip->bli_format.blf_data_map,
 176                                         bip->bli_format.blf_map_size, 0);
 177        ASSERT(last_bit != -1);
 178        nvecs++;
 179        while (last_bit != -1) {
 180                /*
 181                 * This takes the bit number to start looking from and
 182                 * returns the next set bit from there.  It returns -1
 183                 * if there are no more bits set or the start bit is
 184                 * beyond the end of the bitmap.
 185                 */
 186                next_bit = xfs_next_bit(bip->bli_format.blf_data_map,
 187                                                 bip->bli_format.blf_map_size,
 188                                                 last_bit + 1);
 189                /*
 190                 * If we run out of bits, leave the loop,
 191                 * else if we find a new set of bits bump the number of vecs,
 192                 * else keep scanning the current set of bits.
 193                 */
 194                if (next_bit == -1) {
 195                        last_bit = -1;
 196                } else if (next_bit != last_bit + 1) {
 197                        last_bit = next_bit;
 198                        nvecs++;
 199                } else if (xfs_buf_offset(bp, next_bit * XFS_BLI_CHUNK) !=
 200                           (xfs_buf_offset(bp, last_bit * XFS_BLI_CHUNK) +
 201                            XFS_BLI_CHUNK)) {
 202                        last_bit = next_bit;
 203                        nvecs++;
 204                } else {
 205                        last_bit++;
 206                }
 207        }
 208
 209        xfs_buf_item_trace("SIZE NORM", bip);
 210        return nvecs;
 211}
 212
 213/*
 214 * This is called to fill in the vector of log iovecs for the
 215 * given log buf item.  It fills the first entry with a buf log
 216 * format structure, and the rest point to contiguous chunks
 217 * within the buffer.
 218 */
 219STATIC void
 220xfs_buf_item_format(
 221        xfs_buf_log_item_t      *bip,
 222        xfs_log_iovec_t         *log_vector)
 223{
 224        uint            base_size;
 225        uint            nvecs;
 226        xfs_log_iovec_t *vecp;
 227        xfs_buf_t       *bp;
 228        int             first_bit;
 229        int             last_bit;
 230        int             next_bit;
 231        uint            nbits;
 232        uint            buffer_offset;
 233
 234        ASSERT(atomic_read(&bip->bli_refcount) > 0);
 235        ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
 236               (bip->bli_flags & XFS_BLI_STALE));
 237        bp = bip->bli_buf;
 238        vecp = log_vector;
 239
 240        /*
 241         * The size of the base structure is the size of the
 242         * declared structure plus the space for the extra words
 243         * of the bitmap.  We subtract one from the map size, because
 244         * the first element of the bitmap is accounted for in the
 245         * size of the base structure.
 246         */
 247        base_size =
 248                (uint)(sizeof(xfs_buf_log_format_t) +
 249                       ((bip->bli_format.blf_map_size - 1) * sizeof(uint)));
 250        vecp->i_addr = (xfs_caddr_t)&bip->bli_format;
 251        vecp->i_len = base_size;
 252        XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BFORMAT);
 253        vecp++;
 254        nvecs = 1;
 255
 256        if (bip->bli_flags & XFS_BLI_STALE) {
 257                /*
 258                 * The buffer is stale, so all we need to log
 259                 * is the buf log format structure with the
 260                 * cancel flag in it.
 261                 */
 262                xfs_buf_item_trace("FORMAT STALE", bip);
 263                ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
 264                bip->bli_format.blf_size = nvecs;
 265                return;
 266        }
 267
 268        /*
 269         * Fill in an iovec for each set of contiguous chunks.
 270         */
 271        first_bit = xfs_next_bit(bip->bli_format.blf_data_map,
 272                                         bip->bli_format.blf_map_size, 0);
 273        ASSERT(first_bit != -1);
 274        last_bit = first_bit;
 275        nbits = 1;
 276        for (;;) {
 277                /*
 278                 * This takes the bit number to start looking from and
 279                 * returns the next set bit from there.  It returns -1
 280                 * if there are no more bits set or the start bit is
 281                 * beyond the end of the bitmap.
 282                 */
 283                next_bit = xfs_next_bit(bip->bli_format.blf_data_map,
 284                                                 bip->bli_format.blf_map_size,
 285                                                 (uint)last_bit + 1);
 286                /*
 287                 * If we run out of bits fill in the last iovec and get
 288                 * out of the loop.
 289                 * Else if we start a new set of bits then fill in the
 290                 * iovec for the series we were looking at and start
 291                 * counting the bits in the new one.
 292                 * Else we're still in the same set of bits so just
 293                 * keep counting and scanning.
 294                 */
 295                if (next_bit == -1) {
 296                        buffer_offset = first_bit * XFS_BLI_CHUNK;
 297                        vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
 298                        vecp->i_len = nbits * XFS_BLI_CHUNK;
 299                        XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK);
 300                        nvecs++;
 301                        break;
 302                } else if (next_bit != last_bit + 1) {
 303                        buffer_offset = first_bit * XFS_BLI_CHUNK;
 304                        vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
 305                        vecp->i_len = nbits * XFS_BLI_CHUNK;
 306                        XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK);
 307                        nvecs++;
 308                        vecp++;
 309                        first_bit = next_bit;
 310                        last_bit = next_bit;
 311                        nbits = 1;
 312                } else if (xfs_buf_offset(bp, next_bit << XFS_BLI_SHIFT) !=
 313                           (xfs_buf_offset(bp, last_bit << XFS_BLI_SHIFT) +
 314                            XFS_BLI_CHUNK)) {
 315                        buffer_offset = first_bit * XFS_BLI_CHUNK;
 316                        vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
 317                        vecp->i_len = nbits * XFS_BLI_CHUNK;
 318                        XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_BCHUNK);
 319/* You would think we need to bump the nvecs here too, but we do not
 320 * this number is used by recovery, and it gets confused by the boundary
 321 * split here
 322 *                      nvecs++;
 323 */
 324                        vecp++;
 325                        first_bit = next_bit;
 326                        last_bit = next_bit;
 327                        nbits = 1;
 328                } else {
 329                        last_bit++;
 330                        nbits++;
 331                }
 332        }
 333        bip->bli_format.blf_size = nvecs;
 334
 335        /*
 336         * Check to make sure everything is consistent.
 337         */
 338        xfs_buf_item_trace("FORMAT NORM", bip);
 339        xfs_buf_item_log_check(bip);
 340}
 341
 342/*
 343 * This is called to pin the buffer associated with the buf log
 344 * item in memory so it cannot be written out.  Simply call bpin()
 345 * on the buffer to do this.
 346 */
 347STATIC void
 348xfs_buf_item_pin(
 349        xfs_buf_log_item_t      *bip)
 350{
 351        xfs_buf_t       *bp;
 352
 353        bp = bip->bli_buf;
 354        ASSERT(XFS_BUF_ISBUSY(bp));
 355        ASSERT(atomic_read(&bip->bli_refcount) > 0);
 356        ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
 357               (bip->bli_flags & XFS_BLI_STALE));
 358        xfs_buf_item_trace("PIN", bip);
 359        xfs_buftrace("XFS_PIN", bp);
 360        xfs_bpin(bp);
 361}
 362
 363
 364/*
 365 * This is called to unpin the buffer associated with the buf log
 366 * item which was previously pinned with a call to xfs_buf_item_pin().
 367 * Just call bunpin() on the buffer to do this.
 368 *
 369 * Also drop the reference to the buf item for the current transaction.
 370 * If the XFS_BLI_STALE flag is set and we are the last reference,
 371 * then free up the buf log item and unlock the buffer.
 372 */
 373STATIC void
 374xfs_buf_item_unpin(
 375        xfs_buf_log_item_t      *bip,
 376        int                     stale)
 377{
 378        struct xfs_ail  *ailp;
 379        xfs_buf_t       *bp;
 380        int             freed;
 381
 382        bp = bip->bli_buf;
 383        ASSERT(bp != NULL);
 384        ASSERT(XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *) == bip);
 385        ASSERT(atomic_read(&bip->bli_refcount) > 0);
 386        xfs_buf_item_trace("UNPIN", bip);
 387        xfs_buftrace("XFS_UNPIN", bp);
 388
 389        freed = atomic_dec_and_test(&bip->bli_refcount);
 390        ailp = bip->bli_item.li_ailp;
 391        xfs_bunpin(bp);
 392        if (freed && stale) {
 393                ASSERT(bip->bli_flags & XFS_BLI_STALE);
 394                ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
 395                ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
 396                ASSERT(XFS_BUF_ISSTALE(bp));
 397                ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
 398                xfs_buf_item_trace("UNPIN STALE", bip);
 399                xfs_buftrace("XFS_UNPIN STALE", bp);
 400                /*
 401                 * If we get called here because of an IO error, we may
 402                 * or may not have the item on the AIL. xfs_trans_ail_delete()
 403                 * will take care of that situation.
 404                 * xfs_trans_ail_delete() drops the AIL lock.
 405                 */
 406                if (bip->bli_flags & XFS_BLI_STALE_INODE) {
 407                        xfs_buf_do_callbacks(bp, (xfs_log_item_t *)bip);
 408                        XFS_BUF_SET_FSPRIVATE(bp, NULL);
 409                        XFS_BUF_CLR_IODONE_FUNC(bp);
 410                } else {
 411                        spin_lock(&ailp->xa_lock);
 412                        xfs_trans_ail_delete(ailp, (xfs_log_item_t *)bip);
 413                        xfs_buf_item_relse(bp);
 414                        ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL);
 415                }
 416                xfs_buf_relse(bp);
 417        }
 418}
 419
 420/*
 421 * this is called from uncommit in the forced-shutdown path.
 422 * we need to check to see if the reference count on the log item
 423 * is going to drop to zero.  If so, unpin will free the log item
 424 * so we need to free the item's descriptor (that points to the item)
 425 * in the transaction.
 426 */
 427STATIC void
 428xfs_buf_item_unpin_remove(
 429        xfs_buf_log_item_t      *bip,
 430        xfs_trans_t             *tp)
 431{
 432        xfs_buf_t               *bp;
 433        xfs_log_item_desc_t     *lidp;
 434        int                     stale = 0;
 435
 436        bp = bip->bli_buf;
 437        /*
 438         * will xfs_buf_item_unpin() call xfs_buf_item_relse()?
 439         */
 440        if ((atomic_read(&bip->bli_refcount) == 1) &&
 441            (bip->bli_flags & XFS_BLI_STALE)) {
 442                ASSERT(XFS_BUF_VALUSEMA(bip->bli_buf) <= 0);
 443                xfs_buf_item_trace("UNPIN REMOVE", bip);
 444                xfs_buftrace("XFS_UNPIN_REMOVE", bp);
 445                /*
 446                 * yes -- clear the xaction descriptor in-use flag
 447                 * and free the chunk if required.  We can safely
 448                 * do some work here and then call buf_item_unpin
 449                 * to do the rest because if the if is true, then
 450                 * we are holding the buffer locked so no one else
 451                 * will be able to bump up the refcount.
 452                 */
 453                lidp = xfs_trans_find_item(tp, (xfs_log_item_t *) bip);
 454                stale = lidp->lid_flags & XFS_LID_BUF_STALE;
 455                xfs_trans_free_item(tp, lidp);
 456                /*
 457                 * Since the transaction no longer refers to the buffer,
 458                 * the buffer should no longer refer to the transaction.
 459                 */
 460                XFS_BUF_SET_FSPRIVATE2(bp, NULL);
 461        }
 462
 463        xfs_buf_item_unpin(bip, stale);
 464
 465        return;
 466}
 467
 468/*
 469 * This is called to attempt to lock the buffer associated with this
 470 * buf log item.  Don't sleep on the buffer lock.  If we can't get
 471 * the lock right away, return 0.  If we can get the lock, pull the
 472 * buffer from the free list, mark it busy, and return 1.
 473 */
 474STATIC uint
 475xfs_buf_item_trylock(
 476        xfs_buf_log_item_t      *bip)
 477{
 478        xfs_buf_t       *bp;
 479
 480        bp = bip->bli_buf;
 481
 482        if (XFS_BUF_ISPINNED(bp)) {
 483                return XFS_ITEM_PINNED;
 484        }
 485
 486        if (!XFS_BUF_CPSEMA(bp)) {
 487                return XFS_ITEM_LOCKED;
 488        }
 489
 490        /*
 491         * Remove the buffer from the free list.  Only do this
 492         * if it's on the free list.  Private buffers like the
 493         * superblock buffer are not.
 494         */
 495        XFS_BUF_HOLD(bp);
 496
 497        ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
 498        xfs_buf_item_trace("TRYLOCK SUCCESS", bip);
 499        return XFS_ITEM_SUCCESS;
 500}
 501
 502/*
 503 * Release the buffer associated with the buf log item.
 504 * If there is no dirty logged data associated with the
 505 * buffer recorded in the buf log item, then free the
 506 * buf log item and remove the reference to it in the
 507 * buffer.
 508 *
 509 * This call ignores the recursion count.  It is only called
 510 * when the buffer should REALLY be unlocked, regardless
 511 * of the recursion count.
 512 *
 513 * If the XFS_BLI_HOLD flag is set in the buf log item, then
 514 * free the log item if necessary but do not unlock the buffer.
 515 * This is for support of xfs_trans_bhold(). Make sure the
 516 * XFS_BLI_HOLD field is cleared if we don't free the item.
 517 */
 518STATIC void
 519xfs_buf_item_unlock(
 520        xfs_buf_log_item_t      *bip)
 521{
 522        int             aborted;
 523        xfs_buf_t       *bp;
 524        uint            hold;
 525
 526        bp = bip->bli_buf;
 527        xfs_buftrace("XFS_UNLOCK", bp);
 528
 529        /*
 530         * Clear the buffer's association with this transaction.
 531         */
 532        XFS_BUF_SET_FSPRIVATE2(bp, NULL);
 533
 534        /*
 535         * If this is a transaction abort, don't return early.
 536         * Instead, allow the brelse to happen.
 537         * Normally it would be done for stale (cancelled) buffers
 538         * at unpin time, but we'll never go through the pin/unpin
 539         * cycle if we abort inside commit.
 540         */
 541        aborted = (bip->bli_item.li_flags & XFS_LI_ABORTED) != 0;
 542
 543        /*
 544         * If the buf item is marked stale, then don't do anything.
 545         * We'll unlock the buffer and free the buf item when the
 546         * buffer is unpinned for the last time.
 547         */
 548        if (bip->bli_flags & XFS_BLI_STALE) {
 549                bip->bli_flags &= ~XFS_BLI_LOGGED;
 550                xfs_buf_item_trace("UNLOCK STALE", bip);
 551                ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
 552                if (!aborted)
 553                        return;
 554        }
 555
 556        /*
 557         * Drop the transaction's reference to the log item if
 558         * it was not logged as part of the transaction.  Otherwise
 559         * we'll drop the reference in xfs_buf_item_unpin() when
 560         * the transaction is really through with the buffer.
 561         */
 562        if (!(bip->bli_flags & XFS_BLI_LOGGED)) {
 563                atomic_dec(&bip->bli_refcount);
 564        } else {
 565                /*
 566                 * Clear the logged flag since this is per
 567                 * transaction state.
 568                 */
 569                bip->bli_flags &= ~XFS_BLI_LOGGED;
 570        }
 571
 572        /*
 573         * Before possibly freeing the buf item, determine if we should
 574         * release the buffer at the end of this routine.
 575         */
 576        hold = bip->bli_flags & XFS_BLI_HOLD;
 577        xfs_buf_item_trace("UNLOCK", bip);
 578
 579        /*
 580         * If the buf item isn't tracking any data, free it.
 581         * Otherwise, if XFS_BLI_HOLD is set clear it.
 582         */
 583        if (xfs_bitmap_empty(bip->bli_format.blf_data_map,
 584                             bip->bli_format.blf_map_size)) {
 585                xfs_buf_item_relse(bp);
 586        } else if (hold) {
 587                bip->bli_flags &= ~XFS_BLI_HOLD;
 588        }
 589
 590        /*
 591         * Release the buffer if XFS_BLI_HOLD was not set.
 592         */
 593        if (!hold) {
 594                xfs_buf_relse(bp);
 595        }
 596}
 597
 598/*
 599 * This is called to find out where the oldest active copy of the
 600 * buf log item in the on disk log resides now that the last log
 601 * write of it completed at the given lsn.
 602 * We always re-log all the dirty data in a buffer, so usually the
 603 * latest copy in the on disk log is the only one that matters.  For
 604 * those cases we simply return the given lsn.
 605 *
 606 * The one exception to this is for buffers full of newly allocated
 607 * inodes.  These buffers are only relogged with the XFS_BLI_INODE_BUF
 608 * flag set, indicating that only the di_next_unlinked fields from the
 609 * inodes in the buffers will be replayed during recovery.  If the
 610 * original newly allocated inode images have not yet been flushed
 611 * when the buffer is so relogged, then we need to make sure that we
 612 * keep the old images in the 'active' portion of the log.  We do this
 613 * by returning the original lsn of that transaction here rather than
 614 * the current one.
 615 */
 616STATIC xfs_lsn_t
 617xfs_buf_item_committed(
 618        xfs_buf_log_item_t      *bip,
 619        xfs_lsn_t               lsn)
 620{
 621        xfs_buf_item_trace("COMMITTED", bip);
 622        if ((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
 623            (bip->bli_item.li_lsn != 0)) {
 624                return bip->bli_item.li_lsn;
 625        }
 626        return (lsn);
 627}
 628
 629/*
 630 * This is called to asynchronously write the buffer associated with this
 631 * buf log item out to disk. The buffer will already have been locked by
 632 * a successful call to xfs_buf_item_trylock().  If the buffer still has
 633 * B_DELWRI set, then get it going out to disk with a call to bawrite().
 634 * If not, then just release the buffer.
 635 */
 636STATIC void
 637xfs_buf_item_push(
 638        xfs_buf_log_item_t      *bip)
 639{
 640        xfs_buf_t       *bp;
 641
 642        ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
 643        xfs_buf_item_trace("PUSH", bip);
 644
 645        bp = bip->bli_buf;
 646
 647        if (XFS_BUF_ISDELAYWRITE(bp)) {
 648                int     error;
 649                error = xfs_bawrite(bip->bli_item.li_mountp, bp);
 650                if (error)
 651                        xfs_fs_cmn_err(CE_WARN, bip->bli_item.li_mountp,
 652                        "xfs_buf_item_push: pushbuf error %d on bip %p, bp %p",
 653                                        error, bip, bp);
 654        } else {
 655                xfs_buf_relse(bp);
 656        }
 657}
 658
 659/* ARGSUSED */
 660STATIC void
 661xfs_buf_item_committing(xfs_buf_log_item_t *bip, xfs_lsn_t commit_lsn)
 662{
 663}
 664
 665/*
 666 * This is the ops vector shared by all buf log items.
 667 */
 668static struct xfs_item_ops xfs_buf_item_ops = {
 669        .iop_size       = (uint(*)(xfs_log_item_t*))xfs_buf_item_size,
 670        .iop_format     = (void(*)(xfs_log_item_t*, xfs_log_iovec_t*))
 671                                        xfs_buf_item_format,
 672        .iop_pin        = (void(*)(xfs_log_item_t*))xfs_buf_item_pin,
 673        .iop_unpin      = (void(*)(xfs_log_item_t*, int))xfs_buf_item_unpin,
 674        .iop_unpin_remove = (void(*)(xfs_log_item_t*, xfs_trans_t *))
 675                                        xfs_buf_item_unpin_remove,
 676        .iop_trylock    = (uint(*)(xfs_log_item_t*))xfs_buf_item_trylock,
 677        .iop_unlock     = (void(*)(xfs_log_item_t*))xfs_buf_item_unlock,
 678        .iop_committed  = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
 679                                        xfs_buf_item_committed,
 680        .iop_push       = (void(*)(xfs_log_item_t*))xfs_buf_item_push,
 681        .iop_pushbuf    = NULL,
 682        .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
 683                                        xfs_buf_item_committing
 684};
 685
 686
 687/*
 688 * Allocate a new buf log item to go with the given buffer.
 689 * Set the buffer's b_fsprivate field to point to the new
 690 * buf log item.  If there are other item's attached to the
 691 * buffer (see xfs_buf_attach_iodone() below), then put the
 692 * buf log item at the front.
 693 */
 694void
 695xfs_buf_item_init(
 696        xfs_buf_t       *bp,
 697        xfs_mount_t     *mp)
 698{
 699        xfs_log_item_t          *lip;
 700        xfs_buf_log_item_t      *bip;
 701        int                     chunks;
 702        int                     map_size;
 703
 704        /*
 705         * Check to see if there is already a buf log item for
 706         * this buffer.  If there is, it is guaranteed to be
 707         * the first.  If we do already have one, there is
 708         * nothing to do here so return.
 709         */
 710        if (bp->b_mount != mp)
 711                bp->b_mount = mp;
 712        XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb);
 713        if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
 714                lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
 715                if (lip->li_type == XFS_LI_BUF) {
 716                        return;
 717                }
 718        }
 719
 720        /*
 721         * chunks is the number of XFS_BLI_CHUNK size pieces
 722         * the buffer can be divided into. Make sure not to
 723         * truncate any pieces.  map_size is the size of the
 724         * bitmap needed to describe the chunks of the buffer.
 725         */
 726        chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLI_CHUNK - 1)) >> XFS_BLI_SHIFT);
 727        map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT);
 728
 729        bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone,
 730                                                    KM_SLEEP);
 731        bip->bli_item.li_type = XFS_LI_BUF;
 732        bip->bli_item.li_ops = &xfs_buf_item_ops;
 733        bip->bli_item.li_mountp = mp;
 734        bip->bli_item.li_ailp = mp->m_ail;
 735        bip->bli_buf = bp;
 736        xfs_buf_hold(bp);
 737        bip->bli_format.blf_type = XFS_LI_BUF;
 738        bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp);
 739        bip->bli_format.blf_len = (ushort)BTOBB(XFS_BUF_COUNT(bp));
 740        bip->bli_format.blf_map_size = map_size;
 741#ifdef XFS_BLI_TRACE
 742        bip->bli_trace = ktrace_alloc(XFS_BLI_TRACE_SIZE, KM_NOFS);
 743#endif
 744
 745#ifdef XFS_TRANS_DEBUG
 746        /*
 747         * Allocate the arrays for tracking what needs to be logged
 748         * and what our callers request to be logged.  bli_orig
 749         * holds a copy of the original, clean buffer for comparison
 750         * against, and bli_logged keeps a 1 bit flag per byte in
 751         * the buffer to indicate which bytes the callers have asked
 752         * to have logged.
 753         */
 754        bip->bli_orig = (char *)kmem_alloc(XFS_BUF_COUNT(bp), KM_SLEEP);
 755        memcpy(bip->bli_orig, XFS_BUF_PTR(bp), XFS_BUF_COUNT(bp));
 756        bip->bli_logged = (char *)kmem_zalloc(XFS_BUF_COUNT(bp) / NBBY, KM_SLEEP);
 757#endif
 758
 759        /*
 760         * Put the buf item into the list of items attached to the
 761         * buffer at the front.
 762         */
 763        if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
 764                bip->bli_item.li_bio_list =
 765                                XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
 766        }
 767        XFS_BUF_SET_FSPRIVATE(bp, bip);
 768}
 769
 770
 771/*
 772 * Mark bytes first through last inclusive as dirty in the buf
 773 * item's bitmap.
 774 */
 775void
 776xfs_buf_item_log(
 777        xfs_buf_log_item_t      *bip,
 778        uint                    first,
 779        uint                    last)
 780{
 781        uint            first_bit;
 782        uint            last_bit;
 783        uint            bits_to_set;
 784        uint            bits_set;
 785        uint            word_num;
 786        uint            *wordp;
 787        uint            bit;
 788        uint            end_bit;
 789        uint            mask;
 790
 791        /*
 792         * Mark the item as having some dirty data for
 793         * quick reference in xfs_buf_item_dirty.
 794         */
 795        bip->bli_flags |= XFS_BLI_DIRTY;
 796
 797        /*
 798         * Convert byte offsets to bit numbers.
 799         */
 800        first_bit = first >> XFS_BLI_SHIFT;
 801        last_bit = last >> XFS_BLI_SHIFT;
 802
 803        /*
 804         * Calculate the total number of bits to be set.
 805         */
 806        bits_to_set = last_bit - first_bit + 1;
 807
 808        /*
 809         * Get a pointer to the first word in the bitmap
 810         * to set a bit in.
 811         */
 812        word_num = first_bit >> BIT_TO_WORD_SHIFT;
 813        wordp = &(bip->bli_format.blf_data_map[word_num]);
 814
 815        /*
 816         * Calculate the starting bit in the first word.
 817         */
 818        bit = first_bit & (uint)(NBWORD - 1);
 819
 820        /*
 821         * First set any bits in the first word of our range.
 822         * If it starts at bit 0 of the word, it will be
 823         * set below rather than here.  That is what the variable
 824         * bit tells us. The variable bits_set tracks the number
 825         * of bits that have been set so far.  End_bit is the number
 826         * of the last bit to be set in this word plus one.
 827         */
 828        if (bit) {
 829                end_bit = MIN(bit + bits_to_set, (uint)NBWORD);
 830                mask = ((1 << (end_bit - bit)) - 1) << bit;
 831                *wordp |= mask;
 832                wordp++;
 833                bits_set = end_bit - bit;
 834        } else {
 835                bits_set = 0;
 836        }
 837
 838        /*
 839         * Now set bits a whole word at a time that are between
 840         * first_bit and last_bit.
 841         */
 842        while ((bits_to_set - bits_set) >= NBWORD) {
 843                *wordp |= 0xffffffff;
 844                bits_set += NBWORD;
 845                wordp++;
 846        }
 847
 848        /*
 849         * Finally, set any bits left to be set in one last partial word.
 850         */
 851        end_bit = bits_to_set - bits_set;
 852        if (end_bit) {
 853                mask = (1 << end_bit) - 1;
 854                *wordp |= mask;
 855        }
 856
 857        xfs_buf_item_log_debug(bip, first, last);
 858}
 859
 860
 861/*
 862 * Return 1 if the buffer has some data that has been logged (at any
 863 * point, not just the current transaction) and 0 if not.
 864 */
 865uint
 866xfs_buf_item_dirty(
 867        xfs_buf_log_item_t      *bip)
 868{
 869        return (bip->bli_flags & XFS_BLI_DIRTY);
 870}
 871
 872STATIC void
 873xfs_buf_item_free(
 874        xfs_buf_log_item_t      *bip)
 875{
 876#ifdef XFS_TRANS_DEBUG
 877        kmem_free(bip->bli_orig);
 878        kmem_free(bip->bli_logged);
 879#endif /* XFS_TRANS_DEBUG */
 880
 881#ifdef XFS_BLI_TRACE
 882        ktrace_free(bip->bli_trace);
 883#endif
 884        kmem_zone_free(xfs_buf_item_zone, bip);
 885}
 886
 887/*
 888 * This is called when the buf log item is no longer needed.  It should
 889 * free the buf log item associated with the given buffer and clear
 890 * the buffer's pointer to the buf log item.  If there are no more
 891 * items in the list, clear the b_iodone field of the buffer (see
 892 * xfs_buf_attach_iodone() below).
 893 */
 894void
 895xfs_buf_item_relse(
 896        xfs_buf_t       *bp)
 897{
 898        xfs_buf_log_item_t      *bip;
 899
 900        xfs_buftrace("XFS_RELSE", bp);
 901        bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t*);
 902        XFS_BUF_SET_FSPRIVATE(bp, bip->bli_item.li_bio_list);
 903        if ((XFS_BUF_FSPRIVATE(bp, void *) == NULL) &&
 904            (XFS_BUF_IODONE_FUNC(bp) != NULL)) {
 905                XFS_BUF_CLR_IODONE_FUNC(bp);
 906        }
 907        xfs_buf_rele(bp);
 908        xfs_buf_item_free(bip);
 909}
 910
 911
 912/*
 913 * Add the given log item with its callback to the list of callbacks
 914 * to be called when the buffer's I/O completes.  If it is not set
 915 * already, set the buffer's b_iodone() routine to be
 916 * xfs_buf_iodone_callbacks() and link the log item into the list of
 917 * items rooted at b_fsprivate.  Items are always added as the second
 918 * entry in the list if there is a first, because the buf item code
 919 * assumes that the buf log item is first.
 920 */
 921void
 922xfs_buf_attach_iodone(
 923        xfs_buf_t       *bp,
 924        void            (*cb)(xfs_buf_t *, xfs_log_item_t *),
 925        xfs_log_item_t  *lip)
 926{
 927        xfs_log_item_t  *head_lip;
 928
 929        ASSERT(XFS_BUF_ISBUSY(bp));
 930        ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
 931
 932        lip->li_cb = cb;
 933        if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
 934                head_lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
 935                lip->li_bio_list = head_lip->li_bio_list;
 936                head_lip->li_bio_list = lip;
 937        } else {
 938                XFS_BUF_SET_FSPRIVATE(bp, lip);
 939        }
 940
 941        ASSERT((XFS_BUF_IODONE_FUNC(bp) == xfs_buf_iodone_callbacks) ||
 942               (XFS_BUF_IODONE_FUNC(bp) == NULL));
 943        XFS_BUF_SET_IODONE_FUNC(bp, xfs_buf_iodone_callbacks);
 944}
 945
 946STATIC void
 947xfs_buf_do_callbacks(
 948        xfs_buf_t       *bp,
 949        xfs_log_item_t  *lip)
 950{
 951        xfs_log_item_t  *nlip;
 952
 953        while (lip != NULL) {
 954                nlip = lip->li_bio_list;
 955                ASSERT(lip->li_cb != NULL);
 956                /*
 957                 * Clear the next pointer so we don't have any
 958                 * confusion if the item is added to another buf.
 959                 * Don't touch the log item after calling its
 960                 * callback, because it could have freed itself.
 961                 */
 962                lip->li_bio_list = NULL;
 963                lip->li_cb(bp, lip);
 964                lip = nlip;
 965        }
 966}
 967
 968/*
 969 * This is the iodone() function for buffers which have had callbacks
 970 * attached to them by xfs_buf_attach_iodone().  It should remove each
 971 * log item from the buffer's list and call the callback of each in turn.
 972 * When done, the buffer's fsprivate field is set to NULL and the buffer
 973 * is unlocked with a call to iodone().
 974 */
 975void
 976xfs_buf_iodone_callbacks(
 977        xfs_buf_t       *bp)
 978{
 979        xfs_log_item_t  *lip;
 980        static ulong    lasttime;
 981        static xfs_buftarg_t *lasttarg;
 982        xfs_mount_t     *mp;
 983
 984        ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
 985        lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
 986
 987        if (XFS_BUF_GETERROR(bp) != 0) {
 988                /*
 989                 * If we've already decided to shutdown the filesystem
 990                 * because of IO errors, there's no point in giving this
 991                 * a retry.
 992                 */
 993                mp = lip->li_mountp;
 994                if (XFS_FORCED_SHUTDOWN(mp)) {
 995                        ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp);
 996                        XFS_BUF_SUPER_STALE(bp);
 997                        xfs_buftrace("BUF_IODONE_CB", bp);
 998                        xfs_buf_do_callbacks(bp, lip);
 999                        XFS_BUF_SET_FSPRIVATE(bp, NULL);
1000                        XFS_BUF_CLR_IODONE_FUNC(bp);
1001                        xfs_biodone(bp);
1002                        return;
1003                }
1004
1005                if ((XFS_BUF_TARGET(bp) != lasttarg) ||
1006                    (time_after(jiffies, (lasttime + 5*HZ)))) {
1007                        lasttime = jiffies;
1008                        cmn_err(CE_ALERT, "Device %s, XFS metadata write error"
1009                                        " block 0x%llx in %s",
1010                                XFS_BUFTARG_NAME(XFS_BUF_TARGET(bp)),
1011                              (__uint64_t)XFS_BUF_ADDR(bp), mp->m_fsname);
1012                }
1013                lasttarg = XFS_BUF_TARGET(bp);
1014
1015                if (XFS_BUF_ISASYNC(bp)) {
1016                        /*
1017                         * If the write was asynchronous then noone will be
1018                         * looking for the error.  Clear the error state
1019                         * and write the buffer out again delayed write.
1020                         *
1021                         * XXXsup This is OK, so long as we catch these
1022                         * before we start the umount; we don't want these
1023                         * DELWRI metadata bufs to be hanging around.
1024                         */
1025                        XFS_BUF_ERROR(bp,0); /* errno of 0 unsets the flag */
1026
1027                        if (!(XFS_BUF_ISSTALE(bp))) {
1028                                XFS_BUF_DELAYWRITE(bp);
1029                                XFS_BUF_DONE(bp);
1030                                XFS_BUF_SET_START(bp);
1031                        }
1032                        ASSERT(XFS_BUF_IODONE_FUNC(bp));
1033                        xfs_buftrace("BUF_IODONE ASYNC", bp);
1034                        xfs_buf_relse(bp);
1035                } else {
1036                        /*
1037                         * If the write of the buffer was not asynchronous,
1038                         * then we want to make sure to return the error
1039                         * to the caller of bwrite().  Because of this we
1040                         * cannot clear the B_ERROR state at this point.
1041                         * Instead we install a callback function that
1042                         * will be called when the buffer is released, and
1043                         * that routine will clear the error state and
1044                         * set the buffer to be written out again after
1045                         * some delay.
1046                         */
1047                        /* We actually overwrite the existing b-relse
1048                           function at times, but we're gonna be shutting down
1049                           anyway. */
1050                        XFS_BUF_SET_BRELSE_FUNC(bp,xfs_buf_error_relse);
1051                        XFS_BUF_DONE(bp);
1052                        XFS_BUF_FINISH_IOWAIT(bp);
1053                }
1054                return;
1055        }
1056#ifdef XFSERRORDEBUG
1057        xfs_buftrace("XFS BUFCB NOERR", bp);
1058#endif
1059        xfs_buf_do_callbacks(bp, lip);
1060        XFS_BUF_SET_FSPRIVATE(bp, NULL);
1061        XFS_BUF_CLR_IODONE_FUNC(bp);
1062        xfs_biodone(bp);
1063}
1064
1065/*
1066 * This is a callback routine attached to a buffer which gets an error
1067 * when being written out synchronously.
1068 */
1069STATIC void
1070xfs_buf_error_relse(
1071        xfs_buf_t       *bp)
1072{
1073        xfs_log_item_t  *lip;
1074        xfs_mount_t     *mp;
1075
1076        lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
1077        mp = (xfs_mount_t *)lip->li_mountp;
1078        ASSERT(XFS_BUF_TARGET(bp) == mp->m_ddev_targp);
1079
1080        XFS_BUF_STALE(bp);
1081        XFS_BUF_DONE(bp);
1082        XFS_BUF_UNDELAYWRITE(bp);
1083        XFS_BUF_ERROR(bp,0);
1084        xfs_buftrace("BUF_ERROR_RELSE", bp);
1085        if (! XFS_FORCED_SHUTDOWN(mp))
1086                xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1087        /*
1088         * We have to unpin the pinned buffers so do the
1089         * callbacks.
1090         */
1091        xfs_buf_do_callbacks(bp, lip);
1092        XFS_BUF_SET_FSPRIVATE(bp, NULL);
1093        XFS_BUF_CLR_IODONE_FUNC(bp);
1094        XFS_BUF_SET_BRELSE_FUNC(bp,NULL);
1095        xfs_buf_relse(bp);
1096}
1097
1098
1099/*
1100 * This is the iodone() function for buffers which have been
1101 * logged.  It is called when they are eventually flushed out.
1102 * It should remove the buf item from the AIL, and free the buf item.
1103 * It is called by xfs_buf_iodone_callbacks() above which will take
1104 * care of cleaning up the buffer itself.
1105 */
1106/* ARGSUSED */
1107void
1108xfs_buf_iodone(
1109        xfs_buf_t               *bp,
1110        xfs_buf_log_item_t      *bip)
1111{
1112        struct xfs_ail          *ailp = bip->bli_item.li_ailp;
1113
1114        ASSERT(bip->bli_buf == bp);
1115
1116        xfs_buf_rele(bp);
1117
1118        /*
1119         * If we are forcibly shutting down, this may well be
1120         * off the AIL already. That's because we simulate the
1121         * log-committed callbacks to unpin these buffers. Or we may never
1122         * have put this item on AIL because of the transaction was
1123         * aborted forcibly. xfs_trans_ail_delete() takes care of these.
1124         *
1125         * Either way, AIL is useless if we're forcing a shutdown.
1126         */
1127        spin_lock(&ailp->xa_lock);
1128        xfs_trans_ail_delete(ailp, (xfs_log_item_t *)bip);
1129        xfs_buf_item_free(bip);
1130}
1131
1132#if defined(XFS_BLI_TRACE)
1133void
1134xfs_buf_item_trace(
1135        char                    *id,
1136        xfs_buf_log_item_t      *bip)
1137{
1138        xfs_buf_t               *bp;
1139        ASSERT(bip->bli_trace != NULL);
1140
1141        bp = bip->bli_buf;
1142        ktrace_enter(bip->bli_trace,
1143                     (void *)id,
1144                     (void *)bip->bli_buf,
1145                     (void *)((unsigned long)bip->bli_flags),
1146                     (void *)((unsigned long)bip->bli_recur),
1147                     (void *)((unsigned long)atomic_read(&bip->bli_refcount)),
1148                     (void *)((unsigned long)
1149                                (0xFFFFFFFF & XFS_BUF_ADDR(bp) >> 32)),
1150                     (void *)((unsigned long)(0xFFFFFFFF & XFS_BUF_ADDR(bp))),
1151                     (void *)((unsigned long)XFS_BUF_COUNT(bp)),
1152                     (void *)((unsigned long)XFS_BUF_BFLAGS(bp)),
1153                     XFS_BUF_FSPRIVATE(bp, void *),
1154                     XFS_BUF_FSPRIVATE2(bp, void *),
1155                     (void *)(unsigned long)XFS_BUF_ISPINNED(bp),
1156                     (void *)XFS_BUF_IODONE_FUNC(bp),
1157                     (void *)((unsigned long)(XFS_BUF_VALUSEMA(bp))),
1158                     (void *)bip->bli_item.li_desc,
1159                     (void *)((unsigned long)bip->bli_item.li_flags));
1160}
1161#endif /* XFS_BLI_TRACE */
1162