LXR linux/fs/xfs/xfs_log

   1/*
   2 * Copyright (c) 2010 Red Hat, Inc. All Rights Reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of the GNU General Public License as
   6 * published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it would be useful,
   9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  11 * GNU General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public License
  14 * along with this program; if not, write the Free Software Foundation,
  15 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  16 */
  17
  18#include "xfs.h"
  19#include "xfs_fs.h"
  20#include "xfs_types.h"
  21#include "xfs_log.h"
  22#include "xfs_trans.h"
  23#include "xfs_trans_priv.h"
  24#include "xfs_log_priv.h"
  25#include "xfs_sb.h"
  26#include "xfs_ag.h"
  27#include "xfs_mount.h"
  28#include "xfs_error.h"
  29#include "xfs_alloc.h"
  30#include "xfs_extent_busy.h"
  31#include "xfs_discard.h"
  32
  33/*
  34 * Allocate a new ticket. Failing to get a new ticket makes it really hard to
  35 * recover, so we don't allow failure here. Also, we allocate in a context that
  36 * we don't want to be issuing transactions from, so we need to tell the
  37 * allocation code this as well.
  38 *
  39 * We don't reserve any space for the ticket - we are going to steal whatever
  40 * space we require from transactions as they commit. To ensure we reserve all
  41 * the space required, we need to set the current reservation of the ticket to
  42 * zero so that we know to steal the initial transaction overhead from the
  43 * first transaction commit.
  44 */
  45static struct xlog_ticket *
  46xlog_cil_ticket_alloc(
  47        struct xlog     *log)
  48{
  49        struct xlog_ticket *tic;
  50
  51        tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0,
  52                                KM_SLEEP|KM_NOFS);
  53        tic->t_trans_type = XFS_TRANS_CHECKPOINT;
  54
  55        /*
  56         * set the current reservation to zero so we know to steal the basic
  57         * transaction overhead reservation from the first transaction commit.
  58         */
  59        tic->t_curr_res = 0;
  60        return tic;
  61}
  62
  63/*
  64 * After the first stage of log recovery is done, we know where the head and
  65 * tail of the log are. We need this log initialisation done before we can
  66 * initialise the first CIL checkpoint context.
  67 *
  68 * Here we allocate a log ticket to track space usage during a CIL push.  This
  69 * ticket is passed to xlog_write() directly so that we don't slowly leak log
  70 * space by failing to account for space used by log headers and additional
  71 * region headers for split regions.
  72 */
  73void
  74xlog_cil_init_post_recovery(
  75        struct xlog     *log)
  76{
  77        log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log);
  78        log->l_cilp->xc_ctx->sequence = 1;
  79        log->l_cilp->xc_ctx->commit_lsn = xlog_assign_lsn(log->l_curr_cycle,
  80                                                                log->l_curr_block);
  81}
  82
  83/*
  84 * Format log item into a flat buffers
  85 *
  86 * For delayed logging, we need to hold a formatted buffer containing all the
  87 * changes on the log item. This enables us to relog the item in memory and
  88 * write it out asynchronously without needing to relock the object that was
  89 * modified at the time it gets written into the iclog.
  90 *
  91 * This function builds a vector for the changes in each log item in the
  92 * transaction. It then works out the length of the buffer needed for each log
  93 * item, allocates them and formats the vector for the item into the buffer.
  94 * The buffer is then attached to the log item are then inserted into the
  95 * Committed Item List for tracking until the next checkpoint is written out.
  96 *
  97 * We don't set up region headers during this process; we simply copy the
  98 * regions into the flat buffer. We can do this because we still have to do a
  99 * formatting step to write the regions into the iclog buffer.  Writing the
 100 * ophdrs during the iclog write means that we can support splitting large
 101 * regions across iclog boundares without needing a change in the format of the
 102 * item/region encapsulation.
 103 *
 104 * Hence what we need to do now is change the rewrite the vector array to point
 105 * to the copied region inside the buffer we just allocated. This allows us to
 106 * format the regions into the iclog as though they are being formatted
 107 * directly out of the objects themselves.
 108 */
 109static struct xfs_log_vec *
 110xlog_cil_prepare_log_vecs(
 111        struct xfs_trans        *tp)
 112{
 113        struct xfs_log_item_desc *lidp;
 114        struct xfs_log_vec      *lv = NULL;
 115        struct xfs_log_vec      *ret_lv = NULL;
 116
 117
 118        /* Bail out if we didn't find a log item.  */
 119        if (list_empty(&tp->t_items)) {
 120                ASSERT(0);
 121                return NULL;
 122        }
 123
 124        list_for_each_entry(lidp, &tp->t_items, lid_trans) {
 125                struct xfs_log_vec *new_lv;
 126                void    *ptr;
 127                int     index;
 128                int     len = 0;
 129                uint    niovecs;
 130
 131                /* Skip items which aren't dirty in this transaction. */
 132                if (!(lidp->lid_flags & XFS_LID_DIRTY))
 133                        continue;
 134
 135                /* Skip items that do not have any vectors for writing */
 136                niovecs = IOP_SIZE(lidp->lid_item);
 137                if (!niovecs)
 138                        continue;
 139
 140                new_lv = kmem_zalloc(sizeof(*new_lv) +
 141                                niovecs * sizeof(struct xfs_log_iovec),
 142                                KM_SLEEP);
 143
 144                /* The allocated iovec region lies beyond the log vector. */
 145                new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1];
 146                new_lv->lv_niovecs = niovecs;
 147                new_lv->lv_item = lidp->lid_item;
 148
 149                /* build the vector array and calculate it's length */
 150                IOP_FORMAT(new_lv->lv_item, new_lv->lv_iovecp);
 151                for (index = 0; index < new_lv->lv_niovecs; index++)
 152                        len += new_lv->lv_iovecp[index].i_len;
 153
 154                new_lv->lv_buf_len = len;
 155                new_lv->lv_buf = kmem_alloc(new_lv->lv_buf_len,
 156                                KM_SLEEP|KM_NOFS);
 157                ptr = new_lv->lv_buf;
 158
 159                for (index = 0; index < new_lv->lv_niovecs; index++) {
 160                        struct xfs_log_iovec *vec = &new_lv->lv_iovecp[index];
 161
 162                        memcpy(ptr, vec->i_addr, vec->i_len);
 163                        vec->i_addr = ptr;
 164                        ptr += vec->i_len;
 165                }
 166                ASSERT(ptr == new_lv->lv_buf + new_lv->lv_buf_len);
 167
 168                if (!ret_lv)
 169                        ret_lv = new_lv;
 170                else
 171                        lv->lv_next = new_lv;
 172                lv = new_lv;
 173        }
 174
 175        return ret_lv;
 176}
 177
 178/*
 179 * Prepare the log item for insertion into the CIL. Calculate the difference in
 180 * log space and vectors it will consume, and if it is a new item pin it as
 181 * well.
 182 */
 183STATIC void
 184xfs_cil_prepare_item(
 185        struct xlog             *log,
 186        struct xfs_log_vec      *lv,
 187        int                     *len,
 188        int                     *diff_iovecs)
 189{
 190        struct xfs_log_vec      *old = lv->lv_item->li_lv;
 191
 192        if (old) {
 193                /* existing lv on log item, space used is a delta */
 194                ASSERT(!list_empty(&lv->lv_item->li_cil));
 195                ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs);
 196
 197                *len += lv->lv_buf_len - old->lv_buf_len;
 198                *diff_iovecs += lv->lv_niovecs - old->lv_niovecs;
 199                kmem_free(old->lv_buf);
 200                kmem_free(old);
 201        } else {
 202                /* new lv, must pin the log item */
 203                ASSERT(!lv->lv_item->li_lv);
 204                ASSERT(list_empty(&lv->lv_item->li_cil));
 205
 206                *len += lv->lv_buf_len;
 207                *diff_iovecs += lv->lv_niovecs;
 208                IOP_PIN(lv->lv_item);
 209
 210        }
 211
 212        /* attach new log vector to log item */
 213        lv->lv_item->li_lv = lv;
 214
 215        /*
 216         * If this is the first time the item is being committed to the
 217         * CIL, store the sequence number on the log item so we can
 218         * tell in future commits whether this is the first checkpoint
 219         * the item is being committed into.
 220         */
 221        if (!lv->lv_item->li_seq)
 222                lv->lv_item->li_seq = log->l_cilp->xc_ctx->sequence;
 223}
 224
 225/*
 226 * Insert the log items into the CIL and calculate the difference in space
 227 * consumed by the item. Add the space to the checkpoint ticket and calculate
 228 * if the change requires additional log metadata. If it does, take that space
 229 * as well. Remove the amount of space we added to the checkpoint ticket from
 230 * the current transaction ticket so that the accounting works out correctly.
 231 */
 232static void
 233xlog_cil_insert_items(
 234        struct xlog             *log,
 235        struct xfs_log_vec      *log_vector,
 236        struct xlog_ticket      *ticket)
 237{
 238        struct xfs_cil          *cil = log->l_cilp;
 239        struct xfs_cil_ctx      *ctx = cil->xc_ctx;
 240        struct xfs_log_vec      *lv;
 241        int                     len = 0;
 242        int                     diff_iovecs = 0;
 243        int                     iclog_space;
 244
 245        ASSERT(log_vector);
 246
 247        /*
 248         * Do all the accounting aggregation and switching of log vectors
 249         * around in a separate loop to the insertion of items into the CIL.
 250         * Then we can do a separate loop to update the CIL within a single
 251         * lock/unlock pair. This reduces the number of round trips on the CIL
 252         * lock from O(nr_logvectors) to O(1) and greatly reduces the overall
 253         * hold time for the transaction commit.
 254         *
 255         * If this is the first time the item is being placed into the CIL in
 256         * this context, pin it so it can't be written to disk until the CIL is
 257         * flushed to the iclog and the iclog written to disk.
 258         *
 259         * We can do this safely because the context can't checkpoint until we
 260         * are done so it doesn't matter exactly how we update the CIL.
 261         */
 262        for (lv = log_vector; lv; lv = lv->lv_next)
 263                xfs_cil_prepare_item(log, lv, &len, &diff_iovecs);
 264
 265        /* account for space used by new iovec headers  */
 266        len += diff_iovecs * sizeof(xlog_op_header_t);
 267
 268        spin_lock(&cil->xc_cil_lock);
 269
 270        /* move the items to the tail of the CIL */
 271        for (lv = log_vector; lv; lv = lv->lv_next)
 272                list_move_tail(&lv->lv_item->li_cil, &cil->xc_cil);
 273
 274        ctx->nvecs += diff_iovecs;
 275
 276        /*
 277         * Now transfer enough transaction reservation to the context ticket
 278         * for the checkpoint. The context ticket is special - the unit
 279         * reservation has to grow as well as the current reservation as we
 280         * steal from tickets so we can correctly determine the space used
 281         * during the transaction commit.
 282         */
 283        if (ctx->ticket->t_curr_res == 0) {
 284                /* first commit in checkpoint, steal the header reservation */
 285                ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len);
 286                ctx->ticket->t_curr_res = ctx->ticket->t_unit_res;
 287                ticket->t_curr_res -= ctx->ticket->t_unit_res;
 288        }
 289
 290        /* do we need space for more log record headers? */
 291        iclog_space = log->l_iclog_size - log->l_iclog_hsize;
 292        if (len > 0 && (ctx->space_used / iclog_space !=
 293                                (ctx->space_used + len) / iclog_space)) {
 294                int hdrs;
 295
 296                hdrs = (len + iclog_space - 1) / iclog_space;
 297                /* need to take into account split region headers, too */
 298                hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
 299                ctx->ticket->t_unit_res += hdrs;
 300                ctx->ticket->t_curr_res += hdrs;
 301                ticket->t_curr_res -= hdrs;
 302                ASSERT(ticket->t_curr_res >= len);
 303        }
 304        ticket->t_curr_res -= len;
 305        ctx->space_used += len;
 306
 307        spin_unlock(&cil->xc_cil_lock);
 308}
 309
 310static void
 311xlog_cil_free_logvec(
 312        struct xfs_log_vec      *log_vector)
 313{
 314        struct xfs_log_vec      *lv;
 315
 316        for (lv = log_vector; lv; ) {
 317                struct xfs_log_vec *next = lv->lv_next;
 318                kmem_free(lv->lv_buf);
 319                kmem_free(lv);
 320                lv = next;
 321        }
 322}
 323
 324/*
 325 * Mark all items committed and clear busy extents. We free the log vector
 326 * chains in a separate pass so that we unpin the log items as quickly as
 327 * possible.
 328 */
 329static void
 330xlog_cil_committed(
 331        void    *args,
 332        int     abort)
 333{
 334        struct xfs_cil_ctx      *ctx = args;
 335        struct xfs_mount        *mp = ctx->cil->xc_log->l_mp;
 336
 337        xfs_trans_committed_bulk(ctx->cil->xc_log->l_ailp, ctx->lv_chain,
 338                                        ctx->start_lsn, abort);
 339
 340        xfs_extent_busy_sort(&ctx->busy_extents);
 341        xfs_extent_busy_clear(mp, &ctx->busy_extents,
 342                             (mp->m_flags & XFS_MOUNT_DISCARD) && !abort);
 343
 344        spin_lock(&ctx->cil->xc_cil_lock);
 345        list_del(&ctx->committing);
 346        spin_unlock(&ctx->cil->xc_cil_lock);
 347
 348        xlog_cil_free_logvec(ctx->lv_chain);
 349
 350        if (!list_empty(&ctx->busy_extents)) {
 351                ASSERT(mp->m_flags & XFS_MOUNT_DISCARD);
 352
 353                xfs_discard_extents(mp, &ctx->busy_extents);
 354                xfs_extent_busy_clear(mp, &ctx->busy_extents, false);
 355        }
 356
 357        kmem_free(ctx);
 358}
 359
 360/*
 361 * Push the Committed Item List to the log. If @push_seq flag is zero, then it
 362 * is a background flush and so we can chose to ignore it. Otherwise, if the
 363 * current sequence is the same as @push_seq we need to do a flush. If
 364 * @push_seq is less than the current sequence, then it has already been
 365 * flushed and we don't need to do anything - the caller will wait for it to
 366 * complete if necessary.
 367 *
 368 * @push_seq is a value rather than a flag because that allows us to do an
 369 * unlocked check of the sequence number for a match. Hence we can allows log
 370 * forces to run racily and not issue pushes for the same sequence twice. If we
 371 * get a race between multiple pushes for the same sequence they will block on
 372 * the first one and then abort, hence avoiding needless pushes.
 373 */
 374STATIC int
 375xlog_cil_push(
 376        struct xlog             *log)
 377{
 378        struct xfs_cil          *cil = log->l_cilp;
 379        struct xfs_log_vec      *lv;
 380        struct xfs_cil_ctx      *ctx;
 381        struct xfs_cil_ctx      *new_ctx;
 382        struct xlog_in_core     *commit_iclog;
 383        struct xlog_ticket      *tic;
 384        int                     num_lv;
 385        int                     num_iovecs;
 386        int                     len;
 387        int                     error = 0;
 388        struct xfs_trans_header thdr;
 389        struct xfs_log_iovec    lhdr;
 390        struct xfs_log_vec      lvhdr = { NULL };
 391        xfs_lsn_t               commit_lsn;
 392        xfs_lsn_t               push_seq;
 393
 394        if (!cil)
 395                return 0;
 396
 397        new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);
 398        new_ctx->ticket = xlog_cil_ticket_alloc(log);
 399
 400        down_write(&cil->xc_ctx_lock);
 401        ctx = cil->xc_ctx;
 402
 403        spin_lock(&cil->xc_cil_lock);
 404        push_seq = cil->xc_push_seq;
 405        ASSERT(push_seq <= ctx->sequence);
 406
 407        /*
 408         * Check if we've anything to push. If there is nothing, then we don't
 409         * move on to a new sequence number and so we have to be able to push
 410         * this sequence again later.
 411         */
 412        if (list_empty(&cil->xc_cil)) {
 413                cil->xc_push_seq = 0;
 414                spin_unlock(&cil->xc_cil_lock);
 415                goto out_skip;
 416        }
 417        spin_unlock(&cil->xc_cil_lock);
 418
 419
 420        /* check for a previously pushed seqeunce */
 421        if (push_seq < cil->xc_ctx->sequence)
 422                goto out_skip;
 423
 424        /*
 425         * pull all the log vectors off the items in the CIL, and
 426         * remove the items from the CIL. We don't need the CIL lock
 427         * here because it's only needed on the transaction commit
 428         * side which is currently locked out by the flush lock.
 429         */
 430        lv = NULL;
 431        num_lv = 0;
 432        num_iovecs = 0;
 433        len = 0;
 434        while (!list_empty(&cil->xc_cil)) {
 435                struct xfs_log_item     *item;
 436                int                     i;
 437
 438                item = list_first_entry(&cil->xc_cil,
 439                                        struct xfs_log_item, li_cil);
 440                list_del_init(&item->li_cil);
 441                if (!ctx->lv_chain)
 442                        ctx->lv_chain = item->li_lv;
 443                else
 444                        lv->lv_next = item->li_lv;
 445                lv = item->li_lv;
 446                item->li_lv = NULL;
 447
 448                num_lv++;
 449                num_iovecs += lv->lv_niovecs;
 450                for (i = 0; i < lv->lv_niovecs; i++)
 451                        len += lv->lv_iovecp[i].i_len;
 452        }
 453
 454        /*
 455         * initialise the new context and attach it to the CIL. Then attach
 456         * the current context to the CIL committing lsit so it can be found
 457         * during log forces to extract the commit lsn of the sequence that
 458         * needs to be forced.
 459         */
 460        INIT_LIST_HEAD(&new_ctx->committing);
 461        INIT_LIST_HEAD(&new_ctx->busy_extents);
 462        new_ctx->sequence = ctx->sequence + 1;
 463        new_ctx->cil = cil;
 464        cil->xc_ctx = new_ctx;
 465
 466        /*
 467         * mirror the new sequence into the cil structure so that we can do
 468         * unlocked checks against the current sequence in log forces without
 469         * risking deferencing a freed context pointer.
 470         */
 471        cil->xc_current_sequence = new_ctx->sequence;
 472
 473        /*
 474         * The switch is now done, so we can drop the context lock and move out
 475         * of a shared context. We can't just go straight to the commit record,
 476         * though - we need to synchronise with previous and future commits so
 477         * that the commit records are correctly ordered in the log to ensure
 478         * that we process items during log IO completion in the correct order.
 479         *
 480         * For example, if we get an EFI in one checkpoint and the EFD in the
 481         * next (e.g. due to log forces), we do not want the checkpoint with
 482         * the EFD to be committed before the checkpoint with the EFI.  Hence
 483         * we must strictly order the commit records of the checkpoints so
 484         * that: a) the checkpoint callbacks are attached to the iclogs in the
 485         * correct order; and b) the checkpoints are replayed in correct order
 486         * in log recovery.
 487         *
 488         * Hence we need to add this context to the committing context list so
 489         * that higher sequences will wait for us to write out a commit record
 490         * before they do.
 491         */
 492        spin_lock(&cil->xc_cil_lock);
 493        list_add(&ctx->committing, &cil->xc_committing);
 494        spin_unlock(&cil->xc_cil_lock);
 495        up_write(&cil->xc_ctx_lock);
 496
 497        /*
 498         * Build a checkpoint transaction header and write it to the log to
 499         * begin the transaction. We need to account for the space used by the
 500         * transaction header here as it is not accounted for in xlog_write().
 501         *
 502         * The LSN we need to pass to the log items on transaction commit is
 503         * the LSN reported by the first log vector write. If we use the commit
 504         * record lsn then we can move the tail beyond the grant write head.
 505         */
 506        tic = ctx->ticket;
 507        thdr.th_magic = XFS_TRANS_HEADER_MAGIC;
 508        thdr.th_type = XFS_TRANS_CHECKPOINT;
 509        thdr.th_tid = tic->t_tid;
 510        thdr.th_num_items = num_iovecs;
 511        lhdr.i_addr = &thdr;
 512        lhdr.i_len = sizeof(xfs_trans_header_t);
 513        lhdr.i_type = XLOG_REG_TYPE_TRANSHDR;
 514        tic->t_curr_res -= lhdr.i_len + sizeof(xlog_op_header_t);
 515
 516        lvhdr.lv_niovecs = 1;
 517        lvhdr.lv_iovecp = &lhdr;
 518        lvhdr.lv_next = ctx->lv_chain;
 519
 520        error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0);
 521        if (error)
 522                goto out_abort_free_ticket;
 523
 524        /*
 525         * now that we've written the checkpoint into the log, strictly
 526         * order the commit records so replay will get them in the right order.
 527         */
 528restart:
 529        spin_lock(&cil->xc_cil_lock);
 530        list_for_each_entry(new_ctx, &cil->xc_committing, committing) {
 531                /*
 532                 * Higher sequences will wait for this one so skip them.
 533                 * Don't wait for own own sequence, either.
 534                 */
 535                if (new_ctx->sequence >= ctx->sequence)
 536                        continue;
 537                if (!new_ctx->commit_lsn) {
 538                        /*
 539                         * It is still being pushed! Wait for the push to
 540                         * complete, then start again from the beginning.
 541                         */
 542                        xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
 543                        goto restart;
 544                }
 545        }
 546        spin_unlock(&cil->xc_cil_lock);
 547
 548        /* xfs_log_done always frees the ticket on error. */
 549        commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0);
 550        if (commit_lsn == -1)
 551                goto out_abort;
 552
 553        /* attach all the transactions w/ busy extents to iclog */
 554        ctx->log_cb.cb_func = xlog_cil_committed;
 555        ctx->log_cb.cb_arg = ctx;
 556        error = xfs_log_notify(log->l_mp, commit_iclog, &ctx->log_cb);
 557        if (error)
 558                goto out_abort;
 559
 560        /*
 561         * now the checkpoint commit is complete and we've attached the
 562         * callbacks to the iclog we can assign the commit LSN to the context
 563         * and wake up anyone who is waiting for the commit to complete.
 564         */
 565        spin_lock(&cil->xc_cil_lock);
 566        ctx->commit_lsn = commit_lsn;
 567        wake_up_all(&cil->xc_commit_wait);
 568        spin_unlock(&cil->xc_cil_lock);
 569
 570        /* release the hounds! */
 571        return xfs_log_release_iclog(log->l_mp, commit_iclog);
 572
 573out_skip:
 574        up_write(&cil->xc_ctx_lock);
 575        xfs_log_ticket_put(new_ctx->ticket);
 576        kmem_free(new_ctx);
 577        return 0;
 578
 579out_abort_free_ticket:
 580        xfs_log_ticket_put(tic);
 581out_abort:
 582        xlog_cil_committed(ctx, XFS_LI_ABORTED);
 583        return XFS_ERROR(EIO);
 584}
 585
 586static void
 587xlog_cil_push_work(
 588        struct work_struct      *work)
 589{
 590        struct xfs_cil          *cil = container_of(work, struct xfs_cil,
 591                                                        xc_push_work);
 592        xlog_cil_push(cil->xc_log);
 593}
 594
 595/*
 596 * We need to push CIL every so often so we don't cache more than we can fit in
 597 * the log. The limit really is that a checkpoint can't be more than half the
 598 * log (the current checkpoint is not allowed to overwrite the previous
 599 * checkpoint), but commit latency and memory usage limit this to a smaller
 600 * size.
 601 */
 602static void
 603xlog_cil_push_background(
 604        struct xlog     *log)
 605{
 606        struct xfs_cil  *cil = log->l_cilp;
 607
 608        /*
 609         * The cil won't be empty because we are called while holding the
 610         * context lock so whatever we added to the CIL will still be there
 611         */
 612        ASSERT(!list_empty(&cil->xc_cil));
 613
 614        /*
 615         * don't do a background push if we haven't used up all the
 616         * space available yet.
 617         */
 618        if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
 619                return;
 620
 621        spin_lock(&cil->xc_cil_lock);
 622        if (cil->xc_push_seq < cil->xc_current_sequence) {
 623                cil->xc_push_seq = cil->xc_current_sequence;
 624                queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work);
 625        }
 626        spin_unlock(&cil->xc_cil_lock);
 627
 628}
 629
 630static void
 631xlog_cil_push_foreground(
 632        struct xlog     *log,
 633        xfs_lsn_t       push_seq)
 634{
 635        struct xfs_cil  *cil = log->l_cilp;
 636
 637        if (!cil)
 638                return;
 639
 640        ASSERT(push_seq && push_seq <= cil->xc_current_sequence);
 641
 642        /* start on any pending background push to minimise wait time on it */
 643        flush_work(&cil->xc_push_work);
 644
 645        /*
 646         * If the CIL is empty or we've already pushed the sequence then
 647         * there's no work we need to do.
 648         */
 649        spin_lock(&cil->xc_cil_lock);
 650        if (list_empty(&cil->xc_cil) || push_seq <= cil->xc_push_seq) {
 651                spin_unlock(&cil->xc_cil_lock);
 652                return;
 653        }
 654
 655        cil->xc_push_seq = push_seq;
 656        spin_unlock(&cil->xc_cil_lock);
 657
 658        /* do the push now */
 659        xlog_cil_push(log);
 660}
 661
 662/*
 663 * Commit a transaction with the given vector to the Committed Item List.
 664 *
 665 * To do this, we need to format the item, pin it in memory if required and
 666 * account for the space used by the transaction. Once we have done that we
 667 * need to release the unused reservation for the transaction, attach the
 668 * transaction to the checkpoint context so we carry the busy extents through
 669 * to checkpoint completion, and then unlock all the items in the transaction.
 670 *
 671 * For more specific information about the order of operations in
 672 * xfs_log_commit_cil() please refer to the comments in
 673 * xfs_trans_commit_iclog().
 674 *
 675 * Called with the context lock already held in read mode to lock out
 676 * background commit, returns without it held once background commits are
 677 * allowed again.
 678 */
 679int
 680xfs_log_commit_cil(
 681        struct xfs_mount        *mp,
 682        struct xfs_trans        *tp,
 683        xfs_lsn_t               *commit_lsn,
 684        int                     flags)
 685{
 686        struct xlog             *log = mp->m_log;
 687        int                     log_flags = 0;
 688        struct xfs_log_vec      *log_vector;
 689
 690        if (flags & XFS_TRANS_RELEASE_LOG_RES)
 691                log_flags = XFS_LOG_REL_PERM_RESERV;
 692
 693        /*
 694         * Do all the hard work of formatting items (including memory
 695         * allocation) outside the CIL context lock. This prevents stalling CIL
 696         * pushes when we are low on memory and a transaction commit spends a
 697         * lot of time in memory reclaim.
 698         */
 699        log_vector = xlog_cil_prepare_log_vecs(tp);
 700        if (!log_vector)
 701                return ENOMEM;
 702
 703        /* lock out background commit */
 704        down_read(&log->l_cilp->xc_ctx_lock);
 705        if (commit_lsn)
 706                *commit_lsn = log->l_cilp->xc_ctx->sequence;
 707
 708        xlog_cil_insert_items(log, log_vector, tp->t_ticket);
 709
 710        /* check we didn't blow the reservation */
 711        if (tp->t_ticket->t_curr_res < 0)
 712                xlog_print_tic_res(log->l_mp, tp->t_ticket);
 713
 714        /* attach the transaction to the CIL if it has any busy extents */
 715        if (!list_empty(&tp->t_busy)) {
 716                spin_lock(&log->l_cilp->xc_cil_lock);
 717                list_splice_init(&tp->t_busy,
 718                                        &log->l_cilp->xc_ctx->busy_extents);
 719                spin_unlock(&log->l_cilp->xc_cil_lock);
 720        }
 721
 722        tp->t_commit_lsn = *commit_lsn;
 723        xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
 724        xfs_trans_unreserve_and_mod_sb(tp);
 725
 726        /*
 727         * Once all the items of the transaction have been copied to the CIL,
 728         * the items can be unlocked and freed.
 729         *
 730         * This needs to be done before we drop the CIL context lock because we
 731         * have to update state in the log items and unlock them before they go
 732         * to disk. If we don't, then the CIL checkpoint can race with us and
 733         * we can run checkpoint completion before we've updated and unlocked
 734         * the log items. This affects (at least) processing of stale buffers,
 735         * inodes and EFIs.
 736         */
 737        xfs_trans_free_items(tp, *commit_lsn, 0);
 738
 739        xlog_cil_push_background(log);
 740
 741        up_read(&log->l_cilp->xc_ctx_lock);
 742        return 0;
 743}
 744
 745/*
 746 * Conditionally push the CIL based on the sequence passed in.
 747 *
 748 * We only need to push if we haven't already pushed the sequence
 749 * number given. Hence the only time we will trigger a push here is
 750 * if the push sequence is the same as the current context.
 751 *
 752 * We return the current commit lsn to allow the callers to determine if a
 753 * iclog flush is necessary following this call.
 754 */
 755xfs_lsn_t
 756xlog_cil_force_lsn(
 757        struct xlog     *log,
 758        xfs_lsn_t       sequence)
 759{
 760        struct xfs_cil          *cil = log->l_cilp;
 761        struct xfs_cil_ctx      *ctx;
 762        xfs_lsn_t               commit_lsn = NULLCOMMITLSN;
 763
 764        ASSERT(sequence <= cil->xc_current_sequence);
 765
 766        /*
 767         * check to see if we need to force out the current context.
 768         * xlog_cil_push() handles racing pushes for the same sequence,
 769         * so no need to deal with it here.
 770         */
 771        xlog_cil_push_foreground(log, sequence);
 772
 773        /*
 774         * See if we can find a previous sequence still committing.
 775         * We need to wait for all previous sequence commits to complete
 776         * before allowing the force of push_seq to go ahead. Hence block
 777         * on commits for those as well.
 778         */
 779restart:
 780        spin_lock(&cil->xc_cil_lock);
 781        list_for_each_entry(ctx, &cil->xc_committing, committing) {
 782                if (ctx->sequence > sequence)
 783                        continue;
 784                if (!ctx->commit_lsn) {
 785                        /*
 786                         * It is still being pushed! Wait for the push to
 787                         * complete, then start again from the beginning.
 788                         */
 789                        xlog_wait(&cil->xc_commit_wait, &cil->xc_cil_lock);
 790                        goto restart;
 791                }
 792                if (ctx->sequence != sequence)
 793                        continue;
 794                /* found it! */
 795                commit_lsn = ctx->commit_lsn;
 796        }
 797        spin_unlock(&cil->xc_cil_lock);
 798        return commit_lsn;
 799}
 800
 801/*
 802 * Check if the current log item was first committed in this sequence.
 803 * We can't rely on just the log item being in the CIL, we have to check
 804 * the recorded commit sequence number.
 805 *
 806 * Note: for this to be used in a non-racy manner, it has to be called with
 807 * CIL flushing locked out. As a result, it should only be used during the
 808 * transaction commit process when deciding what to format into the item.
 809 */
 810bool
 811xfs_log_item_in_current_chkpt(
 812        struct xfs_log_item *lip)
 813{
 814        struct xfs_cil_ctx *ctx;
 815
 816        if (list_empty(&lip->li_cil))
 817                return false;
 818
 819        ctx = lip->li_mountp->m_log->l_cilp->xc_ctx;
 820
 821        /*
 822         * li_seq is written on the first commit of a log item to record the
 823         * first checkpoint it is written to. Hence if it is different to the
 824         * current sequence, we're in a new checkpoint.
 825         */
 826        if (XFS_LSN_CMP(lip->li_seq, ctx->sequence) != 0)
 827                return false;
 828        return true;
 829}
 830
 831/*
 832 * Perform initial CIL structure initialisation.
 833 */
 834int
 835xlog_cil_init(
 836        struct xlog     *log)
 837{
 838        struct xfs_cil  *cil;
 839        struct xfs_cil_ctx *ctx;
 840
 841        cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL);
 842        if (!cil)
 843                return ENOMEM;
 844
 845        ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL);
 846        if (!ctx) {
 847                kmem_free(cil);
 848                return ENOMEM;
 849        }
 850
 851        INIT_WORK(&cil->xc_push_work, xlog_cil_push_work);
 852        INIT_LIST_HEAD(&cil->xc_cil);
 853        INIT_LIST_HEAD(&cil->xc_committing);
 854        spin_lock_init(&cil->xc_cil_lock);
 855        init_rwsem(&cil->xc_ctx_lock);
 856        init_waitqueue_head(&cil->xc_commit_wait);
 857
 858        INIT_LIST_HEAD(&ctx->committing);
 859        INIT_LIST_HEAD(&ctx->busy_extents);
 860        ctx->sequence = 1;
 861        ctx->cil = cil;
 862        cil->xc_ctx = ctx;
 863        cil->xc_current_sequence = ctx->sequence;
 864
 865        cil->xc_log = log;
 866        log->l_cilp = cil;
 867        return 0;
 868}
 869
 870void
 871xlog_cil_destroy(
 872        struct xlog     *log)
 873{
 874        if (log->l_cilp->xc_ctx) {
 875                if (log->l_cilp->xc_ctx->ticket)
 876                        xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket);
 877                kmem_free(log->l_cilp->xc_ctx);
 878        }
 879
 880        ASSERT(list_empty(&log->l_cilp->xc_cil));
 881        kmem_free(log->l_cilp);
 882}
 883
 884