linux/fs/xfs/xfs_buf.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   4 * All Rights Reserved.
   5 */
   6#include "xfs.h"
   7#include <linux/backing-dev.h>
   8
   9#include "xfs_shared.h"
  10#include "xfs_format.h"
  11#include "xfs_log_format.h"
  12#include "xfs_trans_resv.h"
  13#include "xfs_sb.h"
  14#include "xfs_mount.h"
  15#include "xfs_trace.h"
  16#include "xfs_log.h"
  17#include "xfs_errortag.h"
  18#include "xfs_error.h"
  19
  20static kmem_zone_t *xfs_buf_zone;
  21
  22#define xb_to_gfp(flags) \
  23        ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : GFP_NOFS) | __GFP_NOWARN)
  24
  25/*
  26 * Locking orders
  27 *
  28 * xfs_buf_ioacct_inc:
  29 * xfs_buf_ioacct_dec:
  30 *      b_sema (caller holds)
  31 *        b_lock
  32 *
  33 * xfs_buf_stale:
  34 *      b_sema (caller holds)
  35 *        b_lock
  36 *          lru_lock
  37 *
  38 * xfs_buf_rele:
  39 *      b_lock
  40 *        pag_buf_lock
  41 *          lru_lock
  42 *
  43 * xfs_buftarg_wait_rele
  44 *      lru_lock
  45 *        b_lock (trylock due to inversion)
  46 *
  47 * xfs_buftarg_isolate
  48 *      lru_lock
  49 *        b_lock (trylock due to inversion)
  50 */
  51
  52static inline int
  53xfs_buf_is_vmapped(
  54        struct xfs_buf  *bp)
  55{
  56        /*
  57         * Return true if the buffer is vmapped.
  58         *
  59         * b_addr is null if the buffer is not mapped, but the code is clever
  60         * enough to know it doesn't have to map a single page, so the check has
  61         * to be both for b_addr and bp->b_page_count > 1.
  62         */
  63        return bp->b_addr && bp->b_page_count > 1;
  64}
  65
  66static inline int
  67xfs_buf_vmap_len(
  68        struct xfs_buf  *bp)
  69{
  70        return (bp->b_page_count * PAGE_SIZE) - bp->b_offset;
  71}
  72
  73/*
  74 * Bump the I/O in flight count on the buftarg if we haven't yet done so for
  75 * this buffer. The count is incremented once per buffer (per hold cycle)
  76 * because the corresponding decrement is deferred to buffer release. Buffers
  77 * can undergo I/O multiple times in a hold-release cycle and per buffer I/O
  78 * tracking adds unnecessary overhead. This is used for sychronization purposes
  79 * with unmount (see xfs_wait_buftarg()), so all we really need is a count of
  80 * in-flight buffers.
  81 *
  82 * Buffers that are never released (e.g., superblock, iclog buffers) must set
  83 * the XBF_NO_IOACCT flag before I/O submission. Otherwise, the buftarg count
  84 * never reaches zero and unmount hangs indefinitely.
  85 */
  86static inline void
  87xfs_buf_ioacct_inc(
  88        struct xfs_buf  *bp)
  89{
  90        if (bp->b_flags & XBF_NO_IOACCT)
  91                return;
  92
  93        ASSERT(bp->b_flags & XBF_ASYNC);
  94        spin_lock(&bp->b_lock);
  95        if (!(bp->b_state & XFS_BSTATE_IN_FLIGHT)) {
  96                bp->b_state |= XFS_BSTATE_IN_FLIGHT;
  97                percpu_counter_inc(&bp->b_target->bt_io_count);
  98        }
  99        spin_unlock(&bp->b_lock);
 100}
 101
 102/*
 103 * Clear the in-flight state on a buffer about to be released to the LRU or
 104 * freed and unaccount from the buftarg.
 105 */
 106static inline void
 107__xfs_buf_ioacct_dec(
 108        struct xfs_buf  *bp)
 109{
 110        lockdep_assert_held(&bp->b_lock);
 111
 112        if (bp->b_state & XFS_BSTATE_IN_FLIGHT) {
 113                bp->b_state &= ~XFS_BSTATE_IN_FLIGHT;
 114                percpu_counter_dec(&bp->b_target->bt_io_count);
 115        }
 116}
 117
 118static inline void
 119xfs_buf_ioacct_dec(
 120        struct xfs_buf  *bp)
 121{
 122        spin_lock(&bp->b_lock);
 123        __xfs_buf_ioacct_dec(bp);
 124        spin_unlock(&bp->b_lock);
 125}
 126
 127/*
 128 * When we mark a buffer stale, we remove the buffer from the LRU and clear the
 129 * b_lru_ref count so that the buffer is freed immediately when the buffer
 130 * reference count falls to zero. If the buffer is already on the LRU, we need
 131 * to remove the reference that LRU holds on the buffer.
 132 *
 133 * This prevents build-up of stale buffers on the LRU.
 134 */
 135void
 136xfs_buf_stale(
 137        struct xfs_buf  *bp)
 138{
 139        ASSERT(xfs_buf_islocked(bp));
 140
 141        bp->b_flags |= XBF_STALE;
 142
 143        /*
 144         * Clear the delwri status so that a delwri queue walker will not
 145         * flush this buffer to disk now that it is stale. The delwri queue has
 146         * a reference to the buffer, so this is safe to do.
 147         */
 148        bp->b_flags &= ~_XBF_DELWRI_Q;
 149
 150        /*
 151         * Once the buffer is marked stale and unlocked, a subsequent lookup
 152         * could reset b_flags. There is no guarantee that the buffer is
 153         * unaccounted (released to LRU) before that occurs. Drop in-flight
 154         * status now to preserve accounting consistency.
 155         */
 156        spin_lock(&bp->b_lock);
 157        __xfs_buf_ioacct_dec(bp);
 158
 159        atomic_set(&bp->b_lru_ref, 0);
 160        if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
 161            (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru)))
 162                atomic_dec(&bp->b_hold);
 163
 164        ASSERT(atomic_read(&bp->b_hold) >= 1);
 165        spin_unlock(&bp->b_lock);
 166}
 167
 168static int
 169xfs_buf_get_maps(
 170        struct xfs_buf          *bp,
 171        int                     map_count)
 172{
 173        ASSERT(bp->b_maps == NULL);
 174        bp->b_map_count = map_count;
 175
 176        if (map_count == 1) {
 177                bp->b_maps = &bp->__b_map;
 178                return 0;
 179        }
 180
 181        bp->b_maps = kmem_zalloc(map_count * sizeof(struct xfs_buf_map),
 182                                KM_NOFS);
 183        if (!bp->b_maps)
 184                return -ENOMEM;
 185        return 0;
 186}
 187
 188/*
 189 *      Frees b_pages if it was allocated.
 190 */
 191static void
 192xfs_buf_free_maps(
 193        struct xfs_buf  *bp)
 194{
 195        if (bp->b_maps != &bp->__b_map) {
 196                kmem_free(bp->b_maps);
 197                bp->b_maps = NULL;
 198        }
 199}
 200
 201static int
 202_xfs_buf_alloc(
 203        struct xfs_buftarg      *target,
 204        struct xfs_buf_map      *map,
 205        int                     nmaps,
 206        xfs_buf_flags_t         flags,
 207        struct xfs_buf          **bpp)
 208{
 209        struct xfs_buf          *bp;
 210        int                     error;
 211        int                     i;
 212
 213        *bpp = NULL;
 214        bp = kmem_zone_zalloc(xfs_buf_zone, KM_NOFS);
 215        if (unlikely(!bp))
 216                return -ENOMEM;
 217
 218        /*
 219         * We don't want certain flags to appear in b_flags unless they are
 220         * specifically set by later operations on the buffer.
 221         */
 222        flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD);
 223
 224        atomic_set(&bp->b_hold, 1);
 225        atomic_set(&bp->b_lru_ref, 1);
 226        init_completion(&bp->b_iowait);
 227        INIT_LIST_HEAD(&bp->b_lru);
 228        INIT_LIST_HEAD(&bp->b_list);
 229        INIT_LIST_HEAD(&bp->b_li_list);
 230        sema_init(&bp->b_sema, 0); /* held, no waiters */
 231        spin_lock_init(&bp->b_lock);
 232        bp->b_target = target;
 233        bp->b_mount = target->bt_mount;
 234        bp->b_flags = flags;
 235
 236        /*
 237         * Set length and io_length to the same value initially.
 238         * I/O routines should use io_length, which will be the same in
 239         * most cases but may be reset (e.g. XFS recovery).
 240         */
 241        error = xfs_buf_get_maps(bp, nmaps);
 242        if (error)  {
 243                kmem_cache_free(xfs_buf_zone, bp);
 244                return error;
 245        }
 246
 247        bp->b_bn = map[0].bm_bn;
 248        bp->b_length = 0;
 249        for (i = 0; i < nmaps; i++) {
 250                bp->b_maps[i].bm_bn = map[i].bm_bn;
 251                bp->b_maps[i].bm_len = map[i].bm_len;
 252                bp->b_length += map[i].bm_len;
 253        }
 254
 255        atomic_set(&bp->b_pin_count, 0);
 256        init_waitqueue_head(&bp->b_waiters);
 257
 258        XFS_STATS_INC(bp->b_mount, xb_create);
 259        trace_xfs_buf_init(bp, _RET_IP_);
 260
 261        *bpp = bp;
 262        return 0;
 263}
 264
 265/*
 266 *      Allocate a page array capable of holding a specified number
 267 *      of pages, and point the page buf at it.
 268 */
 269STATIC int
 270_xfs_buf_get_pages(
 271        xfs_buf_t               *bp,
 272        int                     page_count)
 273{
 274        /* Make sure that we have a page list */
 275        if (bp->b_pages == NULL) {
 276                bp->b_page_count = page_count;
 277                if (page_count <= XB_PAGES) {
 278                        bp->b_pages = bp->b_page_array;
 279                } else {
 280                        bp->b_pages = kmem_alloc(sizeof(struct page *) *
 281                                                 page_count, KM_NOFS);
 282                        if (bp->b_pages == NULL)
 283                                return -ENOMEM;
 284                }
 285                memset(bp->b_pages, 0, sizeof(struct page *) * page_count);
 286        }
 287        return 0;
 288}
 289
 290/*
 291 *      Frees b_pages if it was allocated.
 292 */
 293STATIC void
 294_xfs_buf_free_pages(
 295        xfs_buf_t       *bp)
 296{
 297        if (bp->b_pages != bp->b_page_array) {
 298                kmem_free(bp->b_pages);
 299                bp->b_pages = NULL;
 300        }
 301}
 302
 303/*
 304 *      Releases the specified buffer.
 305 *
 306 *      The modification state of any associated pages is left unchanged.
 307 *      The buffer must not be on any hash - use xfs_buf_rele instead for
 308 *      hashed and refcounted buffers
 309 */
 310static void
 311xfs_buf_free(
 312        xfs_buf_t               *bp)
 313{
 314        trace_xfs_buf_free(bp, _RET_IP_);
 315
 316        ASSERT(list_empty(&bp->b_lru));
 317
 318        if (bp->b_flags & _XBF_PAGES) {
 319                uint            i;
 320
 321                if (xfs_buf_is_vmapped(bp))
 322                        vm_unmap_ram(bp->b_addr - bp->b_offset,
 323                                        bp->b_page_count);
 324
 325                for (i = 0; i < bp->b_page_count; i++) {
 326                        struct page     *page = bp->b_pages[i];
 327
 328                        __free_page(page);
 329                }
 330                if (current->reclaim_state)
 331                        current->reclaim_state->reclaimed_slab +=
 332                                                        bp->b_page_count;
 333        } else if (bp->b_flags & _XBF_KMEM)
 334                kmem_free(bp->b_addr);
 335        _xfs_buf_free_pages(bp);
 336        xfs_buf_free_maps(bp);
 337        kmem_cache_free(xfs_buf_zone, bp);
 338}
 339
 340/*
 341 * Allocates all the pages for buffer in question and builds it's page list.
 342 */
 343STATIC int
 344xfs_buf_allocate_memory(
 345        xfs_buf_t               *bp,
 346        uint                    flags)
 347{
 348        size_t                  size;
 349        size_t                  nbytes, offset;
 350        gfp_t                   gfp_mask = xb_to_gfp(flags);
 351        unsigned short          page_count, i;
 352        xfs_off_t               start, end;
 353        int                     error;
 354        xfs_km_flags_t          kmflag_mask = 0;
 355
 356        /*
 357         * assure zeroed buffer for non-read cases.
 358         */
 359        if (!(flags & XBF_READ)) {
 360                kmflag_mask |= KM_ZERO;
 361                gfp_mask |= __GFP_ZERO;
 362        }
 363
 364        /*
 365         * for buffers that are contained within a single page, just allocate
 366         * the memory from the heap - there's no need for the complexity of
 367         * page arrays to keep allocation down to order 0.
 368         */
 369        size = BBTOB(bp->b_length);
 370        if (size < PAGE_SIZE) {
 371                int align_mask = xfs_buftarg_dma_alignment(bp->b_target);
 372                bp->b_addr = kmem_alloc_io(size, align_mask,
 373                                           KM_NOFS | kmflag_mask);
 374                if (!bp->b_addr) {
 375                        /* low memory - use alloc_page loop instead */
 376                        goto use_alloc_page;
 377                }
 378
 379                if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) !=
 380                    ((unsigned long)bp->b_addr & PAGE_MASK)) {
 381                        /* b_addr spans two pages - use alloc_page instead */
 382                        kmem_free(bp->b_addr);
 383                        bp->b_addr = NULL;
 384                        goto use_alloc_page;
 385                }
 386                bp->b_offset = offset_in_page(bp->b_addr);
 387                bp->b_pages = bp->b_page_array;
 388                bp->b_pages[0] = kmem_to_page(bp->b_addr);
 389                bp->b_page_count = 1;
 390                bp->b_flags |= _XBF_KMEM;
 391                return 0;
 392        }
 393
 394use_alloc_page:
 395        start = BBTOB(bp->b_maps[0].bm_bn) >> PAGE_SHIFT;
 396        end = (BBTOB(bp->b_maps[0].bm_bn + bp->b_length) + PAGE_SIZE - 1)
 397                                                                >> PAGE_SHIFT;
 398        page_count = end - start;
 399        error = _xfs_buf_get_pages(bp, page_count);
 400        if (unlikely(error))
 401                return error;
 402
 403        offset = bp->b_offset;
 404        bp->b_flags |= _XBF_PAGES;
 405
 406        for (i = 0; i < bp->b_page_count; i++) {
 407                struct page     *page;
 408                uint            retries = 0;
 409retry:
 410                page = alloc_page(gfp_mask);
 411                if (unlikely(page == NULL)) {
 412                        if (flags & XBF_READ_AHEAD) {
 413                                bp->b_page_count = i;
 414                                error = -ENOMEM;
 415                                goto out_free_pages;
 416                        }
 417
 418                        /*
 419                         * This could deadlock.
 420                         *
 421                         * But until all the XFS lowlevel code is revamped to
 422                         * handle buffer allocation failures we can't do much.
 423                         */
 424                        if (!(++retries % 100))
 425                                xfs_err(NULL,
 426                "%s(%u) possible memory allocation deadlock in %s (mode:0x%x)",
 427                                        current->comm, current->pid,
 428                                        __func__, gfp_mask);
 429
 430                        XFS_STATS_INC(bp->b_mount, xb_page_retries);
 431                        congestion_wait(BLK_RW_ASYNC, HZ/50);
 432                        goto retry;
 433                }
 434
 435                XFS_STATS_INC(bp->b_mount, xb_page_found);
 436
 437                nbytes = min_t(size_t, size, PAGE_SIZE - offset);
 438                size -= nbytes;
 439                bp->b_pages[i] = page;
 440                offset = 0;
 441        }
 442        return 0;
 443
 444out_free_pages:
 445        for (i = 0; i < bp->b_page_count; i++)
 446                __free_page(bp->b_pages[i]);
 447        bp->b_flags &= ~_XBF_PAGES;
 448        return error;
 449}
 450
 451/*
 452 *      Map buffer into kernel address-space if necessary.
 453 */
 454STATIC int
 455_xfs_buf_map_pages(
 456        xfs_buf_t               *bp,
 457        uint                    flags)
 458{
 459        ASSERT(bp->b_flags & _XBF_PAGES);
 460        if (bp->b_page_count == 1) {
 461                /* A single page buffer is always mappable */
 462                bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
 463        } else if (flags & XBF_UNMAPPED) {
 464                bp->b_addr = NULL;
 465        } else {
 466                int retried = 0;
 467                unsigned nofs_flag;
 468
 469                /*
 470                 * vm_map_ram() will allocate auxiliary structures (e.g.
 471                 * pagetables) with GFP_KERNEL, yet we are likely to be under
 472                 * GFP_NOFS context here. Hence we need to tell memory reclaim
 473                 * that we are in such a context via PF_MEMALLOC_NOFS to prevent
 474                 * memory reclaim re-entering the filesystem here and
 475                 * potentially deadlocking.
 476                 */
 477                nofs_flag = memalloc_nofs_save();
 478                do {
 479                        bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
 480                                                -1);
 481                        if (bp->b_addr)
 482                                break;
 483                        vm_unmap_aliases();
 484                } while (retried++ <= 1);
 485                memalloc_nofs_restore(nofs_flag);
 486
 487                if (!bp->b_addr)
 488                        return -ENOMEM;
 489                bp->b_addr += bp->b_offset;
 490        }
 491
 492        return 0;
 493}
 494
 495/*
 496 *      Finding and Reading Buffers
 497 */
 498static int
 499_xfs_buf_obj_cmp(
 500        struct rhashtable_compare_arg   *arg,
 501        const void                      *obj)
 502{
 503        const struct xfs_buf_map        *map = arg->key;
 504        const struct xfs_buf            *bp = obj;
 505
 506        /*
 507         * The key hashing in the lookup path depends on the key being the
 508         * first element of the compare_arg, make sure to assert this.
 509         */
 510        BUILD_BUG_ON(offsetof(struct xfs_buf_map, bm_bn) != 0);
 511
 512        if (bp->b_bn != map->bm_bn)
 513                return 1;
 514
 515        if (unlikely(bp->b_length != map->bm_len)) {
 516                /*
 517                 * found a block number match. If the range doesn't
 518                 * match, the only way this is allowed is if the buffer
 519                 * in the cache is stale and the transaction that made
 520                 * it stale has not yet committed. i.e. we are
 521                 * reallocating a busy extent. Skip this buffer and
 522                 * continue searching for an exact match.
 523                 */
 524                ASSERT(bp->b_flags & XBF_STALE);
 525                return 1;
 526        }
 527        return 0;
 528}
 529
 530static const struct rhashtable_params xfs_buf_hash_params = {
 531        .min_size               = 32,   /* empty AGs have minimal footprint */
 532        .nelem_hint             = 16,
 533        .key_len                = sizeof(xfs_daddr_t),
 534        .key_offset             = offsetof(struct xfs_buf, b_bn),
 535        .head_offset            = offsetof(struct xfs_buf, b_rhash_head),
 536        .automatic_shrinking    = true,
 537        .obj_cmpfn              = _xfs_buf_obj_cmp,
 538};
 539
 540int
 541xfs_buf_hash_init(
 542        struct xfs_perag        *pag)
 543{
 544        spin_lock_init(&pag->pag_buf_lock);
 545        return rhashtable_init(&pag->pag_buf_hash, &xfs_buf_hash_params);
 546}
 547
 548void
 549xfs_buf_hash_destroy(
 550        struct xfs_perag        *pag)
 551{
 552        rhashtable_destroy(&pag->pag_buf_hash);
 553}
 554
 555/*
 556 * Look up a buffer in the buffer cache and return it referenced and locked
 557 * in @found_bp.
 558 *
 559 * If @new_bp is supplied and we have a lookup miss, insert @new_bp into the
 560 * cache.
 561 *
 562 * If XBF_TRYLOCK is set in @flags, only try to lock the buffer and return
 563 * -EAGAIN if we fail to lock it.
 564 *
 565 * Return values are:
 566 *      -EFSCORRUPTED if have been supplied with an invalid address
 567 *      -EAGAIN on trylock failure
 568 *      -ENOENT if we fail to find a match and @new_bp was NULL
 569 *      0, with @found_bp:
 570 *              - @new_bp if we inserted it into the cache
 571 *              - the buffer we found and locked.
 572 */
 573static int
 574xfs_buf_find(
 575        struct xfs_buftarg      *btp,
 576        struct xfs_buf_map      *map,
 577        int                     nmaps,
 578        xfs_buf_flags_t         flags,
 579        struct xfs_buf          *new_bp,
 580        struct xfs_buf          **found_bp)
 581{
 582        struct xfs_perag        *pag;
 583        xfs_buf_t               *bp;
 584        struct xfs_buf_map      cmap = { .bm_bn = map[0].bm_bn };
 585        xfs_daddr_t             eofs;
 586        int                     i;
 587
 588        *found_bp = NULL;
 589
 590        for (i = 0; i < nmaps; i++)
 591                cmap.bm_len += map[i].bm_len;
 592
 593        /* Check for IOs smaller than the sector size / not sector aligned */
 594        ASSERT(!(BBTOB(cmap.bm_len) < btp->bt_meta_sectorsize));
 595        ASSERT(!(BBTOB(cmap.bm_bn) & (xfs_off_t)btp->bt_meta_sectormask));
 596
 597        /*
 598         * Corrupted block numbers can get through to here, unfortunately, so we
 599         * have to check that the buffer falls within the filesystem bounds.
 600         */
 601        eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
 602        if (cmap.bm_bn < 0 || cmap.bm_bn >= eofs) {
 603                xfs_alert(btp->bt_mount,
 604                          "%s: daddr 0x%llx out of range, EOFS 0x%llx",
 605                          __func__, cmap.bm_bn, eofs);
 606                WARN_ON(1);
 607                return -EFSCORRUPTED;
 608        }
 609
 610        pag = xfs_perag_get(btp->bt_mount,
 611                            xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn));
 612
 613        spin_lock(&pag->pag_buf_lock);
 614        bp = rhashtable_lookup_fast(&pag->pag_buf_hash, &cmap,
 615                                    xfs_buf_hash_params);
 616        if (bp) {
 617                atomic_inc(&bp->b_hold);
 618                goto found;
 619        }
 620
 621        /* No match found */
 622        if (!new_bp) {
 623                XFS_STATS_INC(btp->bt_mount, xb_miss_locked);
 624                spin_unlock(&pag->pag_buf_lock);
 625                xfs_perag_put(pag);
 626                return -ENOENT;
 627        }
 628
 629        /* the buffer keeps the perag reference until it is freed */
 630        new_bp->b_pag = pag;
 631        rhashtable_insert_fast(&pag->pag_buf_hash, &new_bp->b_rhash_head,
 632                               xfs_buf_hash_params);
 633        spin_unlock(&pag->pag_buf_lock);
 634        *found_bp = new_bp;
 635        return 0;
 636
 637found:
 638        spin_unlock(&pag->pag_buf_lock);
 639        xfs_perag_put(pag);
 640
 641        if (!xfs_buf_trylock(bp)) {
 642                if (flags & XBF_TRYLOCK) {
 643                        xfs_buf_rele(bp);
 644                        XFS_STATS_INC(btp->bt_mount, xb_busy_locked);
 645                        return -EAGAIN;
 646                }
 647                xfs_buf_lock(bp);
 648                XFS_STATS_INC(btp->bt_mount, xb_get_locked_waited);
 649        }
 650
 651        /*
 652         * if the buffer is stale, clear all the external state associated with
 653         * it. We need to keep flags such as how we allocated the buffer memory
 654         * intact here.
 655         */
 656        if (bp->b_flags & XBF_STALE) {
 657                ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
 658                ASSERT(bp->b_iodone == NULL);
 659                bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
 660                bp->b_ops = NULL;
 661        }
 662
 663        trace_xfs_buf_find(bp, flags, _RET_IP_);
 664        XFS_STATS_INC(btp->bt_mount, xb_get_locked);
 665        *found_bp = bp;
 666        return 0;
 667}
 668
 669struct xfs_buf *
 670xfs_buf_incore(
 671        struct xfs_buftarg      *target,
 672        xfs_daddr_t             blkno,
 673        size_t                  numblks,
 674        xfs_buf_flags_t         flags)
 675{
 676        struct xfs_buf          *bp;
 677        int                     error;
 678        DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
 679
 680        error = xfs_buf_find(target, &map, 1, flags, NULL, &bp);
 681        if (error)
 682                return NULL;
 683        return bp;
 684}
 685
 686/*
 687 * Assembles a buffer covering the specified range. The code is optimised for
 688 * cache hits, as metadata intensive workloads will see 3 orders of magnitude
 689 * more hits than misses.
 690 */
 691int
 692xfs_buf_get_map(
 693        struct xfs_buftarg      *target,
 694        struct xfs_buf_map      *map,
 695        int                     nmaps,
 696        xfs_buf_flags_t         flags,
 697        struct xfs_buf          **bpp)
 698{
 699        struct xfs_buf          *bp;
 700        struct xfs_buf          *new_bp;
 701        int                     error = 0;
 702
 703        *bpp = NULL;
 704        error = xfs_buf_find(target, map, nmaps, flags, NULL, &bp);
 705        if (!error)
 706                goto found;
 707        if (error != -ENOENT)
 708                return error;
 709
 710        error = _xfs_buf_alloc(target, map, nmaps, flags, &new_bp);
 711        if (error)
 712                return error;
 713
 714        error = xfs_buf_allocate_memory(new_bp, flags);
 715        if (error) {
 716                xfs_buf_free(new_bp);
 717                return error;
 718        }
 719
 720        error = xfs_buf_find(target, map, nmaps, flags, new_bp, &bp);
 721        if (error) {
 722                xfs_buf_free(new_bp);
 723                return error;
 724        }
 725
 726        if (bp != new_bp)
 727                xfs_buf_free(new_bp);
 728
 729found:
 730        if (!bp->b_addr) {
 731                error = _xfs_buf_map_pages(bp, flags);
 732                if (unlikely(error)) {
 733                        xfs_warn_ratelimited(target->bt_mount,
 734                                "%s: failed to map %u pages", __func__,
 735                                bp->b_page_count);
 736                        xfs_buf_relse(bp);
 737                        return error;
 738                }
 739        }
 740
 741        /*
 742         * Clear b_error if this is a lookup from a caller that doesn't expect
 743         * valid data to be found in the buffer.
 744         */
 745        if (!(flags & XBF_READ))
 746                xfs_buf_ioerror(bp, 0);
 747
 748        XFS_STATS_INC(target->bt_mount, xb_get);
 749        trace_xfs_buf_get(bp, flags, _RET_IP_);
 750        *bpp = bp;
 751        return 0;
 752}
 753
 754STATIC int
 755_xfs_buf_read(
 756        xfs_buf_t               *bp,
 757        xfs_buf_flags_t         flags)
 758{
 759        ASSERT(!(flags & XBF_WRITE));
 760        ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL);
 761
 762        bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD);
 763        bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
 764
 765        return xfs_buf_submit(bp);
 766}
 767
 768/*
 769 * Reverify a buffer found in cache without an attached ->b_ops.
 770 *
 771 * If the caller passed an ops structure and the buffer doesn't have ops
 772 * assigned, set the ops and use it to verify the contents. If verification
 773 * fails, clear XBF_DONE. We assume the buffer has no recorded errors and is
 774 * already in XBF_DONE state on entry.
 775 *
 776 * Under normal operations, every in-core buffer is verified on read I/O
 777 * completion. There are two scenarios that can lead to in-core buffers without
 778 * an assigned ->b_ops. The first is during log recovery of buffers on a V4
 779 * filesystem, though these buffers are purged at the end of recovery. The
 780 * other is online repair, which intentionally reads with a NULL buffer ops to
 781 * run several verifiers across an in-core buffer in order to establish buffer
 782 * type.  If repair can't establish that, the buffer will be left in memory
 783 * with NULL buffer ops.
 784 */
 785int
 786xfs_buf_reverify(
 787        struct xfs_buf          *bp,
 788        const struct xfs_buf_ops *ops)
 789{
 790        ASSERT(bp->b_flags & XBF_DONE);
 791        ASSERT(bp->b_error == 0);
 792
 793        if (!ops || bp->b_ops)
 794                return 0;
 795
 796        bp->b_ops = ops;
 797        bp->b_ops->verify_read(bp);
 798        if (bp->b_error)
 799                bp->b_flags &= ~XBF_DONE;
 800        return bp->b_error;
 801}
 802
 803int
 804xfs_buf_read_map(
 805        struct xfs_buftarg      *target,
 806        struct xfs_buf_map      *map,
 807        int                     nmaps,
 808        xfs_buf_flags_t         flags,
 809        struct xfs_buf          **bpp,
 810        const struct xfs_buf_ops *ops,
 811        xfs_failaddr_t          fa)
 812{
 813        struct xfs_buf          *bp;
 814        int                     error;
 815
 816        flags |= XBF_READ;
 817        *bpp = NULL;
 818
 819        error = xfs_buf_get_map(target, map, nmaps, flags, &bp);
 820        if (error)
 821                return error;
 822
 823        trace_xfs_buf_read(bp, flags, _RET_IP_);
 824
 825        if (!(bp->b_flags & XBF_DONE)) {
 826                /* Initiate the buffer read and wait. */
 827                XFS_STATS_INC(target->bt_mount, xb_get_read);
 828                bp->b_ops = ops;
 829                error = _xfs_buf_read(bp, flags);
 830
 831                /* Readahead iodone already dropped the buffer, so exit. */
 832                if (flags & XBF_ASYNC)
 833                        return 0;
 834        } else {
 835                /* Buffer already read; all we need to do is check it. */
 836                error = xfs_buf_reverify(bp, ops);
 837
 838                /* Readahead already finished; drop the buffer and exit. */
 839                if (flags & XBF_ASYNC) {
 840                        xfs_buf_relse(bp);
 841                        return 0;
 842                }
 843
 844                /* We do not want read in the flags */
 845                bp->b_flags &= ~XBF_READ;
 846                ASSERT(bp->b_ops != NULL || ops == NULL);
 847        }
 848
 849        /*
 850         * If we've had a read error, then the contents of the buffer are
 851         * invalid and should not be used. To ensure that a followup read tries
 852         * to pull the buffer from disk again, we clear the XBF_DONE flag and
 853         * mark the buffer stale. This ensures that anyone who has a current
 854         * reference to the buffer will interpret it's contents correctly and
 855         * future cache lookups will also treat it as an empty, uninitialised
 856         * buffer.
 857         */
 858        if (error) {
 859                if (!XFS_FORCED_SHUTDOWN(target->bt_mount))
 860                        xfs_buf_ioerror_alert(bp, fa);
 861
 862                bp->b_flags &= ~XBF_DONE;
 863                xfs_buf_stale(bp);
 864                xfs_buf_relse(bp);
 865
 866                /* bad CRC means corrupted metadata */
 867                if (error == -EFSBADCRC)
 868                        error = -EFSCORRUPTED;
 869                return error;
 870        }
 871
 872        *bpp = bp;
 873        return 0;
 874}
 875
 876/*
 877 *      If we are not low on memory then do the readahead in a deadlock
 878 *      safe manner.
 879 */
 880void
 881xfs_buf_readahead_map(
 882        struct xfs_buftarg      *target,
 883        struct xfs_buf_map      *map,
 884        int                     nmaps,
 885        const struct xfs_buf_ops *ops)
 886{
 887        struct xfs_buf          *bp;
 888
 889        if (bdi_read_congested(target->bt_bdev->bd_bdi))
 890                return;
 891
 892        xfs_buf_read_map(target, map, nmaps,
 893                     XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD, &bp, ops,
 894                     __this_address);
 895}
 896
 897/*
 898 * Read an uncached buffer from disk. Allocates and returns a locked
 899 * buffer containing the disk contents or nothing.
 900 */
 901int
 902xfs_buf_read_uncached(
 903        struct xfs_buftarg      *target,
 904        xfs_daddr_t             daddr,
 905        size_t                  numblks,
 906        int                     flags,
 907        struct xfs_buf          **bpp,
 908        const struct xfs_buf_ops *ops)
 909{
 910        struct xfs_buf          *bp;
 911        int                     error;
 912
 913        *bpp = NULL;
 914
 915        error = xfs_buf_get_uncached(target, numblks, flags, &bp);
 916        if (error)
 917                return error;
 918
 919        /* set up the buffer for a read IO */
 920        ASSERT(bp->b_map_count == 1);
 921        bp->b_bn = XFS_BUF_DADDR_NULL;  /* always null for uncached buffers */
 922        bp->b_maps[0].bm_bn = daddr;
 923        bp->b_flags |= XBF_READ;
 924        bp->b_ops = ops;
 925
 926        xfs_buf_submit(bp);
 927        if (bp->b_error) {
 928                error = bp->b_error;
 929                xfs_buf_relse(bp);
 930                return error;
 931        }
 932
 933        *bpp = bp;
 934        return 0;
 935}
 936
 937int
 938xfs_buf_get_uncached(
 939        struct xfs_buftarg      *target,
 940        size_t                  numblks,
 941        int                     flags,
 942        struct xfs_buf          **bpp)
 943{
 944        unsigned long           page_count;
 945        int                     error, i;
 946        struct xfs_buf          *bp;
 947        DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks);
 948
 949        *bpp = NULL;
 950
 951        /* flags might contain irrelevant bits, pass only what we care about */
 952        error = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT, &bp);
 953        if (error)
 954                goto fail;
 955
 956        page_count = PAGE_ALIGN(numblks << BBSHIFT) >> PAGE_SHIFT;
 957        error = _xfs_buf_get_pages(bp, page_count);
 958        if (error)
 959                goto fail_free_buf;
 960
 961        for (i = 0; i < page_count; i++) {
 962                bp->b_pages[i] = alloc_page(xb_to_gfp(flags));
 963                if (!bp->b_pages[i]) {
 964                        error = -ENOMEM;
 965                        goto fail_free_mem;
 966                }
 967        }
 968        bp->b_flags |= _XBF_PAGES;
 969
 970        error = _xfs_buf_map_pages(bp, 0);
 971        if (unlikely(error)) {
 972                xfs_warn(target->bt_mount,
 973                        "%s: failed to map pages", __func__);
 974                goto fail_free_mem;
 975        }
 976
 977        trace_xfs_buf_get_uncached(bp, _RET_IP_);
 978        *bpp = bp;
 979        return 0;
 980
 981 fail_free_mem:
 982        while (--i >= 0)
 983                __free_page(bp->b_pages[i]);
 984        _xfs_buf_free_pages(bp);
 985 fail_free_buf:
 986        xfs_buf_free_maps(bp);
 987        kmem_cache_free(xfs_buf_zone, bp);
 988 fail:
 989        return error;
 990}
 991
 992/*
 993 *      Increment reference count on buffer, to hold the buffer concurrently
 994 *      with another thread which may release (free) the buffer asynchronously.
 995 *      Must hold the buffer already to call this function.
 996 */
 997void
 998xfs_buf_hold(
 999        xfs_buf_t               *bp)
1000{
1001        trace_xfs_buf_hold(bp, _RET_IP_);
1002        atomic_inc(&bp->b_hold);
1003}
1004
1005/*
1006 * Release a hold on the specified buffer. If the hold count is 1, the buffer is
1007 * placed on LRU or freed (depending on b_lru_ref).
1008 */
1009void
1010xfs_buf_rele(
1011        xfs_buf_t               *bp)
1012{
1013        struct xfs_perag        *pag = bp->b_pag;
1014        bool                    release;
1015        bool                    freebuf = false;
1016
1017        trace_xfs_buf_rele(bp, _RET_IP_);
1018
1019        if (!pag) {
1020                ASSERT(list_empty(&bp->b_lru));
1021                if (atomic_dec_and_test(&bp->b_hold)) {
1022                        xfs_buf_ioacct_dec(bp);
1023                        xfs_buf_free(bp);
1024                }
1025                return;
1026        }
1027
1028        ASSERT(atomic_read(&bp->b_hold) > 0);
1029
1030        /*
1031         * We grab the b_lock here first to serialise racing xfs_buf_rele()
1032         * calls. The pag_buf_lock being taken on the last reference only
1033         * serialises against racing lookups in xfs_buf_find(). IOWs, the second
1034         * to last reference we drop here is not serialised against the last
1035         * reference until we take bp->b_lock. Hence if we don't grab b_lock
1036         * first, the last "release" reference can win the race to the lock and
1037         * free the buffer before the second-to-last reference is processed,
1038         * leading to a use-after-free scenario.
1039         */
1040        spin_lock(&bp->b_lock);
1041        release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
1042        if (!release) {
1043                /*
1044                 * Drop the in-flight state if the buffer is already on the LRU
1045                 * and it holds the only reference. This is racy because we
1046                 * haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT
1047                 * ensures the decrement occurs only once per-buf.
1048                 */
1049                if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru))
1050                        __xfs_buf_ioacct_dec(bp);
1051                goto out_unlock;
1052        }
1053
1054        /* the last reference has been dropped ... */
1055        __xfs_buf_ioacct_dec(bp);
1056        if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
1057                /*
1058                 * If the buffer is added to the LRU take a new reference to the
1059                 * buffer for the LRU and clear the (now stale) dispose list
1060                 * state flag
1061                 */
1062                if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
1063                        bp->b_state &= ~XFS_BSTATE_DISPOSE;
1064                        atomic_inc(&bp->b_hold);
1065                }
1066                spin_unlock(&pag->pag_buf_lock);
1067        } else {
1068                /*
1069                 * most of the time buffers will already be removed from the
1070                 * LRU, so optimise that case by checking for the
1071                 * XFS_BSTATE_DISPOSE flag indicating the last list the buffer
1072                 * was on was the disposal list
1073                 */
1074                if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
1075                        list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
1076                } else {
1077                        ASSERT(list_empty(&bp->b_lru));
1078                }
1079
1080                ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
1081                rhashtable_remove_fast(&pag->pag_buf_hash, &bp->b_rhash_head,
1082                                       xfs_buf_hash_params);
1083                spin_unlock(&pag->pag_buf_lock);
1084                xfs_perag_put(pag);
1085                freebuf = true;
1086        }
1087
1088out_unlock:
1089        spin_unlock(&bp->b_lock);
1090
1091        if (freebuf)
1092                xfs_buf_free(bp);
1093}
1094
1095
1096/*
1097 *      Lock a buffer object, if it is not already locked.
1098 *
1099 *      If we come across a stale, pinned, locked buffer, we know that we are
1100 *      being asked to lock a buffer that has been reallocated. Because it is
1101 *      pinned, we know that the log has not been pushed to disk and hence it
1102 *      will still be locked.  Rather than continuing to have trylock attempts
1103 *      fail until someone else pushes the log, push it ourselves before
1104 *      returning.  This means that the xfsaild will not get stuck trying
1105 *      to push on stale inode buffers.
1106 */
1107int
1108xfs_buf_trylock(
1109        struct xfs_buf          *bp)
1110{
1111        int                     locked;
1112
1113        locked = down_trylock(&bp->b_sema) == 0;
1114        if (locked)
1115                trace_xfs_buf_trylock(bp, _RET_IP_);
1116        else
1117                trace_xfs_buf_trylock_fail(bp, _RET_IP_);
1118        return locked;
1119}
1120
1121/*
1122 *      Lock a buffer object.
1123 *
1124 *      If we come across a stale, pinned, locked buffer, we know that we
1125 *      are being asked to lock a buffer that has been reallocated. Because
1126 *      it is pinned, we know that the log has not been pushed to disk and
1127 *      hence it will still be locked. Rather than sleeping until someone
1128 *      else pushes the log, push it ourselves before trying to get the lock.
1129 */
1130void
1131xfs_buf_lock(
1132        struct xfs_buf          *bp)
1133{
1134        trace_xfs_buf_lock(bp, _RET_IP_);
1135
1136        if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
1137                xfs_log_force(bp->b_mount, 0);
1138        down(&bp->b_sema);
1139
1140        trace_xfs_buf_lock_done(bp, _RET_IP_);
1141}
1142
1143void
1144xfs_buf_unlock(
1145        struct xfs_buf          *bp)
1146{
1147        ASSERT(xfs_buf_islocked(bp));
1148
1149        up(&bp->b_sema);
1150        trace_xfs_buf_unlock(bp, _RET_IP_);
1151}
1152
1153STATIC void
1154xfs_buf_wait_unpin(
1155        xfs_buf_t               *bp)
1156{
1157        DECLARE_WAITQUEUE       (wait, current);
1158
1159        if (atomic_read(&bp->b_pin_count) == 0)
1160                return;
1161
1162        add_wait_queue(&bp->b_waiters, &wait);
1163        for (;;) {
1164                set_current_state(TASK_UNINTERRUPTIBLE);
1165                if (atomic_read(&bp->b_pin_count) == 0)
1166                        break;
1167                io_schedule();
1168        }
1169        remove_wait_queue(&bp->b_waiters, &wait);
1170        set_current_state(TASK_RUNNING);
1171}
1172
1173/*
1174 *      Buffer Utility Routines
1175 */
1176
1177void
1178xfs_buf_ioend(
1179        struct xfs_buf  *bp)
1180{
1181        bool            read = bp->b_flags & XBF_READ;
1182
1183        trace_xfs_buf_iodone(bp, _RET_IP_);
1184
1185        bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
1186
1187        /*
1188         * Pull in IO completion errors now. We are guaranteed to be running
1189         * single threaded, so we don't need the lock to read b_io_error.
1190         */
1191        if (!bp->b_error && bp->b_io_error)
1192                xfs_buf_ioerror(bp, bp->b_io_error);
1193
1194        /* Only validate buffers that were read without errors */
1195        if (read && !bp->b_error && bp->b_ops) {
1196                ASSERT(!bp->b_iodone);
1197                bp->b_ops->verify_read(bp);
1198        }
1199
1200        if (!bp->b_error) {
1201                bp->b_flags &= ~XBF_WRITE_FAIL;
1202                bp->b_flags |= XBF_DONE;
1203        }
1204
1205        if (bp->b_iodone)
1206                (*(bp->b_iodone))(bp);
1207        else if (bp->b_flags & XBF_ASYNC)
1208                xfs_buf_relse(bp);
1209        else
1210                complete(&bp->b_iowait);
1211}
1212
1213static void
1214xfs_buf_ioend_work(
1215        struct work_struct      *work)
1216{
1217        struct xfs_buf          *bp =
1218                container_of(work, xfs_buf_t, b_ioend_work);
1219
1220        xfs_buf_ioend(bp);
1221}
1222
1223static void
1224xfs_buf_ioend_async(
1225        struct xfs_buf  *bp)
1226{
1227        INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work);
1228        queue_work(bp->b_mount->m_buf_workqueue, &bp->b_ioend_work);
1229}
1230
1231void
1232__xfs_buf_ioerror(
1233        xfs_buf_t               *bp,
1234        int                     error,
1235        xfs_failaddr_t          failaddr)
1236{
1237        ASSERT(error <= 0 && error >= -1000);
1238        bp->b_error = error;
1239        trace_xfs_buf_ioerror(bp, error, failaddr);
1240}
1241
1242void
1243xfs_buf_ioerror_alert(
1244        struct xfs_buf          *bp,
1245        xfs_failaddr_t          func)
1246{
1247        xfs_buf_alert_ratelimited(bp, "XFS: metadata IO error",
1248                "metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d",
1249                                  func, (uint64_t)XFS_BUF_ADDR(bp),
1250                                  bp->b_length, -bp->b_error);
1251}
1252
1253/*
1254 * To simulate an I/O failure, the buffer must be locked and held with at least
1255 * three references. The LRU reference is dropped by the stale call. The buf
1256 * item reference is dropped via ioend processing. The third reference is owned
1257 * by the caller and is dropped on I/O completion if the buffer is XBF_ASYNC.
1258 */
1259void
1260xfs_buf_ioend_fail(
1261        struct xfs_buf  *bp)
1262{
1263        bp->b_flags &= ~XBF_DONE;
1264        xfs_buf_stale(bp);
1265        xfs_buf_ioerror(bp, -EIO);
1266        xfs_buf_ioend(bp);
1267}
1268
1269int
1270xfs_bwrite(
1271        struct xfs_buf          *bp)
1272{
1273        int                     error;
1274
1275        ASSERT(xfs_buf_islocked(bp));
1276
1277        bp->b_flags |= XBF_WRITE;
1278        bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q |
1279                         XBF_DONE);
1280
1281        error = xfs_buf_submit(bp);
1282        if (error)
1283                xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR);
1284        return error;
1285}
1286
1287static void
1288xfs_buf_bio_end_io(
1289        struct bio              *bio)
1290{
1291        struct xfs_buf          *bp = (struct xfs_buf *)bio->bi_private;
1292
1293        if (!bio->bi_status &&
1294            (bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) &&
1295            XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR))
1296                bio->bi_status = BLK_STS_IOERR;
1297
1298        /*
1299         * don't overwrite existing errors - otherwise we can lose errors on
1300         * buffers that require multiple bios to complete.
1301         */
1302        if (bio->bi_status) {
1303                int error = blk_status_to_errno(bio->bi_status);
1304
1305                cmpxchg(&bp->b_io_error, 0, error);
1306        }
1307
1308        if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
1309                invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
1310
1311        if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
1312                xfs_buf_ioend_async(bp);
1313        bio_put(bio);
1314}
1315
1316static void
1317xfs_buf_ioapply_map(
1318        struct xfs_buf  *bp,
1319        int             map,
1320        int             *buf_offset,
1321        int             *count,
1322        int             op)
1323{
1324        int             page_index;
1325        int             total_nr_pages = bp->b_page_count;
1326        int             nr_pages;
1327        struct bio      *bio;
1328        sector_t        sector =  bp->b_maps[map].bm_bn;
1329        int             size;
1330        int             offset;
1331
1332        /* skip the pages in the buffer before the start offset */
1333        page_index = 0;
1334        offset = *buf_offset;
1335        while (offset >= PAGE_SIZE) {
1336                page_index++;
1337                offset -= PAGE_SIZE;
1338        }
1339
1340        /*
1341         * Limit the IO size to the length of the current vector, and update the
1342         * remaining IO count for the next time around.
1343         */
1344        size = min_t(int, BBTOB(bp->b_maps[map].bm_len), *count);
1345        *count -= size;
1346        *buf_offset += size;
1347
1348next_chunk:
1349        atomic_inc(&bp->b_io_remaining);
1350        nr_pages = min(total_nr_pages, BIO_MAX_PAGES);
1351
1352        bio = bio_alloc(GFP_NOIO, nr_pages);
1353        bio_set_dev(bio, bp->b_target->bt_bdev);
1354        bio->bi_iter.bi_sector = sector;
1355        bio->bi_end_io = xfs_buf_bio_end_io;
1356        bio->bi_private = bp;
1357        bio->bi_opf = op;
1358
1359        for (; size && nr_pages; nr_pages--, page_index++) {
1360                int     rbytes, nbytes = PAGE_SIZE - offset;
1361
1362                if (nbytes > size)
1363                        nbytes = size;
1364
1365                rbytes = bio_add_page(bio, bp->b_pages[page_index], nbytes,
1366                                      offset);
1367                if (rbytes < nbytes)
1368                        break;
1369
1370                offset = 0;
1371                sector += BTOBB(nbytes);
1372                size -= nbytes;
1373                total_nr_pages--;
1374        }
1375
1376        if (likely(bio->bi_iter.bi_size)) {
1377                if (xfs_buf_is_vmapped(bp)) {
1378                        flush_kernel_vmap_range(bp->b_addr,
1379                                                xfs_buf_vmap_len(bp));
1380                }
1381                submit_bio(bio);
1382                if (size)
1383                        goto next_chunk;
1384        } else {
1385                /*
1386                 * This is guaranteed not to be the last io reference count
1387                 * because the caller (xfs_buf_submit) holds a count itself.
1388                 */
1389                atomic_dec(&bp->b_io_remaining);
1390                xfs_buf_ioerror(bp, -EIO);
1391                bio_put(bio);
1392        }
1393
1394}
1395
1396STATIC void
1397_xfs_buf_ioapply(
1398        struct xfs_buf  *bp)
1399{
1400        struct blk_plug plug;
1401        int             op;
1402        int             offset;
1403        int             size;
1404        int             i;
1405
1406        /*
1407         * Make sure we capture only current IO errors rather than stale errors
1408         * left over from previous use of the buffer (e.g. failed readahead).
1409         */
1410        bp->b_error = 0;
1411
1412        if (bp->b_flags & XBF_WRITE) {
1413                op = REQ_OP_WRITE;
1414
1415                /*
1416                 * Run the write verifier callback function if it exists. If
1417                 * this function fails it will mark the buffer with an error and
1418                 * the IO should not be dispatched.
1419                 */
1420                if (bp->b_ops) {
1421                        bp->b_ops->verify_write(bp);
1422                        if (bp->b_error) {
1423                                xfs_force_shutdown(bp->b_mount,
1424                                                   SHUTDOWN_CORRUPT_INCORE);
1425                                return;
1426                        }
1427                } else if (bp->b_bn != XFS_BUF_DADDR_NULL) {
1428                        struct xfs_mount *mp = bp->b_mount;
1429
1430                        /*
1431                         * non-crc filesystems don't attach verifiers during
1432                         * log recovery, so don't warn for such filesystems.
1433                         */
1434                        if (xfs_sb_version_hascrc(&mp->m_sb)) {
1435                                xfs_warn(mp,
1436                                        "%s: no buf ops on daddr 0x%llx len %d",
1437                                        __func__, bp->b_bn, bp->b_length);
1438                                xfs_hex_dump(bp->b_addr,
1439                                                XFS_CORRUPTION_DUMP_LEN);
1440                                dump_stack();
1441                        }
1442                }
1443        } else {
1444                op = REQ_OP_READ;
1445                if (bp->b_flags & XBF_READ_AHEAD)
1446                        op |= REQ_RAHEAD;
1447        }
1448
1449        /* we only use the buffer cache for meta-data */
1450        op |= REQ_META;
1451
1452        /*
1453         * Walk all the vectors issuing IO on them. Set up the initial offset
1454         * into the buffer and the desired IO size before we start -
1455         * _xfs_buf_ioapply_vec() will modify them appropriately for each
1456         * subsequent call.
1457         */
1458        offset = bp->b_offset;
1459        size = BBTOB(bp->b_length);
1460        blk_start_plug(&plug);
1461        for (i = 0; i < bp->b_map_count; i++) {
1462                xfs_buf_ioapply_map(bp, i, &offset, &size, op);
1463                if (bp->b_error)
1464                        break;
1465                if (size <= 0)
1466                        break;  /* all done */
1467        }
1468        blk_finish_plug(&plug);
1469}
1470
1471/*
1472 * Wait for I/O completion of a sync buffer and return the I/O error code.
1473 */
1474static int
1475xfs_buf_iowait(
1476        struct xfs_buf  *bp)
1477{
1478        ASSERT(!(bp->b_flags & XBF_ASYNC));
1479
1480        trace_xfs_buf_iowait(bp, _RET_IP_);
1481        wait_for_completion(&bp->b_iowait);
1482        trace_xfs_buf_iowait_done(bp, _RET_IP_);
1483
1484        return bp->b_error;
1485}
1486
1487/*
1488 * Buffer I/O submission path, read or write. Asynchronous submission transfers
1489 * the buffer lock ownership and the current reference to the IO. It is not
1490 * safe to reference the buffer after a call to this function unless the caller
1491 * holds an additional reference itself.
1492 */
1493int
1494__xfs_buf_submit(
1495        struct xfs_buf  *bp,
1496        bool            wait)
1497{
1498        int             error = 0;
1499
1500        trace_xfs_buf_submit(bp, _RET_IP_);
1501
1502        ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
1503
1504        /* on shutdown we stale and complete the buffer immediately */
1505        if (XFS_FORCED_SHUTDOWN(bp->b_mount)) {
1506                xfs_buf_ioend_fail(bp);
1507                return -EIO;
1508        }
1509
1510        /*
1511         * Grab a reference so the buffer does not go away underneath us. For
1512         * async buffers, I/O completion drops the callers reference, which
1513         * could occur before submission returns.
1514         */
1515        xfs_buf_hold(bp);
1516
1517        if (bp->b_flags & XBF_WRITE)
1518                xfs_buf_wait_unpin(bp);
1519
1520        /* clear the internal error state to avoid spurious errors */
1521        bp->b_io_error = 0;
1522
1523        /*
1524         * Set the count to 1 initially, this will stop an I/O completion
1525         * callout which happens before we have started all the I/O from calling
1526         * xfs_buf_ioend too early.
1527         */
1528        atomic_set(&bp->b_io_remaining, 1);
1529        if (bp->b_flags & XBF_ASYNC)
1530                xfs_buf_ioacct_inc(bp);
1531        _xfs_buf_ioapply(bp);
1532
1533        /*
1534         * If _xfs_buf_ioapply failed, we can get back here with only the IO
1535         * reference we took above. If we drop it to zero, run completion so
1536         * that we don't return to the caller with completion still pending.
1537         */
1538        if (atomic_dec_and_test(&bp->b_io_remaining) == 1) {
1539                if (bp->b_error || !(bp->b_flags & XBF_ASYNC))
1540                        xfs_buf_ioend(bp);
1541                else
1542                        xfs_buf_ioend_async(bp);
1543        }
1544
1545        if (wait)
1546                error = xfs_buf_iowait(bp);
1547
1548        /*
1549         * Release the hold that keeps the buffer referenced for the entire
1550         * I/O. Note that if the buffer is async, it is not safe to reference
1551         * after this release.
1552         */
1553        xfs_buf_rele(bp);
1554        return error;
1555}
1556
1557void *
1558xfs_buf_offset(
1559        struct xfs_buf          *bp,
1560        size_t                  offset)
1561{
1562        struct page             *page;
1563
1564        if (bp->b_addr)
1565                return bp->b_addr + offset;
1566
1567        offset += bp->b_offset;
1568        page = bp->b_pages[offset >> PAGE_SHIFT];
1569        return page_address(page) + (offset & (PAGE_SIZE-1));
1570}
1571
1572void
1573xfs_buf_zero(
1574        struct xfs_buf          *bp,
1575        size_t                  boff,
1576        size_t                  bsize)
1577{
1578        size_t                  bend;
1579
1580        bend = boff + bsize;
1581        while (boff < bend) {
1582                struct page     *page;
1583                int             page_index, page_offset, csize;
1584
1585                page_index = (boff + bp->b_offset) >> PAGE_SHIFT;
1586                page_offset = (boff + bp->b_offset) & ~PAGE_MASK;
1587                page = bp->b_pages[page_index];
1588                csize = min_t(size_t, PAGE_SIZE - page_offset,
1589                                      BBTOB(bp->b_length) - boff);
1590
1591                ASSERT((csize + page_offset) <= PAGE_SIZE);
1592
1593                memset(page_address(page) + page_offset, 0, csize);
1594
1595                boff += csize;
1596        }
1597}
1598
1599/*
1600 * Log a message about and stale a buffer that a caller has decided is corrupt.
1601 *
1602 * This function should be called for the kinds of metadata corruption that
1603 * cannot be detect from a verifier, such as incorrect inter-block relationship
1604 * data.  Do /not/ call this function from a verifier function.
1605 *
1606 * The buffer must be XBF_DONE prior to the call.  Afterwards, the buffer will
1607 * be marked stale, but b_error will not be set.  The caller is responsible for
1608 * releasing the buffer or fixing it.
1609 */
1610void
1611__xfs_buf_mark_corrupt(
1612        struct xfs_buf          *bp,
1613        xfs_failaddr_t          fa)
1614{
1615        ASSERT(bp->b_flags & XBF_DONE);
1616
1617        xfs_buf_corruption_error(bp, fa);
1618        xfs_buf_stale(bp);
1619}
1620
1621/*
1622 *      Handling of buffer targets (buftargs).
1623 */
1624
1625/*
1626 * Wait for any bufs with callbacks that have been submitted but have not yet
1627 * returned. These buffers will have an elevated hold count, so wait on those
1628 * while freeing all the buffers only held by the LRU.
1629 */
1630static enum lru_status
1631xfs_buftarg_wait_rele(
1632        struct list_head        *item,
1633        struct list_lru_one     *lru,
1634        spinlock_t              *lru_lock,
1635        void                    *arg)
1636
1637{
1638        struct xfs_buf          *bp = container_of(item, struct xfs_buf, b_lru);
1639        struct list_head        *dispose = arg;
1640
1641        if (atomic_read(&bp->b_hold) > 1) {
1642                /* need to wait, so skip it this pass */
1643                trace_xfs_buf_wait_buftarg(bp, _RET_IP_);
1644                return LRU_SKIP;
1645        }
1646        if (!spin_trylock(&bp->b_lock))
1647                return LRU_SKIP;
1648
1649        /*
1650         * clear the LRU reference count so the buffer doesn't get
1651         * ignored in xfs_buf_rele().
1652         */
1653        atomic_set(&bp->b_lru_ref, 0);
1654        bp->b_state |= XFS_BSTATE_DISPOSE;
1655        list_lru_isolate_move(lru, item, dispose);
1656        spin_unlock(&bp->b_lock);
1657        return LRU_REMOVED;
1658}
1659
1660void
1661xfs_wait_buftarg(
1662        struct xfs_buftarg      *btp)
1663{
1664        LIST_HEAD(dispose);
1665        int                     loop = 0;
1666        bool                    write_fail = false;
1667
1668        /*
1669         * First wait on the buftarg I/O count for all in-flight buffers to be
1670         * released. This is critical as new buffers do not make the LRU until
1671         * they are released.
1672         *
1673         * Next, flush the buffer workqueue to ensure all completion processing
1674         * has finished. Just waiting on buffer locks is not sufficient for
1675         * async IO as the reference count held over IO is not released until
1676         * after the buffer lock is dropped. Hence we need to ensure here that
1677         * all reference counts have been dropped before we start walking the
1678         * LRU list.
1679         */
1680        while (percpu_counter_sum(&btp->bt_io_count))
1681                delay(100);
1682        flush_workqueue(btp->bt_mount->m_buf_workqueue);
1683
1684        /* loop until there is nothing left on the lru list. */
1685        while (list_lru_count(&btp->bt_lru)) {
1686                list_lru_walk(&btp->bt_lru, xfs_buftarg_wait_rele,
1687                              &dispose, LONG_MAX);
1688
1689                while (!list_empty(&dispose)) {
1690                        struct xfs_buf *bp;
1691                        bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
1692                        list_del_init(&bp->b_lru);
1693                        if (bp->b_flags & XBF_WRITE_FAIL) {
1694                                write_fail = true;
1695                                xfs_buf_alert_ratelimited(bp,
1696                                        "XFS: Corruption Alert",
1697"Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!",
1698                                        (long long)bp->b_bn);
1699                        }
1700                        xfs_buf_rele(bp);
1701                }
1702                if (loop++ != 0)
1703                        delay(100);
1704        }
1705
1706        /*
1707         * If one or more failed buffers were freed, that means dirty metadata
1708         * was thrown away. This should only ever happen after I/O completion
1709         * handling has elevated I/O error(s) to permanent failures and shuts
1710         * down the fs.
1711         */
1712        if (write_fail) {
1713                ASSERT(XFS_FORCED_SHUTDOWN(btp->bt_mount));
1714                xfs_alert(btp->bt_mount,
1715              "Please run xfs_repair to determine the extent of the problem.");
1716        }
1717}
1718
1719static enum lru_status
1720xfs_buftarg_isolate(
1721        struct list_head        *item,
1722        struct list_lru_one     *lru,
1723        spinlock_t              *lru_lock,
1724        void                    *arg)
1725{
1726        struct xfs_buf          *bp = container_of(item, struct xfs_buf, b_lru);
1727        struct list_head        *dispose = arg;
1728
1729        /*
1730         * we are inverting the lru lock/bp->b_lock here, so use a trylock.
1731         * If we fail to get the lock, just skip it.
1732         */
1733        if (!spin_trylock(&bp->b_lock))
1734                return LRU_SKIP;
1735        /*
1736         * Decrement the b_lru_ref count unless the value is already
1737         * zero. If the value is already zero, we need to reclaim the
1738         * buffer, otherwise it gets another trip through the LRU.
1739         */
1740        if (atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
1741                spin_unlock(&bp->b_lock);
1742                return LRU_ROTATE;
1743        }
1744
1745        bp->b_state |= XFS_BSTATE_DISPOSE;
1746        list_lru_isolate_move(lru, item, dispose);
1747        spin_unlock(&bp->b_lock);
1748        return LRU_REMOVED;
1749}
1750
1751static unsigned long
1752xfs_buftarg_shrink_scan(
1753        struct shrinker         *shrink,
1754        struct shrink_control   *sc)
1755{
1756        struct xfs_buftarg      *btp = container_of(shrink,
1757                                        struct xfs_buftarg, bt_shrinker);
1758        LIST_HEAD(dispose);
1759        unsigned long           freed;
1760
1761        freed = list_lru_shrink_walk(&btp->bt_lru, sc,
1762                                     xfs_buftarg_isolate, &dispose);
1763
1764        while (!list_empty(&dispose)) {
1765                struct xfs_buf *bp;
1766                bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
1767                list_del_init(&bp->b_lru);
1768                xfs_buf_rele(bp);
1769        }
1770
1771        return freed;
1772}
1773
1774static unsigned long
1775xfs_buftarg_shrink_count(
1776        struct shrinker         *shrink,
1777        struct shrink_control   *sc)
1778{
1779        struct xfs_buftarg      *btp = container_of(shrink,
1780                                        struct xfs_buftarg, bt_shrinker);
1781        return list_lru_shrink_count(&btp->bt_lru, sc);
1782}
1783
1784void
1785xfs_free_buftarg(
1786        struct xfs_buftarg      *btp)
1787{
1788        unregister_shrinker(&btp->bt_shrinker);
1789        ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
1790        percpu_counter_destroy(&btp->bt_io_count);
1791        list_lru_destroy(&btp->bt_lru);
1792
1793        xfs_blkdev_issue_flush(btp);
1794
1795        kmem_free(btp);
1796}
1797
1798int
1799xfs_setsize_buftarg(
1800        xfs_buftarg_t           *btp,
1801        unsigned int            sectorsize)
1802{
1803        /* Set up metadata sector size info */
1804        btp->bt_meta_sectorsize = sectorsize;
1805        btp->bt_meta_sectormask = sectorsize - 1;
1806
1807        if (set_blocksize(btp->bt_bdev, sectorsize)) {
1808                xfs_warn(btp->bt_mount,
1809                        "Cannot set_blocksize to %u on device %pg",
1810                        sectorsize, btp->bt_bdev);
1811                return -EINVAL;
1812        }
1813
1814        /* Set up device logical sector size mask */
1815        btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev);
1816        btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1;
1817
1818        return 0;
1819}
1820
1821/*
1822 * When allocating the initial buffer target we have not yet
1823 * read in the superblock, so don't know what sized sectors
1824 * are being used at this early stage.  Play safe.
1825 */
1826STATIC int
1827xfs_setsize_buftarg_early(
1828        xfs_buftarg_t           *btp,
1829        struct block_device     *bdev)
1830{
1831        return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev));
1832}
1833
1834xfs_buftarg_t *
1835xfs_alloc_buftarg(
1836        struct xfs_mount        *mp,
1837        struct block_device     *bdev,
1838        struct dax_device       *dax_dev)
1839{
1840        xfs_buftarg_t           *btp;
1841
1842        btp = kmem_zalloc(sizeof(*btp), KM_NOFS);
1843
1844        btp->bt_mount = mp;
1845        btp->bt_dev =  bdev->bd_dev;
1846        btp->bt_bdev = bdev;
1847        btp->bt_daxdev = dax_dev;
1848
1849        /*
1850         * Buffer IO error rate limiting. Limit it to no more than 10 messages
1851         * per 30 seconds so as to not spam logs too much on repeated errors.
1852         */
1853        ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ,
1854                             DEFAULT_RATELIMIT_BURST);
1855
1856        if (xfs_setsize_buftarg_early(btp, bdev))
1857                goto error_free;
1858
1859        if (list_lru_init(&btp->bt_lru))
1860                goto error_free;
1861
1862        if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
1863                goto error_lru;
1864
1865        btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count;
1866        btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
1867        btp->bt_shrinker.seeks = DEFAULT_SEEKS;
1868        btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE;
1869        if (register_shrinker(&btp->bt_shrinker))
1870                goto error_pcpu;
1871        return btp;
1872
1873error_pcpu:
1874        percpu_counter_destroy(&btp->bt_io_count);
1875error_lru:
1876        list_lru_destroy(&btp->bt_lru);
1877error_free:
1878        kmem_free(btp);
1879        return NULL;
1880}
1881
1882/*
1883 * Cancel a delayed write list.
1884 *
1885 * Remove each buffer from the list, clear the delwri queue flag and drop the
1886 * associated buffer reference.
1887 */
1888void
1889xfs_buf_delwri_cancel(
1890        struct list_head        *list)
1891{
1892        struct xfs_buf          *bp;
1893
1894        while (!list_empty(list)) {
1895                bp = list_first_entry(list, struct xfs_buf, b_list);
1896
1897                xfs_buf_lock(bp);
1898                bp->b_flags &= ~_XBF_DELWRI_Q;
1899                list_del_init(&bp->b_list);
1900                xfs_buf_relse(bp);
1901        }
1902}
1903
1904/*
1905 * Add a buffer to the delayed write list.
1906 *
1907 * This queues a buffer for writeout if it hasn't already been.  Note that
1908 * neither this routine nor the buffer list submission functions perform
1909 * any internal synchronization.  It is expected that the lists are thread-local
1910 * to the callers.
1911 *
1912 * Returns true if we queued up the buffer, or false if it already had
1913 * been on the buffer list.
1914 */
1915bool
1916xfs_buf_delwri_queue(
1917        struct xfs_buf          *bp,
1918        struct list_head        *list)
1919{
1920        ASSERT(xfs_buf_islocked(bp));
1921        ASSERT(!(bp->b_flags & XBF_READ));
1922
1923        /*
1924         * If the buffer is already marked delwri it already is queued up
1925         * by someone else for imediate writeout.  Just ignore it in that
1926         * case.
1927         */
1928        if (bp->b_flags & _XBF_DELWRI_Q) {
1929                trace_xfs_buf_delwri_queued(bp, _RET_IP_);
1930                return false;
1931        }
1932
1933        trace_xfs_buf_delwri_queue(bp, _RET_IP_);
1934
1935        /*
1936         * If a buffer gets written out synchronously or marked stale while it
1937         * is on a delwri list we lazily remove it. To do this, the other party
1938         * clears the  _XBF_DELWRI_Q flag but otherwise leaves the buffer alone.
1939         * It remains referenced and on the list.  In a rare corner case it
1940         * might get readded to a delwri list after the synchronous writeout, in
1941         * which case we need just need to re-add the flag here.
1942         */
1943        bp->b_flags |= _XBF_DELWRI_Q;
1944        if (list_empty(&bp->b_list)) {
1945                atomic_inc(&bp->b_hold);
1946                list_add_tail(&bp->b_list, list);
1947        }
1948
1949        return true;
1950}
1951
1952/*
1953 * Compare function is more complex than it needs to be because
1954 * the return value is only 32 bits and we are doing comparisons
1955 * on 64 bit values
1956 */
1957static int
1958xfs_buf_cmp(
1959        void            *priv,
1960        struct list_head *a,
1961        struct list_head *b)
1962{
1963        struct xfs_buf  *ap = container_of(a, struct xfs_buf, b_list);
1964        struct xfs_buf  *bp = container_of(b, struct xfs_buf, b_list);
1965        xfs_daddr_t             diff;
1966
1967        diff = ap->b_maps[0].bm_bn - bp->b_maps[0].bm_bn;
1968        if (diff < 0)
1969                return -1;
1970        if (diff > 0)
1971                return 1;
1972        return 0;
1973}
1974
1975/*
1976 * Submit buffers for write. If wait_list is specified, the buffers are
1977 * submitted using sync I/O and placed on the wait list such that the caller can
1978 * iowait each buffer. Otherwise async I/O is used and the buffers are released
1979 * at I/O completion time. In either case, buffers remain locked until I/O
1980 * completes and the buffer is released from the queue.
1981 */
1982static int
1983xfs_buf_delwri_submit_buffers(
1984        struct list_head        *buffer_list,
1985        struct list_head        *wait_list)
1986{
1987        struct xfs_buf          *bp, *n;
1988        int                     pinned = 0;
1989        struct blk_plug         plug;
1990
1991        list_sort(NULL, buffer_list, xfs_buf_cmp);
1992
1993        blk_start_plug(&plug);
1994        list_for_each_entry_safe(bp, n, buffer_list, b_list) {
1995                if (!wait_list) {
1996                        if (xfs_buf_ispinned(bp)) {
1997                                pinned++;
1998                                continue;
1999                        }
2000                        if (!xfs_buf_trylock(bp))
2001                                continue;
2002                } else {
2003                        xfs_buf_lock(bp);
2004                }
2005
2006                /*
2007                 * Someone else might have written the buffer synchronously or
2008                 * marked it stale in the meantime.  In that case only the
2009                 * _XBF_DELWRI_Q flag got cleared, and we have to drop the
2010                 * reference and remove it from the list here.
2011                 */
2012                if (!(bp->b_flags & _XBF_DELWRI_Q)) {
2013                        list_del_init(&bp->b_list);
2014                        xfs_buf_relse(bp);
2015                        continue;
2016                }
2017
2018                trace_xfs_buf_delwri_split(bp, _RET_IP_);
2019
2020                /*
2021                 * If we have a wait list, each buffer (and associated delwri
2022                 * queue reference) transfers to it and is submitted
2023                 * synchronously. Otherwise, drop the buffer from the delwri
2024                 * queue and submit async.
2025                 */
2026                bp->b_flags &= ~_XBF_DELWRI_Q;
2027                bp->b_flags |= XBF_WRITE;
2028                if (wait_list) {
2029                        bp->b_flags &= ~XBF_ASYNC;
2030                        list_move_tail(&bp->b_list, wait_list);
2031                } else {
2032                        bp->b_flags |= XBF_ASYNC;
2033                        list_del_init(&bp->b_list);
2034                }
2035                __xfs_buf_submit(bp, false);
2036        }
2037        blk_finish_plug(&plug);
2038
2039        return pinned;
2040}
2041
2042/*
2043 * Write out a buffer list asynchronously.
2044 *
2045 * This will take the @buffer_list, write all non-locked and non-pinned buffers
2046 * out and not wait for I/O completion on any of the buffers.  This interface
2047 * is only safely useable for callers that can track I/O completion by higher
2048 * level means, e.g. AIL pushing as the @buffer_list is consumed in this
2049 * function.
2050 *
2051 * Note: this function will skip buffers it would block on, and in doing so
2052 * leaves them on @buffer_list so they can be retried on a later pass. As such,
2053 * it is up to the caller to ensure that the buffer list is fully submitted or
2054 * cancelled appropriately when they are finished with the list. Failure to
2055 * cancel or resubmit the list until it is empty will result in leaked buffers
2056 * at unmount time.
2057 */
2058int
2059xfs_buf_delwri_submit_nowait(
2060        struct list_head        *buffer_list)
2061{
2062        return xfs_buf_delwri_submit_buffers(buffer_list, NULL);
2063}
2064
2065/*
2066 * Write out a buffer list synchronously.
2067 *
2068 * This will take the @buffer_list, write all buffers out and wait for I/O
2069 * completion on all of the buffers. @buffer_list is consumed by the function,
2070 * so callers must have some other way of tracking buffers if they require such
2071 * functionality.
2072 */
2073int
2074xfs_buf_delwri_submit(
2075        struct list_head        *buffer_list)
2076{
2077        LIST_HEAD               (wait_list);
2078        int                     error = 0, error2;
2079        struct xfs_buf          *bp;
2080
2081        xfs_buf_delwri_submit_buffers(buffer_list, &wait_list);
2082
2083        /* Wait for IO to complete. */
2084        while (!list_empty(&wait_list)) {
2085                bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
2086
2087                list_del_init(&bp->b_list);
2088
2089                /*
2090                 * Wait on the locked buffer, check for errors and unlock and
2091                 * release the delwri queue reference.
2092                 */
2093                error2 = xfs_buf_iowait(bp);
2094                xfs_buf_relse(bp);
2095                if (!error)
2096                        error = error2;
2097        }
2098
2099        return error;
2100}
2101
2102/*
2103 * Push a single buffer on a delwri queue.
2104 *
2105 * The purpose of this function is to submit a single buffer of a delwri queue
2106 * and return with the buffer still on the original queue. The waiting delwri
2107 * buffer submission infrastructure guarantees transfer of the delwri queue
2108 * buffer reference to a temporary wait list. We reuse this infrastructure to
2109 * transfer the buffer back to the original queue.
2110 *
2111 * Note the buffer transitions from the queued state, to the submitted and wait
2112 * listed state and back to the queued state during this call. The buffer
2113 * locking and queue management logic between _delwri_pushbuf() and
2114 * _delwri_queue() guarantee that the buffer cannot be queued to another list
2115 * before returning.
2116 */
2117int
2118xfs_buf_delwri_pushbuf(
2119        struct xfs_buf          *bp,
2120        struct list_head        *buffer_list)
2121{
2122        LIST_HEAD               (submit_list);
2123        int                     error;
2124
2125        ASSERT(bp->b_flags & _XBF_DELWRI_Q);
2126
2127        trace_xfs_buf_delwri_pushbuf(bp, _RET_IP_);
2128
2129        /*
2130         * Isolate the buffer to a new local list so we can submit it for I/O
2131         * independently from the rest of the original list.
2132         */
2133        xfs_buf_lock(bp);
2134        list_move(&bp->b_list, &submit_list);
2135        xfs_buf_unlock(bp);
2136
2137        /*
2138         * Delwri submission clears the DELWRI_Q buffer flag and returns with
2139         * the buffer on the wait list with the original reference. Rather than
2140         * bounce the buffer from a local wait list back to the original list
2141         * after I/O completion, reuse the original list as the wait list.
2142         */
2143        xfs_buf_delwri_submit_buffers(&submit_list, buffer_list);
2144
2145        /*
2146         * The buffer is now locked, under I/O and wait listed on the original
2147         * delwri queue. Wait for I/O completion, restore the DELWRI_Q flag and
2148         * return with the buffer unlocked and on the original queue.
2149         */
2150        error = xfs_buf_iowait(bp);
2151        bp->b_flags |= _XBF_DELWRI_Q;
2152        xfs_buf_unlock(bp);
2153
2154        return error;
2155}
2156
2157int __init
2158xfs_buf_init(void)
2159{
2160        xfs_buf_zone = kmem_cache_create("xfs_buf", sizeof(struct xfs_buf), 0,
2161                                         SLAB_HWCACHE_ALIGN |
2162                                         SLAB_RECLAIM_ACCOUNT |
2163                                         SLAB_MEM_SPREAD,
2164                                         NULL);
2165        if (!xfs_buf_zone)
2166                goto out;
2167
2168        return 0;
2169
2170 out:
2171        return -ENOMEM;
2172}
2173
2174void
2175xfs_buf_terminate(void)
2176{
2177        kmem_cache_destroy(xfs_buf_zone);
2178}
2179
2180void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
2181{
2182        /*
2183         * Set the lru reference count to 0 based on the error injection tag.
2184         * This allows userspace to disrupt buffer caching for debug/testing
2185         * purposes.
2186         */
2187        if (XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_LRU_REF))
2188                lru_ref = 0;
2189
2190        atomic_set(&bp->b_lru_ref, lru_ref);
2191}
2192
2193/*
2194 * Verify an on-disk magic value against the magic value specified in the
2195 * verifier structure. The verifier magic is in disk byte order so the caller is
2196 * expected to pass the value directly from disk.
2197 */
2198bool
2199xfs_verify_magic(
2200        struct xfs_buf          *bp,
2201        __be32                  dmagic)
2202{
2203        struct xfs_mount        *mp = bp->b_mount;
2204        int                     idx;
2205
2206        idx = xfs_sb_version_hascrc(&mp->m_sb);
2207        if (WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx]))
2208                return false;
2209        return dmagic == bp->b_ops->magic[idx];
2210}
2211/*
2212 * Verify an on-disk magic value against the magic value specified in the
2213 * verifier structure. The verifier magic is in disk byte order so the caller is
2214 * expected to pass the value directly from disk.
2215 */
2216bool
2217xfs_verify_magic16(
2218        struct xfs_buf          *bp,
2219        __be16                  dmagic)
2220{
2221        struct xfs_mount        *mp = bp->b_mount;
2222        int                     idx;
2223
2224        idx = xfs_sb_version_hascrc(&mp->m_sb);
2225        if (WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx]))
2226                return false;
2227        return dmagic == bp->b_ops->magic16[idx];
2228}
2229