linux/fs/xfs/xfs_buf.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   3 * All Rights Reserved.
   4 *
   5 * This program is free software; you can redistribute it and/or
   6 * modify it under the terms of the GNU General Public License as
   7 * published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope that it would be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public License
  15 * along with this program; if not, write the Free Software Foundation,
  16 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17 */
  18#include "xfs.h"
  19#include <linux/stddef.h>
  20#include <linux/errno.h>
  21#include <linux/gfp.h>
  22#include <linux/pagemap.h>
  23#include <linux/init.h>
  24#include <linux/vmalloc.h>
  25#include <linux/bio.h>
  26#include <linux/sysctl.h>
  27#include <linux/proc_fs.h>
  28#include <linux/workqueue.h>
  29#include <linux/percpu.h>
  30#include <linux/blkdev.h>
  31#include <linux/hash.h>
  32#include <linux/kthread.h>
  33#include <linux/migrate.h>
  34#include <linux/backing-dev.h>
  35#include <linux/freezer.h>
  36
  37#include "xfs_sb.h"
  38#include "xfs_inum.h"
  39#include "xfs_log.h"
  40#include "xfs_ag.h"
  41#include "xfs_mount.h"
  42#include "xfs_trace.h"
  43
  44static kmem_zone_t *xfs_buf_zone;
  45STATIC int xfsbufd(void *);
  46
  47static struct workqueue_struct *xfslogd_workqueue;
  48
  49#ifdef XFS_BUF_LOCK_TRACKING
  50# define XB_SET_OWNER(bp)       ((bp)->b_last_holder = current->pid)
  51# define XB_CLEAR_OWNER(bp)     ((bp)->b_last_holder = -1)
  52# define XB_GET_OWNER(bp)       ((bp)->b_last_holder)
  53#else
  54# define XB_SET_OWNER(bp)       do { } while (0)
  55# define XB_CLEAR_OWNER(bp)     do { } while (0)
  56# define XB_GET_OWNER(bp)       do { } while (0)
  57#endif
  58
  59#define xb_to_gfp(flags) \
  60        ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : \
  61          ((flags) & XBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) | __GFP_NOWARN)
  62
  63#define xb_to_km(flags) \
  64         (((flags) & XBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP)
  65
  66
  67static inline int
  68xfs_buf_is_vmapped(
  69        struct xfs_buf  *bp)
  70{
  71        /*
  72         * Return true if the buffer is vmapped.
  73         *
  74         * The XBF_MAPPED flag is set if the buffer should be mapped, but the
  75         * code is clever enough to know it doesn't have to map a single page,
  76         * so the check has to be both for XBF_MAPPED and bp->b_page_count > 1.
  77         */
  78        return (bp->b_flags & XBF_MAPPED) && bp->b_page_count > 1;
  79}
  80
  81static inline int
  82xfs_buf_vmap_len(
  83        struct xfs_buf  *bp)
  84{
  85        return (bp->b_page_count * PAGE_SIZE) - bp->b_offset;
  86}
  87
  88/*
  89 * xfs_buf_lru_add - add a buffer to the LRU.
  90 *
  91 * The LRU takes a new reference to the buffer so that it will only be freed
  92 * once the shrinker takes the buffer off the LRU.
  93 */
  94STATIC void
  95xfs_buf_lru_add(
  96        struct xfs_buf  *bp)
  97{
  98        struct xfs_buftarg *btp = bp->b_target;
  99
 100        spin_lock(&btp->bt_lru_lock);
 101        if (list_empty(&bp->b_lru)) {
 102                atomic_inc(&bp->b_hold);
 103                list_add_tail(&bp->b_lru, &btp->bt_lru);
 104                btp->bt_lru_nr++;
 105        }
 106        spin_unlock(&btp->bt_lru_lock);
 107}
 108
 109/*
 110 * xfs_buf_lru_del - remove a buffer from the LRU
 111 *
 112 * The unlocked check is safe here because it only occurs when there are not
 113 * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there
 114 * to optimise the shrinker removing the buffer from the LRU and calling
 115 * xfs_buf_free(). i.e. it removes an unnecessary round trip on the
 116 * bt_lru_lock.
 117 */
 118STATIC void
 119xfs_buf_lru_del(
 120        struct xfs_buf  *bp)
 121{
 122        struct xfs_buftarg *btp = bp->b_target;
 123
 124        if (list_empty(&bp->b_lru))
 125                return;
 126
 127        spin_lock(&btp->bt_lru_lock);
 128        if (!list_empty(&bp->b_lru)) {
 129                list_del_init(&bp->b_lru);
 130                btp->bt_lru_nr--;
 131        }
 132        spin_unlock(&btp->bt_lru_lock);
 133}
 134
 135/*
 136 * When we mark a buffer stale, we remove the buffer from the LRU and clear the
 137 * b_lru_ref count so that the buffer is freed immediately when the buffer
 138 * reference count falls to zero. If the buffer is already on the LRU, we need
 139 * to remove the reference that LRU holds on the buffer.
 140 *
 141 * This prevents build-up of stale buffers on the LRU.
 142 */
 143void
 144xfs_buf_stale(
 145        struct xfs_buf  *bp)
 146{
 147        bp->b_flags |= XBF_STALE;
 148        xfs_buf_delwri_dequeue(bp);
 149        atomic_set(&(bp)->b_lru_ref, 0);
 150        if (!list_empty(&bp->b_lru)) {
 151                struct xfs_buftarg *btp = bp->b_target;
 152
 153                spin_lock(&btp->bt_lru_lock);
 154                if (!list_empty(&bp->b_lru)) {
 155                        list_del_init(&bp->b_lru);
 156                        btp->bt_lru_nr--;
 157                        atomic_dec(&bp->b_hold);
 158                }
 159                spin_unlock(&btp->bt_lru_lock);
 160        }
 161        ASSERT(atomic_read(&bp->b_hold) >= 1);
 162}
 163
 164struct xfs_buf *
 165xfs_buf_alloc(
 166        struct xfs_buftarg      *target,
 167        xfs_off_t               range_base,
 168        size_t                  range_length,
 169        xfs_buf_flags_t         flags)
 170{
 171        struct xfs_buf          *bp;
 172
 173        bp = kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags));
 174        if (unlikely(!bp))
 175                return NULL;
 176
 177        /*
 178         * We don't want certain flags to appear in b_flags.
 179         */
 180        flags &= ~(XBF_LOCK|XBF_MAPPED|XBF_DONT_BLOCK|XBF_READ_AHEAD);
 181
 182        memset(bp, 0, sizeof(xfs_buf_t));
 183        atomic_set(&bp->b_hold, 1);
 184        atomic_set(&bp->b_lru_ref, 1);
 185        init_completion(&bp->b_iowait);
 186        INIT_LIST_HEAD(&bp->b_lru);
 187        INIT_LIST_HEAD(&bp->b_list);
 188        RB_CLEAR_NODE(&bp->b_rbnode);
 189        sema_init(&bp->b_sema, 0); /* held, no waiters */
 190        XB_SET_OWNER(bp);
 191        bp->b_target = target;
 192        bp->b_file_offset = range_base;
 193        /*
 194         * Set buffer_length and count_desired to the same value initially.
 195         * I/O routines should use count_desired, which will be the same in
 196         * most cases but may be reset (e.g. XFS recovery).
 197         */
 198        bp->b_buffer_length = bp->b_count_desired = range_length;
 199        bp->b_flags = flags;
 200        bp->b_bn = XFS_BUF_DADDR_NULL;
 201        atomic_set(&bp->b_pin_count, 0);
 202        init_waitqueue_head(&bp->b_waiters);
 203
 204        XFS_STATS_INC(xb_create);
 205        trace_xfs_buf_init(bp, _RET_IP_);
 206
 207        return bp;
 208}
 209
 210/*
 211 *      Allocate a page array capable of holding a specified number
 212 *      of pages, and point the page buf at it.
 213 */
 214STATIC int
 215_xfs_buf_get_pages(
 216        xfs_buf_t               *bp,
 217        int                     page_count,
 218        xfs_buf_flags_t         flags)
 219{
 220        /* Make sure that we have a page list */
 221        if (bp->b_pages == NULL) {
 222                bp->b_offset = xfs_buf_poff(bp->b_file_offset);
 223                bp->b_page_count = page_count;
 224                if (page_count <= XB_PAGES) {
 225                        bp->b_pages = bp->b_page_array;
 226                } else {
 227                        bp->b_pages = kmem_alloc(sizeof(struct page *) *
 228                                        page_count, xb_to_km(flags));
 229                        if (bp->b_pages == NULL)
 230                                return -ENOMEM;
 231                }
 232                memset(bp->b_pages, 0, sizeof(struct page *) * page_count);
 233        }
 234        return 0;
 235}
 236
 237/*
 238 *      Frees b_pages if it was allocated.
 239 */
 240STATIC void
 241_xfs_buf_free_pages(
 242        xfs_buf_t       *bp)
 243{
 244        if (bp->b_pages != bp->b_page_array) {
 245                kmem_free(bp->b_pages);
 246                bp->b_pages = NULL;
 247        }
 248}
 249
 250/*
 251 *      Releases the specified buffer.
 252 *
 253 *      The modification state of any associated pages is left unchanged.
 254 *      The buffer most not be on any hash - use xfs_buf_rele instead for
 255 *      hashed and refcounted buffers
 256 */
 257void
 258xfs_buf_free(
 259        xfs_buf_t               *bp)
 260{
 261        trace_xfs_buf_free(bp, _RET_IP_);
 262
 263        ASSERT(list_empty(&bp->b_lru));
 264
 265        if (bp->b_flags & _XBF_PAGES) {
 266                uint            i;
 267
 268                if (xfs_buf_is_vmapped(bp))
 269                        vm_unmap_ram(bp->b_addr - bp->b_offset,
 270                                        bp->b_page_count);
 271
 272                for (i = 0; i < bp->b_page_count; i++) {
 273                        struct page     *page = bp->b_pages[i];
 274
 275                        __free_page(page);
 276                }
 277        } else if (bp->b_flags & _XBF_KMEM)
 278                kmem_free(bp->b_addr);
 279        _xfs_buf_free_pages(bp);
 280        kmem_zone_free(xfs_buf_zone, bp);
 281}
 282
 283/*
 284 * Allocates all the pages for buffer in question and builds it's page list.
 285 */
 286STATIC int
 287xfs_buf_allocate_memory(
 288        xfs_buf_t               *bp,
 289        uint                    flags)
 290{
 291        size_t                  size = bp->b_count_desired;
 292        size_t                  nbytes, offset;
 293        gfp_t                   gfp_mask = xb_to_gfp(flags);
 294        unsigned short          page_count, i;
 295        xfs_off_t               end;
 296        int                     error;
 297
 298        /*
 299         * for buffers that are contained within a single page, just allocate
 300         * the memory from the heap - there's no need for the complexity of
 301         * page arrays to keep allocation down to order 0.
 302         */
 303        if (bp->b_buffer_length < PAGE_SIZE) {
 304                bp->b_addr = kmem_alloc(bp->b_buffer_length, xb_to_km(flags));
 305                if (!bp->b_addr) {
 306                        /* low memory - use alloc_page loop instead */
 307                        goto use_alloc_page;
 308                }
 309
 310                if (((unsigned long)(bp->b_addr + bp->b_buffer_length - 1) &
 311                                                                PAGE_MASK) !=
 312                    ((unsigned long)bp->b_addr & PAGE_MASK)) {
 313                        /* b_addr spans two pages - use alloc_page instead */
 314                        kmem_free(bp->b_addr);
 315                        bp->b_addr = NULL;
 316                        goto use_alloc_page;
 317                }
 318                bp->b_offset = offset_in_page(bp->b_addr);
 319                bp->b_pages = bp->b_page_array;
 320                bp->b_pages[0] = virt_to_page(bp->b_addr);
 321                bp->b_page_count = 1;
 322                bp->b_flags |= XBF_MAPPED | _XBF_KMEM;
 323                return 0;
 324        }
 325
 326use_alloc_page:
 327        end = bp->b_file_offset + bp->b_buffer_length;
 328        page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset);
 329        error = _xfs_buf_get_pages(bp, page_count, flags);
 330        if (unlikely(error))
 331                return error;
 332
 333        offset = bp->b_offset;
 334        bp->b_flags |= _XBF_PAGES;
 335
 336        for (i = 0; i < bp->b_page_count; i++) {
 337                struct page     *page;
 338                uint            retries = 0;
 339retry:
 340                page = alloc_page(gfp_mask);
 341                if (unlikely(page == NULL)) {
 342                        if (flags & XBF_READ_AHEAD) {
 343                                bp->b_page_count = i;
 344                                error = ENOMEM;
 345                                goto out_free_pages;
 346                        }
 347
 348                        /*
 349                         * This could deadlock.
 350                         *
 351                         * But until all the XFS lowlevel code is revamped to
 352                         * handle buffer allocation failures we can't do much.
 353                         */
 354                        if (!(++retries % 100))
 355                                xfs_err(NULL,
 356                "possible memory allocation deadlock in %s (mode:0x%x)",
 357                                        __func__, gfp_mask);
 358
 359                        XFS_STATS_INC(xb_page_retries);
 360                        congestion_wait(BLK_RW_ASYNC, HZ/50);
 361                        goto retry;
 362                }
 363
 364                XFS_STATS_INC(xb_page_found);
 365
 366                nbytes = min_t(size_t, size, PAGE_SIZE - offset);
 367                size -= nbytes;
 368                bp->b_pages[i] = page;
 369                offset = 0;
 370        }
 371        return 0;
 372
 373out_free_pages:
 374        for (i = 0; i < bp->b_page_count; i++)
 375                __free_page(bp->b_pages[i]);
 376        return error;
 377}
 378
 379/*
 380 *      Map buffer into kernel address-space if necessary.
 381 */
 382STATIC int
 383_xfs_buf_map_pages(
 384        xfs_buf_t               *bp,
 385        uint                    flags)
 386{
 387        ASSERT(bp->b_flags & _XBF_PAGES);
 388        if (bp->b_page_count == 1) {
 389                /* A single page buffer is always mappable */
 390                bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
 391                bp->b_flags |= XBF_MAPPED;
 392        } else if (flags & XBF_MAPPED) {
 393                int retried = 0;
 394
 395                do {
 396                        bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
 397                                                -1, PAGE_KERNEL);
 398                        if (bp->b_addr)
 399                                break;
 400                        vm_unmap_aliases();
 401                } while (retried++ <= 1);
 402
 403                if (!bp->b_addr)
 404                        return -ENOMEM;
 405                bp->b_addr += bp->b_offset;
 406                bp->b_flags |= XBF_MAPPED;
 407        }
 408
 409        return 0;
 410}
 411
 412/*
 413 *      Finding and Reading Buffers
 414 */
 415
 416/*
 417 *      Look up, and creates if absent, a lockable buffer for
 418 *      a given range of an inode.  The buffer is returned
 419 *      locked. No I/O is implied by this call.
 420 */
 421xfs_buf_t *
 422_xfs_buf_find(
 423        xfs_buftarg_t           *btp,   /* block device target          */
 424        xfs_off_t               ioff,   /* starting offset of range     */
 425        size_t                  isize,  /* length of range              */
 426        xfs_buf_flags_t         flags,
 427        xfs_buf_t               *new_bp)
 428{
 429        xfs_off_t               range_base;
 430        size_t                  range_length;
 431        struct xfs_perag        *pag;
 432        struct rb_node          **rbp;
 433        struct rb_node          *parent;
 434        xfs_buf_t               *bp;
 435
 436        range_base = (ioff << BBSHIFT);
 437        range_length = (isize << BBSHIFT);
 438
 439        /* Check for IOs smaller than the sector size / not sector aligned */
 440        ASSERT(!(range_length < (1 << btp->bt_sshift)));
 441        ASSERT(!(range_base & (xfs_off_t)btp->bt_smask));
 442
 443        /* get tree root */
 444        pag = xfs_perag_get(btp->bt_mount,
 445                                xfs_daddr_to_agno(btp->bt_mount, ioff));
 446
 447        /* walk tree */
 448        spin_lock(&pag->pag_buf_lock);
 449        rbp = &pag->pag_buf_tree.rb_node;
 450        parent = NULL;
 451        bp = NULL;
 452        while (*rbp) {
 453                parent = *rbp;
 454                bp = rb_entry(parent, struct xfs_buf, b_rbnode);
 455
 456                if (range_base < bp->b_file_offset)
 457                        rbp = &(*rbp)->rb_left;
 458                else if (range_base > bp->b_file_offset)
 459                        rbp = &(*rbp)->rb_right;
 460                else {
 461                        /*
 462                         * found a block offset match. If the range doesn't
 463                         * match, the only way this is allowed is if the buffer
 464                         * in the cache is stale and the transaction that made
 465                         * it stale has not yet committed. i.e. we are
 466                         * reallocating a busy extent. Skip this buffer and
 467                         * continue searching to the right for an exact match.
 468                         */
 469                        if (bp->b_buffer_length != range_length) {
 470                                ASSERT(bp->b_flags & XBF_STALE);
 471                                rbp = &(*rbp)->rb_right;
 472                                continue;
 473                        }
 474                        atomic_inc(&bp->b_hold);
 475                        goto found;
 476                }
 477        }
 478
 479        /* No match found */
 480        if (new_bp) {
 481                rb_link_node(&new_bp->b_rbnode, parent, rbp);
 482                rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree);
 483                /* the buffer keeps the perag reference until it is freed */
 484                new_bp->b_pag = pag;
 485                spin_unlock(&pag->pag_buf_lock);
 486        } else {
 487                XFS_STATS_INC(xb_miss_locked);
 488                spin_unlock(&pag->pag_buf_lock);
 489                xfs_perag_put(pag);
 490        }
 491        return new_bp;
 492
 493found:
 494        spin_unlock(&pag->pag_buf_lock);
 495        xfs_perag_put(pag);
 496
 497        if (!xfs_buf_trylock(bp)) {
 498                if (flags & XBF_TRYLOCK) {
 499                        xfs_buf_rele(bp);
 500                        XFS_STATS_INC(xb_busy_locked);
 501                        return NULL;
 502                }
 503                xfs_buf_lock(bp);
 504                XFS_STATS_INC(xb_get_locked_waited);
 505        }
 506
 507        /*
 508         * if the buffer is stale, clear all the external state associated with
 509         * it. We need to keep flags such as how we allocated the buffer memory
 510         * intact here.
 511         */
 512        if (bp->b_flags & XBF_STALE) {
 513                ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
 514                bp->b_flags &= XBF_MAPPED | _XBF_KMEM | _XBF_PAGES;
 515        }
 516
 517        trace_xfs_buf_find(bp, flags, _RET_IP_);
 518        XFS_STATS_INC(xb_get_locked);
 519        return bp;
 520}
 521
 522/*
 523 * Assembles a buffer covering the specified range. The code is optimised for
 524 * cache hits, as metadata intensive workloads will see 3 orders of magnitude
 525 * more hits than misses.
 526 */
 527struct xfs_buf *
 528xfs_buf_get(
 529        xfs_buftarg_t           *target,/* target for buffer            */
 530        xfs_off_t               ioff,   /* starting offset of range     */
 531        size_t                  isize,  /* length of range              */
 532        xfs_buf_flags_t         flags)
 533{
 534        struct xfs_buf          *bp;
 535        struct xfs_buf          *new_bp;
 536        int                     error = 0;
 537
 538        bp = _xfs_buf_find(target, ioff, isize, flags, NULL);
 539        if (likely(bp))
 540                goto found;
 541
 542        new_bp = xfs_buf_alloc(target, ioff << BBSHIFT, isize << BBSHIFT,
 543                               flags);
 544        if (unlikely(!new_bp))
 545                return NULL;
 546
 547        bp = _xfs_buf_find(target, ioff, isize, flags, new_bp);
 548        if (!bp) {
 549                kmem_zone_free(xfs_buf_zone, new_bp);
 550                return NULL;
 551        }
 552
 553        if (bp == new_bp) {
 554                error = xfs_buf_allocate_memory(bp, flags);
 555                if (error)
 556                        goto no_buffer;
 557        } else
 558                kmem_zone_free(xfs_buf_zone, new_bp);
 559
 560        /*
 561         * Now we have a workable buffer, fill in the block number so
 562         * that we can do IO on it.
 563         */
 564        bp->b_bn = ioff;
 565        bp->b_count_desired = bp->b_buffer_length;
 566
 567found:
 568        if (!(bp->b_flags & XBF_MAPPED)) {
 569                error = _xfs_buf_map_pages(bp, flags);
 570                if (unlikely(error)) {
 571                        xfs_warn(target->bt_mount,
 572                                "%s: failed to map pages\n", __func__);
 573                        goto no_buffer;
 574                }
 575        }
 576
 577        XFS_STATS_INC(xb_get);
 578        trace_xfs_buf_get(bp, flags, _RET_IP_);
 579        return bp;
 580
 581no_buffer:
 582        if (flags & (XBF_LOCK | XBF_TRYLOCK))
 583                xfs_buf_unlock(bp);
 584        xfs_buf_rele(bp);
 585        return NULL;
 586}
 587
 588STATIC int
 589_xfs_buf_read(
 590        xfs_buf_t               *bp,
 591        xfs_buf_flags_t         flags)
 592{
 593        int                     status;
 594
 595        ASSERT(!(flags & (XBF_DELWRI|XBF_WRITE)));
 596        ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL);
 597
 598        bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_DELWRI | XBF_READ_AHEAD);
 599        bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
 600
 601        status = xfs_buf_iorequest(bp);
 602        if (status || bp->b_error || (flags & XBF_ASYNC))
 603                return status;
 604        return xfs_buf_iowait(bp);
 605}
 606
 607xfs_buf_t *
 608xfs_buf_read(
 609        xfs_buftarg_t           *target,
 610        xfs_off_t               ioff,
 611        size_t                  isize,
 612        xfs_buf_flags_t         flags)
 613{
 614        xfs_buf_t               *bp;
 615
 616        flags |= XBF_READ;
 617
 618        bp = xfs_buf_get(target, ioff, isize, flags);
 619        if (bp) {
 620                trace_xfs_buf_read(bp, flags, _RET_IP_);
 621
 622                if (!XFS_BUF_ISDONE(bp)) {
 623                        XFS_STATS_INC(xb_get_read);
 624                        _xfs_buf_read(bp, flags);
 625                } else if (flags & XBF_ASYNC) {
 626                        /*
 627                         * Read ahead call which is already satisfied,
 628                         * drop the buffer
 629                         */
 630                        goto no_buffer;
 631                } else {
 632                        /* We do not want read in the flags */
 633                        bp->b_flags &= ~XBF_READ;
 634                }
 635        }
 636
 637        return bp;
 638
 639 no_buffer:
 640        if (flags & (XBF_LOCK | XBF_TRYLOCK))
 641                xfs_buf_unlock(bp);
 642        xfs_buf_rele(bp);
 643        return NULL;
 644}
 645
 646/*
 647 *      If we are not low on memory then do the readahead in a deadlock
 648 *      safe manner.
 649 */
 650void
 651xfs_buf_readahead(
 652        xfs_buftarg_t           *target,
 653        xfs_off_t               ioff,
 654        size_t                  isize)
 655{
 656        if (bdi_read_congested(target->bt_bdi))
 657                return;
 658
 659        xfs_buf_read(target, ioff, isize,
 660                     XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD|XBF_DONT_BLOCK);
 661}
 662
 663/*
 664 * Read an uncached buffer from disk. Allocates and returns a locked
 665 * buffer containing the disk contents or nothing.
 666 */
 667struct xfs_buf *
 668xfs_buf_read_uncached(
 669        struct xfs_mount        *mp,
 670        struct xfs_buftarg      *target,
 671        xfs_daddr_t             daddr,
 672        size_t                  length,
 673        int                     flags)
 674{
 675        xfs_buf_t               *bp;
 676        int                     error;
 677
 678        bp = xfs_buf_get_uncached(target, length, flags);
 679        if (!bp)
 680                return NULL;
 681
 682        /* set up the buffer for a read IO */
 683        XFS_BUF_SET_ADDR(bp, daddr);
 684        XFS_BUF_READ(bp);
 685
 686        xfsbdstrat(mp, bp);
 687        error = xfs_buf_iowait(bp);
 688        if (error || bp->b_error) {
 689                xfs_buf_relse(bp);
 690                return NULL;
 691        }
 692        return bp;
 693}
 694
 695/*
 696 * Return a buffer allocated as an empty buffer and associated to external
 697 * memory via xfs_buf_associate_memory() back to it's empty state.
 698 */
 699void
 700xfs_buf_set_empty(
 701        struct xfs_buf          *bp,
 702        size_t                  len)
 703{
 704        if (bp->b_pages)
 705                _xfs_buf_free_pages(bp);
 706
 707        bp->b_pages = NULL;
 708        bp->b_page_count = 0;
 709        bp->b_addr = NULL;
 710        bp->b_file_offset = 0;
 711        bp->b_buffer_length = bp->b_count_desired = len;
 712        bp->b_bn = XFS_BUF_DADDR_NULL;
 713        bp->b_flags &= ~XBF_MAPPED;
 714}
 715
 716static inline struct page *
 717mem_to_page(
 718        void                    *addr)
 719{
 720        if ((!is_vmalloc_addr(addr))) {
 721                return virt_to_page(addr);
 722        } else {
 723                return vmalloc_to_page(addr);
 724        }
 725}
 726
 727int
 728xfs_buf_associate_memory(
 729        xfs_buf_t               *bp,
 730        void                    *mem,
 731        size_t                  len)
 732{
 733        int                     rval;
 734        int                     i = 0;
 735        unsigned long           pageaddr;
 736        unsigned long           offset;
 737        size_t                  buflen;
 738        int                     page_count;
 739
 740        pageaddr = (unsigned long)mem & PAGE_MASK;
 741        offset = (unsigned long)mem - pageaddr;
 742        buflen = PAGE_ALIGN(len + offset);
 743        page_count = buflen >> PAGE_SHIFT;
 744
 745        /* Free any previous set of page pointers */
 746        if (bp->b_pages)
 747                _xfs_buf_free_pages(bp);
 748
 749        bp->b_pages = NULL;
 750        bp->b_addr = mem;
 751
 752        rval = _xfs_buf_get_pages(bp, page_count, XBF_DONT_BLOCK);
 753        if (rval)
 754                return rval;
 755
 756        bp->b_offset = offset;
 757
 758        for (i = 0; i < bp->b_page_count; i++) {
 759                bp->b_pages[i] = mem_to_page((void *)pageaddr);
 760                pageaddr += PAGE_SIZE;
 761        }
 762
 763        bp->b_count_desired = len;
 764        bp->b_buffer_length = buflen;
 765        bp->b_flags |= XBF_MAPPED;
 766
 767        return 0;
 768}
 769
 770xfs_buf_t *
 771xfs_buf_get_uncached(
 772        struct xfs_buftarg      *target,
 773        size_t                  len,
 774        int                     flags)
 775{
 776        unsigned long           page_count = PAGE_ALIGN(len) >> PAGE_SHIFT;
 777        int                     error, i;
 778        xfs_buf_t               *bp;
 779
 780        bp = xfs_buf_alloc(target, 0, len, 0);
 781        if (unlikely(bp == NULL))
 782                goto fail;
 783
 784        error = _xfs_buf_get_pages(bp, page_count, 0);
 785        if (error)
 786                goto fail_free_buf;
 787
 788        for (i = 0; i < page_count; i++) {
 789                bp->b_pages[i] = alloc_page(xb_to_gfp(flags));
 790                if (!bp->b_pages[i])
 791                        goto fail_free_mem;
 792        }
 793        bp->b_flags |= _XBF_PAGES;
 794
 795        error = _xfs_buf_map_pages(bp, XBF_MAPPED);
 796        if (unlikely(error)) {
 797                xfs_warn(target->bt_mount,
 798                        "%s: failed to map pages\n", __func__);
 799                goto fail_free_mem;
 800        }
 801
 802        trace_xfs_buf_get_uncached(bp, _RET_IP_);
 803        return bp;
 804
 805 fail_free_mem:
 806        while (--i >= 0)
 807                __free_page(bp->b_pages[i]);
 808        _xfs_buf_free_pages(bp);
 809 fail_free_buf:
 810        kmem_zone_free(xfs_buf_zone, bp);
 811 fail:
 812        return NULL;
 813}
 814
 815/*
 816 *      Increment reference count on buffer, to hold the buffer concurrently
 817 *      with another thread which may release (free) the buffer asynchronously.
 818 *      Must hold the buffer already to call this function.
 819 */
 820void
 821xfs_buf_hold(
 822        xfs_buf_t               *bp)
 823{
 824        trace_xfs_buf_hold(bp, _RET_IP_);
 825        atomic_inc(&bp->b_hold);
 826}
 827
 828/*
 829 *      Releases a hold on the specified buffer.  If the
 830 *      the hold count is 1, calls xfs_buf_free.
 831 */
 832void
 833xfs_buf_rele(
 834        xfs_buf_t               *bp)
 835{
 836        struct xfs_perag        *pag = bp->b_pag;
 837
 838        trace_xfs_buf_rele(bp, _RET_IP_);
 839
 840        if (!pag) {
 841                ASSERT(list_empty(&bp->b_lru));
 842                ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
 843                if (atomic_dec_and_test(&bp->b_hold))
 844                        xfs_buf_free(bp);
 845                return;
 846        }
 847
 848        ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
 849
 850        ASSERT(atomic_read(&bp->b_hold) > 0);
 851        if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
 852                if (!(bp->b_flags & XBF_STALE) &&
 853                           atomic_read(&bp->b_lru_ref)) {
 854                        xfs_buf_lru_add(bp);
 855                        spin_unlock(&pag->pag_buf_lock);
 856                } else {
 857                        xfs_buf_lru_del(bp);
 858                        ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
 859                        rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
 860                        spin_unlock(&pag->pag_buf_lock);
 861                        xfs_perag_put(pag);
 862                        xfs_buf_free(bp);
 863                }
 864        }
 865}
 866
 867
 868/*
 869 *      Lock a buffer object, if it is not already locked.
 870 *
 871 *      If we come across a stale, pinned, locked buffer, we know that we are
 872 *      being asked to lock a buffer that has been reallocated. Because it is
 873 *      pinned, we know that the log has not been pushed to disk and hence it
 874 *      will still be locked.  Rather than continuing to have trylock attempts
 875 *      fail until someone else pushes the log, push it ourselves before
 876 *      returning.  This means that the xfsaild will not get stuck trying
 877 *      to push on stale inode buffers.
 878 */
 879int
 880xfs_buf_trylock(
 881        struct xfs_buf          *bp)
 882{
 883        int                     locked;
 884
 885        locked = down_trylock(&bp->b_sema) == 0;
 886        if (locked)
 887                XB_SET_OWNER(bp);
 888        else if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
 889                xfs_log_force(bp->b_target->bt_mount, 0);
 890
 891        trace_xfs_buf_trylock(bp, _RET_IP_);
 892        return locked;
 893}
 894
 895/*
 896 *      Lock a buffer object.
 897 *
 898 *      If we come across a stale, pinned, locked buffer, we know that we
 899 *      are being asked to lock a buffer that has been reallocated. Because
 900 *      it is pinned, we know that the log has not been pushed to disk and
 901 *      hence it will still be locked. Rather than sleeping until someone
 902 *      else pushes the log, push it ourselves before trying to get the lock.
 903 */
 904void
 905xfs_buf_lock(
 906        struct xfs_buf          *bp)
 907{
 908        trace_xfs_buf_lock(bp, _RET_IP_);
 909
 910        if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
 911                xfs_log_force(bp->b_target->bt_mount, 0);
 912        down(&bp->b_sema);
 913        XB_SET_OWNER(bp);
 914
 915        trace_xfs_buf_lock_done(bp, _RET_IP_);
 916}
 917
 918/*
 919 *      Releases the lock on the buffer object.
 920 *      If the buffer is marked delwri but is not queued, do so before we
 921 *      unlock the buffer as we need to set flags correctly.  We also need to
 922 *      take a reference for the delwri queue because the unlocker is going to
 923 *      drop their's and they don't know we just queued it.
 924 */
 925void
 926xfs_buf_unlock(
 927        struct xfs_buf          *bp)
 928{
 929        XB_CLEAR_OWNER(bp);
 930        up(&bp->b_sema);
 931
 932        trace_xfs_buf_unlock(bp, _RET_IP_);
 933}
 934
 935STATIC void
 936xfs_buf_wait_unpin(
 937        xfs_buf_t               *bp)
 938{
 939        DECLARE_WAITQUEUE       (wait, current);
 940
 941        if (atomic_read(&bp->b_pin_count) == 0)
 942                return;
 943
 944        add_wait_queue(&bp->b_waiters, &wait);
 945        for (;;) {
 946                set_current_state(TASK_UNINTERRUPTIBLE);
 947                if (atomic_read(&bp->b_pin_count) == 0)
 948                        break;
 949                io_schedule();
 950        }
 951        remove_wait_queue(&bp->b_waiters, &wait);
 952        set_current_state(TASK_RUNNING);
 953}
 954
 955/*
 956 *      Buffer Utility Routines
 957 */
 958
 959STATIC void
 960xfs_buf_iodone_work(
 961        struct work_struct      *work)
 962{
 963        xfs_buf_t               *bp =
 964                container_of(work, xfs_buf_t, b_iodone_work);
 965
 966        if (bp->b_iodone)
 967                (*(bp->b_iodone))(bp);
 968        else if (bp->b_flags & XBF_ASYNC)
 969                xfs_buf_relse(bp);
 970}
 971
 972void
 973xfs_buf_ioend(
 974        xfs_buf_t               *bp,
 975        int                     schedule)
 976{
 977        trace_xfs_buf_iodone(bp, _RET_IP_);
 978
 979        bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
 980        if (bp->b_error == 0)
 981                bp->b_flags |= XBF_DONE;
 982
 983        if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) {
 984                if (schedule) {
 985                        INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work);
 986                        queue_work(xfslogd_workqueue, &bp->b_iodone_work);
 987                } else {
 988                        xfs_buf_iodone_work(&bp->b_iodone_work);
 989                }
 990        } else {
 991                complete(&bp->b_iowait);
 992        }
 993}
 994
 995void
 996xfs_buf_ioerror(
 997        xfs_buf_t               *bp,
 998        int                     error)
 999{
1000        ASSERT(error >= 0 && error <= 0xffff);
1001        bp->b_error = (unsigned short)error;
1002        trace_xfs_buf_ioerror(bp, error, _RET_IP_);
1003}
1004
1005void
1006xfs_buf_ioerror_alert(
1007        struct xfs_buf          *bp,
1008        const char              *func)
1009{
1010        xfs_alert(bp->b_target->bt_mount,
1011"metadata I/O error: block 0x%llx (\"%s\") error %d buf count %zd",
1012                (__uint64_t)XFS_BUF_ADDR(bp), func,
1013                bp->b_error, XFS_BUF_COUNT(bp));
1014}
1015
1016int
1017xfs_bwrite(
1018        struct xfs_buf          *bp)
1019{
1020        int                     error;
1021
1022        bp->b_flags |= XBF_WRITE;
1023        bp->b_flags &= ~(XBF_ASYNC | XBF_READ);
1024
1025        xfs_buf_delwri_dequeue(bp);
1026        xfs_bdstrat_cb(bp);
1027
1028        error = xfs_buf_iowait(bp);
1029        if (error) {
1030                xfs_force_shutdown(bp->b_target->bt_mount,
1031                                   SHUTDOWN_META_IO_ERROR);
1032        }
1033        return error;
1034}
1035
1036/*
1037 * Called when we want to stop a buffer from getting written or read.
1038 * We attach the EIO error, muck with its flags, and call xfs_buf_ioend
1039 * so that the proper iodone callbacks get called.
1040 */
1041STATIC int
1042xfs_bioerror(
1043        xfs_buf_t *bp)
1044{
1045#ifdef XFSERRORDEBUG
1046        ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone);
1047#endif
1048
1049        /*
1050         * No need to wait until the buffer is unpinned, we aren't flushing it.
1051         */
1052        xfs_buf_ioerror(bp, EIO);
1053
1054        /*
1055         * We're calling xfs_buf_ioend, so delete XBF_DONE flag.
1056         */
1057        XFS_BUF_UNREAD(bp);
1058        XFS_BUF_UNDONE(bp);
1059        xfs_buf_stale(bp);
1060
1061        xfs_buf_ioend(bp, 0);
1062
1063        return EIO;
1064}
1065
1066/*
1067 * Same as xfs_bioerror, except that we are releasing the buffer
1068 * here ourselves, and avoiding the xfs_buf_ioend call.
1069 * This is meant for userdata errors; metadata bufs come with
1070 * iodone functions attached, so that we can track down errors.
1071 */
1072STATIC int
1073xfs_bioerror_relse(
1074        struct xfs_buf  *bp)
1075{
1076        int64_t         fl = bp->b_flags;
1077        /*
1078         * No need to wait until the buffer is unpinned.
1079         * We aren't flushing it.
1080         *
1081         * chunkhold expects B_DONE to be set, whether
1082         * we actually finish the I/O or not. We don't want to
1083         * change that interface.
1084         */
1085        XFS_BUF_UNREAD(bp);
1086        XFS_BUF_DONE(bp);
1087        xfs_buf_stale(bp);
1088        bp->b_iodone = NULL;
1089        if (!(fl & XBF_ASYNC)) {
1090                /*
1091                 * Mark b_error and B_ERROR _both_.
1092                 * Lot's of chunkcache code assumes that.
1093                 * There's no reason to mark error for
1094                 * ASYNC buffers.
1095                 */
1096                xfs_buf_ioerror(bp, EIO);
1097                complete(&bp->b_iowait);
1098        } else {
1099                xfs_buf_relse(bp);
1100        }
1101
1102        return EIO;
1103}
1104
1105
1106/*
1107 * All xfs metadata buffers except log state machine buffers
1108 * get this attached as their b_bdstrat callback function.
1109 * This is so that we can catch a buffer
1110 * after prematurely unpinning it to forcibly shutdown the filesystem.
1111 */
1112int
1113xfs_bdstrat_cb(
1114        struct xfs_buf  *bp)
1115{
1116        if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) {
1117                trace_xfs_bdstrat_shut(bp, _RET_IP_);
1118                /*
1119                 * Metadata write that didn't get logged but
1120                 * written delayed anyway. These aren't associated
1121                 * with a transaction, and can be ignored.
1122                 */
1123                if (!bp->b_iodone && !XFS_BUF_ISREAD(bp))
1124                        return xfs_bioerror_relse(bp);
1125                else
1126                        return xfs_bioerror(bp);
1127        }
1128
1129        xfs_buf_iorequest(bp);
1130        return 0;
1131}
1132
1133/*
1134 * Wrapper around bdstrat so that we can stop data from going to disk in case
1135 * we are shutting down the filesystem.  Typically user data goes thru this
1136 * path; one of the exceptions is the superblock.
1137 */
1138void
1139xfsbdstrat(
1140        struct xfs_mount        *mp,
1141        struct xfs_buf          *bp)
1142{
1143        if (XFS_FORCED_SHUTDOWN(mp)) {
1144                trace_xfs_bdstrat_shut(bp, _RET_IP_);
1145                xfs_bioerror_relse(bp);
1146                return;
1147        }
1148
1149        xfs_buf_iorequest(bp);
1150}
1151
1152STATIC void
1153_xfs_buf_ioend(
1154        xfs_buf_t               *bp,
1155        int                     schedule)
1156{
1157        if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
1158                xfs_buf_ioend(bp, schedule);
1159}
1160
1161STATIC void
1162xfs_buf_bio_end_io(
1163        struct bio              *bio,
1164        int                     error)
1165{
1166        xfs_buf_t               *bp = (xfs_buf_t *)bio->bi_private;
1167
1168        xfs_buf_ioerror(bp, -error);
1169
1170        if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
1171                invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
1172
1173        _xfs_buf_ioend(bp, 1);
1174        bio_put(bio);
1175}
1176
1177STATIC void
1178_xfs_buf_ioapply(
1179        xfs_buf_t               *bp)
1180{
1181        int                     rw, map_i, total_nr_pages, nr_pages;
1182        struct bio              *bio;
1183        int                     offset = bp->b_offset;
1184        int                     size = bp->b_count_desired;
1185        sector_t                sector = bp->b_bn;
1186
1187        total_nr_pages = bp->b_page_count;
1188        map_i = 0;
1189
1190        if (bp->b_flags & XBF_WRITE) {
1191                if (bp->b_flags & XBF_SYNCIO)
1192                        rw = WRITE_SYNC;
1193                else
1194                        rw = WRITE;
1195                if (bp->b_flags & XBF_FUA)
1196                        rw |= REQ_FUA;
1197                if (bp->b_flags & XBF_FLUSH)
1198                        rw |= REQ_FLUSH;
1199        } else if (bp->b_flags & XBF_READ_AHEAD) {
1200                rw = READA;
1201        } else {
1202                rw = READ;
1203        }
1204
1205        /* we only use the buffer cache for meta-data */
1206        rw |= REQ_META;
1207
1208next_chunk:
1209        atomic_inc(&bp->b_io_remaining);
1210        nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT);
1211        if (nr_pages > total_nr_pages)
1212                nr_pages = total_nr_pages;
1213
1214        bio = bio_alloc(GFP_NOIO, nr_pages);
1215        bio->bi_bdev = bp->b_target->bt_bdev;
1216        bio->bi_sector = sector;
1217        bio->bi_end_io = xfs_buf_bio_end_io;
1218        bio->bi_private = bp;
1219
1220
1221        for (; size && nr_pages; nr_pages--, map_i++) {
1222                int     rbytes, nbytes = PAGE_SIZE - offset;
1223
1224                if (nbytes > size)
1225                        nbytes = size;
1226
1227                rbytes = bio_add_page(bio, bp->b_pages[map_i], nbytes, offset);
1228                if (rbytes < nbytes)
1229                        break;
1230
1231                offset = 0;
1232                sector += nbytes >> BBSHIFT;
1233                size -= nbytes;
1234                total_nr_pages--;
1235        }
1236
1237        if (likely(bio->bi_size)) {
1238                if (xfs_buf_is_vmapped(bp)) {
1239                        flush_kernel_vmap_range(bp->b_addr,
1240                                                xfs_buf_vmap_len(bp));
1241                }
1242                submit_bio(rw, bio);
1243                if (size)
1244                        goto next_chunk;
1245        } else {
1246                xfs_buf_ioerror(bp, EIO);
1247                bio_put(bio);
1248        }
1249}
1250
1251int
1252xfs_buf_iorequest(
1253        xfs_buf_t               *bp)
1254{
1255        trace_xfs_buf_iorequest(bp, _RET_IP_);
1256
1257        ASSERT(!(bp->b_flags & XBF_DELWRI));
1258
1259        if (bp->b_flags & XBF_WRITE)
1260                xfs_buf_wait_unpin(bp);
1261        xfs_buf_hold(bp);
1262
1263        /* Set the count to 1 initially, this will stop an I/O
1264         * completion callout which happens before we have started
1265         * all the I/O from calling xfs_buf_ioend too early.
1266         */
1267        atomic_set(&bp->b_io_remaining, 1);
1268        _xfs_buf_ioapply(bp);
1269        _xfs_buf_ioend(bp, 0);
1270
1271        xfs_buf_rele(bp);
1272        return 0;
1273}
1274
1275/*
1276 *      Waits for I/O to complete on the buffer supplied.
1277 *      It returns immediately if no I/O is pending.
1278 *      It returns the I/O error code, if any, or 0 if there was no error.
1279 */
1280int
1281xfs_buf_iowait(
1282        xfs_buf_t               *bp)
1283{
1284        trace_xfs_buf_iowait(bp, _RET_IP_);
1285
1286        wait_for_completion(&bp->b_iowait);
1287
1288        trace_xfs_buf_iowait_done(bp, _RET_IP_);
1289        return bp->b_error;
1290}
1291
1292xfs_caddr_t
1293xfs_buf_offset(
1294        xfs_buf_t               *bp,
1295        size_t                  offset)
1296{
1297        struct page             *page;
1298
1299        if (bp->b_flags & XBF_MAPPED)
1300                return bp->b_addr + offset;
1301
1302        offset += bp->b_offset;
1303        page = bp->b_pages[offset >> PAGE_SHIFT];
1304        return (xfs_caddr_t)page_address(page) + (offset & (PAGE_SIZE-1));
1305}
1306
1307/*
1308 *      Move data into or out of a buffer.
1309 */
1310void
1311xfs_buf_iomove(
1312        xfs_buf_t               *bp,    /* buffer to process            */
1313        size_t                  boff,   /* starting buffer offset       */
1314        size_t                  bsize,  /* length to copy               */
1315        void                    *data,  /* data address                 */
1316        xfs_buf_rw_t            mode)   /* read/write/zero flag         */
1317{
1318        size_t                  bend, cpoff, csize;
1319        struct page             *page;
1320
1321        bend = boff + bsize;
1322        while (boff < bend) {
1323                page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)];
1324                cpoff = xfs_buf_poff(boff + bp->b_offset);
1325                csize = min_t(size_t,
1326                              PAGE_SIZE-cpoff, bp->b_count_desired-boff);
1327
1328                ASSERT(((csize + cpoff) <= PAGE_SIZE));
1329
1330                switch (mode) {
1331                case XBRW_ZERO:
1332                        memset(page_address(page) + cpoff, 0, csize);
1333                        break;
1334                case XBRW_READ:
1335                        memcpy(data, page_address(page) + cpoff, csize);
1336                        break;
1337                case XBRW_WRITE:
1338                        memcpy(page_address(page) + cpoff, data, csize);
1339                }
1340
1341                boff += csize;
1342                data += csize;
1343        }
1344}
1345
1346/*
1347 *      Handling of buffer targets (buftargs).
1348 */
1349
1350/*
1351 * Wait for any bufs with callbacks that have been submitted but have not yet
1352 * returned. These buffers will have an elevated hold count, so wait on those
1353 * while freeing all the buffers only held by the LRU.
1354 */
1355void
1356xfs_wait_buftarg(
1357        struct xfs_buftarg      *btp)
1358{
1359        struct xfs_buf          *bp;
1360
1361restart:
1362        spin_lock(&btp->bt_lru_lock);
1363        while (!list_empty(&btp->bt_lru)) {
1364                bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
1365                if (atomic_read(&bp->b_hold) > 1) {
1366                        spin_unlock(&btp->bt_lru_lock);
1367                        delay(100);
1368                        goto restart;
1369                }
1370                /*
1371                 * clear the LRU reference count so the buffer doesn't get
1372                 * ignored in xfs_buf_rele().
1373                 */
1374                atomic_set(&bp->b_lru_ref, 0);
1375                spin_unlock(&btp->bt_lru_lock);
1376                xfs_buf_rele(bp);
1377                spin_lock(&btp->bt_lru_lock);
1378        }
1379        spin_unlock(&btp->bt_lru_lock);
1380}
1381
1382int
1383xfs_buftarg_shrink(
1384        struct shrinker         *shrink,
1385        struct shrink_control   *sc)
1386{
1387        struct xfs_buftarg      *btp = container_of(shrink,
1388                                        struct xfs_buftarg, bt_shrinker);
1389        struct xfs_buf          *bp;
1390        int nr_to_scan = sc->nr_to_scan;
1391        LIST_HEAD(dispose);
1392
1393        if (!nr_to_scan)
1394                return btp->bt_lru_nr;
1395
1396        spin_lock(&btp->bt_lru_lock);
1397        while (!list_empty(&btp->bt_lru)) {
1398                if (nr_to_scan-- <= 0)
1399                        break;
1400
1401                bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
1402
1403                /*
1404                 * Decrement the b_lru_ref count unless the value is already
1405                 * zero. If the value is already zero, we need to reclaim the
1406                 * buffer, otherwise it gets another trip through the LRU.
1407                 */
1408                if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
1409                        list_move_tail(&bp->b_lru, &btp->bt_lru);
1410                        continue;
1411                }
1412
1413                /*
1414                 * remove the buffer from the LRU now to avoid needing another
1415                 * lock round trip inside xfs_buf_rele().
1416                 */
1417                list_move(&bp->b_lru, &dispose);
1418                btp->bt_lru_nr--;
1419        }
1420        spin_unlock(&btp->bt_lru_lock);
1421
1422        while (!list_empty(&dispose)) {
1423                bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
1424                list_del_init(&bp->b_lru);
1425                xfs_buf_rele(bp);
1426        }
1427
1428        return btp->bt_lru_nr;
1429}
1430
1431void
1432xfs_free_buftarg(
1433        struct xfs_mount        *mp,
1434        struct xfs_buftarg      *btp)
1435{
1436        unregister_shrinker(&btp->bt_shrinker);
1437
1438        xfs_flush_buftarg(btp, 1);
1439        if (mp->m_flags & XFS_MOUNT_BARRIER)
1440                xfs_blkdev_issue_flush(btp);
1441
1442        kthread_stop(btp->bt_task);
1443        kmem_free(btp);
1444}
1445
1446STATIC int
1447xfs_setsize_buftarg_flags(
1448        xfs_buftarg_t           *btp,
1449        unsigned int            blocksize,
1450        unsigned int            sectorsize,
1451        int                     verbose)
1452{
1453        btp->bt_bsize = blocksize;
1454        btp->bt_sshift = ffs(sectorsize) - 1;
1455        btp->bt_smask = sectorsize - 1;
1456
1457        if (set_blocksize(btp->bt_bdev, sectorsize)) {
1458                char name[BDEVNAME_SIZE];
1459
1460                bdevname(btp->bt_bdev, name);
1461
1462                xfs_warn(btp->bt_mount,
1463                        "Cannot set_blocksize to %u on device %s\n",
1464                        sectorsize, name);
1465                return EINVAL;
1466        }
1467
1468        return 0;
1469}
1470
1471/*
1472 *      When allocating the initial buffer target we have not yet
1473 *      read in the superblock, so don't know what sized sectors
1474 *      are being used is at this early stage.  Play safe.
1475 */
1476STATIC int
1477xfs_setsize_buftarg_early(
1478        xfs_buftarg_t           *btp,
1479        struct block_device     *bdev)
1480{
1481        return xfs_setsize_buftarg_flags(btp,
1482                        PAGE_SIZE, bdev_logical_block_size(bdev), 0);
1483}
1484
1485int
1486xfs_setsize_buftarg(
1487        xfs_buftarg_t           *btp,
1488        unsigned int            blocksize,
1489        unsigned int            sectorsize)
1490{
1491        return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1);
1492}
1493
1494STATIC int
1495xfs_alloc_delwri_queue(
1496        xfs_buftarg_t           *btp,
1497        const char              *fsname)
1498{
1499        INIT_LIST_HEAD(&btp->bt_delwri_queue);
1500        spin_lock_init(&btp->bt_delwri_lock);
1501        btp->bt_flags = 0;
1502        btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname);
1503        if (IS_ERR(btp->bt_task))
1504                return PTR_ERR(btp->bt_task);
1505        return 0;
1506}
1507
1508xfs_buftarg_t *
1509xfs_alloc_buftarg(
1510        struct xfs_mount        *mp,
1511        struct block_device     *bdev,
1512        int                     external,
1513        const char              *fsname)
1514{
1515        xfs_buftarg_t           *btp;
1516
1517        btp = kmem_zalloc(sizeof(*btp), KM_SLEEP);
1518
1519        btp->bt_mount = mp;
1520        btp->bt_dev =  bdev->bd_dev;
1521        btp->bt_bdev = bdev;
1522        btp->bt_bdi = blk_get_backing_dev_info(bdev);
1523        if (!btp->bt_bdi)
1524                goto error;
1525
1526        INIT_LIST_HEAD(&btp->bt_lru);
1527        spin_lock_init(&btp->bt_lru_lock);
1528        if (xfs_setsize_buftarg_early(btp, bdev))
1529                goto error;
1530        if (xfs_alloc_delwri_queue(btp, fsname))
1531                goto error;
1532        btp->bt_shrinker.shrink = xfs_buftarg_shrink;
1533        btp->bt_shrinker.seeks = DEFAULT_SEEKS;
1534        register_shrinker(&btp->bt_shrinker);
1535        return btp;
1536
1537error:
1538        kmem_free(btp);
1539        return NULL;
1540}
1541
1542
1543/*
1544 *      Delayed write buffer handling
1545 */
1546void
1547xfs_buf_delwri_queue(
1548        xfs_buf_t               *bp)
1549{
1550        struct xfs_buftarg      *btp = bp->b_target;
1551
1552        trace_xfs_buf_delwri_queue(bp, _RET_IP_);
1553
1554        ASSERT(!(bp->b_flags & XBF_READ));
1555
1556        spin_lock(&btp->bt_delwri_lock);
1557        if (!list_empty(&bp->b_list)) {
1558                /* if already in the queue, move it to the tail */
1559                ASSERT(bp->b_flags & _XBF_DELWRI_Q);
1560                list_move_tail(&bp->b_list, &btp->bt_delwri_queue);
1561        } else {
1562                /* start xfsbufd as it is about to have something to do */
1563                if (list_empty(&btp->bt_delwri_queue))
1564                        wake_up_process(bp->b_target->bt_task);
1565
1566                atomic_inc(&bp->b_hold);
1567                bp->b_flags |= XBF_DELWRI | _XBF_DELWRI_Q | XBF_ASYNC;
1568                list_add_tail(&bp->b_list, &btp->bt_delwri_queue);
1569        }
1570        bp->b_queuetime = jiffies;
1571        spin_unlock(&btp->bt_delwri_lock);
1572}
1573
1574void
1575xfs_buf_delwri_dequeue(
1576        xfs_buf_t               *bp)
1577{
1578        int                     dequeued = 0;
1579
1580        spin_lock(&bp->b_target->bt_delwri_lock);
1581        if ((bp->b_flags & XBF_DELWRI) && !list_empty(&bp->b_list)) {
1582                ASSERT(bp->b_flags & _XBF_DELWRI_Q);
1583                list_del_init(&bp->b_list);
1584                dequeued = 1;
1585        }
1586        bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q);
1587        spin_unlock(&bp->b_target->bt_delwri_lock);
1588
1589        if (dequeued)
1590                xfs_buf_rele(bp);
1591
1592        trace_xfs_buf_delwri_dequeue(bp, _RET_IP_);
1593}
1594
1595/*
1596 * If a delwri buffer needs to be pushed before it has aged out, then promote
1597 * it to the head of the delwri queue so that it will be flushed on the next
1598 * xfsbufd run. We do this by resetting the queuetime of the buffer to be older
1599 * than the age currently needed to flush the buffer. Hence the next time the
1600 * xfsbufd sees it is guaranteed to be considered old enough to flush.
1601 */
1602void
1603xfs_buf_delwri_promote(
1604        struct xfs_buf  *bp)
1605{
1606        struct xfs_buftarg *btp = bp->b_target;
1607        long            age = xfs_buf_age_centisecs * msecs_to_jiffies(10) + 1;
1608
1609        ASSERT(bp->b_flags & XBF_DELWRI);
1610        ASSERT(bp->b_flags & _XBF_DELWRI_Q);
1611
1612        /*
1613         * Check the buffer age before locking the delayed write queue as we
1614         * don't need to promote buffers that are already past the flush age.
1615         */
1616        if (bp->b_queuetime < jiffies - age)
1617                return;
1618        bp->b_queuetime = jiffies - age;
1619        spin_lock(&btp->bt_delwri_lock);
1620        list_move(&bp->b_list, &btp->bt_delwri_queue);
1621        spin_unlock(&btp->bt_delwri_lock);
1622}
1623
1624/*
1625 * Move as many buffers as specified to the supplied list
1626 * idicating if we skipped any buffers to prevent deadlocks.
1627 */
1628STATIC int
1629xfs_buf_delwri_split(
1630        xfs_buftarg_t   *target,
1631        struct list_head *list,
1632        unsigned long   age)
1633{
1634        xfs_buf_t       *bp, *n;
1635        int             skipped = 0;
1636        int             force;
1637
1638        force = test_and_clear_bit(XBT_FORCE_FLUSH, &target->bt_flags);
1639        INIT_LIST_HEAD(list);
1640        spin_lock(&target->bt_delwri_lock);
1641        list_for_each_entry_safe(bp, n, &target->bt_delwri_queue, b_list) {
1642                ASSERT(bp->b_flags & XBF_DELWRI);
1643
1644                if (!xfs_buf_ispinned(bp) && xfs_buf_trylock(bp)) {
1645                        if (!force &&
1646                            time_before(jiffies, bp->b_queuetime + age)) {
1647                                xfs_buf_unlock(bp);
1648                                break;
1649                        }
1650
1651                        bp->b_flags &= ~(XBF_DELWRI | _XBF_DELWRI_Q);
1652                        bp->b_flags |= XBF_WRITE;
1653                        list_move_tail(&bp->b_list, list);
1654                        trace_xfs_buf_delwri_split(bp, _RET_IP_);
1655                } else
1656                        skipped++;
1657        }
1658
1659        spin_unlock(&target->bt_delwri_lock);
1660        return skipped;
1661}
1662
1663/*
1664 * Compare function is more complex than it needs to be because
1665 * the return value is only 32 bits and we are doing comparisons
1666 * on 64 bit values
1667 */
1668static int
1669xfs_buf_cmp(
1670        void            *priv,
1671        struct list_head *a,
1672        struct list_head *b)
1673{
1674        struct xfs_buf  *ap = container_of(a, struct xfs_buf, b_list);
1675        struct xfs_buf  *bp = container_of(b, struct xfs_buf, b_list);
1676        xfs_daddr_t             diff;
1677
1678        diff = ap->b_bn - bp->b_bn;
1679        if (diff < 0)
1680                return -1;
1681        if (diff > 0)
1682                return 1;
1683        return 0;
1684}
1685
1686STATIC int
1687xfsbufd(
1688        void            *data)
1689{
1690        xfs_buftarg_t   *target = (xfs_buftarg_t *)data;
1691
1692        current->flags |= PF_MEMALLOC;
1693
1694        set_freezable();
1695
1696        do {
1697                long    age = xfs_buf_age_centisecs * msecs_to_jiffies(10);
1698                long    tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10);
1699                struct list_head tmp;
1700                struct blk_plug plug;
1701
1702                if (unlikely(freezing(current)))
1703                        try_to_freeze();
1704
1705                /* sleep for a long time if there is nothing to do. */
1706                if (list_empty(&target->bt_delwri_queue))
1707                        tout = MAX_SCHEDULE_TIMEOUT;
1708                schedule_timeout_interruptible(tout);
1709
1710                xfs_buf_delwri_split(target, &tmp, age);
1711                list_sort(NULL, &tmp, xfs_buf_cmp);
1712
1713                blk_start_plug(&plug);
1714                while (!list_empty(&tmp)) {
1715                        struct xfs_buf *bp;
1716                        bp = list_first_entry(&tmp, struct xfs_buf, b_list);
1717                        list_del_init(&bp->b_list);
1718                        xfs_bdstrat_cb(bp);
1719                }
1720                blk_finish_plug(&plug);
1721        } while (!kthread_should_stop());
1722
1723        return 0;
1724}
1725
1726/*
1727 *      Go through all incore buffers, and release buffers if they belong to
1728 *      the given device. This is used in filesystem error handling to
1729 *      preserve the consistency of its metadata.
1730 */
1731int
1732xfs_flush_buftarg(
1733        xfs_buftarg_t   *target,
1734        int             wait)
1735{
1736        xfs_buf_t       *bp;
1737        int             pincount = 0;
1738        LIST_HEAD(tmp_list);
1739        LIST_HEAD(wait_list);
1740        struct blk_plug plug;
1741
1742        flush_workqueue(xfslogd_workqueue);
1743
1744        set_bit(XBT_FORCE_FLUSH, &target->bt_flags);
1745        pincount = xfs_buf_delwri_split(target, &tmp_list, 0);
1746
1747        /*
1748         * Dropped the delayed write list lock, now walk the temporary list.
1749         * All I/O is issued async and then if we need to wait for completion
1750         * we do that after issuing all the IO.
1751         */
1752        list_sort(NULL, &tmp_list, xfs_buf_cmp);
1753
1754        blk_start_plug(&plug);
1755        while (!list_empty(&tmp_list)) {
1756                bp = list_first_entry(&tmp_list, struct xfs_buf, b_list);
1757                ASSERT(target == bp->b_target);
1758                list_del_init(&bp->b_list);
1759                if (wait) {
1760                        bp->b_flags &= ~XBF_ASYNC;
1761                        list_add(&bp->b_list, &wait_list);
1762                }
1763                xfs_bdstrat_cb(bp);
1764        }
1765        blk_finish_plug(&plug);
1766
1767        if (wait) {
1768                /* Wait for IO to complete. */
1769                while (!list_empty(&wait_list)) {
1770                        bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
1771
1772                        list_del_init(&bp->b_list);
1773                        xfs_buf_iowait(bp);
1774                        xfs_buf_relse(bp);
1775                }
1776        }
1777
1778        return pincount;
1779}
1780
1781int __init
1782xfs_buf_init(void)
1783{
1784        xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf",
1785                                                KM_ZONE_HWALIGN, NULL);
1786        if (!xfs_buf_zone)
1787                goto out;
1788
1789        xfslogd_workqueue = alloc_workqueue("xfslogd",
1790                                        WQ_MEM_RECLAIM | WQ_HIGHPRI, 1);
1791        if (!xfslogd_workqueue)
1792                goto out_free_buf_zone;
1793
1794        return 0;
1795
1796 out_free_buf_zone:
1797        kmem_zone_destroy(xfs_buf_zone);
1798 out:
1799        return -ENOMEM;
1800}
1801
1802void
1803xfs_buf_terminate(void)
1804{
1805        destroy_workqueue(xfslogd_workqueue);
1806        kmem_zone_destroy(xfs_buf_zone);
1807}
1808