linux/fs/xfs/linux-2.6/xfs_buf.c
<<
>>
Prefs
   1/*
   2 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
   3 * All Rights Reserved.
   4 *
   5 * This program is free software; you can redistribute it and/or
   6 * modify it under the terms of the GNU General Public License as
   7 * published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope that it would be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 * GNU General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public License
  15 * along with this program; if not, write the Free Software Foundation,
  16 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17 */
  18#include "xfs.h"
  19#include <linux/stddef.h>
  20#include <linux/errno.h>
  21#include <linux/slab.h>
  22#include <linux/pagemap.h>
  23#include <linux/init.h>
  24#include <linux/vmalloc.h>
  25#include <linux/bio.h>
  26#include <linux/sysctl.h>
  27#include <linux/proc_fs.h>
  28#include <linux/workqueue.h>
  29#include <linux/percpu.h>
  30#include <linux/blkdev.h>
  31#include <linux/hash.h>
  32#include <linux/kthread.h>
  33#include <linux/migrate.h>
  34#include <linux/backing-dev.h>
  35#include <linux/freezer.h>
  36
  37#include "xfs_sb.h"
  38#include "xfs_inum.h"
  39#include "xfs_ag.h"
  40#include "xfs_dmapi.h"
  41#include "xfs_mount.h"
  42
  43static kmem_zone_t *xfs_buf_zone;
  44STATIC int xfsbufd(void *);
  45STATIC int xfsbufd_wakeup(int, gfp_t);
  46STATIC void xfs_buf_delwri_queue(xfs_buf_t *, int);
  47static struct shrinker xfs_buf_shake = {
  48        .shrink = xfsbufd_wakeup,
  49        .seeks = DEFAULT_SEEKS,
  50};
  51
  52static struct workqueue_struct *xfslogd_workqueue;
  53struct workqueue_struct *xfsdatad_workqueue;
  54struct workqueue_struct *xfsconvertd_workqueue;
  55
  56#ifdef XFS_BUF_TRACE
  57void
  58xfs_buf_trace(
  59        xfs_buf_t       *bp,
  60        char            *id,
  61        void            *data,
  62        void            *ra)
  63{
  64        ktrace_enter(xfs_buf_trace_buf,
  65                bp, id,
  66                (void *)(unsigned long)bp->b_flags,
  67                (void *)(unsigned long)bp->b_hold.counter,
  68                (void *)(unsigned long)bp->b_sema.count,
  69                (void *)current,
  70                data, ra,
  71                (void *)(unsigned long)((bp->b_file_offset>>32) & 0xffffffff),
  72                (void *)(unsigned long)(bp->b_file_offset & 0xffffffff),
  73                (void *)(unsigned long)bp->b_buffer_length,
  74                NULL, NULL, NULL, NULL, NULL);
  75}
  76ktrace_t *xfs_buf_trace_buf;
  77#define XFS_BUF_TRACE_SIZE      4096
  78#define XB_TRACE(bp, id, data)  \
  79        xfs_buf_trace(bp, id, (void *)data, (void *)__builtin_return_address(0))
  80#else
  81#define XB_TRACE(bp, id, data)  do { } while (0)
  82#endif
  83
  84#ifdef XFS_BUF_LOCK_TRACKING
  85# define XB_SET_OWNER(bp)       ((bp)->b_last_holder = current->pid)
  86# define XB_CLEAR_OWNER(bp)     ((bp)->b_last_holder = -1)
  87# define XB_GET_OWNER(bp)       ((bp)->b_last_holder)
  88#else
  89# define XB_SET_OWNER(bp)       do { } while (0)
  90# define XB_CLEAR_OWNER(bp)     do { } while (0)
  91# define XB_GET_OWNER(bp)       do { } while (0)
  92#endif
  93
  94#define xb_to_gfp(flags) \
  95        ((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : \
  96          ((flags) & XBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL) | __GFP_NOWARN)
  97
  98#define xb_to_km(flags) \
  99         (((flags) & XBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP)
 100
 101#define xfs_buf_allocate(flags) \
 102        kmem_zone_alloc(xfs_buf_zone, xb_to_km(flags))
 103#define xfs_buf_deallocate(bp) \
 104        kmem_zone_free(xfs_buf_zone, (bp));
 105
 106/*
 107 *      Page Region interfaces.
 108 *
 109 *      For pages in filesystems where the blocksize is smaller than the
 110 *      pagesize, we use the page->private field (long) to hold a bitmap
 111 *      of uptodate regions within the page.
 112 *
 113 *      Each such region is "bytes per page / bits per long" bytes long.
 114 *
 115 *      NBPPR == number-of-bytes-per-page-region
 116 *      BTOPR == bytes-to-page-region (rounded up)
 117 *      BTOPRT == bytes-to-page-region-truncated (rounded down)
 118 */
 119#if (BITS_PER_LONG == 32)
 120#define PRSHIFT         (PAGE_CACHE_SHIFT - 5)  /* (32 == 1<<5) */
 121#elif (BITS_PER_LONG == 64)
 122#define PRSHIFT         (PAGE_CACHE_SHIFT - 6)  /* (64 == 1<<6) */
 123#else
 124#error BITS_PER_LONG must be 32 or 64
 125#endif
 126#define NBPPR           (PAGE_CACHE_SIZE/BITS_PER_LONG)
 127#define BTOPR(b)        (((unsigned int)(b) + (NBPPR - 1)) >> PRSHIFT)
 128#define BTOPRT(b)       (((unsigned int)(b) >> PRSHIFT))
 129
 130STATIC unsigned long
 131page_region_mask(
 132        size_t          offset,
 133        size_t          length)
 134{
 135        unsigned long   mask;
 136        int             first, final;
 137
 138        first = BTOPR(offset);
 139        final = BTOPRT(offset + length - 1);
 140        first = min(first, final);
 141
 142        mask = ~0UL;
 143        mask <<= BITS_PER_LONG - (final - first);
 144        mask >>= BITS_PER_LONG - (final);
 145
 146        ASSERT(offset + length <= PAGE_CACHE_SIZE);
 147        ASSERT((final - first) < BITS_PER_LONG && (final - first) >= 0);
 148
 149        return mask;
 150}
 151
 152STATIC_INLINE void
 153set_page_region(
 154        struct page     *page,
 155        size_t          offset,
 156        size_t          length)
 157{
 158        set_page_private(page,
 159                page_private(page) | page_region_mask(offset, length));
 160        if (page_private(page) == ~0UL)
 161                SetPageUptodate(page);
 162}
 163
 164STATIC_INLINE int
 165test_page_region(
 166        struct page     *page,
 167        size_t          offset,
 168        size_t          length)
 169{
 170        unsigned long   mask = page_region_mask(offset, length);
 171
 172        return (mask && (page_private(page) & mask) == mask);
 173}
 174
 175/*
 176 *      Mapping of multi-page buffers into contiguous virtual space
 177 */
 178
 179typedef struct a_list {
 180        void            *vm_addr;
 181        struct a_list   *next;
 182} a_list_t;
 183
 184static a_list_t         *as_free_head;
 185static int              as_list_len;
 186static DEFINE_SPINLOCK(as_lock);
 187
 188/*
 189 *      Try to batch vunmaps because they are costly.
 190 */
 191STATIC void
 192free_address(
 193        void            *addr)
 194{
 195        a_list_t        *aentry;
 196
 197#ifdef CONFIG_XEN
 198        /*
 199         * Xen needs to be able to make sure it can get an exclusive
 200         * RO mapping of pages it wants to turn into a pagetable.  If
 201         * a newly allocated page is also still being vmap()ed by xfs,
 202         * it will cause pagetable construction to fail.  This is a
 203         * quick workaround to always eagerly unmap pages so that Xen
 204         * is happy.
 205         */
 206        vunmap(addr);
 207        return;
 208#endif
 209
 210        aentry = kmalloc(sizeof(a_list_t), GFP_NOWAIT);
 211        if (likely(aentry)) {
 212                spin_lock(&as_lock);
 213                aentry->next = as_free_head;
 214                aentry->vm_addr = addr;
 215                as_free_head = aentry;
 216                as_list_len++;
 217                spin_unlock(&as_lock);
 218        } else {
 219                vunmap(addr);
 220        }
 221}
 222
 223STATIC void
 224purge_addresses(void)
 225{
 226        a_list_t        *aentry, *old;
 227
 228        if (as_free_head == NULL)
 229                return;
 230
 231        spin_lock(&as_lock);
 232        aentry = as_free_head;
 233        as_free_head = NULL;
 234        as_list_len = 0;
 235        spin_unlock(&as_lock);
 236
 237        while ((old = aentry) != NULL) {
 238                vunmap(aentry->vm_addr);
 239                aentry = aentry->next;
 240                kfree(old);
 241        }
 242}
 243
 244/*
 245 *      Internal xfs_buf_t object manipulation
 246 */
 247
 248STATIC void
 249_xfs_buf_initialize(
 250        xfs_buf_t               *bp,
 251        xfs_buftarg_t           *target,
 252        xfs_off_t               range_base,
 253        size_t                  range_length,
 254        xfs_buf_flags_t         flags)
 255{
 256        /*
 257         * We don't want certain flags to appear in b_flags.
 258         */
 259        flags &= ~(XBF_LOCK|XBF_MAPPED|XBF_DONT_BLOCK|XBF_READ_AHEAD);
 260
 261        memset(bp, 0, sizeof(xfs_buf_t));
 262        atomic_set(&bp->b_hold, 1);
 263        init_completion(&bp->b_iowait);
 264        INIT_LIST_HEAD(&bp->b_list);
 265        INIT_LIST_HEAD(&bp->b_hash_list);
 266        init_MUTEX_LOCKED(&bp->b_sema); /* held, no waiters */
 267        XB_SET_OWNER(bp);
 268        bp->b_target = target;
 269        bp->b_file_offset = range_base;
 270        /*
 271         * Set buffer_length and count_desired to the same value initially.
 272         * I/O routines should use count_desired, which will be the same in
 273         * most cases but may be reset (e.g. XFS recovery).
 274         */
 275        bp->b_buffer_length = bp->b_count_desired = range_length;
 276        bp->b_flags = flags;
 277        bp->b_bn = XFS_BUF_DADDR_NULL;
 278        atomic_set(&bp->b_pin_count, 0);
 279        init_waitqueue_head(&bp->b_waiters);
 280
 281        XFS_STATS_INC(xb_create);
 282        XB_TRACE(bp, "initialize", target);
 283}
 284
 285/*
 286 *      Allocate a page array capable of holding a specified number
 287 *      of pages, and point the page buf at it.
 288 */
 289STATIC int
 290_xfs_buf_get_pages(
 291        xfs_buf_t               *bp,
 292        int                     page_count,
 293        xfs_buf_flags_t         flags)
 294{
 295        /* Make sure that we have a page list */
 296        if (bp->b_pages == NULL) {
 297                bp->b_offset = xfs_buf_poff(bp->b_file_offset);
 298                bp->b_page_count = page_count;
 299                if (page_count <= XB_PAGES) {
 300                        bp->b_pages = bp->b_page_array;
 301                } else {
 302                        bp->b_pages = kmem_alloc(sizeof(struct page *) *
 303                                        page_count, xb_to_km(flags));
 304                        if (bp->b_pages == NULL)
 305                                return -ENOMEM;
 306                }
 307                memset(bp->b_pages, 0, sizeof(struct page *) * page_count);
 308        }
 309        return 0;
 310}
 311
 312/*
 313 *      Frees b_pages if it was allocated.
 314 */
 315STATIC void
 316_xfs_buf_free_pages(
 317        xfs_buf_t       *bp)
 318{
 319        if (bp->b_pages != bp->b_page_array) {
 320                kmem_free(bp->b_pages);
 321        }
 322}
 323
 324/*
 325 *      Releases the specified buffer.
 326 *
 327 *      The modification state of any associated pages is left unchanged.
 328 *      The buffer most not be on any hash - use xfs_buf_rele instead for
 329 *      hashed and refcounted buffers
 330 */
 331void
 332xfs_buf_free(
 333        xfs_buf_t               *bp)
 334{
 335        XB_TRACE(bp, "free", 0);
 336
 337        ASSERT(list_empty(&bp->b_hash_list));
 338
 339        if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) {
 340                uint            i;
 341
 342                if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1))
 343                        free_address(bp->b_addr - bp->b_offset);
 344
 345                for (i = 0; i < bp->b_page_count; i++) {
 346                        struct page     *page = bp->b_pages[i];
 347
 348                        if (bp->b_flags & _XBF_PAGE_CACHE)
 349                                ASSERT(!PagePrivate(page));
 350                        page_cache_release(page);
 351                }
 352                _xfs_buf_free_pages(bp);
 353        }
 354
 355        xfs_buf_deallocate(bp);
 356}
 357
 358/*
 359 *      Finds all pages for buffer in question and builds it's page list.
 360 */
 361STATIC int
 362_xfs_buf_lookup_pages(
 363        xfs_buf_t               *bp,
 364        uint                    flags)
 365{
 366        struct address_space    *mapping = bp->b_target->bt_mapping;
 367        size_t                  blocksize = bp->b_target->bt_bsize;
 368        size_t                  size = bp->b_count_desired;
 369        size_t                  nbytes, offset;
 370        gfp_t                   gfp_mask = xb_to_gfp(flags);
 371        unsigned short          page_count, i;
 372        pgoff_t                 first;
 373        xfs_off_t               end;
 374        int                     error;
 375
 376        end = bp->b_file_offset + bp->b_buffer_length;
 377        page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset);
 378
 379        error = _xfs_buf_get_pages(bp, page_count, flags);
 380        if (unlikely(error))
 381                return error;
 382        bp->b_flags |= _XBF_PAGE_CACHE;
 383
 384        offset = bp->b_offset;
 385        first = bp->b_file_offset >> PAGE_CACHE_SHIFT;
 386
 387        for (i = 0; i < bp->b_page_count; i++) {
 388                struct page     *page;
 389                uint            retries = 0;
 390
 391              retry:
 392                page = find_or_create_page(mapping, first + i, gfp_mask);
 393                if (unlikely(page == NULL)) {
 394                        if (flags & XBF_READ_AHEAD) {
 395                                bp->b_page_count = i;
 396                                for (i = 0; i < bp->b_page_count; i++)
 397                                        unlock_page(bp->b_pages[i]);
 398                                return -ENOMEM;
 399                        }
 400
 401                        /*
 402                         * This could deadlock.
 403                         *
 404                         * But until all the XFS lowlevel code is revamped to
 405                         * handle buffer allocation failures we can't do much.
 406                         */
 407                        if (!(++retries % 100))
 408                                printk(KERN_ERR
 409                                        "XFS: possible memory allocation "
 410                                        "deadlock in %s (mode:0x%x)\n",
 411                                        __func__, gfp_mask);
 412
 413                        XFS_STATS_INC(xb_page_retries);
 414                        xfsbufd_wakeup(0, gfp_mask);
 415                        congestion_wait(BLK_RW_ASYNC, HZ/50);
 416                        goto retry;
 417                }
 418
 419                XFS_STATS_INC(xb_page_found);
 420
 421                nbytes = min_t(size_t, size, PAGE_CACHE_SIZE - offset);
 422                size -= nbytes;
 423
 424                ASSERT(!PagePrivate(page));
 425                if (!PageUptodate(page)) {
 426                        page_count--;
 427                        if (blocksize >= PAGE_CACHE_SIZE) {
 428                                if (flags & XBF_READ)
 429                                        bp->b_flags |= _XBF_PAGE_LOCKED;
 430                        } else if (!PagePrivate(page)) {
 431                                if (test_page_region(page, offset, nbytes))
 432                                        page_count++;
 433                        }
 434                }
 435
 436                bp->b_pages[i] = page;
 437                offset = 0;
 438        }
 439
 440        if (!(bp->b_flags & _XBF_PAGE_LOCKED)) {
 441                for (i = 0; i < bp->b_page_count; i++)
 442                        unlock_page(bp->b_pages[i]);
 443        }
 444
 445        if (page_count == bp->b_page_count)
 446                bp->b_flags |= XBF_DONE;
 447
 448        XB_TRACE(bp, "lookup_pages", (long)page_count);
 449        return error;
 450}
 451
 452/*
 453 *      Map buffer into kernel address-space if nessecary.
 454 */
 455STATIC int
 456_xfs_buf_map_pages(
 457        xfs_buf_t               *bp,
 458        uint                    flags)
 459{
 460        /* A single page buffer is always mappable */
 461        if (bp->b_page_count == 1) {
 462                bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
 463                bp->b_flags |= XBF_MAPPED;
 464        } else if (flags & XBF_MAPPED) {
 465                if (as_list_len > 64)
 466                        purge_addresses();
 467                bp->b_addr = vmap(bp->b_pages, bp->b_page_count,
 468                                        VM_MAP, PAGE_KERNEL);
 469                if (unlikely(bp->b_addr == NULL))
 470                        return -ENOMEM;
 471                bp->b_addr += bp->b_offset;
 472                bp->b_flags |= XBF_MAPPED;
 473        }
 474
 475        return 0;
 476}
 477
 478/*
 479 *      Finding and Reading Buffers
 480 */
 481
 482/*
 483 *      Look up, and creates if absent, a lockable buffer for
 484 *      a given range of an inode.  The buffer is returned
 485 *      locked.  If other overlapping buffers exist, they are
 486 *      released before the new buffer is created and locked,
 487 *      which may imply that this call will block until those buffers
 488 *      are unlocked.  No I/O is implied by this call.
 489 */
 490xfs_buf_t *
 491_xfs_buf_find(
 492        xfs_buftarg_t           *btp,   /* block device target          */
 493        xfs_off_t               ioff,   /* starting offset of range     */
 494        size_t                  isize,  /* length of range              */
 495        xfs_buf_flags_t         flags,
 496        xfs_buf_t               *new_bp)
 497{
 498        xfs_off_t               range_base;
 499        size_t                  range_length;
 500        xfs_bufhash_t           *hash;
 501        xfs_buf_t               *bp, *n;
 502
 503        range_base = (ioff << BBSHIFT);
 504        range_length = (isize << BBSHIFT);
 505
 506        /* Check for IOs smaller than the sector size / not sector aligned */
 507        ASSERT(!(range_length < (1 << btp->bt_sshift)));
 508        ASSERT(!(range_base & (xfs_off_t)btp->bt_smask));
 509
 510        hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)];
 511
 512        spin_lock(&hash->bh_lock);
 513
 514        list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) {
 515                ASSERT(btp == bp->b_target);
 516                if (bp->b_file_offset == range_base &&
 517                    bp->b_buffer_length == range_length) {
 518                        /*
 519                         * If we look at something, bring it to the
 520                         * front of the list for next time.
 521                         */
 522                        atomic_inc(&bp->b_hold);
 523                        list_move(&bp->b_hash_list, &hash->bh_list);
 524                        goto found;
 525                }
 526        }
 527
 528        /* No match found */
 529        if (new_bp) {
 530                _xfs_buf_initialize(new_bp, btp, range_base,
 531                                range_length, flags);
 532                new_bp->b_hash = hash;
 533                list_add(&new_bp->b_hash_list, &hash->bh_list);
 534        } else {
 535                XFS_STATS_INC(xb_miss_locked);
 536        }
 537
 538        spin_unlock(&hash->bh_lock);
 539        return new_bp;
 540
 541found:
 542        spin_unlock(&hash->bh_lock);
 543
 544        /* Attempt to get the semaphore without sleeping,
 545         * if this does not work then we need to drop the
 546         * spinlock and do a hard attempt on the semaphore.
 547         */
 548        if (down_trylock(&bp->b_sema)) {
 549                if (!(flags & XBF_TRYLOCK)) {
 550                        /* wait for buffer ownership */
 551                        XB_TRACE(bp, "get_lock", 0);
 552                        xfs_buf_lock(bp);
 553                        XFS_STATS_INC(xb_get_locked_waited);
 554                } else {
 555                        /* We asked for a trylock and failed, no need
 556                         * to look at file offset and length here, we
 557                         * know that this buffer at least overlaps our
 558                         * buffer and is locked, therefore our buffer
 559                         * either does not exist, or is this buffer.
 560                         */
 561                        xfs_buf_rele(bp);
 562                        XFS_STATS_INC(xb_busy_locked);
 563                        return NULL;
 564                }
 565        } else {
 566                /* trylock worked */
 567                XB_SET_OWNER(bp);
 568        }
 569
 570        if (bp->b_flags & XBF_STALE) {
 571                ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
 572                bp->b_flags &= XBF_MAPPED;
 573        }
 574        XB_TRACE(bp, "got_lock", 0);
 575        XFS_STATS_INC(xb_get_locked);
 576        return bp;
 577}
 578
 579/*
 580 *      Assembles a buffer covering the specified range.
 581 *      Storage in memory for all portions of the buffer will be allocated,
 582 *      although backing storage may not be.
 583 */
 584xfs_buf_t *
 585xfs_buf_get_flags(
 586        xfs_buftarg_t           *target,/* target for buffer            */
 587        xfs_off_t               ioff,   /* starting offset of range     */
 588        size_t                  isize,  /* length of range              */
 589        xfs_buf_flags_t         flags)
 590{
 591        xfs_buf_t               *bp, *new_bp;
 592        int                     error = 0, i;
 593
 594        new_bp = xfs_buf_allocate(flags);
 595        if (unlikely(!new_bp))
 596                return NULL;
 597
 598        bp = _xfs_buf_find(target, ioff, isize, flags, new_bp);
 599        if (bp == new_bp) {
 600                error = _xfs_buf_lookup_pages(bp, flags);
 601                if (error)
 602                        goto no_buffer;
 603        } else {
 604                xfs_buf_deallocate(new_bp);
 605                if (unlikely(bp == NULL))
 606                        return NULL;
 607        }
 608
 609        for (i = 0; i < bp->b_page_count; i++)
 610                mark_page_accessed(bp->b_pages[i]);
 611
 612        if (!(bp->b_flags & XBF_MAPPED)) {
 613                error = _xfs_buf_map_pages(bp, flags);
 614                if (unlikely(error)) {
 615                        printk(KERN_WARNING "%s: failed to map pages\n",
 616                                        __func__);
 617                        goto no_buffer;
 618                }
 619        }
 620
 621        XFS_STATS_INC(xb_get);
 622
 623        /*
 624         * Always fill in the block number now, the mapped cases can do
 625         * their own overlay of this later.
 626         */
 627        bp->b_bn = ioff;
 628        bp->b_count_desired = bp->b_buffer_length;
 629
 630        XB_TRACE(bp, "get", (unsigned long)flags);
 631        return bp;
 632
 633 no_buffer:
 634        if (flags & (XBF_LOCK | XBF_TRYLOCK))
 635                xfs_buf_unlock(bp);
 636        xfs_buf_rele(bp);
 637        return NULL;
 638}
 639
 640STATIC int
 641_xfs_buf_read(
 642        xfs_buf_t               *bp,
 643        xfs_buf_flags_t         flags)
 644{
 645        int                     status;
 646
 647        XB_TRACE(bp, "_xfs_buf_read", (unsigned long)flags);
 648
 649        ASSERT(!(flags & (XBF_DELWRI|XBF_WRITE)));
 650        ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL);
 651
 652        bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_DELWRI | \
 653                        XBF_READ_AHEAD | _XBF_RUN_QUEUES);
 654        bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | \
 655                        XBF_READ_AHEAD | _XBF_RUN_QUEUES);
 656
 657        status = xfs_buf_iorequest(bp);
 658        if (!status && !(flags & XBF_ASYNC))
 659                status = xfs_buf_iowait(bp);
 660        return status;
 661}
 662
 663xfs_buf_t *
 664xfs_buf_read_flags(
 665        xfs_buftarg_t           *target,
 666        xfs_off_t               ioff,
 667        size_t                  isize,
 668        xfs_buf_flags_t         flags)
 669{
 670        xfs_buf_t               *bp;
 671
 672        flags |= XBF_READ;
 673
 674        bp = xfs_buf_get_flags(target, ioff, isize, flags);
 675        if (bp) {
 676                if (!XFS_BUF_ISDONE(bp)) {
 677                        XB_TRACE(bp, "read", (unsigned long)flags);
 678                        XFS_STATS_INC(xb_get_read);
 679                        _xfs_buf_read(bp, flags);
 680                } else if (flags & XBF_ASYNC) {
 681                        XB_TRACE(bp, "read_async", (unsigned long)flags);
 682                        /*
 683                         * Read ahead call which is already satisfied,
 684                         * drop the buffer
 685                         */
 686                        goto no_buffer;
 687                } else {
 688                        XB_TRACE(bp, "read_done", (unsigned long)flags);
 689                        /* We do not want read in the flags */
 690                        bp->b_flags &= ~XBF_READ;
 691                }
 692        }
 693
 694        return bp;
 695
 696 no_buffer:
 697        if (flags & (XBF_LOCK | XBF_TRYLOCK))
 698                xfs_buf_unlock(bp);
 699        xfs_buf_rele(bp);
 700        return NULL;
 701}
 702
 703/*
 704 *      If we are not low on memory then do the readahead in a deadlock
 705 *      safe manner.
 706 */
 707void
 708xfs_buf_readahead(
 709        xfs_buftarg_t           *target,
 710        xfs_off_t               ioff,
 711        size_t                  isize,
 712        xfs_buf_flags_t         flags)
 713{
 714        struct backing_dev_info *bdi;
 715
 716        bdi = target->bt_mapping->backing_dev_info;
 717        if (bdi_read_congested(bdi))
 718                return;
 719
 720        flags |= (XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD);
 721        xfs_buf_read_flags(target, ioff, isize, flags);
 722}
 723
 724xfs_buf_t *
 725xfs_buf_get_empty(
 726        size_t                  len,
 727        xfs_buftarg_t           *target)
 728{
 729        xfs_buf_t               *bp;
 730
 731        bp = xfs_buf_allocate(0);
 732        if (bp)
 733                _xfs_buf_initialize(bp, target, 0, len, 0);
 734        return bp;
 735}
 736
 737static inline struct page *
 738mem_to_page(
 739        void                    *addr)
 740{
 741        if ((!is_vmalloc_addr(addr))) {
 742                return virt_to_page(addr);
 743        } else {
 744                return vmalloc_to_page(addr);
 745        }
 746}
 747
 748int
 749xfs_buf_associate_memory(
 750        xfs_buf_t               *bp,
 751        void                    *mem,
 752        size_t                  len)
 753{
 754        int                     rval;
 755        int                     i = 0;
 756        unsigned long           pageaddr;
 757        unsigned long           offset;
 758        size_t                  buflen;
 759        int                     page_count;
 760
 761        pageaddr = (unsigned long)mem & PAGE_CACHE_MASK;
 762        offset = (unsigned long)mem - pageaddr;
 763        buflen = PAGE_CACHE_ALIGN(len + offset);
 764        page_count = buflen >> PAGE_CACHE_SHIFT;
 765
 766        /* Free any previous set of page pointers */
 767        if (bp->b_pages)
 768                _xfs_buf_free_pages(bp);
 769
 770        bp->b_pages = NULL;
 771        bp->b_addr = mem;
 772
 773        rval = _xfs_buf_get_pages(bp, page_count, XBF_DONT_BLOCK);
 774        if (rval)
 775                return rval;
 776
 777        bp->b_offset = offset;
 778
 779        for (i = 0; i < bp->b_page_count; i++) {
 780                bp->b_pages[i] = mem_to_page((void *)pageaddr);
 781                pageaddr += PAGE_CACHE_SIZE;
 782        }
 783
 784        bp->b_count_desired = len;
 785        bp->b_buffer_length = buflen;
 786        bp->b_flags |= XBF_MAPPED;
 787        bp->b_flags &= ~_XBF_PAGE_LOCKED;
 788
 789        return 0;
 790}
 791
 792xfs_buf_t *
 793xfs_buf_get_noaddr(
 794        size_t                  len,
 795        xfs_buftarg_t           *target)
 796{
 797        unsigned long           page_count = PAGE_ALIGN(len) >> PAGE_SHIFT;
 798        int                     error, i;
 799        xfs_buf_t               *bp;
 800
 801        bp = xfs_buf_allocate(0);
 802        if (unlikely(bp == NULL))
 803                goto fail;
 804        _xfs_buf_initialize(bp, target, 0, len, 0);
 805
 806        error = _xfs_buf_get_pages(bp, page_count, 0);
 807        if (error)
 808                goto fail_free_buf;
 809
 810        for (i = 0; i < page_count; i++) {
 811                bp->b_pages[i] = alloc_page(GFP_KERNEL);
 812                if (!bp->b_pages[i])
 813                        goto fail_free_mem;
 814        }
 815        bp->b_flags |= _XBF_PAGES;
 816
 817        error = _xfs_buf_map_pages(bp, XBF_MAPPED);
 818        if (unlikely(error)) {
 819                printk(KERN_WARNING "%s: failed to map pages\n",
 820                                __func__);
 821                goto fail_free_mem;
 822        }
 823
 824        xfs_buf_unlock(bp);
 825
 826        XB_TRACE(bp, "no_daddr", len);
 827        return bp;
 828
 829 fail_free_mem:
 830        while (--i >= 0)
 831                __free_page(bp->b_pages[i]);
 832        _xfs_buf_free_pages(bp);
 833 fail_free_buf:
 834        xfs_buf_deallocate(bp);
 835 fail:
 836        return NULL;
 837}
 838
 839/*
 840 *      Increment reference count on buffer, to hold the buffer concurrently
 841 *      with another thread which may release (free) the buffer asynchronously.
 842 *      Must hold the buffer already to call this function.
 843 */
 844void
 845xfs_buf_hold(
 846        xfs_buf_t               *bp)
 847{
 848        atomic_inc(&bp->b_hold);
 849        XB_TRACE(bp, "hold", 0);
 850}
 851
 852/*
 853 *      Releases a hold on the specified buffer.  If the
 854 *      the hold count is 1, calls xfs_buf_free.
 855 */
 856void
 857xfs_buf_rele(
 858        xfs_buf_t               *bp)
 859{
 860        xfs_bufhash_t           *hash = bp->b_hash;
 861
 862        XB_TRACE(bp, "rele", bp->b_relse);
 863
 864        if (unlikely(!hash)) {
 865                ASSERT(!bp->b_relse);
 866                if (atomic_dec_and_test(&bp->b_hold))
 867                        xfs_buf_free(bp);
 868                return;
 869        }
 870
 871        ASSERT(atomic_read(&bp->b_hold) > 0);
 872        if (atomic_dec_and_lock(&bp->b_hold, &hash->bh_lock)) {
 873                if (bp->b_relse) {
 874                        atomic_inc(&bp->b_hold);
 875                        spin_unlock(&hash->bh_lock);
 876                        (*(bp->b_relse)) (bp);
 877                } else if (bp->b_flags & XBF_FS_MANAGED) {
 878                        spin_unlock(&hash->bh_lock);
 879                } else {
 880                        ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)));
 881                        list_del_init(&bp->b_hash_list);
 882                        spin_unlock(&hash->bh_lock);
 883                        xfs_buf_free(bp);
 884                }
 885        }
 886}
 887
 888
 889/*
 890 *      Mutual exclusion on buffers.  Locking model:
 891 *
 892 *      Buffers associated with inodes for which buffer locking
 893 *      is not enabled are not protected by semaphores, and are
 894 *      assumed to be exclusively owned by the caller.  There is a
 895 *      spinlock in the buffer, used by the caller when concurrent
 896 *      access is possible.
 897 */
 898
 899/*
 900 *      Locks a buffer object, if it is not already locked.
 901 *      Note that this in no way locks the underlying pages, so it is only
 902 *      useful for synchronizing concurrent use of buffer objects, not for
 903 *      synchronizing independent access to the underlying pages.
 904 */
 905int
 906xfs_buf_cond_lock(
 907        xfs_buf_t               *bp)
 908{
 909        int                     locked;
 910
 911        locked = down_trylock(&bp->b_sema) == 0;
 912        if (locked) {
 913                XB_SET_OWNER(bp);
 914        }
 915        XB_TRACE(bp, "cond_lock", (long)locked);
 916        return locked ? 0 : -EBUSY;
 917}
 918
 919#if defined(DEBUG) || defined(XFS_BLI_TRACE)
 920int
 921xfs_buf_lock_value(
 922        xfs_buf_t               *bp)
 923{
 924        return bp->b_sema.count;
 925}
 926#endif
 927
 928/*
 929 *      Locks a buffer object.
 930 *      Note that this in no way locks the underlying pages, so it is only
 931 *      useful for synchronizing concurrent use of buffer objects, not for
 932 *      synchronizing independent access to the underlying pages.
 933 */
 934void
 935xfs_buf_lock(
 936        xfs_buf_t               *bp)
 937{
 938        XB_TRACE(bp, "lock", 0);
 939        if (atomic_read(&bp->b_io_remaining))
 940                blk_run_address_space(bp->b_target->bt_mapping);
 941        down(&bp->b_sema);
 942        XB_SET_OWNER(bp);
 943        XB_TRACE(bp, "locked", 0);
 944}
 945
 946/*
 947 *      Releases the lock on the buffer object.
 948 *      If the buffer is marked delwri but is not queued, do so before we
 949 *      unlock the buffer as we need to set flags correctly.  We also need to
 950 *      take a reference for the delwri queue because the unlocker is going to
 951 *      drop their's and they don't know we just queued it.
 952 */
 953void
 954xfs_buf_unlock(
 955        xfs_buf_t               *bp)
 956{
 957        if ((bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q)) == XBF_DELWRI) {
 958                atomic_inc(&bp->b_hold);
 959                bp->b_flags |= XBF_ASYNC;
 960                xfs_buf_delwri_queue(bp, 0);
 961        }
 962
 963        XB_CLEAR_OWNER(bp);
 964        up(&bp->b_sema);
 965        XB_TRACE(bp, "unlock", 0);
 966}
 967
 968
 969/*
 970 *      Pinning Buffer Storage in Memory
 971 *      Ensure that no attempt to force a buffer to disk will succeed.
 972 */
 973void
 974xfs_buf_pin(
 975        xfs_buf_t               *bp)
 976{
 977        atomic_inc(&bp->b_pin_count);
 978        XB_TRACE(bp, "pin", (long)bp->b_pin_count.counter);
 979}
 980
 981void
 982xfs_buf_unpin(
 983        xfs_buf_t               *bp)
 984{
 985        if (atomic_dec_and_test(&bp->b_pin_count))
 986                wake_up_all(&bp->b_waiters);
 987        XB_TRACE(bp, "unpin", (long)bp->b_pin_count.counter);
 988}
 989
 990int
 991xfs_buf_ispin(
 992        xfs_buf_t               *bp)
 993{
 994        return atomic_read(&bp->b_pin_count);
 995}
 996
 997STATIC void
 998xfs_buf_wait_unpin(
 999        xfs_buf_t               *bp)
1000{
1001        DECLARE_WAITQUEUE       (wait, current);
1002
1003        if (atomic_read(&bp->b_pin_count) == 0)
1004                return;
1005
1006        add_wait_queue(&bp->b_waiters, &wait);
1007        for (;;) {
1008                set_current_state(TASK_UNINTERRUPTIBLE);
1009                if (atomic_read(&bp->b_pin_count) == 0)
1010                        break;
1011                if (atomic_read(&bp->b_io_remaining))
1012                        blk_run_address_space(bp->b_target->bt_mapping);
1013                schedule();
1014        }
1015        remove_wait_queue(&bp->b_waiters, &wait);
1016        set_current_state(TASK_RUNNING);
1017}
1018
1019/*
1020 *      Buffer Utility Routines
1021 */
1022
1023STATIC void
1024xfs_buf_iodone_work(
1025        struct work_struct      *work)
1026{
1027        xfs_buf_t               *bp =
1028                container_of(work, xfs_buf_t, b_iodone_work);
1029
1030        /*
1031         * We can get an EOPNOTSUPP to ordered writes.  Here we clear the
1032         * ordered flag and reissue them.  Because we can't tell the higher
1033         * layers directly that they should not issue ordered I/O anymore, they
1034         * need to check if the _XFS_BARRIER_FAILED flag was set during I/O completion.
1035         */
1036        if ((bp->b_error == EOPNOTSUPP) &&
1037            (bp->b_flags & (XBF_ORDERED|XBF_ASYNC)) == (XBF_ORDERED|XBF_ASYNC)) {
1038                XB_TRACE(bp, "ordered_retry", bp->b_iodone);
1039                bp->b_flags &= ~XBF_ORDERED;
1040                bp->b_flags |= _XFS_BARRIER_FAILED;
1041                xfs_buf_iorequest(bp);
1042        } else if (bp->b_iodone)
1043                (*(bp->b_iodone))(bp);
1044        else if (bp->b_flags & XBF_ASYNC)
1045                xfs_buf_relse(bp);
1046}
1047
1048void
1049xfs_buf_ioend(
1050        xfs_buf_t               *bp,
1051        int                     schedule)
1052{
1053        bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
1054        if (bp->b_error == 0)
1055                bp->b_flags |= XBF_DONE;
1056
1057        XB_TRACE(bp, "iodone", bp->b_iodone);
1058
1059        if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) {
1060                if (schedule) {
1061                        INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work);
1062                        queue_work(xfslogd_workqueue, &bp->b_iodone_work);
1063                } else {
1064                        xfs_buf_iodone_work(&bp->b_iodone_work);
1065                }
1066        } else {
1067                complete(&bp->b_iowait);
1068        }
1069}
1070
1071void
1072xfs_buf_ioerror(
1073        xfs_buf_t               *bp,
1074        int                     error)
1075{
1076        ASSERT(error >= 0 && error <= 0xffff);
1077        bp->b_error = (unsigned short)error;
1078        XB_TRACE(bp, "ioerror", (unsigned long)error);
1079}
1080
1081int
1082xfs_bawrite(
1083        void                    *mp,
1084        struct xfs_buf          *bp)
1085{
1086        XB_TRACE(bp, "bawrite", 0);
1087
1088        ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL);
1089
1090        xfs_buf_delwri_dequeue(bp);
1091
1092        bp->b_flags &= ~(XBF_READ | XBF_DELWRI | XBF_READ_AHEAD);
1093        bp->b_flags |= (XBF_WRITE | XBF_ASYNC | _XBF_RUN_QUEUES);
1094
1095        bp->b_mount = mp;
1096        bp->b_strat = xfs_bdstrat_cb;
1097        return xfs_bdstrat_cb(bp);
1098}
1099
1100void
1101xfs_bdwrite(
1102        void                    *mp,
1103        struct xfs_buf          *bp)
1104{
1105        XB_TRACE(bp, "bdwrite", 0);
1106
1107        bp->b_strat = xfs_bdstrat_cb;
1108        bp->b_mount = mp;
1109
1110        bp->b_flags &= ~XBF_READ;
1111        bp->b_flags |= (XBF_DELWRI | XBF_ASYNC);
1112
1113        xfs_buf_delwri_queue(bp, 1);
1114}
1115
1116STATIC_INLINE void
1117_xfs_buf_ioend(
1118        xfs_buf_t               *bp,
1119        int                     schedule)
1120{
1121        if (atomic_dec_and_test(&bp->b_io_remaining) == 1) {
1122                bp->b_flags &= ~_XBF_PAGE_LOCKED;
1123                xfs_buf_ioend(bp, schedule);
1124        }
1125}
1126
1127STATIC void
1128xfs_buf_bio_end_io(
1129        struct bio              *bio,
1130        int                     error)
1131{
1132        xfs_buf_t               *bp = (xfs_buf_t *)bio->bi_private;
1133        unsigned int            blocksize = bp->b_target->bt_bsize;
1134        struct bio_vec          *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1135
1136        xfs_buf_ioerror(bp, -error);
1137
1138        do {
1139                struct page     *page = bvec->bv_page;
1140
1141                ASSERT(!PagePrivate(page));
1142                if (unlikely(bp->b_error)) {
1143                        if (bp->b_flags & XBF_READ)
1144                                ClearPageUptodate(page);
1145                } else if (blocksize >= PAGE_CACHE_SIZE) {
1146                        SetPageUptodate(page);
1147                } else if (!PagePrivate(page) &&
1148                                (bp->b_flags & _XBF_PAGE_CACHE)) {
1149                        set_page_region(page, bvec->bv_offset, bvec->bv_len);
1150                }
1151
1152                if (--bvec >= bio->bi_io_vec)
1153                        prefetchw(&bvec->bv_page->flags);
1154
1155                if (bp->b_flags & _XBF_PAGE_LOCKED)
1156                        unlock_page(page);
1157        } while (bvec >= bio->bi_io_vec);
1158
1159        _xfs_buf_ioend(bp, 1);
1160        bio_put(bio);
1161}
1162
1163STATIC void
1164_xfs_buf_ioapply(
1165        xfs_buf_t               *bp)
1166{
1167        int                     rw, map_i, total_nr_pages, nr_pages;
1168        struct bio              *bio;
1169        int                     offset = bp->b_offset;
1170        int                     size = bp->b_count_desired;
1171        sector_t                sector = bp->b_bn;
1172        unsigned int            blocksize = bp->b_target->bt_bsize;
1173
1174        total_nr_pages = bp->b_page_count;
1175        map_i = 0;
1176
1177        if (bp->b_flags & XBF_ORDERED) {
1178                ASSERT(!(bp->b_flags & XBF_READ));
1179                rw = WRITE_BARRIER;
1180        } else if (bp->b_flags & _XBF_RUN_QUEUES) {
1181                ASSERT(!(bp->b_flags & XBF_READ_AHEAD));
1182                bp->b_flags &= ~_XBF_RUN_QUEUES;
1183                rw = (bp->b_flags & XBF_WRITE) ? WRITE_SYNC : READ_SYNC;
1184        } else {
1185                rw = (bp->b_flags & XBF_WRITE) ? WRITE :
1186                     (bp->b_flags & XBF_READ_AHEAD) ? READA : READ;
1187        }
1188
1189        /* Special code path for reading a sub page size buffer in --
1190         * we populate up the whole page, and hence the other metadata
1191         * in the same page.  This optimization is only valid when the
1192         * filesystem block size is not smaller than the page size.
1193         */
1194        if ((bp->b_buffer_length < PAGE_CACHE_SIZE) &&
1195            ((bp->b_flags & (XBF_READ|_XBF_PAGE_LOCKED)) ==
1196              (XBF_READ|_XBF_PAGE_LOCKED)) &&
1197            (blocksize >= PAGE_CACHE_SIZE)) {
1198                bio = bio_alloc(GFP_NOIO, 1);
1199
1200                bio->bi_bdev = bp->b_target->bt_bdev;
1201                bio->bi_sector = sector - (offset >> BBSHIFT);
1202                bio->bi_end_io = xfs_buf_bio_end_io;
1203                bio->bi_private = bp;
1204
1205                bio_add_page(bio, bp->b_pages[0], PAGE_CACHE_SIZE, 0);
1206                size = 0;
1207
1208                atomic_inc(&bp->b_io_remaining);
1209
1210                goto submit_io;
1211        }
1212
1213next_chunk:
1214        atomic_inc(&bp->b_io_remaining);
1215        nr_pages = BIO_MAX_SECTORS >> (PAGE_SHIFT - BBSHIFT);
1216        if (nr_pages > total_nr_pages)
1217                nr_pages = total_nr_pages;
1218
1219        bio = bio_alloc(GFP_NOIO, nr_pages);
1220        bio->bi_bdev = bp->b_target->bt_bdev;
1221        bio->bi_sector = sector;
1222        bio->bi_end_io = xfs_buf_bio_end_io;
1223        bio->bi_private = bp;
1224
1225        for (; size && nr_pages; nr_pages--, map_i++) {
1226                int     rbytes, nbytes = PAGE_CACHE_SIZE - offset;
1227
1228                if (nbytes > size)
1229                        nbytes = size;
1230
1231                rbytes = bio_add_page(bio, bp->b_pages[map_i], nbytes, offset);
1232                if (rbytes < nbytes)
1233                        break;
1234
1235                offset = 0;
1236                sector += nbytes >> BBSHIFT;
1237                size -= nbytes;
1238                total_nr_pages--;
1239        }
1240
1241submit_io:
1242        if (likely(bio->bi_size)) {
1243                submit_bio(rw, bio);
1244                if (size)
1245                        goto next_chunk;
1246        } else {
1247                bio_put(bio);
1248                xfs_buf_ioerror(bp, EIO);
1249        }
1250}
1251
1252int
1253xfs_buf_iorequest(
1254        xfs_buf_t               *bp)
1255{
1256        XB_TRACE(bp, "iorequest", 0);
1257
1258        if (bp->b_flags & XBF_DELWRI) {
1259                xfs_buf_delwri_queue(bp, 1);
1260                return 0;
1261        }
1262
1263        if (bp->b_flags & XBF_WRITE) {
1264                xfs_buf_wait_unpin(bp);
1265        }
1266
1267        xfs_buf_hold(bp);
1268
1269        /* Set the count to 1 initially, this will stop an I/O
1270         * completion callout which happens before we have started
1271         * all the I/O from calling xfs_buf_ioend too early.
1272         */
1273        atomic_set(&bp->b_io_remaining, 1);
1274        _xfs_buf_ioapply(bp);
1275        _xfs_buf_ioend(bp, 0);
1276
1277        xfs_buf_rele(bp);
1278        return 0;
1279}
1280
1281/*
1282 *      Waits for I/O to complete on the buffer supplied.
1283 *      It returns immediately if no I/O is pending.
1284 *      It returns the I/O error code, if any, or 0 if there was no error.
1285 */
1286int
1287xfs_buf_iowait(
1288        xfs_buf_t               *bp)
1289{
1290        XB_TRACE(bp, "iowait", 0);
1291        if (atomic_read(&bp->b_io_remaining))
1292                blk_run_address_space(bp->b_target->bt_mapping);
1293        wait_for_completion(&bp->b_iowait);
1294        XB_TRACE(bp, "iowaited", (long)bp->b_error);
1295        return bp->b_error;
1296}
1297
1298xfs_caddr_t
1299xfs_buf_offset(
1300        xfs_buf_t               *bp,
1301        size_t                  offset)
1302{
1303        struct page             *page;
1304
1305        if (bp->b_flags & XBF_MAPPED)
1306                return XFS_BUF_PTR(bp) + offset;
1307
1308        offset += bp->b_offset;
1309        page = bp->b_pages[offset >> PAGE_CACHE_SHIFT];
1310        return (xfs_caddr_t)page_address(page) + (offset & (PAGE_CACHE_SIZE-1));
1311}
1312
1313/*
1314 *      Move data into or out of a buffer.
1315 */
1316void
1317xfs_buf_iomove(
1318        xfs_buf_t               *bp,    /* buffer to process            */
1319        size_t                  boff,   /* starting buffer offset       */
1320        size_t                  bsize,  /* length to copy               */
1321        caddr_t                 data,   /* data address                 */
1322        xfs_buf_rw_t            mode)   /* read/write/zero flag         */
1323{
1324        size_t                  bend, cpoff, csize;
1325        struct page             *page;
1326
1327        bend = boff + bsize;
1328        while (boff < bend) {
1329                page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)];
1330                cpoff = xfs_buf_poff(boff + bp->b_offset);
1331                csize = min_t(size_t,
1332                              PAGE_CACHE_SIZE-cpoff, bp->b_count_desired-boff);
1333
1334                ASSERT(((csize + cpoff) <= PAGE_CACHE_SIZE));
1335
1336                switch (mode) {
1337                case XBRW_ZERO:
1338                        memset(page_address(page) + cpoff, 0, csize);
1339                        break;
1340                case XBRW_READ:
1341                        memcpy(data, page_address(page) + cpoff, csize);
1342                        break;
1343                case XBRW_WRITE:
1344                        memcpy(page_address(page) + cpoff, data, csize);
1345                }
1346
1347                boff += csize;
1348                data += csize;
1349        }
1350}
1351
1352/*
1353 *      Handling of buffer targets (buftargs).
1354 */
1355
1356/*
1357 *      Wait for any bufs with callbacks that have been submitted but
1358 *      have not yet returned... walk the hash list for the target.
1359 */
1360void
1361xfs_wait_buftarg(
1362        xfs_buftarg_t   *btp)
1363{
1364        xfs_buf_t       *bp, *n;
1365        xfs_bufhash_t   *hash;
1366        uint            i;
1367
1368        for (i = 0; i < (1 << btp->bt_hashshift); i++) {
1369                hash = &btp->bt_hash[i];
1370again:
1371                spin_lock(&hash->bh_lock);
1372                list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) {
1373                        ASSERT(btp == bp->b_target);
1374                        if (!(bp->b_flags & XBF_FS_MANAGED)) {
1375                                spin_unlock(&hash->bh_lock);
1376                                /*
1377                                 * Catch superblock reference count leaks
1378                                 * immediately
1379                                 */
1380                                BUG_ON(bp->b_bn == 0);
1381                                delay(100);
1382                                goto again;
1383                        }
1384                }
1385                spin_unlock(&hash->bh_lock);
1386        }
1387}
1388
1389/*
1390 *      Allocate buffer hash table for a given target.
1391 *      For devices containing metadata (i.e. not the log/realtime devices)
1392 *      we need to allocate a much larger hash table.
1393 */
1394STATIC void
1395xfs_alloc_bufhash(
1396        xfs_buftarg_t           *btp,
1397        int                     external)
1398{
1399        unsigned int            i;
1400
1401        btp->bt_hashshift = external ? 3 : 8;   /* 8 or 256 buckets */
1402        btp->bt_hashmask = (1 << btp->bt_hashshift) - 1;
1403        btp->bt_hash = kmem_zalloc((1 << btp->bt_hashshift) *
1404                                        sizeof(xfs_bufhash_t), KM_SLEEP | KM_LARGE);
1405        for (i = 0; i < (1 << btp->bt_hashshift); i++) {
1406                spin_lock_init(&btp->bt_hash[i].bh_lock);
1407                INIT_LIST_HEAD(&btp->bt_hash[i].bh_list);
1408        }
1409}
1410
1411STATIC void
1412xfs_free_bufhash(
1413        xfs_buftarg_t           *btp)
1414{
1415        kmem_free(btp->bt_hash);
1416        btp->bt_hash = NULL;
1417}
1418
1419/*
1420 *      buftarg list for delwrite queue processing
1421 */
1422static LIST_HEAD(xfs_buftarg_list);
1423static DEFINE_SPINLOCK(xfs_buftarg_lock);
1424
1425STATIC void
1426xfs_register_buftarg(
1427        xfs_buftarg_t           *btp)
1428{
1429        spin_lock(&xfs_buftarg_lock);
1430        list_add(&btp->bt_list, &xfs_buftarg_list);
1431        spin_unlock(&xfs_buftarg_lock);
1432}
1433
1434STATIC void
1435xfs_unregister_buftarg(
1436        xfs_buftarg_t           *btp)
1437{
1438        spin_lock(&xfs_buftarg_lock);
1439        list_del(&btp->bt_list);
1440        spin_unlock(&xfs_buftarg_lock);
1441}
1442
1443void
1444xfs_free_buftarg(
1445        struct xfs_mount        *mp,
1446        struct xfs_buftarg      *btp)
1447{
1448        xfs_flush_buftarg(btp, 1);
1449        if (mp->m_flags & XFS_MOUNT_BARRIER)
1450                xfs_blkdev_issue_flush(btp);
1451        xfs_free_bufhash(btp);
1452        iput(btp->bt_mapping->host);
1453
1454        /* Unregister the buftarg first so that we don't get a
1455         * wakeup finding a non-existent task
1456         */
1457        xfs_unregister_buftarg(btp);
1458        kthread_stop(btp->bt_task);
1459
1460        kmem_free(btp);
1461}
1462
1463STATIC int
1464xfs_setsize_buftarg_flags(
1465        xfs_buftarg_t           *btp,
1466        unsigned int            blocksize,
1467        unsigned int            sectorsize,
1468        int                     verbose)
1469{
1470        btp->bt_bsize = blocksize;
1471        btp->bt_sshift = ffs(sectorsize) - 1;
1472        btp->bt_smask = sectorsize - 1;
1473
1474        if (set_blocksize(btp->bt_bdev, sectorsize)) {
1475                printk(KERN_WARNING
1476                        "XFS: Cannot set_blocksize to %u on device %s\n",
1477                        sectorsize, XFS_BUFTARG_NAME(btp));
1478                return EINVAL;
1479        }
1480
1481        if (verbose &&
1482            (PAGE_CACHE_SIZE / BITS_PER_LONG) > sectorsize) {
1483                printk(KERN_WARNING
1484                        "XFS: %u byte sectors in use on device %s.  "
1485                        "This is suboptimal; %u or greater is ideal.\n",
1486                        sectorsize, XFS_BUFTARG_NAME(btp),
1487                        (unsigned int)PAGE_CACHE_SIZE / BITS_PER_LONG);
1488        }
1489
1490        return 0;
1491}
1492
1493/*
1494 *      When allocating the initial buffer target we have not yet
1495 *      read in the superblock, so don't know what sized sectors
1496 *      are being used is at this early stage.  Play safe.
1497 */
1498STATIC int
1499xfs_setsize_buftarg_early(
1500        xfs_buftarg_t           *btp,
1501        struct block_device     *bdev)
1502{
1503        return xfs_setsize_buftarg_flags(btp,
1504                        PAGE_CACHE_SIZE, bdev_logical_block_size(bdev), 0);
1505}
1506
1507int
1508xfs_setsize_buftarg(
1509        xfs_buftarg_t           *btp,
1510        unsigned int            blocksize,
1511        unsigned int            sectorsize)
1512{
1513        return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1);
1514}
1515
1516STATIC int
1517xfs_mapping_buftarg(
1518        xfs_buftarg_t           *btp,
1519        struct block_device     *bdev)
1520{
1521        struct backing_dev_info *bdi;
1522        struct inode            *inode;
1523        struct address_space    *mapping;
1524        static const struct address_space_operations mapping_aops = {
1525                .sync_page = block_sync_page,
1526                .migratepage = fail_migrate_page,
1527        };
1528
1529        inode = new_inode(bdev->bd_inode->i_sb);
1530        if (!inode) {
1531                printk(KERN_WARNING
1532                        "XFS: Cannot allocate mapping inode for device %s\n",
1533                        XFS_BUFTARG_NAME(btp));
1534                return ENOMEM;
1535        }
1536        inode->i_mode = S_IFBLK;
1537        inode->i_bdev = bdev;
1538        inode->i_rdev = bdev->bd_dev;
1539        bdi = blk_get_backing_dev_info(bdev);
1540        if (!bdi)
1541                bdi = &default_backing_dev_info;
1542        mapping = &inode->i_data;
1543        mapping->a_ops = &mapping_aops;
1544        mapping->backing_dev_info = bdi;
1545        mapping_set_gfp_mask(mapping, GFP_NOFS);
1546        btp->bt_mapping = mapping;
1547        return 0;
1548}
1549
1550STATIC int
1551xfs_alloc_delwrite_queue(
1552        xfs_buftarg_t           *btp)
1553{
1554        int     error = 0;
1555
1556        INIT_LIST_HEAD(&btp->bt_list);
1557        INIT_LIST_HEAD(&btp->bt_delwrite_queue);
1558        spin_lock_init(&btp->bt_delwrite_lock);
1559        btp->bt_flags = 0;
1560        btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd");
1561        if (IS_ERR(btp->bt_task)) {
1562                error = PTR_ERR(btp->bt_task);
1563                goto out_error;
1564        }
1565        xfs_register_buftarg(btp);
1566out_error:
1567        return error;
1568}
1569
1570xfs_buftarg_t *
1571xfs_alloc_buftarg(
1572        struct block_device     *bdev,
1573        int                     external)
1574{
1575        xfs_buftarg_t           *btp;
1576
1577        btp = kmem_zalloc(sizeof(*btp), KM_SLEEP);
1578
1579        btp->bt_dev =  bdev->bd_dev;
1580        btp->bt_bdev = bdev;
1581        if (xfs_setsize_buftarg_early(btp, bdev))
1582                goto error;
1583        if (xfs_mapping_buftarg(btp, bdev))
1584                goto error;
1585        if (xfs_alloc_delwrite_queue(btp))
1586                goto error;
1587        xfs_alloc_bufhash(btp, external);
1588        return btp;
1589
1590error:
1591        kmem_free(btp);
1592        return NULL;
1593}
1594
1595
1596/*
1597 *      Delayed write buffer handling
1598 */
1599STATIC void
1600xfs_buf_delwri_queue(
1601        xfs_buf_t               *bp,
1602        int                     unlock)
1603{
1604        struct list_head        *dwq = &bp->b_target->bt_delwrite_queue;
1605        spinlock_t              *dwlk = &bp->b_target->bt_delwrite_lock;
1606
1607        XB_TRACE(bp, "delwri_q", (long)unlock);
1608        ASSERT((bp->b_flags&(XBF_DELWRI|XBF_ASYNC)) == (XBF_DELWRI|XBF_ASYNC));
1609
1610        spin_lock(dwlk);
1611        /* If already in the queue, dequeue and place at tail */
1612        if (!list_empty(&bp->b_list)) {
1613                ASSERT(bp->b_flags & _XBF_DELWRI_Q);
1614                if (unlock)
1615                        atomic_dec(&bp->b_hold);
1616                list_del(&bp->b_list);
1617        }
1618
1619        bp->b_flags |= _XBF_DELWRI_Q;
1620        list_add_tail(&bp->b_list, dwq);
1621        bp->b_queuetime = jiffies;
1622        spin_unlock(dwlk);
1623
1624        if (unlock)
1625                xfs_buf_unlock(bp);
1626}
1627
1628void
1629xfs_buf_delwri_dequeue(
1630        xfs_buf_t               *bp)
1631{
1632        spinlock_t              *dwlk = &bp->b_target->bt_delwrite_lock;
1633        int                     dequeued = 0;
1634
1635        spin_lock(dwlk);
1636        if ((bp->b_flags & XBF_DELWRI) && !list_empty(&bp->b_list)) {
1637                ASSERT(bp->b_flags & _XBF_DELWRI_Q);
1638                list_del_init(&bp->b_list);
1639                dequeued = 1;
1640        }
1641        bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q);
1642        spin_unlock(dwlk);
1643
1644        if (dequeued)
1645                xfs_buf_rele(bp);
1646
1647        XB_TRACE(bp, "delwri_dq", (long)dequeued);
1648}
1649
1650STATIC void
1651xfs_buf_runall_queues(
1652        struct workqueue_struct *queue)
1653{
1654        flush_workqueue(queue);
1655}
1656
1657STATIC int
1658xfsbufd_wakeup(
1659        int                     priority,
1660        gfp_t                   mask)
1661{
1662        xfs_buftarg_t           *btp;
1663
1664        spin_lock(&xfs_buftarg_lock);
1665        list_for_each_entry(btp, &xfs_buftarg_list, bt_list) {
1666                if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags))
1667                        continue;
1668                set_bit(XBT_FORCE_FLUSH, &btp->bt_flags);
1669                wake_up_process(btp->bt_task);
1670        }
1671        spin_unlock(&xfs_buftarg_lock);
1672        return 0;
1673}
1674
1675/*
1676 * Move as many buffers as specified to the supplied list
1677 * idicating if we skipped any buffers to prevent deadlocks.
1678 */
1679STATIC int
1680xfs_buf_delwri_split(
1681        xfs_buftarg_t   *target,
1682        struct list_head *list,
1683        unsigned long   age)
1684{
1685        xfs_buf_t       *bp, *n;
1686        struct list_head *dwq = &target->bt_delwrite_queue;
1687        spinlock_t      *dwlk = &target->bt_delwrite_lock;
1688        int             skipped = 0;
1689        int             force;
1690
1691        force = test_and_clear_bit(XBT_FORCE_FLUSH, &target->bt_flags);
1692        INIT_LIST_HEAD(list);
1693        spin_lock(dwlk);
1694        list_for_each_entry_safe(bp, n, dwq, b_list) {
1695                XB_TRACE(bp, "walkq1", (long)xfs_buf_ispin(bp));
1696                ASSERT(bp->b_flags & XBF_DELWRI);
1697
1698                if (!xfs_buf_ispin(bp) && !xfs_buf_cond_lock(bp)) {
1699                        if (!force &&
1700                            time_before(jiffies, bp->b_queuetime + age)) {
1701                                xfs_buf_unlock(bp);
1702                                break;
1703                        }
1704
1705                        bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q|
1706                                         _XBF_RUN_QUEUES);
1707                        bp->b_flags |= XBF_WRITE;
1708                        list_move_tail(&bp->b_list, list);
1709                } else
1710                        skipped++;
1711        }
1712        spin_unlock(dwlk);
1713
1714        return skipped;
1715
1716}
1717
1718STATIC int
1719xfsbufd(
1720        void            *data)
1721{
1722        struct list_head tmp;
1723        xfs_buftarg_t   *target = (xfs_buftarg_t *)data;
1724        int             count;
1725        xfs_buf_t       *bp;
1726
1727        current->flags |= PF_MEMALLOC;
1728
1729        set_freezable();
1730
1731        do {
1732                if (unlikely(freezing(current))) {
1733                        set_bit(XBT_FORCE_SLEEP, &target->bt_flags);
1734                        refrigerator();
1735                } else {
1736                        clear_bit(XBT_FORCE_SLEEP, &target->bt_flags);
1737                }
1738
1739                schedule_timeout_interruptible(
1740                        xfs_buf_timer_centisecs * msecs_to_jiffies(10));
1741
1742                xfs_buf_delwri_split(target, &tmp,
1743                                xfs_buf_age_centisecs * msecs_to_jiffies(10));
1744
1745                count = 0;
1746                while (!list_empty(&tmp)) {
1747                        bp = list_entry(tmp.next, xfs_buf_t, b_list);
1748                        ASSERT(target == bp->b_target);
1749
1750                        list_del_init(&bp->b_list);
1751                        xfs_buf_iostrategy(bp);
1752                        count++;
1753                }
1754
1755                if (as_list_len > 0)
1756                        purge_addresses();
1757                if (count)
1758                        blk_run_address_space(target->bt_mapping);
1759
1760        } while (!kthread_should_stop());
1761
1762        return 0;
1763}
1764
1765/*
1766 *      Go through all incore buffers, and release buffers if they belong to
1767 *      the given device. This is used in filesystem error handling to
1768 *      preserve the consistency of its metadata.
1769 */
1770int
1771xfs_flush_buftarg(
1772        xfs_buftarg_t   *target,
1773        int             wait)
1774{
1775        struct list_head tmp;
1776        xfs_buf_t       *bp, *n;
1777        int             pincount = 0;
1778
1779        xfs_buf_runall_queues(xfsconvertd_workqueue);
1780        xfs_buf_runall_queues(xfsdatad_workqueue);
1781        xfs_buf_runall_queues(xfslogd_workqueue);
1782
1783        set_bit(XBT_FORCE_FLUSH, &target->bt_flags);
1784        pincount = xfs_buf_delwri_split(target, &tmp, 0);
1785
1786        /*
1787         * Dropped the delayed write list lock, now walk the temporary list
1788         */
1789        list_for_each_entry_safe(bp, n, &tmp, b_list) {
1790                ASSERT(target == bp->b_target);
1791                if (wait)
1792                        bp->b_flags &= ~XBF_ASYNC;
1793                else
1794                        list_del_init(&bp->b_list);
1795
1796                xfs_buf_iostrategy(bp);
1797        }
1798
1799        if (wait)
1800                blk_run_address_space(target->bt_mapping);
1801
1802        /*
1803         * Remaining list items must be flushed before returning
1804         */
1805        while (!list_empty(&tmp)) {
1806                bp = list_entry(tmp.next, xfs_buf_t, b_list);
1807
1808                list_del_init(&bp->b_list);
1809                xfs_iowait(bp);
1810                xfs_buf_relse(bp);
1811        }
1812
1813        return pincount;
1814}
1815
1816int __init
1817xfs_buf_init(void)
1818{
1819#ifdef XFS_BUF_TRACE
1820        xfs_buf_trace_buf = ktrace_alloc(XFS_BUF_TRACE_SIZE, KM_NOFS);
1821#endif
1822
1823        xfs_buf_zone = kmem_zone_init_flags(sizeof(xfs_buf_t), "xfs_buf",
1824                                                KM_ZONE_HWALIGN, NULL);
1825        if (!xfs_buf_zone)
1826                goto out_free_trace_buf;
1827
1828        xfslogd_workqueue = create_workqueue("xfslogd");
1829        if (!xfslogd_workqueue)
1830                goto out_free_buf_zone;
1831
1832        xfsdatad_workqueue = create_workqueue("xfsdatad");
1833        if (!xfsdatad_workqueue)
1834                goto out_destroy_xfslogd_workqueue;
1835
1836        xfsconvertd_workqueue = create_workqueue("xfsconvertd");
1837        if (!xfsconvertd_workqueue)
1838                goto out_destroy_xfsdatad_workqueue;
1839
1840        register_shrinker(&xfs_buf_shake);
1841        return 0;
1842
1843 out_destroy_xfsdatad_workqueue:
1844        destroy_workqueue(xfsdatad_workqueue);
1845 out_destroy_xfslogd_workqueue:
1846        destroy_workqueue(xfslogd_workqueue);
1847 out_free_buf_zone:
1848        kmem_zone_destroy(xfs_buf_zone);
1849 out_free_trace_buf:
1850#ifdef XFS_BUF_TRACE
1851        ktrace_free(xfs_buf_trace_buf);
1852#endif
1853        return -ENOMEM;
1854}
1855
1856void
1857xfs_buf_terminate(void)
1858{
1859        unregister_shrinker(&xfs_buf_shake);
1860        destroy_workqueue(xfsconvertd_workqueue);
1861        destroy_workqueue(xfsdatad_workqueue);
1862        destroy_workqueue(xfslogd_workqueue);
1863        kmem_zone_destroy(xfs_buf_zone);
1864#ifdef XFS_BUF_TRACE
1865        ktrace_free(xfs_buf_trace_buf);
1866#endif
1867}
1868
1869#ifdef CONFIG_KDB_MODULES
1870struct list_head *
1871xfs_get_buftarg_list(void)
1872{
1873        return &xfs_buftarg_list;
1874}
1875#endif
1876