linux/drivers/md/dm-bufio.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2009-2011 Red Hat, Inc.
   3 *
   4 * Author: Mikulas Patocka <mpatocka@redhat.com>
   5 *
   6 * This file is released under the GPL.
   7 */
   8
   9#include <linux/dm-bufio.h>
  10
  11#include <linux/device-mapper.h>
  12#include <linux/dm-io.h>
  13#include <linux/slab.h>
  14#include <linux/sched/mm.h>
  15#include <linux/jiffies.h>
  16#include <linux/vmalloc.h>
  17#include <linux/shrinker.h>
  18#include <linux/module.h>
  19#include <linux/rbtree.h>
  20#include <linux/stacktrace.h>
  21
  22#define DM_MSG_PREFIX "bufio"
  23
  24/*
  25 * Memory management policy:
  26 *      Limit the number of buffers to DM_BUFIO_MEMORY_PERCENT of main memory
  27 *      or DM_BUFIO_VMALLOC_PERCENT of vmalloc memory (whichever is lower).
  28 *      Always allocate at least DM_BUFIO_MIN_BUFFERS buffers.
  29 *      Start background writeback when there are DM_BUFIO_WRITEBACK_PERCENT
  30 *      dirty buffers.
  31 */
  32#define DM_BUFIO_MIN_BUFFERS            8
  33
  34#define DM_BUFIO_MEMORY_PERCENT         2
  35#define DM_BUFIO_VMALLOC_PERCENT        25
  36#define DM_BUFIO_WRITEBACK_RATIO        3
  37#define DM_BUFIO_LOW_WATERMARK_RATIO    16
  38
  39/*
  40 * Check buffer ages in this interval (seconds)
  41 */
  42#define DM_BUFIO_WORK_TIMER_SECS        30
  43
  44/*
  45 * Free buffers when they are older than this (seconds)
  46 */
  47#define DM_BUFIO_DEFAULT_AGE_SECS       300
  48
  49/*
  50 * The nr of bytes of cached data to keep around.
  51 */
  52#define DM_BUFIO_DEFAULT_RETAIN_BYTES   (256 * 1024)
  53
  54/*
  55 * Align buffer writes to this boundary.
  56 * Tests show that SSDs have the highest IOPS when using 4k writes.
  57 */
  58#define DM_BUFIO_WRITE_ALIGN            4096
  59
  60/*
  61 * dm_buffer->list_mode
  62 */
  63#define LIST_CLEAN      0
  64#define LIST_DIRTY      1
  65#define LIST_SIZE       2
  66
  67/*
  68 * Linking of buffers:
  69 *      All buffers are linked to buffer_tree with their node field.
  70 *
  71 *      Clean buffers that are not being written (B_WRITING not set)
  72 *      are linked to lru[LIST_CLEAN] with their lru_list field.
  73 *
  74 *      Dirty and clean buffers that are being written are linked to
  75 *      lru[LIST_DIRTY] with their lru_list field. When the write
  76 *      finishes, the buffer cannot be relinked immediately (because we
  77 *      are in an interrupt context and relinking requires process
  78 *      context), so some clean-not-writing buffers can be held on
  79 *      dirty_lru too.  They are later added to lru in the process
  80 *      context.
  81 */
  82struct dm_bufio_client {
  83        struct mutex lock;
  84
  85        struct list_head lru[LIST_SIZE];
  86        unsigned long n_buffers[LIST_SIZE];
  87
  88        struct block_device *bdev;
  89        unsigned block_size;
  90        s8 sectors_per_block_bits;
  91        void (*alloc_callback)(struct dm_buffer *);
  92        void (*write_callback)(struct dm_buffer *);
  93
  94        struct kmem_cache *slab_buffer;
  95        struct kmem_cache *slab_cache;
  96        struct dm_io_client *dm_io;
  97
  98        struct list_head reserved_buffers;
  99        unsigned need_reserved_buffers;
 100
 101        unsigned minimum_buffers;
 102
 103        struct rb_root buffer_tree;
 104        wait_queue_head_t free_buffer_wait;
 105
 106        sector_t start;
 107
 108        int async_write_error;
 109
 110        struct list_head client_list;
 111        struct shrinker shrinker;
 112};
 113
 114/*
 115 * Buffer state bits.
 116 */
 117#define B_READING       0
 118#define B_WRITING       1
 119#define B_DIRTY         2
 120
 121/*
 122 * Describes how the block was allocated:
 123 * kmem_cache_alloc(), __get_free_pages() or vmalloc().
 124 * See the comment at alloc_buffer_data.
 125 */
 126enum data_mode {
 127        DATA_MODE_SLAB = 0,
 128        DATA_MODE_GET_FREE_PAGES = 1,
 129        DATA_MODE_VMALLOC = 2,
 130        DATA_MODE_LIMIT = 3
 131};
 132
 133struct dm_buffer {
 134        struct rb_node node;
 135        struct list_head lru_list;
 136        struct list_head global_list;
 137        sector_t block;
 138        void *data;
 139        unsigned char data_mode;                /* DATA_MODE_* */
 140        unsigned char list_mode;                /* LIST_* */
 141        blk_status_t read_error;
 142        blk_status_t write_error;
 143        unsigned accessed;
 144        unsigned hold_count;
 145        unsigned long state;
 146        unsigned long last_accessed;
 147        unsigned dirty_start;
 148        unsigned dirty_end;
 149        unsigned write_start;
 150        unsigned write_end;
 151        struct dm_bufio_client *c;
 152        struct list_head write_list;
 153        void (*end_io)(struct dm_buffer *, blk_status_t);
 154#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
 155#define MAX_STACK 10
 156        unsigned int stack_len;
 157        unsigned long stack_entries[MAX_STACK];
 158#endif
 159};
 160
 161/*----------------------------------------------------------------*/
 162
 163#define dm_bufio_in_request()   (!!current->bio_list)
 164
 165static void dm_bufio_lock(struct dm_bufio_client *c)
 166{
 167        mutex_lock_nested(&c->lock, dm_bufio_in_request());
 168}
 169
 170static int dm_bufio_trylock(struct dm_bufio_client *c)
 171{
 172        return mutex_trylock(&c->lock);
 173}
 174
 175static void dm_bufio_unlock(struct dm_bufio_client *c)
 176{
 177        mutex_unlock(&c->lock);
 178}
 179
 180/*----------------------------------------------------------------*/
 181
 182/*
 183 * Default cache size: available memory divided by the ratio.
 184 */
 185static unsigned long dm_bufio_default_cache_size;
 186
 187/*
 188 * Total cache size set by the user.
 189 */
 190static unsigned long dm_bufio_cache_size;
 191
 192/*
 193 * A copy of dm_bufio_cache_size because dm_bufio_cache_size can change
 194 * at any time.  If it disagrees, the user has changed cache size.
 195 */
 196static unsigned long dm_bufio_cache_size_latch;
 197
 198static DEFINE_SPINLOCK(global_spinlock);
 199
 200static LIST_HEAD(global_queue);
 201
 202static unsigned long global_num = 0;
 203
 204/*
 205 * Buffers are freed after this timeout
 206 */
 207static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS;
 208static unsigned long dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES;
 209
 210static unsigned long dm_bufio_peak_allocated;
 211static unsigned long dm_bufio_allocated_kmem_cache;
 212static unsigned long dm_bufio_allocated_get_free_pages;
 213static unsigned long dm_bufio_allocated_vmalloc;
 214static unsigned long dm_bufio_current_allocated;
 215
 216/*----------------------------------------------------------------*/
 217
 218/*
 219 * The current number of clients.
 220 */
 221static int dm_bufio_client_count;
 222
 223/*
 224 * The list of all clients.
 225 */
 226static LIST_HEAD(dm_bufio_all_clients);
 227
 228/*
 229 * This mutex protects dm_bufio_cache_size_latch and dm_bufio_client_count
 230 */
 231static DEFINE_MUTEX(dm_bufio_clients_lock);
 232
 233static struct workqueue_struct *dm_bufio_wq;
 234static struct delayed_work dm_bufio_cleanup_old_work;
 235static struct work_struct dm_bufio_replacement_work;
 236
 237
 238#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
 239static void buffer_record_stack(struct dm_buffer *b)
 240{
 241        b->stack_len = stack_trace_save(b->stack_entries, MAX_STACK, 2);
 242}
 243#endif
 244
 245/*----------------------------------------------------------------
 246 * A red/black tree acts as an index for all the buffers.
 247 *--------------------------------------------------------------*/
 248static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block)
 249{
 250        struct rb_node *n = c->buffer_tree.rb_node;
 251        struct dm_buffer *b;
 252
 253        while (n) {
 254                b = container_of(n, struct dm_buffer, node);
 255
 256                if (b->block == block)
 257                        return b;
 258
 259                n = block < b->block ? n->rb_left : n->rb_right;
 260        }
 261
 262        return NULL;
 263}
 264
 265static struct dm_buffer *__find_next(struct dm_bufio_client *c, sector_t block)
 266{
 267        struct rb_node *n = c->buffer_tree.rb_node;
 268        struct dm_buffer *b;
 269        struct dm_buffer *best = NULL;
 270
 271        while (n) {
 272                b = container_of(n, struct dm_buffer, node);
 273
 274                if (b->block == block)
 275                        return b;
 276
 277                if (block <= b->block) {
 278                        n = n->rb_left;
 279                        best = b;
 280                } else {
 281                        n = n->rb_right;
 282                }
 283        }
 284
 285        return best;
 286}
 287
 288static void __insert(struct dm_bufio_client *c, struct dm_buffer *b)
 289{
 290        struct rb_node **new = &c->buffer_tree.rb_node, *parent = NULL;
 291        struct dm_buffer *found;
 292
 293        while (*new) {
 294                found = container_of(*new, struct dm_buffer, node);
 295
 296                if (found->block == b->block) {
 297                        BUG_ON(found != b);
 298                        return;
 299                }
 300
 301                parent = *new;
 302                new = b->block < found->block ?
 303                        &found->node.rb_left : &found->node.rb_right;
 304        }
 305
 306        rb_link_node(&b->node, parent, new);
 307        rb_insert_color(&b->node, &c->buffer_tree);
 308}
 309
 310static void __remove(struct dm_bufio_client *c, struct dm_buffer *b)
 311{
 312        rb_erase(&b->node, &c->buffer_tree);
 313}
 314
 315/*----------------------------------------------------------------*/
 316
 317static void adjust_total_allocated(struct dm_buffer *b, bool unlink)
 318{
 319        unsigned char data_mode;
 320        long diff;
 321
 322        static unsigned long * const class_ptr[DATA_MODE_LIMIT] = {
 323                &dm_bufio_allocated_kmem_cache,
 324                &dm_bufio_allocated_get_free_pages,
 325                &dm_bufio_allocated_vmalloc,
 326        };
 327
 328        data_mode = b->data_mode;
 329        diff = (long)b->c->block_size;
 330        if (unlink)
 331                diff = -diff;
 332
 333        spin_lock(&global_spinlock);
 334
 335        *class_ptr[data_mode] += diff;
 336
 337        dm_bufio_current_allocated += diff;
 338
 339        if (dm_bufio_current_allocated > dm_bufio_peak_allocated)
 340                dm_bufio_peak_allocated = dm_bufio_current_allocated;
 341
 342        b->accessed = 1;
 343
 344        if (!unlink) {
 345                list_add(&b->global_list, &global_queue);
 346                global_num++;
 347                if (dm_bufio_current_allocated > dm_bufio_cache_size)
 348                        queue_work(dm_bufio_wq, &dm_bufio_replacement_work);
 349        } else {
 350                list_del(&b->global_list);
 351                global_num--;
 352        }
 353
 354        spin_unlock(&global_spinlock);
 355}
 356
 357/*
 358 * Change the number of clients and recalculate per-client limit.
 359 */
 360static void __cache_size_refresh(void)
 361{
 362        BUG_ON(!mutex_is_locked(&dm_bufio_clients_lock));
 363        BUG_ON(dm_bufio_client_count < 0);
 364
 365        dm_bufio_cache_size_latch = READ_ONCE(dm_bufio_cache_size);
 366
 367        /*
 368         * Use default if set to 0 and report the actual cache size used.
 369         */
 370        if (!dm_bufio_cache_size_latch) {
 371                (void)cmpxchg(&dm_bufio_cache_size, 0,
 372                              dm_bufio_default_cache_size);
 373                dm_bufio_cache_size_latch = dm_bufio_default_cache_size;
 374        }
 375}
 376
 377/*
 378 * Allocating buffer data.
 379 *
 380 * Small buffers are allocated with kmem_cache, to use space optimally.
 381 *
 382 * For large buffers, we choose between get_free_pages and vmalloc.
 383 * Each has advantages and disadvantages.
 384 *
 385 * __get_free_pages can randomly fail if the memory is fragmented.
 386 * __vmalloc won't randomly fail, but vmalloc space is limited (it may be
 387 * as low as 128M) so using it for caching is not appropriate.
 388 *
 389 * If the allocation may fail we use __get_free_pages. Memory fragmentation
 390 * won't have a fatal effect here, but it just causes flushes of some other
 391 * buffers and more I/O will be performed. Don't use __get_free_pages if it
 392 * always fails (i.e. order >= MAX_ORDER).
 393 *
 394 * If the allocation shouldn't fail we use __vmalloc. This is only for the
 395 * initial reserve allocation, so there's no risk of wasting all vmalloc
 396 * space.
 397 */
 398static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask,
 399                               unsigned char *data_mode)
 400{
 401        if (unlikely(c->slab_cache != NULL)) {
 402                *data_mode = DATA_MODE_SLAB;
 403                return kmem_cache_alloc(c->slab_cache, gfp_mask);
 404        }
 405
 406        if (c->block_size <= KMALLOC_MAX_SIZE &&
 407            gfp_mask & __GFP_NORETRY) {
 408                *data_mode = DATA_MODE_GET_FREE_PAGES;
 409                return (void *)__get_free_pages(gfp_mask,
 410                                                c->sectors_per_block_bits - (PAGE_SHIFT - SECTOR_SHIFT));
 411        }
 412
 413        *data_mode = DATA_MODE_VMALLOC;
 414
 415        /*
 416         * __vmalloc allocates the data pages and auxiliary structures with
 417         * gfp_flags that were specified, but pagetables are always allocated
 418         * with GFP_KERNEL, no matter what was specified as gfp_mask.
 419         *
 420         * Consequently, we must set per-process flag PF_MEMALLOC_NOIO so that
 421         * all allocations done by this process (including pagetables) are done
 422         * as if GFP_NOIO was specified.
 423         */
 424        if (gfp_mask & __GFP_NORETRY) {
 425                unsigned noio_flag = memalloc_noio_save();
 426                void *ptr = __vmalloc(c->block_size, gfp_mask);
 427
 428                memalloc_noio_restore(noio_flag);
 429                return ptr;
 430        }
 431
 432        return __vmalloc(c->block_size, gfp_mask);
 433}
 434
 435/*
 436 * Free buffer's data.
 437 */
 438static void free_buffer_data(struct dm_bufio_client *c,
 439                             void *data, unsigned char data_mode)
 440{
 441        switch (data_mode) {
 442        case DATA_MODE_SLAB:
 443                kmem_cache_free(c->slab_cache, data);
 444                break;
 445
 446        case DATA_MODE_GET_FREE_PAGES:
 447                free_pages((unsigned long)data,
 448                           c->sectors_per_block_bits - (PAGE_SHIFT - SECTOR_SHIFT));
 449                break;
 450
 451        case DATA_MODE_VMALLOC:
 452                vfree(data);
 453                break;
 454
 455        default:
 456                DMCRIT("dm_bufio_free_buffer_data: bad data mode: %d",
 457                       data_mode);
 458                BUG();
 459        }
 460}
 461
 462/*
 463 * Allocate buffer and its data.
 464 */
 465static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask)
 466{
 467        struct dm_buffer *b = kmem_cache_alloc(c->slab_buffer, gfp_mask);
 468
 469        if (!b)
 470                return NULL;
 471
 472        b->c = c;
 473
 474        b->data = alloc_buffer_data(c, gfp_mask, &b->data_mode);
 475        if (!b->data) {
 476                kmem_cache_free(c->slab_buffer, b);
 477                return NULL;
 478        }
 479
 480#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
 481        b->stack_len = 0;
 482#endif
 483        return b;
 484}
 485
 486/*
 487 * Free buffer and its data.
 488 */
 489static void free_buffer(struct dm_buffer *b)
 490{
 491        struct dm_bufio_client *c = b->c;
 492
 493        free_buffer_data(c, b->data, b->data_mode);
 494        kmem_cache_free(c->slab_buffer, b);
 495}
 496
 497/*
 498 * Link buffer to the buffer tree and clean or dirty queue.
 499 */
 500static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty)
 501{
 502        struct dm_bufio_client *c = b->c;
 503
 504        c->n_buffers[dirty]++;
 505        b->block = block;
 506        b->list_mode = dirty;
 507        list_add(&b->lru_list, &c->lru[dirty]);
 508        __insert(b->c, b);
 509        b->last_accessed = jiffies;
 510
 511        adjust_total_allocated(b, false);
 512}
 513
 514/*
 515 * Unlink buffer from the buffer tree and dirty or clean queue.
 516 */
 517static void __unlink_buffer(struct dm_buffer *b)
 518{
 519        struct dm_bufio_client *c = b->c;
 520
 521        BUG_ON(!c->n_buffers[b->list_mode]);
 522
 523        c->n_buffers[b->list_mode]--;
 524        __remove(b->c, b);
 525        list_del(&b->lru_list);
 526
 527        adjust_total_allocated(b, true);
 528}
 529
 530/*
 531 * Place the buffer to the head of dirty or clean LRU queue.
 532 */
 533static void __relink_lru(struct dm_buffer *b, int dirty)
 534{
 535        struct dm_bufio_client *c = b->c;
 536
 537        b->accessed = 1;
 538
 539        BUG_ON(!c->n_buffers[b->list_mode]);
 540
 541        c->n_buffers[b->list_mode]--;
 542        c->n_buffers[dirty]++;
 543        b->list_mode = dirty;
 544        list_move(&b->lru_list, &c->lru[dirty]);
 545        b->last_accessed = jiffies;
 546}
 547
 548/*----------------------------------------------------------------
 549 * Submit I/O on the buffer.
 550 *
 551 * Bio interface is faster but it has some problems:
 552 *      the vector list is limited (increasing this limit increases
 553 *      memory-consumption per buffer, so it is not viable);
 554 *
 555 *      the memory must be direct-mapped, not vmalloced;
 556 *
 557 * If the buffer is small enough (up to DM_BUFIO_INLINE_VECS pages) and
 558 * it is not vmalloced, try using the bio interface.
 559 *
 560 * If the buffer is big, if it is vmalloced or if the underlying device
 561 * rejects the bio because it is too large, use dm-io layer to do the I/O.
 562 * The dm-io layer splits the I/O into multiple requests, avoiding the above
 563 * shortcomings.
 564 *--------------------------------------------------------------*/
 565
 566/*
 567 * dm-io completion routine. It just calls b->bio.bi_end_io, pretending
 568 * that the request was handled directly with bio interface.
 569 */
 570static void dmio_complete(unsigned long error, void *context)
 571{
 572        struct dm_buffer *b = context;
 573
 574        b->end_io(b, unlikely(error != 0) ? BLK_STS_IOERR : 0);
 575}
 576
 577static void use_dmio(struct dm_buffer *b, int rw, sector_t sector,
 578                     unsigned n_sectors, unsigned offset)
 579{
 580        int r;
 581        struct dm_io_request io_req = {
 582                .bi_op = rw,
 583                .bi_op_flags = 0,
 584                .notify.fn = dmio_complete,
 585                .notify.context = b,
 586                .client = b->c->dm_io,
 587        };
 588        struct dm_io_region region = {
 589                .bdev = b->c->bdev,
 590                .sector = sector,
 591                .count = n_sectors,
 592        };
 593
 594        if (b->data_mode != DATA_MODE_VMALLOC) {
 595                io_req.mem.type = DM_IO_KMEM;
 596                io_req.mem.ptr.addr = (char *)b->data + offset;
 597        } else {
 598                io_req.mem.type = DM_IO_VMA;
 599                io_req.mem.ptr.vma = (char *)b->data + offset;
 600        }
 601
 602        r = dm_io(&io_req, 1, &region, NULL);
 603        if (unlikely(r))
 604                b->end_io(b, errno_to_blk_status(r));
 605}
 606
 607static void bio_complete(struct bio *bio)
 608{
 609        struct dm_buffer *b = bio->bi_private;
 610        blk_status_t status = bio->bi_status;
 611        bio_put(bio);
 612        b->end_io(b, status);
 613}
 614
 615static void use_bio(struct dm_buffer *b, int rw, sector_t sector,
 616                    unsigned n_sectors, unsigned offset)
 617{
 618        struct bio *bio;
 619        char *ptr;
 620        unsigned vec_size, len;
 621
 622        vec_size = b->c->block_size >> PAGE_SHIFT;
 623        if (unlikely(b->c->sectors_per_block_bits < PAGE_SHIFT - SECTOR_SHIFT))
 624                vec_size += 2;
 625
 626        bio = bio_kmalloc(GFP_NOWAIT | __GFP_NORETRY | __GFP_NOWARN, vec_size);
 627        if (!bio) {
 628dmio:
 629                use_dmio(b, rw, sector, n_sectors, offset);
 630                return;
 631        }
 632
 633        bio->bi_iter.bi_sector = sector;
 634        bio_set_dev(bio, b->c->bdev);
 635        bio_set_op_attrs(bio, rw, 0);
 636        bio->bi_end_io = bio_complete;
 637        bio->bi_private = b;
 638
 639        ptr = (char *)b->data + offset;
 640        len = n_sectors << SECTOR_SHIFT;
 641
 642        do {
 643                unsigned this_step = min((unsigned)(PAGE_SIZE - offset_in_page(ptr)), len);
 644                if (!bio_add_page(bio, virt_to_page(ptr), this_step,
 645                                  offset_in_page(ptr))) {
 646                        bio_put(bio);
 647                        goto dmio;
 648                }
 649
 650                len -= this_step;
 651                ptr += this_step;
 652        } while (len > 0);
 653
 654        submit_bio(bio);
 655}
 656
 657static inline sector_t block_to_sector(struct dm_bufio_client *c, sector_t block)
 658{
 659        sector_t sector;
 660
 661        if (likely(c->sectors_per_block_bits >= 0))
 662                sector = block << c->sectors_per_block_bits;
 663        else
 664                sector = block * (c->block_size >> SECTOR_SHIFT);
 665        sector += c->start;
 666
 667        return sector;
 668}
 669
 670static void submit_io(struct dm_buffer *b, int rw, void (*end_io)(struct dm_buffer *, blk_status_t))
 671{
 672        unsigned n_sectors;
 673        sector_t sector;
 674        unsigned offset, end;
 675
 676        b->end_io = end_io;
 677
 678        sector = block_to_sector(b->c, b->block);
 679
 680        if (rw != REQ_OP_WRITE) {
 681                n_sectors = b->c->block_size >> SECTOR_SHIFT;
 682                offset = 0;
 683        } else {
 684                if (b->c->write_callback)
 685                        b->c->write_callback(b);
 686                offset = b->write_start;
 687                end = b->write_end;
 688                offset &= -DM_BUFIO_WRITE_ALIGN;
 689                end += DM_BUFIO_WRITE_ALIGN - 1;
 690                end &= -DM_BUFIO_WRITE_ALIGN;
 691                if (unlikely(end > b->c->block_size))
 692                        end = b->c->block_size;
 693
 694                sector += offset >> SECTOR_SHIFT;
 695                n_sectors = (end - offset) >> SECTOR_SHIFT;
 696        }
 697
 698        if (b->data_mode != DATA_MODE_VMALLOC)
 699                use_bio(b, rw, sector, n_sectors, offset);
 700        else
 701                use_dmio(b, rw, sector, n_sectors, offset);
 702}
 703
 704/*----------------------------------------------------------------
 705 * Writing dirty buffers
 706 *--------------------------------------------------------------*/
 707
 708/*
 709 * The endio routine for write.
 710 *
 711 * Set the error, clear B_WRITING bit and wake anyone who was waiting on
 712 * it.
 713 */
 714static void write_endio(struct dm_buffer *b, blk_status_t status)
 715{
 716        b->write_error = status;
 717        if (unlikely(status)) {
 718                struct dm_bufio_client *c = b->c;
 719
 720                (void)cmpxchg(&c->async_write_error, 0,
 721                                blk_status_to_errno(status));
 722        }
 723
 724        BUG_ON(!test_bit(B_WRITING, &b->state));
 725
 726        smp_mb__before_atomic();
 727        clear_bit(B_WRITING, &b->state);
 728        smp_mb__after_atomic();
 729
 730        wake_up_bit(&b->state, B_WRITING);
 731}
 732
 733/*
 734 * Initiate a write on a dirty buffer, but don't wait for it.
 735 *
 736 * - If the buffer is not dirty, exit.
 737 * - If there some previous write going on, wait for it to finish (we can't
 738 *   have two writes on the same buffer simultaneously).
 739 * - Submit our write and don't wait on it. We set B_WRITING indicating
 740 *   that there is a write in progress.
 741 */
 742static void __write_dirty_buffer(struct dm_buffer *b,
 743                                 struct list_head *write_list)
 744{
 745        if (!test_bit(B_DIRTY, &b->state))
 746                return;
 747
 748        clear_bit(B_DIRTY, &b->state);
 749        wait_on_bit_lock_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
 750
 751        b->write_start = b->dirty_start;
 752        b->write_end = b->dirty_end;
 753
 754        if (!write_list)
 755                submit_io(b, REQ_OP_WRITE, write_endio);
 756        else
 757                list_add_tail(&b->write_list, write_list);
 758}
 759
 760static void __flush_write_list(struct list_head *write_list)
 761{
 762        struct blk_plug plug;
 763        blk_start_plug(&plug);
 764        while (!list_empty(write_list)) {
 765                struct dm_buffer *b =
 766                        list_entry(write_list->next, struct dm_buffer, write_list);
 767                list_del(&b->write_list);
 768                submit_io(b, REQ_OP_WRITE, write_endio);
 769                cond_resched();
 770        }
 771        blk_finish_plug(&plug);
 772}
 773
 774/*
 775 * Wait until any activity on the buffer finishes.  Possibly write the
 776 * buffer if it is dirty.  When this function finishes, there is no I/O
 777 * running on the buffer and the buffer is not dirty.
 778 */
 779static void __make_buffer_clean(struct dm_buffer *b)
 780{
 781        BUG_ON(b->hold_count);
 782
 783        if (!b->state)  /* fast case */
 784                return;
 785
 786        wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);
 787        __write_dirty_buffer(b, NULL);
 788        wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
 789}
 790
 791/*
 792 * Find some buffer that is not held by anybody, clean it, unlink it and
 793 * return it.
 794 */
 795static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c)
 796{
 797        struct dm_buffer *b;
 798
 799        list_for_each_entry_reverse(b, &c->lru[LIST_CLEAN], lru_list) {
 800                BUG_ON(test_bit(B_WRITING, &b->state));
 801                BUG_ON(test_bit(B_DIRTY, &b->state));
 802
 803                if (!b->hold_count) {
 804                        __make_buffer_clean(b);
 805                        __unlink_buffer(b);
 806                        return b;
 807                }
 808                cond_resched();
 809        }
 810
 811        list_for_each_entry_reverse(b, &c->lru[LIST_DIRTY], lru_list) {
 812                BUG_ON(test_bit(B_READING, &b->state));
 813
 814                if (!b->hold_count) {
 815                        __make_buffer_clean(b);
 816                        __unlink_buffer(b);
 817                        return b;
 818                }
 819                cond_resched();
 820        }
 821
 822        return NULL;
 823}
 824
 825/*
 826 * Wait until some other threads free some buffer or release hold count on
 827 * some buffer.
 828 *
 829 * This function is entered with c->lock held, drops it and regains it
 830 * before exiting.
 831 */
 832static void __wait_for_free_buffer(struct dm_bufio_client *c)
 833{
 834        DECLARE_WAITQUEUE(wait, current);
 835
 836        add_wait_queue(&c->free_buffer_wait, &wait);
 837        set_current_state(TASK_UNINTERRUPTIBLE);
 838        dm_bufio_unlock(c);
 839
 840        io_schedule();
 841
 842        remove_wait_queue(&c->free_buffer_wait, &wait);
 843
 844        dm_bufio_lock(c);
 845}
 846
 847enum new_flag {
 848        NF_FRESH = 0,
 849        NF_READ = 1,
 850        NF_GET = 2,
 851        NF_PREFETCH = 3
 852};
 853
 854/*
 855 * Allocate a new buffer. If the allocation is not possible, wait until
 856 * some other thread frees a buffer.
 857 *
 858 * May drop the lock and regain it.
 859 */
 860static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf)
 861{
 862        struct dm_buffer *b;
 863        bool tried_noio_alloc = false;
 864
 865        /*
 866         * dm-bufio is resistant to allocation failures (it just keeps
 867         * one buffer reserved in cases all the allocations fail).
 868         * So set flags to not try too hard:
 869         *      GFP_NOWAIT: don't wait; if we need to sleep we'll release our
 870         *                  mutex and wait ourselves.
 871         *      __GFP_NORETRY: don't retry and rather return failure
 872         *      __GFP_NOMEMALLOC: don't use emergency reserves
 873         *      __GFP_NOWARN: don't print a warning in case of failure
 874         *
 875         * For debugging, if we set the cache size to 1, no new buffers will
 876         * be allocated.
 877         */
 878        while (1) {
 879                if (dm_bufio_cache_size_latch != 1) {
 880                        b = alloc_buffer(c, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
 881                        if (b)
 882                                return b;
 883                }
 884
 885                if (nf == NF_PREFETCH)
 886                        return NULL;
 887
 888                if (dm_bufio_cache_size_latch != 1 && !tried_noio_alloc) {
 889                        dm_bufio_unlock(c);
 890                        b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
 891                        dm_bufio_lock(c);
 892                        if (b)
 893                                return b;
 894                        tried_noio_alloc = true;
 895                }
 896
 897                if (!list_empty(&c->reserved_buffers)) {
 898                        b = list_entry(c->reserved_buffers.next,
 899                                       struct dm_buffer, lru_list);
 900                        list_del(&b->lru_list);
 901                        c->need_reserved_buffers++;
 902
 903                        return b;
 904                }
 905
 906                b = __get_unclaimed_buffer(c);
 907                if (b)
 908                        return b;
 909
 910                __wait_for_free_buffer(c);
 911        }
 912}
 913
 914static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c, enum new_flag nf)
 915{
 916        struct dm_buffer *b = __alloc_buffer_wait_no_callback(c, nf);
 917
 918        if (!b)
 919                return NULL;
 920
 921        if (c->alloc_callback)
 922                c->alloc_callback(b);
 923
 924        return b;
 925}
 926
 927/*
 928 * Free a buffer and wake other threads waiting for free buffers.
 929 */
 930static void __free_buffer_wake(struct dm_buffer *b)
 931{
 932        struct dm_bufio_client *c = b->c;
 933
 934        if (!c->need_reserved_buffers)
 935                free_buffer(b);
 936        else {
 937                list_add(&b->lru_list, &c->reserved_buffers);
 938                c->need_reserved_buffers--;
 939        }
 940
 941        wake_up(&c->free_buffer_wait);
 942}
 943
 944static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait,
 945                                        struct list_head *write_list)
 946{
 947        struct dm_buffer *b, *tmp;
 948
 949        list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) {
 950                BUG_ON(test_bit(B_READING, &b->state));
 951
 952                if (!test_bit(B_DIRTY, &b->state) &&
 953                    !test_bit(B_WRITING, &b->state)) {
 954                        __relink_lru(b, LIST_CLEAN);
 955                        continue;
 956                }
 957
 958                if (no_wait && test_bit(B_WRITING, &b->state))
 959                        return;
 960
 961                __write_dirty_buffer(b, write_list);
 962                cond_resched();
 963        }
 964}
 965
 966/*
 967 * Check if we're over watermark.
 968 * If we are over threshold_buffers, start freeing buffers.
 969 * If we're over "limit_buffers", block until we get under the limit.
 970 */
 971static void __check_watermark(struct dm_bufio_client *c,
 972                              struct list_head *write_list)
 973{
 974        if (c->n_buffers[LIST_DIRTY] > c->n_buffers[LIST_CLEAN] * DM_BUFIO_WRITEBACK_RATIO)
 975                __write_dirty_buffers_async(c, 1, write_list);
 976}
 977
 978/*----------------------------------------------------------------
 979 * Getting a buffer
 980 *--------------------------------------------------------------*/
 981
 982static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
 983                                     enum new_flag nf, int *need_submit,
 984                                     struct list_head *write_list)
 985{
 986        struct dm_buffer *b, *new_b = NULL;
 987
 988        *need_submit = 0;
 989
 990        b = __find(c, block);
 991        if (b)
 992                goto found_buffer;
 993
 994        if (nf == NF_GET)
 995                return NULL;
 996
 997        new_b = __alloc_buffer_wait(c, nf);
 998        if (!new_b)
 999                return NULL;
1000
1001        /*
1002         * We've had a period where the mutex was unlocked, so need to
1003         * recheck the buffer tree.
1004         */
1005        b = __find(c, block);
1006        if (b) {
1007                __free_buffer_wake(new_b);
1008                goto found_buffer;
1009        }
1010
1011        __check_watermark(c, write_list);
1012
1013        b = new_b;
1014        b->hold_count = 1;
1015        b->read_error = 0;
1016        b->write_error = 0;
1017        __link_buffer(b, block, LIST_CLEAN);
1018
1019        if (nf == NF_FRESH) {
1020                b->state = 0;
1021                return b;
1022        }
1023
1024        b->state = 1 << B_READING;
1025        *need_submit = 1;
1026
1027        return b;
1028
1029found_buffer:
1030        if (nf == NF_PREFETCH)
1031                return NULL;
1032        /*
1033         * Note: it is essential that we don't wait for the buffer to be
1034         * read if dm_bufio_get function is used. Both dm_bufio_get and
1035         * dm_bufio_prefetch can be used in the driver request routine.
1036         * If the user called both dm_bufio_prefetch and dm_bufio_get on
1037         * the same buffer, it would deadlock if we waited.
1038         */
1039        if (nf == NF_GET && unlikely(test_bit(B_READING, &b->state)))
1040                return NULL;
1041
1042        b->hold_count++;
1043        __relink_lru(b, test_bit(B_DIRTY, &b->state) ||
1044                     test_bit(B_WRITING, &b->state));
1045        return b;
1046}
1047
1048/*
1049 * The endio routine for reading: set the error, clear the bit and wake up
1050 * anyone waiting on the buffer.
1051 */
1052static void read_endio(struct dm_buffer *b, blk_status_t status)
1053{
1054        b->read_error = status;
1055
1056        BUG_ON(!test_bit(B_READING, &b->state));
1057
1058        smp_mb__before_atomic();
1059        clear_bit(B_READING, &b->state);
1060        smp_mb__after_atomic();
1061
1062        wake_up_bit(&b->state, B_READING);
1063}
1064
1065/*
1066 * A common routine for dm_bufio_new and dm_bufio_read.  Operation of these
1067 * functions is similar except that dm_bufio_new doesn't read the
1068 * buffer from the disk (assuming that the caller overwrites all the data
1069 * and uses dm_bufio_mark_buffer_dirty to write new data back).
1070 */
1071static void *new_read(struct dm_bufio_client *c, sector_t block,
1072                      enum new_flag nf, struct dm_buffer **bp)
1073{
1074        int need_submit;
1075        struct dm_buffer *b;
1076
1077        LIST_HEAD(write_list);
1078
1079        dm_bufio_lock(c);
1080        b = __bufio_new(c, block, nf, &need_submit, &write_list);
1081#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
1082        if (b && b->hold_count == 1)
1083                buffer_record_stack(b);
1084#endif
1085        dm_bufio_unlock(c);
1086
1087        __flush_write_list(&write_list);
1088
1089        if (!b)
1090                return NULL;
1091
1092        if (need_submit)
1093                submit_io(b, REQ_OP_READ, read_endio);
1094
1095        wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);
1096
1097        if (b->read_error) {
1098                int error = blk_status_to_errno(b->read_error);
1099
1100                dm_bufio_release(b);
1101
1102                return ERR_PTR(error);
1103        }
1104
1105        *bp = b;
1106
1107        return b->data;
1108}
1109
1110void *dm_bufio_get(struct dm_bufio_client *c, sector_t block,
1111                   struct dm_buffer **bp)
1112{
1113        return new_read(c, block, NF_GET, bp);
1114}
1115EXPORT_SYMBOL_GPL(dm_bufio_get);
1116
1117void *dm_bufio_read(struct dm_bufio_client *c, sector_t block,
1118                    struct dm_buffer **bp)
1119{
1120        BUG_ON(dm_bufio_in_request());
1121
1122        return new_read(c, block, NF_READ, bp);
1123}
1124EXPORT_SYMBOL_GPL(dm_bufio_read);
1125
1126void *dm_bufio_new(struct dm_bufio_client *c, sector_t block,
1127                   struct dm_buffer **bp)
1128{
1129        BUG_ON(dm_bufio_in_request());
1130
1131        return new_read(c, block, NF_FRESH, bp);
1132}
1133EXPORT_SYMBOL_GPL(dm_bufio_new);
1134
1135void dm_bufio_prefetch(struct dm_bufio_client *c,
1136                       sector_t block, unsigned n_blocks)
1137{
1138        struct blk_plug plug;
1139
1140        LIST_HEAD(write_list);
1141
1142        BUG_ON(dm_bufio_in_request());
1143
1144        blk_start_plug(&plug);
1145        dm_bufio_lock(c);
1146
1147        for (; n_blocks--; block++) {
1148                int need_submit;
1149                struct dm_buffer *b;
1150                b = __bufio_new(c, block, NF_PREFETCH, &need_submit,
1151                                &write_list);
1152                if (unlikely(!list_empty(&write_list))) {
1153                        dm_bufio_unlock(c);
1154                        blk_finish_plug(&plug);
1155                        __flush_write_list(&write_list);
1156                        blk_start_plug(&plug);
1157                        dm_bufio_lock(c);
1158                }
1159                if (unlikely(b != NULL)) {
1160                        dm_bufio_unlock(c);
1161
1162                        if (need_submit)
1163                                submit_io(b, REQ_OP_READ, read_endio);
1164                        dm_bufio_release(b);
1165
1166                        cond_resched();
1167
1168                        if (!n_blocks)
1169                                goto flush_plug;
1170                        dm_bufio_lock(c);
1171                }
1172        }
1173
1174        dm_bufio_unlock(c);
1175
1176flush_plug:
1177        blk_finish_plug(&plug);
1178}
1179EXPORT_SYMBOL_GPL(dm_bufio_prefetch);
1180
1181void dm_bufio_release(struct dm_buffer *b)
1182{
1183        struct dm_bufio_client *c = b->c;
1184
1185        dm_bufio_lock(c);
1186
1187        BUG_ON(!b->hold_count);
1188
1189        b->hold_count--;
1190        if (!b->hold_count) {
1191                wake_up(&c->free_buffer_wait);
1192
1193                /*
1194                 * If there were errors on the buffer, and the buffer is not
1195                 * to be written, free the buffer. There is no point in caching
1196                 * invalid buffer.
1197                 */
1198                if ((b->read_error || b->write_error) &&
1199                    !test_bit(B_READING, &b->state) &&
1200                    !test_bit(B_WRITING, &b->state) &&
1201                    !test_bit(B_DIRTY, &b->state)) {
1202                        __unlink_buffer(b);
1203                        __free_buffer_wake(b);
1204                }
1205        }
1206
1207        dm_bufio_unlock(c);
1208}
1209EXPORT_SYMBOL_GPL(dm_bufio_release);
1210
1211void dm_bufio_mark_partial_buffer_dirty(struct dm_buffer *b,
1212                                        unsigned start, unsigned end)
1213{
1214        struct dm_bufio_client *c = b->c;
1215
1216        BUG_ON(start >= end);
1217        BUG_ON(end > b->c->block_size);
1218
1219        dm_bufio_lock(c);
1220
1221        BUG_ON(test_bit(B_READING, &b->state));
1222
1223        if (!test_and_set_bit(B_DIRTY, &b->state)) {
1224                b->dirty_start = start;
1225                b->dirty_end = end;
1226                __relink_lru(b, LIST_DIRTY);
1227        } else {
1228                if (start < b->dirty_start)
1229                        b->dirty_start = start;
1230                if (end > b->dirty_end)
1231                        b->dirty_end = end;
1232        }
1233
1234        dm_bufio_unlock(c);
1235}
1236EXPORT_SYMBOL_GPL(dm_bufio_mark_partial_buffer_dirty);
1237
1238void dm_bufio_mark_buffer_dirty(struct dm_buffer *b)
1239{
1240        dm_bufio_mark_partial_buffer_dirty(b, 0, b->c->block_size);
1241}
1242EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty);
1243
1244void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c)
1245{
1246        LIST_HEAD(write_list);
1247
1248        BUG_ON(dm_bufio_in_request());
1249
1250        dm_bufio_lock(c);
1251        __write_dirty_buffers_async(c, 0, &write_list);
1252        dm_bufio_unlock(c);
1253        __flush_write_list(&write_list);
1254}
1255EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async);
1256
1257/*
1258 * For performance, it is essential that the buffers are written asynchronously
1259 * and simultaneously (so that the block layer can merge the writes) and then
1260 * waited upon.
1261 *
1262 * Finally, we flush hardware disk cache.
1263 */
1264int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c)
1265{
1266        int a, f;
1267        unsigned long buffers_processed = 0;
1268        struct dm_buffer *b, *tmp;
1269
1270        LIST_HEAD(write_list);
1271
1272        dm_bufio_lock(c);
1273        __write_dirty_buffers_async(c, 0, &write_list);
1274        dm_bufio_unlock(c);
1275        __flush_write_list(&write_list);
1276        dm_bufio_lock(c);
1277
1278again:
1279        list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) {
1280                int dropped_lock = 0;
1281
1282                if (buffers_processed < c->n_buffers[LIST_DIRTY])
1283                        buffers_processed++;
1284
1285                BUG_ON(test_bit(B_READING, &b->state));
1286
1287                if (test_bit(B_WRITING, &b->state)) {
1288                        if (buffers_processed < c->n_buffers[LIST_DIRTY]) {
1289                                dropped_lock = 1;
1290                                b->hold_count++;
1291                                dm_bufio_unlock(c);
1292                                wait_on_bit_io(&b->state, B_WRITING,
1293                                               TASK_UNINTERRUPTIBLE);
1294                                dm_bufio_lock(c);
1295                                b->hold_count--;
1296                        } else
1297                                wait_on_bit_io(&b->state, B_WRITING,
1298                                               TASK_UNINTERRUPTIBLE);
1299                }
1300
1301                if (!test_bit(B_DIRTY, &b->state) &&
1302                    !test_bit(B_WRITING, &b->state))
1303                        __relink_lru(b, LIST_CLEAN);
1304
1305                cond_resched();
1306
1307                /*
1308                 * If we dropped the lock, the list is no longer consistent,
1309                 * so we must restart the search.
1310                 *
1311                 * In the most common case, the buffer just processed is
1312                 * relinked to the clean list, so we won't loop scanning the
1313                 * same buffer again and again.
1314                 *
1315                 * This may livelock if there is another thread simultaneously
1316                 * dirtying buffers, so we count the number of buffers walked
1317                 * and if it exceeds the total number of buffers, it means that
1318                 * someone is doing some writes simultaneously with us.  In
1319                 * this case, stop, dropping the lock.
1320                 */
1321                if (dropped_lock)
1322                        goto again;
1323        }
1324        wake_up(&c->free_buffer_wait);
1325        dm_bufio_unlock(c);
1326
1327        a = xchg(&c->async_write_error, 0);
1328        f = dm_bufio_issue_flush(c);
1329        if (a)
1330                return a;
1331
1332        return f;
1333}
1334EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers);
1335
1336/*
1337 * Use dm-io to send an empty barrier to flush the device.
1338 */
1339int dm_bufio_issue_flush(struct dm_bufio_client *c)
1340{
1341        struct dm_io_request io_req = {
1342                .bi_op = REQ_OP_WRITE,
1343                .bi_op_flags = REQ_PREFLUSH | REQ_SYNC,
1344                .mem.type = DM_IO_KMEM,
1345                .mem.ptr.addr = NULL,
1346                .client = c->dm_io,
1347        };
1348        struct dm_io_region io_reg = {
1349                .bdev = c->bdev,
1350                .sector = 0,
1351                .count = 0,
1352        };
1353
1354        BUG_ON(dm_bufio_in_request());
1355
1356        return dm_io(&io_req, 1, &io_reg, NULL);
1357}
1358EXPORT_SYMBOL_GPL(dm_bufio_issue_flush);
1359
1360/*
1361 * Use dm-io to send a discard request to flush the device.
1362 */
1363int dm_bufio_issue_discard(struct dm_bufio_client *c, sector_t block, sector_t count)
1364{
1365        struct dm_io_request io_req = {
1366                .bi_op = REQ_OP_DISCARD,
1367                .bi_op_flags = REQ_SYNC,
1368                .mem.type = DM_IO_KMEM,
1369                .mem.ptr.addr = NULL,
1370                .client = c->dm_io,
1371        };
1372        struct dm_io_region io_reg = {
1373                .bdev = c->bdev,
1374                .sector = block_to_sector(c, block),
1375                .count = block_to_sector(c, count),
1376        };
1377
1378        BUG_ON(dm_bufio_in_request());
1379
1380        return dm_io(&io_req, 1, &io_reg, NULL);
1381}
1382EXPORT_SYMBOL_GPL(dm_bufio_issue_discard);
1383
1384/*
1385 * We first delete any other buffer that may be at that new location.
1386 *
1387 * Then, we write the buffer to the original location if it was dirty.
1388 *
1389 * Then, if we are the only one who is holding the buffer, relink the buffer
1390 * in the buffer tree for the new location.
1391 *
1392 * If there was someone else holding the buffer, we write it to the new
1393 * location but not relink it, because that other user needs to have the buffer
1394 * at the same place.
1395 */
1396void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block)
1397{
1398        struct dm_bufio_client *c = b->c;
1399        struct dm_buffer *new;
1400
1401        BUG_ON(dm_bufio_in_request());
1402
1403        dm_bufio_lock(c);
1404
1405retry:
1406        new = __find(c, new_block);
1407        if (new) {
1408                if (new->hold_count) {
1409                        __wait_for_free_buffer(c);
1410                        goto retry;
1411                }
1412
1413                /*
1414                 * FIXME: Is there any point waiting for a write that's going
1415                 * to be overwritten in a bit?
1416                 */
1417                __make_buffer_clean(new);
1418                __unlink_buffer(new);
1419                __free_buffer_wake(new);
1420        }
1421
1422        BUG_ON(!b->hold_count);
1423        BUG_ON(test_bit(B_READING, &b->state));
1424
1425        __write_dirty_buffer(b, NULL);
1426        if (b->hold_count == 1) {
1427                wait_on_bit_io(&b->state, B_WRITING,
1428                               TASK_UNINTERRUPTIBLE);
1429                set_bit(B_DIRTY, &b->state);
1430                b->dirty_start = 0;
1431                b->dirty_end = c->block_size;
1432                __unlink_buffer(b);
1433                __link_buffer(b, new_block, LIST_DIRTY);
1434        } else {
1435                sector_t old_block;
1436                wait_on_bit_lock_io(&b->state, B_WRITING,
1437                                    TASK_UNINTERRUPTIBLE);
1438                /*
1439                 * Relink buffer to "new_block" so that write_callback
1440                 * sees "new_block" as a block number.
1441                 * After the write, link the buffer back to old_block.
1442                 * All this must be done in bufio lock, so that block number
1443                 * change isn't visible to other threads.
1444                 */
1445                old_block = b->block;
1446                __unlink_buffer(b);
1447                __link_buffer(b, new_block, b->list_mode);
1448                submit_io(b, REQ_OP_WRITE, write_endio);
1449                wait_on_bit_io(&b->state, B_WRITING,
1450                               TASK_UNINTERRUPTIBLE);
1451                __unlink_buffer(b);
1452                __link_buffer(b, old_block, b->list_mode);
1453        }
1454
1455        dm_bufio_unlock(c);
1456        dm_bufio_release(b);
1457}
1458EXPORT_SYMBOL_GPL(dm_bufio_release_move);
1459
1460static void forget_buffer_locked(struct dm_buffer *b)
1461{
1462        if (likely(!b->hold_count) && likely(!b->state)) {
1463                __unlink_buffer(b);
1464                __free_buffer_wake(b);
1465        }
1466}
1467
1468/*
1469 * Free the given buffer.
1470 *
1471 * This is just a hint, if the buffer is in use or dirty, this function
1472 * does nothing.
1473 */
1474void dm_bufio_forget(struct dm_bufio_client *c, sector_t block)
1475{
1476        struct dm_buffer *b;
1477
1478        dm_bufio_lock(c);
1479
1480        b = __find(c, block);
1481        if (b)
1482                forget_buffer_locked(b);
1483
1484        dm_bufio_unlock(c);
1485}
1486EXPORT_SYMBOL_GPL(dm_bufio_forget);
1487
1488void dm_bufio_forget_buffers(struct dm_bufio_client *c, sector_t block, sector_t n_blocks)
1489{
1490        struct dm_buffer *b;
1491        sector_t end_block = block + n_blocks;
1492
1493        while (block < end_block) {
1494                dm_bufio_lock(c);
1495
1496                b = __find_next(c, block);
1497                if (b) {
1498                        block = b->block + 1;
1499                        forget_buffer_locked(b);
1500                }
1501
1502                dm_bufio_unlock(c);
1503
1504                if (!b)
1505                        break;
1506        }
1507
1508}
1509EXPORT_SYMBOL_GPL(dm_bufio_forget_buffers);
1510
1511void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned n)
1512{
1513        c->minimum_buffers = n;
1514}
1515EXPORT_SYMBOL_GPL(dm_bufio_set_minimum_buffers);
1516
1517unsigned dm_bufio_get_block_size(struct dm_bufio_client *c)
1518{
1519        return c->block_size;
1520}
1521EXPORT_SYMBOL_GPL(dm_bufio_get_block_size);
1522
1523sector_t dm_bufio_get_device_size(struct dm_bufio_client *c)
1524{
1525        sector_t s = i_size_read(c->bdev->bd_inode) >> SECTOR_SHIFT;
1526        if (likely(c->sectors_per_block_bits >= 0))
1527                s >>= c->sectors_per_block_bits;
1528        else
1529                sector_div(s, c->block_size >> SECTOR_SHIFT);
1530        return s;
1531}
1532EXPORT_SYMBOL_GPL(dm_bufio_get_device_size);
1533
1534sector_t dm_bufio_get_block_number(struct dm_buffer *b)
1535{
1536        return b->block;
1537}
1538EXPORT_SYMBOL_GPL(dm_bufio_get_block_number);
1539
1540void *dm_bufio_get_block_data(struct dm_buffer *b)
1541{
1542        return b->data;
1543}
1544EXPORT_SYMBOL_GPL(dm_bufio_get_block_data);
1545
1546void *dm_bufio_get_aux_data(struct dm_buffer *b)
1547{
1548        return b + 1;
1549}
1550EXPORT_SYMBOL_GPL(dm_bufio_get_aux_data);
1551
1552struct dm_bufio_client *dm_bufio_get_client(struct dm_buffer *b)
1553{
1554        return b->c;
1555}
1556EXPORT_SYMBOL_GPL(dm_bufio_get_client);
1557
1558static void drop_buffers(struct dm_bufio_client *c)
1559{
1560        struct dm_buffer *b;
1561        int i;
1562        bool warned = false;
1563
1564        BUG_ON(dm_bufio_in_request());
1565
1566        /*
1567         * An optimization so that the buffers are not written one-by-one.
1568         */
1569        dm_bufio_write_dirty_buffers_async(c);
1570
1571        dm_bufio_lock(c);
1572
1573        while ((b = __get_unclaimed_buffer(c)))
1574                __free_buffer_wake(b);
1575
1576        for (i = 0; i < LIST_SIZE; i++)
1577                list_for_each_entry(b, &c->lru[i], lru_list) {
1578                        WARN_ON(!warned);
1579                        warned = true;
1580                        DMERR("leaked buffer %llx, hold count %u, list %d",
1581                              (unsigned long long)b->block, b->hold_count, i);
1582#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
1583                        stack_trace_print(b->stack_entries, b->stack_len, 1);
1584                        /* mark unclaimed to avoid BUG_ON below */
1585                        b->hold_count = 0;
1586#endif
1587                }
1588
1589#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
1590        while ((b = __get_unclaimed_buffer(c)))
1591                __free_buffer_wake(b);
1592#endif
1593
1594        for (i = 0; i < LIST_SIZE; i++)
1595                BUG_ON(!list_empty(&c->lru[i]));
1596
1597        dm_bufio_unlock(c);
1598}
1599
1600/*
1601 * We may not be able to evict this buffer if IO pending or the client
1602 * is still using it.  Caller is expected to know buffer is too old.
1603 *
1604 * And if GFP_NOFS is used, we must not do any I/O because we hold
1605 * dm_bufio_clients_lock and we would risk deadlock if the I/O gets
1606 * rerouted to different bufio client.
1607 */
1608static bool __try_evict_buffer(struct dm_buffer *b, gfp_t gfp)
1609{
1610        if (!(gfp & __GFP_FS)) {
1611                if (test_bit(B_READING, &b->state) ||
1612                    test_bit(B_WRITING, &b->state) ||
1613                    test_bit(B_DIRTY, &b->state))
1614                        return false;
1615        }
1616
1617        if (b->hold_count)
1618                return false;
1619
1620        __make_buffer_clean(b);
1621        __unlink_buffer(b);
1622        __free_buffer_wake(b);
1623
1624        return true;
1625}
1626
1627static unsigned long get_retain_buffers(struct dm_bufio_client *c)
1628{
1629        unsigned long retain_bytes = READ_ONCE(dm_bufio_retain_bytes);
1630        if (likely(c->sectors_per_block_bits >= 0))
1631                retain_bytes >>= c->sectors_per_block_bits + SECTOR_SHIFT;
1632        else
1633                retain_bytes /= c->block_size;
1634        return retain_bytes;
1635}
1636
1637static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
1638                            gfp_t gfp_mask)
1639{
1640        int l;
1641        struct dm_buffer *b, *tmp;
1642        unsigned long freed = 0;
1643        unsigned long count = c->n_buffers[LIST_CLEAN] +
1644                              c->n_buffers[LIST_DIRTY];
1645        unsigned long retain_target = get_retain_buffers(c);
1646
1647        for (l = 0; l < LIST_SIZE; l++) {
1648                list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) {
1649                        if (__try_evict_buffer(b, gfp_mask))
1650                                freed++;
1651                        if (!--nr_to_scan || ((count - freed) <= retain_target))
1652                                return freed;
1653                        cond_resched();
1654                }
1655        }
1656        return freed;
1657}
1658
1659static unsigned long
1660dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
1661{
1662        struct dm_bufio_client *c;
1663        unsigned long freed;
1664
1665        c = container_of(shrink, struct dm_bufio_client, shrinker);
1666        if (sc->gfp_mask & __GFP_FS)
1667                dm_bufio_lock(c);
1668        else if (!dm_bufio_trylock(c))
1669                return SHRINK_STOP;
1670
1671        freed  = __scan(c, sc->nr_to_scan, sc->gfp_mask);
1672        dm_bufio_unlock(c);
1673        return freed;
1674}
1675
1676static unsigned long
1677dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
1678{
1679        struct dm_bufio_client *c = container_of(shrink, struct dm_bufio_client, shrinker);
1680        unsigned long count = READ_ONCE(c->n_buffers[LIST_CLEAN]) +
1681                              READ_ONCE(c->n_buffers[LIST_DIRTY]);
1682        unsigned long retain_target = get_retain_buffers(c);
1683
1684        return (count < retain_target) ? 0 : (count - retain_target);
1685}
1686
1687/*
1688 * Create the buffering interface
1689 */
1690struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned block_size,
1691                                               unsigned reserved_buffers, unsigned aux_size,
1692                                               void (*alloc_callback)(struct dm_buffer *),
1693                                               void (*write_callback)(struct dm_buffer *))
1694{
1695        int r;
1696        struct dm_bufio_client *c;
1697        unsigned i;
1698        char slab_name[27];
1699
1700        if (!block_size || block_size & ((1 << SECTOR_SHIFT) - 1)) {
1701                DMERR("%s: block size not specified or is not multiple of 512b", __func__);
1702                r = -EINVAL;
1703                goto bad_client;
1704        }
1705
1706        c = kzalloc(sizeof(*c), GFP_KERNEL);
1707        if (!c) {
1708                r = -ENOMEM;
1709                goto bad_client;
1710        }
1711        c->buffer_tree = RB_ROOT;
1712
1713        c->bdev = bdev;
1714        c->block_size = block_size;
1715        if (is_power_of_2(block_size))
1716                c->sectors_per_block_bits = __ffs(block_size) - SECTOR_SHIFT;
1717        else
1718                c->sectors_per_block_bits = -1;
1719
1720        c->alloc_callback = alloc_callback;
1721        c->write_callback = write_callback;
1722
1723        for (i = 0; i < LIST_SIZE; i++) {
1724                INIT_LIST_HEAD(&c->lru[i]);
1725                c->n_buffers[i] = 0;
1726        }
1727
1728        mutex_init(&c->lock);
1729        INIT_LIST_HEAD(&c->reserved_buffers);
1730        c->need_reserved_buffers = reserved_buffers;
1731
1732        dm_bufio_set_minimum_buffers(c, DM_BUFIO_MIN_BUFFERS);
1733
1734        init_waitqueue_head(&c->free_buffer_wait);
1735        c->async_write_error = 0;
1736
1737        c->dm_io = dm_io_client_create();
1738        if (IS_ERR(c->dm_io)) {
1739                r = PTR_ERR(c->dm_io);
1740                goto bad_dm_io;
1741        }
1742
1743        if (block_size <= KMALLOC_MAX_SIZE &&
1744            (block_size < PAGE_SIZE || !is_power_of_2(block_size))) {
1745                unsigned align = min(1U << __ffs(block_size), (unsigned)PAGE_SIZE);
1746                snprintf(slab_name, sizeof slab_name, "dm_bufio_cache-%u", block_size);
1747                c->slab_cache = kmem_cache_create(slab_name, block_size, align,
1748                                                  SLAB_RECLAIM_ACCOUNT, NULL);
1749                if (!c->slab_cache) {
1750                        r = -ENOMEM;
1751                        goto bad;
1752                }
1753        }
1754        if (aux_size)
1755                snprintf(slab_name, sizeof slab_name, "dm_bufio_buffer-%u", aux_size);
1756        else
1757                snprintf(slab_name, sizeof slab_name, "dm_bufio_buffer");
1758        c->slab_buffer = kmem_cache_create(slab_name, sizeof(struct dm_buffer) + aux_size,
1759                                           0, SLAB_RECLAIM_ACCOUNT, NULL);
1760        if (!c->slab_buffer) {
1761                r = -ENOMEM;
1762                goto bad;
1763        }
1764
1765        while (c->need_reserved_buffers) {
1766                struct dm_buffer *b = alloc_buffer(c, GFP_KERNEL);
1767
1768                if (!b) {
1769                        r = -ENOMEM;
1770                        goto bad;
1771                }
1772                __free_buffer_wake(b);
1773        }
1774
1775        c->shrinker.count_objects = dm_bufio_shrink_count;
1776        c->shrinker.scan_objects = dm_bufio_shrink_scan;
1777        c->shrinker.seeks = 1;
1778        c->shrinker.batch = 0;
1779        r = register_shrinker(&c->shrinker);
1780        if (r)
1781                goto bad;
1782
1783        mutex_lock(&dm_bufio_clients_lock);
1784        dm_bufio_client_count++;
1785        list_add(&c->client_list, &dm_bufio_all_clients);
1786        __cache_size_refresh();
1787        mutex_unlock(&dm_bufio_clients_lock);
1788
1789        return c;
1790
1791bad:
1792        while (!list_empty(&c->reserved_buffers)) {
1793                struct dm_buffer *b = list_entry(c->reserved_buffers.next,
1794                                                 struct dm_buffer, lru_list);
1795                list_del(&b->lru_list);
1796                free_buffer(b);
1797        }
1798        kmem_cache_destroy(c->slab_cache);
1799        kmem_cache_destroy(c->slab_buffer);
1800        dm_io_client_destroy(c->dm_io);
1801bad_dm_io:
1802        mutex_destroy(&c->lock);
1803        kfree(c);
1804bad_client:
1805        return ERR_PTR(r);
1806}
1807EXPORT_SYMBOL_GPL(dm_bufio_client_create);
1808
1809/*
1810 * Free the buffering interface.
1811 * It is required that there are no references on any buffers.
1812 */
1813void dm_bufio_client_destroy(struct dm_bufio_client *c)
1814{
1815        unsigned i;
1816
1817        drop_buffers(c);
1818
1819        unregister_shrinker(&c->shrinker);
1820
1821        mutex_lock(&dm_bufio_clients_lock);
1822
1823        list_del(&c->client_list);
1824        dm_bufio_client_count--;
1825        __cache_size_refresh();
1826
1827        mutex_unlock(&dm_bufio_clients_lock);
1828
1829        BUG_ON(!RB_EMPTY_ROOT(&c->buffer_tree));
1830        BUG_ON(c->need_reserved_buffers);
1831
1832        while (!list_empty(&c->reserved_buffers)) {
1833                struct dm_buffer *b = list_entry(c->reserved_buffers.next,
1834                                                 struct dm_buffer, lru_list);
1835                list_del(&b->lru_list);
1836                free_buffer(b);
1837        }
1838
1839        for (i = 0; i < LIST_SIZE; i++)
1840                if (c->n_buffers[i])
1841                        DMERR("leaked buffer count %d: %ld", i, c->n_buffers[i]);
1842
1843        for (i = 0; i < LIST_SIZE; i++)
1844                BUG_ON(c->n_buffers[i]);
1845
1846        kmem_cache_destroy(c->slab_cache);
1847        kmem_cache_destroy(c->slab_buffer);
1848        dm_io_client_destroy(c->dm_io);
1849        mutex_destroy(&c->lock);
1850        kfree(c);
1851}
1852EXPORT_SYMBOL_GPL(dm_bufio_client_destroy);
1853
1854void dm_bufio_set_sector_offset(struct dm_bufio_client *c, sector_t start)
1855{
1856        c->start = start;
1857}
1858EXPORT_SYMBOL_GPL(dm_bufio_set_sector_offset);
1859
1860static unsigned get_max_age_hz(void)
1861{
1862        unsigned max_age = READ_ONCE(dm_bufio_max_age);
1863
1864        if (max_age > UINT_MAX / HZ)
1865                max_age = UINT_MAX / HZ;
1866
1867        return max_age * HZ;
1868}
1869
1870static bool older_than(struct dm_buffer *b, unsigned long age_hz)
1871{
1872        return time_after_eq(jiffies, b->last_accessed + age_hz);
1873}
1874
1875static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz)
1876{
1877        struct dm_buffer *b, *tmp;
1878        unsigned long retain_target = get_retain_buffers(c);
1879        unsigned long count;
1880        LIST_HEAD(write_list);
1881
1882        dm_bufio_lock(c);
1883
1884        __check_watermark(c, &write_list);
1885        if (unlikely(!list_empty(&write_list))) {
1886                dm_bufio_unlock(c);
1887                __flush_write_list(&write_list);
1888                dm_bufio_lock(c);
1889        }
1890
1891        count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY];
1892        list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_CLEAN], lru_list) {
1893                if (count <= retain_target)
1894                        break;
1895
1896                if (!older_than(b, age_hz))
1897                        break;
1898
1899                if (__try_evict_buffer(b, 0))
1900                        count--;
1901
1902                cond_resched();
1903        }
1904
1905        dm_bufio_unlock(c);
1906}
1907
1908static void do_global_cleanup(struct work_struct *w)
1909{
1910        struct dm_bufio_client *locked_client = NULL;
1911        struct dm_bufio_client *current_client;
1912        struct dm_buffer *b;
1913        unsigned spinlock_hold_count;
1914        unsigned long threshold = dm_bufio_cache_size -
1915                dm_bufio_cache_size / DM_BUFIO_LOW_WATERMARK_RATIO;
1916        unsigned long loops = global_num * 2;
1917
1918        mutex_lock(&dm_bufio_clients_lock);
1919
1920        while (1) {
1921                cond_resched();
1922
1923                spin_lock(&global_spinlock);
1924                if (unlikely(dm_bufio_current_allocated <= threshold))
1925                        break;
1926
1927                spinlock_hold_count = 0;
1928get_next:
1929                if (!loops--)
1930                        break;
1931                if (unlikely(list_empty(&global_queue)))
1932                        break;
1933                b = list_entry(global_queue.prev, struct dm_buffer, global_list);
1934
1935                if (b->accessed) {
1936                        b->accessed = 0;
1937                        list_move(&b->global_list, &global_queue);
1938                        if (likely(++spinlock_hold_count < 16))
1939                                goto get_next;
1940                        spin_unlock(&global_spinlock);
1941                        continue;
1942                }
1943
1944                current_client = b->c;
1945                if (unlikely(current_client != locked_client)) {
1946                        if (locked_client)
1947                                dm_bufio_unlock(locked_client);
1948
1949                        if (!dm_bufio_trylock(current_client)) {
1950                                spin_unlock(&global_spinlock);
1951                                dm_bufio_lock(current_client);
1952                                locked_client = current_client;
1953                                continue;
1954                        }
1955
1956                        locked_client = current_client;
1957                }
1958
1959                spin_unlock(&global_spinlock);
1960
1961                if (unlikely(!__try_evict_buffer(b, GFP_KERNEL))) {
1962                        spin_lock(&global_spinlock);
1963                        list_move(&b->global_list, &global_queue);
1964                        spin_unlock(&global_spinlock);
1965                }
1966        }
1967
1968        spin_unlock(&global_spinlock);
1969
1970        if (locked_client)
1971                dm_bufio_unlock(locked_client);
1972
1973        mutex_unlock(&dm_bufio_clients_lock);
1974}
1975
1976static void cleanup_old_buffers(void)
1977{
1978        unsigned long max_age_hz = get_max_age_hz();
1979        struct dm_bufio_client *c;
1980
1981        mutex_lock(&dm_bufio_clients_lock);
1982
1983        __cache_size_refresh();
1984
1985        list_for_each_entry(c, &dm_bufio_all_clients, client_list)
1986                __evict_old_buffers(c, max_age_hz);
1987
1988        mutex_unlock(&dm_bufio_clients_lock);
1989}
1990
1991static void work_fn(struct work_struct *w)
1992{
1993        cleanup_old_buffers();
1994
1995        queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work,
1996                           DM_BUFIO_WORK_TIMER_SECS * HZ);
1997}
1998
1999/*----------------------------------------------------------------
2000 * Module setup
2001 *--------------------------------------------------------------*/
2002
2003/*
2004 * This is called only once for the whole dm_bufio module.
2005 * It initializes memory limit.
2006 */
2007static int __init dm_bufio_init(void)
2008{
2009        __u64 mem;
2010
2011        dm_bufio_allocated_kmem_cache = 0;
2012        dm_bufio_allocated_get_free_pages = 0;
2013        dm_bufio_allocated_vmalloc = 0;
2014        dm_bufio_current_allocated = 0;
2015
2016        mem = (__u64)mult_frac(totalram_pages() - totalhigh_pages(),
2017                               DM_BUFIO_MEMORY_PERCENT, 100) << PAGE_SHIFT;
2018
2019        if (mem > ULONG_MAX)
2020                mem = ULONG_MAX;
2021
2022#ifdef CONFIG_MMU
2023        if (mem > mult_frac(VMALLOC_TOTAL, DM_BUFIO_VMALLOC_PERCENT, 100))
2024                mem = mult_frac(VMALLOC_TOTAL, DM_BUFIO_VMALLOC_PERCENT, 100);
2025#endif
2026
2027        dm_bufio_default_cache_size = mem;
2028
2029        mutex_lock(&dm_bufio_clients_lock);
2030        __cache_size_refresh();
2031        mutex_unlock(&dm_bufio_clients_lock);
2032
2033        dm_bufio_wq = alloc_workqueue("dm_bufio_cache", WQ_MEM_RECLAIM, 0);
2034        if (!dm_bufio_wq)
2035                return -ENOMEM;
2036
2037        INIT_DELAYED_WORK(&dm_bufio_cleanup_old_work, work_fn);
2038        INIT_WORK(&dm_bufio_replacement_work, do_global_cleanup);
2039        queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work,
2040                           DM_BUFIO_WORK_TIMER_SECS * HZ);
2041
2042        return 0;
2043}
2044
2045/*
2046 * This is called once when unloading the dm_bufio module.
2047 */
2048static void __exit dm_bufio_exit(void)
2049{
2050        int bug = 0;
2051
2052        cancel_delayed_work_sync(&dm_bufio_cleanup_old_work);
2053        flush_workqueue(dm_bufio_wq);
2054        destroy_workqueue(dm_bufio_wq);
2055
2056        if (dm_bufio_client_count) {
2057                DMCRIT("%s: dm_bufio_client_count leaked: %d",
2058                        __func__, dm_bufio_client_count);
2059                bug = 1;
2060        }
2061
2062        if (dm_bufio_current_allocated) {
2063                DMCRIT("%s: dm_bufio_current_allocated leaked: %lu",
2064                        __func__, dm_bufio_current_allocated);
2065                bug = 1;
2066        }
2067
2068        if (dm_bufio_allocated_get_free_pages) {
2069                DMCRIT("%s: dm_bufio_allocated_get_free_pages leaked: %lu",
2070                       __func__, dm_bufio_allocated_get_free_pages);
2071                bug = 1;
2072        }
2073
2074        if (dm_bufio_allocated_vmalloc) {
2075                DMCRIT("%s: dm_bufio_vmalloc leaked: %lu",
2076                       __func__, dm_bufio_allocated_vmalloc);
2077                bug = 1;
2078        }
2079
2080        BUG_ON(bug);
2081}
2082
2083module_init(dm_bufio_init)
2084module_exit(dm_bufio_exit)
2085
2086module_param_named(max_cache_size_bytes, dm_bufio_cache_size, ulong, S_IRUGO | S_IWUSR);
2087MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache");
2088
2089module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR);
2090MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds");
2091
2092module_param_named(retain_bytes, dm_bufio_retain_bytes, ulong, S_IRUGO | S_IWUSR);
2093MODULE_PARM_DESC(retain_bytes, "Try to keep at least this many bytes cached in memory");
2094
2095module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR);
2096MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory");
2097
2098module_param_named(allocated_kmem_cache_bytes, dm_bufio_allocated_kmem_cache, ulong, S_IRUGO);
2099MODULE_PARM_DESC(allocated_kmem_cache_bytes, "Memory allocated with kmem_cache_alloc");
2100
2101module_param_named(allocated_get_free_pages_bytes, dm_bufio_allocated_get_free_pages, ulong, S_IRUGO);
2102MODULE_PARM_DESC(allocated_get_free_pages_bytes, "Memory allocated with get_free_pages");
2103
2104module_param_named(allocated_vmalloc_bytes, dm_bufio_allocated_vmalloc, ulong, S_IRUGO);
2105MODULE_PARM_DESC(allocated_vmalloc_bytes, "Memory allocated with vmalloc");
2106
2107module_param_named(current_allocated_bytes, dm_bufio_current_allocated, ulong, S_IRUGO);
2108MODULE_PARM_DESC(current_allocated_bytes, "Memory currently used by the cache");
2109
2110MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>");
2111MODULE_DESCRIPTION(DM_NAME " buffered I/O library");
2112MODULE_LICENSE("GPL");
2113