linux/drivers/md/dm-bufio.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2009-2011 Red Hat, Inc.
   3 *
   4 * Author: Mikulas Patocka <mpatocka@redhat.com>
   5 *
   6 * This file is released under the GPL.
   7 */
   8
   9#include "dm-bufio.h"
  10
  11#include <linux/device-mapper.h>
  12#include <linux/dm-io.h>
  13#include <linux/slab.h>
  14#include <linux/sched/mm.h>
  15#include <linux/jiffies.h>
  16#include <linux/vmalloc.h>
  17#include <linux/shrinker.h>
  18#include <linux/module.h>
  19#include <linux/rbtree.h>
  20#include <linux/stacktrace.h>
  21
  22#define DM_MSG_PREFIX "bufio"
  23
  24/*
  25 * Memory management policy:
  26 *      Limit the number of buffers to DM_BUFIO_MEMORY_PERCENT of main memory
  27 *      or DM_BUFIO_VMALLOC_PERCENT of vmalloc memory (whichever is lower).
  28 *      Always allocate at least DM_BUFIO_MIN_BUFFERS buffers.
  29 *      Start background writeback when there are DM_BUFIO_WRITEBACK_PERCENT
  30 *      dirty buffers.
  31 */
  32#define DM_BUFIO_MIN_BUFFERS            8
  33
  34#define DM_BUFIO_MEMORY_PERCENT         2
  35#define DM_BUFIO_VMALLOC_PERCENT        25
  36#define DM_BUFIO_WRITEBACK_PERCENT      75
  37
  38/*
  39 * Check buffer ages in this interval (seconds)
  40 */
  41#define DM_BUFIO_WORK_TIMER_SECS        30
  42
  43/*
  44 * Free buffers when they are older than this (seconds)
  45 */
  46#define DM_BUFIO_DEFAULT_AGE_SECS       300
  47
  48/*
  49 * The nr of bytes of cached data to keep around.
  50 */
  51#define DM_BUFIO_DEFAULT_RETAIN_BYTES   (256 * 1024)
  52
  53/*
  54 * The number of bvec entries that are embedded directly in the buffer.
  55 * If the chunk size is larger, dm-io is used to do the io.
  56 */
  57#define DM_BUFIO_INLINE_VECS            16
  58
  59/*
  60 * Don't try to use kmem_cache_alloc for blocks larger than this.
  61 * For explanation, see alloc_buffer_data below.
  62 */
  63#define DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT  (PAGE_SIZE >> 1)
  64#define DM_BUFIO_BLOCK_SIZE_GFP_LIMIT   (PAGE_SIZE << (MAX_ORDER - 1))
  65
  66/*
  67 * Align buffer writes to this boundary.
  68 * Tests show that SSDs have the highest IOPS when using 4k writes.
  69 */
  70#define DM_BUFIO_WRITE_ALIGN            4096
  71
  72/*
  73 * dm_buffer->list_mode
  74 */
  75#define LIST_CLEAN      0
  76#define LIST_DIRTY      1
  77#define LIST_SIZE       2
  78
  79/*
  80 * Linking of buffers:
  81 *      All buffers are linked to cache_hash with their hash_list field.
  82 *
  83 *      Clean buffers that are not being written (B_WRITING not set)
  84 *      are linked to lru[LIST_CLEAN] with their lru_list field.
  85 *
  86 *      Dirty and clean buffers that are being written are linked to
  87 *      lru[LIST_DIRTY] with their lru_list field. When the write
  88 *      finishes, the buffer cannot be relinked immediately (because we
  89 *      are in an interrupt context and relinking requires process
  90 *      context), so some clean-not-writing buffers can be held on
  91 *      dirty_lru too.  They are later added to lru in the process
  92 *      context.
  93 */
  94struct dm_bufio_client {
  95        struct mutex lock;
  96
  97        struct list_head lru[LIST_SIZE];
  98        unsigned long n_buffers[LIST_SIZE];
  99
 100        struct block_device *bdev;
 101        unsigned block_size;
 102        unsigned char sectors_per_block_bits;
 103        unsigned char pages_per_block_bits;
 104        unsigned char blocks_per_page_bits;
 105        unsigned aux_size;
 106        void (*alloc_callback)(struct dm_buffer *);
 107        void (*write_callback)(struct dm_buffer *);
 108
 109        struct dm_io_client *dm_io;
 110
 111        struct list_head reserved_buffers;
 112        unsigned need_reserved_buffers;
 113
 114        unsigned minimum_buffers;
 115
 116        struct rb_root buffer_tree;
 117        wait_queue_head_t free_buffer_wait;
 118
 119        sector_t start;
 120
 121        int async_write_error;
 122
 123        struct list_head client_list;
 124        struct shrinker shrinker;
 125};
 126
 127/*
 128 * Buffer state bits.
 129 */
 130#define B_READING       0
 131#define B_WRITING       1
 132#define B_DIRTY         2
 133
 134/*
 135 * Describes how the block was allocated:
 136 * kmem_cache_alloc(), __get_free_pages() or vmalloc().
 137 * See the comment at alloc_buffer_data.
 138 */
 139enum data_mode {
 140        DATA_MODE_SLAB = 0,
 141        DATA_MODE_GET_FREE_PAGES = 1,
 142        DATA_MODE_VMALLOC = 2,
 143        DATA_MODE_LIMIT = 3
 144};
 145
 146struct dm_buffer {
 147        struct rb_node node;
 148        struct list_head lru_list;
 149        sector_t block;
 150        void *data;
 151        enum data_mode data_mode;
 152        unsigned char list_mode;                /* LIST_* */
 153        unsigned hold_count;
 154        blk_status_t read_error;
 155        blk_status_t write_error;
 156        unsigned long state;
 157        unsigned long last_accessed;
 158        unsigned dirty_start;
 159        unsigned dirty_end;
 160        unsigned write_start;
 161        unsigned write_end;
 162        struct dm_bufio_client *c;
 163        struct list_head write_list;
 164        struct bio bio;
 165        struct bio_vec bio_vec[DM_BUFIO_INLINE_VECS];
 166#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
 167#define MAX_STACK 10
 168        struct stack_trace stack_trace;
 169        unsigned long stack_entries[MAX_STACK];
 170#endif
 171};
 172
 173/*----------------------------------------------------------------*/
 174
 175static struct kmem_cache *dm_bufio_caches[PAGE_SHIFT - SECTOR_SHIFT];
 176static char *dm_bufio_cache_names[PAGE_SHIFT - SECTOR_SHIFT];
 177
 178static inline int dm_bufio_cache_index(struct dm_bufio_client *c)
 179{
 180        unsigned ret = c->blocks_per_page_bits - 1;
 181
 182        BUG_ON(ret >= ARRAY_SIZE(dm_bufio_caches));
 183
 184        return ret;
 185}
 186
 187#define DM_BUFIO_CACHE(c)       (dm_bufio_caches[dm_bufio_cache_index(c)])
 188#define DM_BUFIO_CACHE_NAME(c)  (dm_bufio_cache_names[dm_bufio_cache_index(c)])
 189
 190#define dm_bufio_in_request()   (!!current->bio_list)
 191
 192static void dm_bufio_lock(struct dm_bufio_client *c)
 193{
 194        mutex_lock_nested(&c->lock, dm_bufio_in_request());
 195}
 196
 197static int dm_bufio_trylock(struct dm_bufio_client *c)
 198{
 199        return mutex_trylock(&c->lock);
 200}
 201
 202static void dm_bufio_unlock(struct dm_bufio_client *c)
 203{
 204        mutex_unlock(&c->lock);
 205}
 206
 207/*----------------------------------------------------------------*/
 208
 209/*
 210 * Default cache size: available memory divided by the ratio.
 211 */
 212static unsigned long dm_bufio_default_cache_size;
 213
 214/*
 215 * Total cache size set by the user.
 216 */
 217static unsigned long dm_bufio_cache_size;
 218
 219/*
 220 * A copy of dm_bufio_cache_size because dm_bufio_cache_size can change
 221 * at any time.  If it disagrees, the user has changed cache size.
 222 */
 223static unsigned long dm_bufio_cache_size_latch;
 224
 225static DEFINE_SPINLOCK(param_spinlock);
 226
 227/*
 228 * Buffers are freed after this timeout
 229 */
 230static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS;
 231static unsigned long dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES;
 232
 233static unsigned long dm_bufio_peak_allocated;
 234static unsigned long dm_bufio_allocated_kmem_cache;
 235static unsigned long dm_bufio_allocated_get_free_pages;
 236static unsigned long dm_bufio_allocated_vmalloc;
 237static unsigned long dm_bufio_current_allocated;
 238
 239/*----------------------------------------------------------------*/
 240
 241/*
 242 * Per-client cache: dm_bufio_cache_size / dm_bufio_client_count
 243 */
 244static unsigned long dm_bufio_cache_size_per_client;
 245
 246/*
 247 * The current number of clients.
 248 */
 249static int dm_bufio_client_count;
 250
 251/*
 252 * The list of all clients.
 253 */
 254static LIST_HEAD(dm_bufio_all_clients);
 255
 256/*
 257 * This mutex protects dm_bufio_cache_size_latch,
 258 * dm_bufio_cache_size_per_client and dm_bufio_client_count
 259 */
 260static DEFINE_MUTEX(dm_bufio_clients_lock);
 261
 262#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
 263static void buffer_record_stack(struct dm_buffer *b)
 264{
 265        b->stack_trace.nr_entries = 0;
 266        b->stack_trace.max_entries = MAX_STACK;
 267        b->stack_trace.entries = b->stack_entries;
 268        b->stack_trace.skip = 2;
 269        save_stack_trace(&b->stack_trace);
 270}
 271#endif
 272
 273/*----------------------------------------------------------------
 274 * A red/black tree acts as an index for all the buffers.
 275 *--------------------------------------------------------------*/
 276static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block)
 277{
 278        struct rb_node *n = c->buffer_tree.rb_node;
 279        struct dm_buffer *b;
 280
 281        while (n) {
 282                b = container_of(n, struct dm_buffer, node);
 283
 284                if (b->block == block)
 285                        return b;
 286
 287                n = (b->block < block) ? n->rb_left : n->rb_right;
 288        }
 289
 290        return NULL;
 291}
 292
 293static void __insert(struct dm_bufio_client *c, struct dm_buffer *b)
 294{
 295        struct rb_node **new = &c->buffer_tree.rb_node, *parent = NULL;
 296        struct dm_buffer *found;
 297
 298        while (*new) {
 299                found = container_of(*new, struct dm_buffer, node);
 300
 301                if (found->block == b->block) {
 302                        BUG_ON(found != b);
 303                        return;
 304                }
 305
 306                parent = *new;
 307                new = (found->block < b->block) ?
 308                        &((*new)->rb_left) : &((*new)->rb_right);
 309        }
 310
 311        rb_link_node(&b->node, parent, new);
 312        rb_insert_color(&b->node, &c->buffer_tree);
 313}
 314
 315static void __remove(struct dm_bufio_client *c, struct dm_buffer *b)
 316{
 317        rb_erase(&b->node, &c->buffer_tree);
 318}
 319
 320/*----------------------------------------------------------------*/
 321
 322static void adjust_total_allocated(enum data_mode data_mode, long diff)
 323{
 324        static unsigned long * const class_ptr[DATA_MODE_LIMIT] = {
 325                &dm_bufio_allocated_kmem_cache,
 326                &dm_bufio_allocated_get_free_pages,
 327                &dm_bufio_allocated_vmalloc,
 328        };
 329
 330        spin_lock(&param_spinlock);
 331
 332        *class_ptr[data_mode] += diff;
 333
 334        dm_bufio_current_allocated += diff;
 335
 336        if (dm_bufio_current_allocated > dm_bufio_peak_allocated)
 337                dm_bufio_peak_allocated = dm_bufio_current_allocated;
 338
 339        spin_unlock(&param_spinlock);
 340}
 341
 342/*
 343 * Change the number of clients and recalculate per-client limit.
 344 */
 345static void __cache_size_refresh(void)
 346{
 347        BUG_ON(!mutex_is_locked(&dm_bufio_clients_lock));
 348        BUG_ON(dm_bufio_client_count < 0);
 349
 350        dm_bufio_cache_size_latch = READ_ONCE(dm_bufio_cache_size);
 351
 352        /*
 353         * Use default if set to 0 and report the actual cache size used.
 354         */
 355        if (!dm_bufio_cache_size_latch) {
 356                (void)cmpxchg(&dm_bufio_cache_size, 0,
 357                              dm_bufio_default_cache_size);
 358                dm_bufio_cache_size_latch = dm_bufio_default_cache_size;
 359        }
 360
 361        dm_bufio_cache_size_per_client = dm_bufio_cache_size_latch /
 362                                         (dm_bufio_client_count ? : 1);
 363}
 364
 365/*
 366 * Allocating buffer data.
 367 *
 368 * Small buffers are allocated with kmem_cache, to use space optimally.
 369 *
 370 * For large buffers, we choose between get_free_pages and vmalloc.
 371 * Each has advantages and disadvantages.
 372 *
 373 * __get_free_pages can randomly fail if the memory is fragmented.
 374 * __vmalloc won't randomly fail, but vmalloc space is limited (it may be
 375 * as low as 128M) so using it for caching is not appropriate.
 376 *
 377 * If the allocation may fail we use __get_free_pages. Memory fragmentation
 378 * won't have a fatal effect here, but it just causes flushes of some other
 379 * buffers and more I/O will be performed. Don't use __get_free_pages if it
 380 * always fails (i.e. order >= MAX_ORDER).
 381 *
 382 * If the allocation shouldn't fail we use __vmalloc. This is only for the
 383 * initial reserve allocation, so there's no risk of wasting all vmalloc
 384 * space.
 385 */
 386static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask,
 387                               enum data_mode *data_mode)
 388{
 389        unsigned noio_flag;
 390        void *ptr;
 391
 392        if (c->block_size <= DM_BUFIO_BLOCK_SIZE_SLAB_LIMIT) {
 393                *data_mode = DATA_MODE_SLAB;
 394                return kmem_cache_alloc(DM_BUFIO_CACHE(c), gfp_mask);
 395        }
 396
 397        if (c->block_size <= DM_BUFIO_BLOCK_SIZE_GFP_LIMIT &&
 398            gfp_mask & __GFP_NORETRY) {
 399                *data_mode = DATA_MODE_GET_FREE_PAGES;
 400                return (void *)__get_free_pages(gfp_mask,
 401                                                c->pages_per_block_bits);
 402        }
 403
 404        *data_mode = DATA_MODE_VMALLOC;
 405
 406        /*
 407         * __vmalloc allocates the data pages and auxiliary structures with
 408         * gfp_flags that were specified, but pagetables are always allocated
 409         * with GFP_KERNEL, no matter what was specified as gfp_mask.
 410         *
 411         * Consequently, we must set per-process flag PF_MEMALLOC_NOIO so that
 412         * all allocations done by this process (including pagetables) are done
 413         * as if GFP_NOIO was specified.
 414         */
 415
 416        if (gfp_mask & __GFP_NORETRY)
 417                noio_flag = memalloc_noio_save();
 418
 419        ptr = __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL);
 420
 421        if (gfp_mask & __GFP_NORETRY)
 422                memalloc_noio_restore(noio_flag);
 423
 424        return ptr;
 425}
 426
 427/*
 428 * Free buffer's data.
 429 */
 430static void free_buffer_data(struct dm_bufio_client *c,
 431                             void *data, enum data_mode data_mode)
 432{
 433        switch (data_mode) {
 434        case DATA_MODE_SLAB:
 435                kmem_cache_free(DM_BUFIO_CACHE(c), data);
 436                break;
 437
 438        case DATA_MODE_GET_FREE_PAGES:
 439                free_pages((unsigned long)data, c->pages_per_block_bits);
 440                break;
 441
 442        case DATA_MODE_VMALLOC:
 443                vfree(data);
 444                break;
 445
 446        default:
 447                DMCRIT("dm_bufio_free_buffer_data: bad data mode: %d",
 448                       data_mode);
 449                BUG();
 450        }
 451}
 452
 453/*
 454 * Allocate buffer and its data.
 455 */
 456static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask)
 457{
 458        struct dm_buffer *b = kmalloc(sizeof(struct dm_buffer) + c->aux_size,
 459                                      gfp_mask);
 460
 461        if (!b)
 462                return NULL;
 463
 464        b->c = c;
 465
 466        b->data = alloc_buffer_data(c, gfp_mask, &b->data_mode);
 467        if (!b->data) {
 468                kfree(b);
 469                return NULL;
 470        }
 471
 472        adjust_total_allocated(b->data_mode, (long)c->block_size);
 473
 474#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
 475        memset(&b->stack_trace, 0, sizeof(b->stack_trace));
 476#endif
 477        return b;
 478}
 479
 480/*
 481 * Free buffer and its data.
 482 */
 483static void free_buffer(struct dm_buffer *b)
 484{
 485        struct dm_bufio_client *c = b->c;
 486
 487        adjust_total_allocated(b->data_mode, -(long)c->block_size);
 488
 489        free_buffer_data(c, b->data, b->data_mode);
 490        kfree(b);
 491}
 492
 493/*
 494 * Link buffer to the hash list and clean or dirty queue.
 495 */
 496static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty)
 497{
 498        struct dm_bufio_client *c = b->c;
 499
 500        c->n_buffers[dirty]++;
 501        b->block = block;
 502        b->list_mode = dirty;
 503        list_add(&b->lru_list, &c->lru[dirty]);
 504        __insert(b->c, b);
 505        b->last_accessed = jiffies;
 506}
 507
 508/*
 509 * Unlink buffer from the hash list and dirty or clean queue.
 510 */
 511static void __unlink_buffer(struct dm_buffer *b)
 512{
 513        struct dm_bufio_client *c = b->c;
 514
 515        BUG_ON(!c->n_buffers[b->list_mode]);
 516
 517        c->n_buffers[b->list_mode]--;
 518        __remove(b->c, b);
 519        list_del(&b->lru_list);
 520}
 521
 522/*
 523 * Place the buffer to the head of dirty or clean LRU queue.
 524 */
 525static void __relink_lru(struct dm_buffer *b, int dirty)
 526{
 527        struct dm_bufio_client *c = b->c;
 528
 529        BUG_ON(!c->n_buffers[b->list_mode]);
 530
 531        c->n_buffers[b->list_mode]--;
 532        c->n_buffers[dirty]++;
 533        b->list_mode = dirty;
 534        list_move(&b->lru_list, &c->lru[dirty]);
 535        b->last_accessed = jiffies;
 536}
 537
 538/*----------------------------------------------------------------
 539 * Submit I/O on the buffer.
 540 *
 541 * Bio interface is faster but it has some problems:
 542 *      the vector list is limited (increasing this limit increases
 543 *      memory-consumption per buffer, so it is not viable);
 544 *
 545 *      the memory must be direct-mapped, not vmalloced;
 546 *
 547 *      the I/O driver can reject requests spuriously if it thinks that
 548 *      the requests are too big for the device or if they cross a
 549 *      controller-defined memory boundary.
 550 *
 551 * If the buffer is small enough (up to DM_BUFIO_INLINE_VECS pages) and
 552 * it is not vmalloced, try using the bio interface.
 553 *
 554 * If the buffer is big, if it is vmalloced or if the underlying device
 555 * rejects the bio because it is too large, use dm-io layer to do the I/O.
 556 * The dm-io layer splits the I/O into multiple requests, avoiding the above
 557 * shortcomings.
 558 *--------------------------------------------------------------*/
 559
 560/*
 561 * dm-io completion routine. It just calls b->bio.bi_end_io, pretending
 562 * that the request was handled directly with bio interface.
 563 */
 564static void dmio_complete(unsigned long error, void *context)
 565{
 566        struct dm_buffer *b = context;
 567
 568        b->bio.bi_status = error ? BLK_STS_IOERR : 0;
 569        b->bio.bi_end_io(&b->bio);
 570}
 571
 572static void use_dmio(struct dm_buffer *b, int rw, sector_t sector,
 573                     unsigned n_sectors, unsigned offset, bio_end_io_t *end_io)
 574{
 575        int r;
 576        struct dm_io_request io_req = {
 577                .bi_op = rw,
 578                .bi_op_flags = 0,
 579                .notify.fn = dmio_complete,
 580                .notify.context = b,
 581                .client = b->c->dm_io,
 582        };
 583        struct dm_io_region region = {
 584                .bdev = b->c->bdev,
 585                .sector = sector,
 586                .count = n_sectors,
 587        };
 588
 589        if (b->data_mode != DATA_MODE_VMALLOC) {
 590                io_req.mem.type = DM_IO_KMEM;
 591                io_req.mem.ptr.addr = (char *)b->data + offset;
 592        } else {
 593                io_req.mem.type = DM_IO_VMA;
 594                io_req.mem.ptr.vma = (char *)b->data + offset;
 595        }
 596
 597        b->bio.bi_end_io = end_io;
 598
 599        r = dm_io(&io_req, 1, &region, NULL);
 600        if (r) {
 601                b->bio.bi_status = errno_to_blk_status(r);
 602                end_io(&b->bio);
 603        }
 604}
 605
 606static void inline_endio(struct bio *bio)
 607{
 608        bio_end_io_t *end_fn = bio->bi_private;
 609        blk_status_t status = bio->bi_status;
 610
 611        /*
 612         * Reset the bio to free any attached resources
 613         * (e.g. bio integrity profiles).
 614         */
 615        bio_reset(bio);
 616
 617        bio->bi_status = status;
 618        end_fn(bio);
 619}
 620
 621static void use_inline_bio(struct dm_buffer *b, int rw, sector_t sector,
 622                           unsigned n_sectors, unsigned offset, bio_end_io_t *end_io)
 623{
 624        char *ptr;
 625        unsigned len;
 626
 627        bio_init(&b->bio, b->bio_vec, DM_BUFIO_INLINE_VECS);
 628        b->bio.bi_iter.bi_sector = sector;
 629        bio_set_dev(&b->bio, b->c->bdev);
 630        b->bio.bi_end_io = inline_endio;
 631        /*
 632         * Use of .bi_private isn't a problem here because
 633         * the dm_buffer's inline bio is local to bufio.
 634         */
 635        b->bio.bi_private = end_io;
 636        bio_set_op_attrs(&b->bio, rw, 0);
 637
 638        ptr = (char *)b->data + offset;
 639        len = n_sectors << SECTOR_SHIFT;
 640
 641        do {
 642                unsigned this_step = min((unsigned)(PAGE_SIZE - offset_in_page(ptr)), len);
 643                if (!bio_add_page(&b->bio, virt_to_page(ptr), this_step,
 644                                  offset_in_page(ptr))) {
 645                        BUG_ON(b->c->block_size <= PAGE_SIZE);
 646                        use_dmio(b, rw, sector, n_sectors, offset, end_io);
 647                        return;
 648                }
 649
 650                len -= this_step;
 651                ptr += this_step;
 652        } while (len > 0);
 653
 654        submit_bio(&b->bio);
 655}
 656
 657static void submit_io(struct dm_buffer *b, int rw, bio_end_io_t *end_io)
 658{
 659        unsigned n_sectors;
 660        sector_t sector;
 661        unsigned offset, end;
 662
 663        sector = (b->block << b->c->sectors_per_block_bits) + b->c->start;
 664
 665        if (rw != WRITE) {
 666                n_sectors = 1 << b->c->sectors_per_block_bits;
 667                offset = 0;
 668        } else {
 669                if (b->c->write_callback)
 670                        b->c->write_callback(b);
 671                offset = b->write_start;
 672                end = b->write_end;
 673                offset &= -DM_BUFIO_WRITE_ALIGN;
 674                end += DM_BUFIO_WRITE_ALIGN - 1;
 675                end &= -DM_BUFIO_WRITE_ALIGN;
 676                if (unlikely(end > b->c->block_size))
 677                        end = b->c->block_size;
 678
 679                sector += offset >> SECTOR_SHIFT;
 680                n_sectors = (end - offset) >> SECTOR_SHIFT;
 681        }
 682
 683        if (n_sectors <= ((DM_BUFIO_INLINE_VECS * PAGE_SIZE) >> SECTOR_SHIFT) &&
 684            b->data_mode != DATA_MODE_VMALLOC)
 685                use_inline_bio(b, rw, sector, n_sectors, offset, end_io);
 686        else
 687                use_dmio(b, rw, sector, n_sectors, offset, end_io);
 688}
 689
 690/*----------------------------------------------------------------
 691 * Writing dirty buffers
 692 *--------------------------------------------------------------*/
 693
 694/*
 695 * The endio routine for write.
 696 *
 697 * Set the error, clear B_WRITING bit and wake anyone who was waiting on
 698 * it.
 699 */
 700static void write_endio(struct bio *bio)
 701{
 702        struct dm_buffer *b = container_of(bio, struct dm_buffer, bio);
 703
 704        b->write_error = bio->bi_status;
 705        if (unlikely(bio->bi_status)) {
 706                struct dm_bufio_client *c = b->c;
 707
 708                (void)cmpxchg(&c->async_write_error, 0,
 709                                blk_status_to_errno(bio->bi_status));
 710        }
 711
 712        BUG_ON(!test_bit(B_WRITING, &b->state));
 713
 714        smp_mb__before_atomic();
 715        clear_bit(B_WRITING, &b->state);
 716        smp_mb__after_atomic();
 717
 718        wake_up_bit(&b->state, B_WRITING);
 719}
 720
 721/*
 722 * Initiate a write on a dirty buffer, but don't wait for it.
 723 *
 724 * - If the buffer is not dirty, exit.
 725 * - If there some previous write going on, wait for it to finish (we can't
 726 *   have two writes on the same buffer simultaneously).
 727 * - Submit our write and don't wait on it. We set B_WRITING indicating
 728 *   that there is a write in progress.
 729 */
 730static void __write_dirty_buffer(struct dm_buffer *b,
 731                                 struct list_head *write_list)
 732{
 733        if (!test_bit(B_DIRTY, &b->state))
 734                return;
 735
 736        clear_bit(B_DIRTY, &b->state);
 737        wait_on_bit_lock_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
 738
 739        b->write_start = b->dirty_start;
 740        b->write_end = b->dirty_end;
 741
 742        if (!write_list)
 743                submit_io(b, WRITE, write_endio);
 744        else
 745                list_add_tail(&b->write_list, write_list);
 746}
 747
 748static void __flush_write_list(struct list_head *write_list)
 749{
 750        struct blk_plug plug;
 751        blk_start_plug(&plug);
 752        while (!list_empty(write_list)) {
 753                struct dm_buffer *b =
 754                        list_entry(write_list->next, struct dm_buffer, write_list);
 755                list_del(&b->write_list);
 756                submit_io(b, WRITE, write_endio);
 757                cond_resched();
 758        }
 759        blk_finish_plug(&plug);
 760}
 761
 762/*
 763 * Wait until any activity on the buffer finishes.  Possibly write the
 764 * buffer if it is dirty.  When this function finishes, there is no I/O
 765 * running on the buffer and the buffer is not dirty.
 766 */
 767static void __make_buffer_clean(struct dm_buffer *b)
 768{
 769        BUG_ON(b->hold_count);
 770
 771        if (!b->state)  /* fast case */
 772                return;
 773
 774        wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);
 775        __write_dirty_buffer(b, NULL);
 776        wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
 777}
 778
 779/*
 780 * Find some buffer that is not held by anybody, clean it, unlink it and
 781 * return it.
 782 */
 783static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c)
 784{
 785        struct dm_buffer *b;
 786
 787        list_for_each_entry_reverse(b, &c->lru[LIST_CLEAN], lru_list) {
 788                BUG_ON(test_bit(B_WRITING, &b->state));
 789                BUG_ON(test_bit(B_DIRTY, &b->state));
 790
 791                if (!b->hold_count) {
 792                        __make_buffer_clean(b);
 793                        __unlink_buffer(b);
 794                        return b;
 795                }
 796                cond_resched();
 797        }
 798
 799        list_for_each_entry_reverse(b, &c->lru[LIST_DIRTY], lru_list) {
 800                BUG_ON(test_bit(B_READING, &b->state));
 801
 802                if (!b->hold_count) {
 803                        __make_buffer_clean(b);
 804                        __unlink_buffer(b);
 805                        return b;
 806                }
 807                cond_resched();
 808        }
 809
 810        return NULL;
 811}
 812
 813/*
 814 * Wait until some other threads free some buffer or release hold count on
 815 * some buffer.
 816 *
 817 * This function is entered with c->lock held, drops it and regains it
 818 * before exiting.
 819 */
 820static void __wait_for_free_buffer(struct dm_bufio_client *c)
 821{
 822        DECLARE_WAITQUEUE(wait, current);
 823
 824        add_wait_queue(&c->free_buffer_wait, &wait);
 825        set_current_state(TASK_UNINTERRUPTIBLE);
 826        dm_bufio_unlock(c);
 827
 828        io_schedule();
 829
 830        remove_wait_queue(&c->free_buffer_wait, &wait);
 831
 832        dm_bufio_lock(c);
 833}
 834
 835enum new_flag {
 836        NF_FRESH = 0,
 837        NF_READ = 1,
 838        NF_GET = 2,
 839        NF_PREFETCH = 3
 840};
 841
 842/*
 843 * Allocate a new buffer. If the allocation is not possible, wait until
 844 * some other thread frees a buffer.
 845 *
 846 * May drop the lock and regain it.
 847 */
 848static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf)
 849{
 850        struct dm_buffer *b;
 851        bool tried_noio_alloc = false;
 852
 853        /*
 854         * dm-bufio is resistant to allocation failures (it just keeps
 855         * one buffer reserved in cases all the allocations fail).
 856         * So set flags to not try too hard:
 857         *      GFP_NOWAIT: don't wait; if we need to sleep we'll release our
 858         *                  mutex and wait ourselves.
 859         *      __GFP_NORETRY: don't retry and rather return failure
 860         *      __GFP_NOMEMALLOC: don't use emergency reserves
 861         *      __GFP_NOWARN: don't print a warning in case of failure
 862         *
 863         * For debugging, if we set the cache size to 1, no new buffers will
 864         * be allocated.
 865         */
 866        while (1) {
 867                if (dm_bufio_cache_size_latch != 1) {
 868                        b = alloc_buffer(c, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
 869                        if (b)
 870                                return b;
 871                }
 872
 873                if (nf == NF_PREFETCH)
 874                        return NULL;
 875
 876                if (dm_bufio_cache_size_latch != 1 && !tried_noio_alloc) {
 877                        dm_bufio_unlock(c);
 878                        b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
 879                        dm_bufio_lock(c);
 880                        if (b)
 881                                return b;
 882                        tried_noio_alloc = true;
 883                }
 884
 885                if (!list_empty(&c->reserved_buffers)) {
 886                        b = list_entry(c->reserved_buffers.next,
 887                                       struct dm_buffer, lru_list);
 888                        list_del(&b->lru_list);
 889                        c->need_reserved_buffers++;
 890
 891                        return b;
 892                }
 893
 894                b = __get_unclaimed_buffer(c);
 895                if (b)
 896                        return b;
 897
 898                __wait_for_free_buffer(c);
 899        }
 900}
 901
 902static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c, enum new_flag nf)
 903{
 904        struct dm_buffer *b = __alloc_buffer_wait_no_callback(c, nf);
 905
 906        if (!b)
 907                return NULL;
 908
 909        if (c->alloc_callback)
 910                c->alloc_callback(b);
 911
 912        return b;
 913}
 914
 915/*
 916 * Free a buffer and wake other threads waiting for free buffers.
 917 */
 918static void __free_buffer_wake(struct dm_buffer *b)
 919{
 920        struct dm_bufio_client *c = b->c;
 921
 922        if (!c->need_reserved_buffers)
 923                free_buffer(b);
 924        else {
 925                list_add(&b->lru_list, &c->reserved_buffers);
 926                c->need_reserved_buffers--;
 927        }
 928
 929        wake_up(&c->free_buffer_wait);
 930}
 931
 932static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait,
 933                                        struct list_head *write_list)
 934{
 935        struct dm_buffer *b, *tmp;
 936
 937        list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) {
 938                BUG_ON(test_bit(B_READING, &b->state));
 939
 940                if (!test_bit(B_DIRTY, &b->state) &&
 941                    !test_bit(B_WRITING, &b->state)) {
 942                        __relink_lru(b, LIST_CLEAN);
 943                        continue;
 944                }
 945
 946                if (no_wait && test_bit(B_WRITING, &b->state))
 947                        return;
 948
 949                __write_dirty_buffer(b, write_list);
 950                cond_resched();
 951        }
 952}
 953
 954/*
 955 * Get writeback threshold and buffer limit for a given client.
 956 */
 957static void __get_memory_limit(struct dm_bufio_client *c,
 958                               unsigned long *threshold_buffers,
 959                               unsigned long *limit_buffers)
 960{
 961        unsigned long buffers;
 962
 963        if (unlikely(READ_ONCE(dm_bufio_cache_size) != dm_bufio_cache_size_latch)) {
 964                if (mutex_trylock(&dm_bufio_clients_lock)) {
 965                        __cache_size_refresh();
 966                        mutex_unlock(&dm_bufio_clients_lock);
 967                }
 968        }
 969
 970        buffers = dm_bufio_cache_size_per_client >>
 971                  (c->sectors_per_block_bits + SECTOR_SHIFT);
 972
 973        if (buffers < c->minimum_buffers)
 974                buffers = c->minimum_buffers;
 975
 976        *limit_buffers = buffers;
 977        *threshold_buffers = mult_frac(buffers,
 978                                       DM_BUFIO_WRITEBACK_PERCENT, 100);
 979}
 980
 981/*
 982 * Check if we're over watermark.
 983 * If we are over threshold_buffers, start freeing buffers.
 984 * If we're over "limit_buffers", block until we get under the limit.
 985 */
 986static void __check_watermark(struct dm_bufio_client *c,
 987                              struct list_head *write_list)
 988{
 989        unsigned long threshold_buffers, limit_buffers;
 990
 991        __get_memory_limit(c, &threshold_buffers, &limit_buffers);
 992
 993        while (c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY] >
 994               limit_buffers) {
 995
 996                struct dm_buffer *b = __get_unclaimed_buffer(c);
 997
 998                if (!b)
 999                        return;
1000
1001                __free_buffer_wake(b);
1002                cond_resched();
1003        }
1004
1005        if (c->n_buffers[LIST_DIRTY] > threshold_buffers)
1006                __write_dirty_buffers_async(c, 1, write_list);
1007}
1008
1009/*----------------------------------------------------------------
1010 * Getting a buffer
1011 *--------------------------------------------------------------*/
1012
1013static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
1014                                     enum new_flag nf, int *need_submit,
1015                                     struct list_head *write_list)
1016{
1017        struct dm_buffer *b, *new_b = NULL;
1018
1019        *need_submit = 0;
1020
1021        b = __find(c, block);
1022        if (b)
1023                goto found_buffer;
1024
1025        if (nf == NF_GET)
1026                return NULL;
1027
1028        new_b = __alloc_buffer_wait(c, nf);
1029        if (!new_b)
1030                return NULL;
1031
1032        /*
1033         * We've had a period where the mutex was unlocked, so need to
1034         * recheck the hash table.
1035         */
1036        b = __find(c, block);
1037        if (b) {
1038                __free_buffer_wake(new_b);
1039                goto found_buffer;
1040        }
1041
1042        __check_watermark(c, write_list);
1043
1044        b = new_b;
1045        b->hold_count = 1;
1046        b->read_error = 0;
1047        b->write_error = 0;
1048        __link_buffer(b, block, LIST_CLEAN);
1049
1050        if (nf == NF_FRESH) {
1051                b->state = 0;
1052                return b;
1053        }
1054
1055        b->state = 1 << B_READING;
1056        *need_submit = 1;
1057
1058        return b;
1059
1060found_buffer:
1061        if (nf == NF_PREFETCH)
1062                return NULL;
1063        /*
1064         * Note: it is essential that we don't wait for the buffer to be
1065         * read if dm_bufio_get function is used. Both dm_bufio_get and
1066         * dm_bufio_prefetch can be used in the driver request routine.
1067         * If the user called both dm_bufio_prefetch and dm_bufio_get on
1068         * the same buffer, it would deadlock if we waited.
1069         */
1070        if (nf == NF_GET && unlikely(test_bit(B_READING, &b->state)))
1071                return NULL;
1072
1073        b->hold_count++;
1074        __relink_lru(b, test_bit(B_DIRTY, &b->state) ||
1075                     test_bit(B_WRITING, &b->state));
1076        return b;
1077}
1078
1079/*
1080 * The endio routine for reading: set the error, clear the bit and wake up
1081 * anyone waiting on the buffer.
1082 */
1083static void read_endio(struct bio *bio)
1084{
1085        struct dm_buffer *b = container_of(bio, struct dm_buffer, bio);
1086
1087        b->read_error = bio->bi_status;
1088
1089        BUG_ON(!test_bit(B_READING, &b->state));
1090
1091        smp_mb__before_atomic();
1092        clear_bit(B_READING, &b->state);
1093        smp_mb__after_atomic();
1094
1095        wake_up_bit(&b->state, B_READING);
1096}
1097
1098/*
1099 * A common routine for dm_bufio_new and dm_bufio_read.  Operation of these
1100 * functions is similar except that dm_bufio_new doesn't read the
1101 * buffer from the disk (assuming that the caller overwrites all the data
1102 * and uses dm_bufio_mark_buffer_dirty to write new data back).
1103 */
1104static void *new_read(struct dm_bufio_client *c, sector_t block,
1105                      enum new_flag nf, struct dm_buffer **bp)
1106{
1107        int need_submit;
1108        struct dm_buffer *b;
1109
1110        LIST_HEAD(write_list);
1111
1112        dm_bufio_lock(c);
1113        b = __bufio_new(c, block, nf, &need_submit, &write_list);
1114#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
1115        if (b && b->hold_count == 1)
1116                buffer_record_stack(b);
1117#endif
1118        dm_bufio_unlock(c);
1119
1120        __flush_write_list(&write_list);
1121
1122        if (!b)
1123                return NULL;
1124
1125        if (need_submit)
1126                submit_io(b, READ, read_endio);
1127
1128        wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);
1129
1130        if (b->read_error) {
1131                int error = blk_status_to_errno(b->read_error);
1132
1133                dm_bufio_release(b);
1134
1135                return ERR_PTR(error);
1136        }
1137
1138        *bp = b;
1139
1140        return b->data;
1141}
1142
1143void *dm_bufio_get(struct dm_bufio_client *c, sector_t block,
1144                   struct dm_buffer **bp)
1145{
1146        return new_read(c, block, NF_GET, bp);
1147}
1148EXPORT_SYMBOL_GPL(dm_bufio_get);
1149
1150void *dm_bufio_read(struct dm_bufio_client *c, sector_t block,
1151                    struct dm_buffer **bp)
1152{
1153        BUG_ON(dm_bufio_in_request());
1154
1155        return new_read(c, block, NF_READ, bp);
1156}
1157EXPORT_SYMBOL_GPL(dm_bufio_read);
1158
1159void *dm_bufio_new(struct dm_bufio_client *c, sector_t block,
1160                   struct dm_buffer **bp)
1161{
1162        BUG_ON(dm_bufio_in_request());
1163
1164        return new_read(c, block, NF_FRESH, bp);
1165}
1166EXPORT_SYMBOL_GPL(dm_bufio_new);
1167
1168void dm_bufio_prefetch(struct dm_bufio_client *c,
1169                       sector_t block, unsigned n_blocks)
1170{
1171        struct blk_plug plug;
1172
1173        LIST_HEAD(write_list);
1174
1175        BUG_ON(dm_bufio_in_request());
1176
1177        blk_start_plug(&plug);
1178        dm_bufio_lock(c);
1179
1180        for (; n_blocks--; block++) {
1181                int need_submit;
1182                struct dm_buffer *b;
1183                b = __bufio_new(c, block, NF_PREFETCH, &need_submit,
1184                                &write_list);
1185                if (unlikely(!list_empty(&write_list))) {
1186                        dm_bufio_unlock(c);
1187                        blk_finish_plug(&plug);
1188                        __flush_write_list(&write_list);
1189                        blk_start_plug(&plug);
1190                        dm_bufio_lock(c);
1191                }
1192                if (unlikely(b != NULL)) {
1193                        dm_bufio_unlock(c);
1194
1195                        if (need_submit)
1196                                submit_io(b, READ, read_endio);
1197                        dm_bufio_release(b);
1198
1199                        cond_resched();
1200
1201                        if (!n_blocks)
1202                                goto flush_plug;
1203                        dm_bufio_lock(c);
1204                }
1205        }
1206
1207        dm_bufio_unlock(c);
1208
1209flush_plug:
1210        blk_finish_plug(&plug);
1211}
1212EXPORT_SYMBOL_GPL(dm_bufio_prefetch);
1213
1214void dm_bufio_release(struct dm_buffer *b)
1215{
1216        struct dm_bufio_client *c = b->c;
1217
1218        dm_bufio_lock(c);
1219
1220        BUG_ON(!b->hold_count);
1221
1222        b->hold_count--;
1223        if (!b->hold_count) {
1224                wake_up(&c->free_buffer_wait);
1225
1226                /*
1227                 * If there were errors on the buffer, and the buffer is not
1228                 * to be written, free the buffer. There is no point in caching
1229                 * invalid buffer.
1230                 */
1231                if ((b->read_error || b->write_error) &&
1232                    !test_bit(B_READING, &b->state) &&
1233                    !test_bit(B_WRITING, &b->state) &&
1234                    !test_bit(B_DIRTY, &b->state)) {
1235                        __unlink_buffer(b);
1236                        __free_buffer_wake(b);
1237                }
1238        }
1239
1240        dm_bufio_unlock(c);
1241}
1242EXPORT_SYMBOL_GPL(dm_bufio_release);
1243
1244void dm_bufio_mark_partial_buffer_dirty(struct dm_buffer *b,
1245                                        unsigned start, unsigned end)
1246{
1247        struct dm_bufio_client *c = b->c;
1248
1249        BUG_ON(start >= end);
1250        BUG_ON(end > b->c->block_size);
1251
1252        dm_bufio_lock(c);
1253
1254        BUG_ON(test_bit(B_READING, &b->state));
1255
1256        if (!test_and_set_bit(B_DIRTY, &b->state)) {
1257                b->dirty_start = start;
1258                b->dirty_end = end;
1259                __relink_lru(b, LIST_DIRTY);
1260        } else {
1261                if (start < b->dirty_start)
1262                        b->dirty_start = start;
1263                if (end > b->dirty_end)
1264                        b->dirty_end = end;
1265        }
1266
1267        dm_bufio_unlock(c);
1268}
1269EXPORT_SYMBOL_GPL(dm_bufio_mark_partial_buffer_dirty);
1270
1271void dm_bufio_mark_buffer_dirty(struct dm_buffer *b)
1272{
1273        dm_bufio_mark_partial_buffer_dirty(b, 0, b->c->block_size);
1274}
1275EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty);
1276
1277void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c)
1278{
1279        LIST_HEAD(write_list);
1280
1281        BUG_ON(dm_bufio_in_request());
1282
1283        dm_bufio_lock(c);
1284        __write_dirty_buffers_async(c, 0, &write_list);
1285        dm_bufio_unlock(c);
1286        __flush_write_list(&write_list);
1287}
1288EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async);
1289
1290/*
1291 * For performance, it is essential that the buffers are written asynchronously
1292 * and simultaneously (so that the block layer can merge the writes) and then
1293 * waited upon.
1294 *
1295 * Finally, we flush hardware disk cache.
1296 */
1297int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c)
1298{
1299        int a, f;
1300        unsigned long buffers_processed = 0;
1301        struct dm_buffer *b, *tmp;
1302
1303        LIST_HEAD(write_list);
1304
1305        dm_bufio_lock(c);
1306        __write_dirty_buffers_async(c, 0, &write_list);
1307        dm_bufio_unlock(c);
1308        __flush_write_list(&write_list);
1309        dm_bufio_lock(c);
1310
1311again:
1312        list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) {
1313                int dropped_lock = 0;
1314
1315                if (buffers_processed < c->n_buffers[LIST_DIRTY])
1316                        buffers_processed++;
1317
1318                BUG_ON(test_bit(B_READING, &b->state));
1319
1320                if (test_bit(B_WRITING, &b->state)) {
1321                        if (buffers_processed < c->n_buffers[LIST_DIRTY]) {
1322                                dropped_lock = 1;
1323                                b->hold_count++;
1324                                dm_bufio_unlock(c);
1325                                wait_on_bit_io(&b->state, B_WRITING,
1326                                               TASK_UNINTERRUPTIBLE);
1327                                dm_bufio_lock(c);
1328                                b->hold_count--;
1329                        } else
1330                                wait_on_bit_io(&b->state, B_WRITING,
1331                                               TASK_UNINTERRUPTIBLE);
1332                }
1333
1334                if (!test_bit(B_DIRTY, &b->state) &&
1335                    !test_bit(B_WRITING, &b->state))
1336                        __relink_lru(b, LIST_CLEAN);
1337
1338                cond_resched();
1339
1340                /*
1341                 * If we dropped the lock, the list is no longer consistent,
1342                 * so we must restart the search.
1343                 *
1344                 * In the most common case, the buffer just processed is
1345                 * relinked to the clean list, so we won't loop scanning the
1346                 * same buffer again and again.
1347                 *
1348                 * This may livelock if there is another thread simultaneously
1349                 * dirtying buffers, so we count the number of buffers walked
1350                 * and if it exceeds the total number of buffers, it means that
1351                 * someone is doing some writes simultaneously with us.  In
1352                 * this case, stop, dropping the lock.
1353                 */
1354                if (dropped_lock)
1355                        goto again;
1356        }
1357        wake_up(&c->free_buffer_wait);
1358        dm_bufio_unlock(c);
1359
1360        a = xchg(&c->async_write_error, 0);
1361        f = dm_bufio_issue_flush(c);
1362        if (a)
1363                return a;
1364
1365        return f;
1366}
1367EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers);
1368
1369/*
1370 * Use dm-io to send and empty barrier flush the device.
1371 */
1372int dm_bufio_issue_flush(struct dm_bufio_client *c)
1373{
1374        struct dm_io_request io_req = {
1375                .bi_op = REQ_OP_WRITE,
1376                .bi_op_flags = REQ_PREFLUSH | REQ_SYNC,
1377                .mem.type = DM_IO_KMEM,
1378                .mem.ptr.addr = NULL,
1379                .client = c->dm_io,
1380        };
1381        struct dm_io_region io_reg = {
1382                .bdev = c->bdev,
1383                .sector = 0,
1384                .count = 0,
1385        };
1386
1387        BUG_ON(dm_bufio_in_request());
1388
1389        return dm_io(&io_req, 1, &io_reg, NULL);
1390}
1391EXPORT_SYMBOL_GPL(dm_bufio_issue_flush);
1392
1393/*
1394 * We first delete any other buffer that may be at that new location.
1395 *
1396 * Then, we write the buffer to the original location if it was dirty.
1397 *
1398 * Then, if we are the only one who is holding the buffer, relink the buffer
1399 * in the hash queue for the new location.
1400 *
1401 * If there was someone else holding the buffer, we write it to the new
1402 * location but not relink it, because that other user needs to have the buffer
1403 * at the same place.
1404 */
1405void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block)
1406{
1407        struct dm_bufio_client *c = b->c;
1408        struct dm_buffer *new;
1409
1410        BUG_ON(dm_bufio_in_request());
1411
1412        dm_bufio_lock(c);
1413
1414retry:
1415        new = __find(c, new_block);
1416        if (new) {
1417                if (new->hold_count) {
1418                        __wait_for_free_buffer(c);
1419                        goto retry;
1420                }
1421
1422                /*
1423                 * FIXME: Is there any point waiting for a write that's going
1424                 * to be overwritten in a bit?
1425                 */
1426                __make_buffer_clean(new);
1427                __unlink_buffer(new);
1428                __free_buffer_wake(new);
1429        }
1430
1431        BUG_ON(!b->hold_count);
1432        BUG_ON(test_bit(B_READING, &b->state));
1433
1434        __write_dirty_buffer(b, NULL);
1435        if (b->hold_count == 1) {
1436                wait_on_bit_io(&b->state, B_WRITING,
1437                               TASK_UNINTERRUPTIBLE);
1438                set_bit(B_DIRTY, &b->state);
1439                b->dirty_start = 0;
1440                b->dirty_end = c->block_size;
1441                __unlink_buffer(b);
1442                __link_buffer(b, new_block, LIST_DIRTY);
1443        } else {
1444                sector_t old_block;
1445                wait_on_bit_lock_io(&b->state, B_WRITING,
1446                                    TASK_UNINTERRUPTIBLE);
1447                /*
1448                 * Relink buffer to "new_block" so that write_callback
1449                 * sees "new_block" as a block number.
1450                 * After the write, link the buffer back to old_block.
1451                 * All this must be done in bufio lock, so that block number
1452                 * change isn't visible to other threads.
1453                 */
1454                old_block = b->block;
1455                __unlink_buffer(b);
1456                __link_buffer(b, new_block, b->list_mode);
1457                submit_io(b, WRITE, write_endio);
1458                wait_on_bit_io(&b->state, B_WRITING,
1459                               TASK_UNINTERRUPTIBLE);
1460                __unlink_buffer(b);
1461                __link_buffer(b, old_block, b->list_mode);
1462        }
1463
1464        dm_bufio_unlock(c);
1465        dm_bufio_release(b);
1466}
1467EXPORT_SYMBOL_GPL(dm_bufio_release_move);
1468
1469/*
1470 * Free the given buffer.
1471 *
1472 * This is just a hint, if the buffer is in use or dirty, this function
1473 * does nothing.
1474 */
1475void dm_bufio_forget(struct dm_bufio_client *c, sector_t block)
1476{
1477        struct dm_buffer *b;
1478
1479        dm_bufio_lock(c);
1480
1481        b = __find(c, block);
1482        if (b && likely(!b->hold_count) && likely(!b->state)) {
1483                __unlink_buffer(b);
1484                __free_buffer_wake(b);
1485        }
1486
1487        dm_bufio_unlock(c);
1488}
1489EXPORT_SYMBOL(dm_bufio_forget);
1490
1491void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned n)
1492{
1493        c->minimum_buffers = n;
1494}
1495EXPORT_SYMBOL(dm_bufio_set_minimum_buffers);
1496
1497unsigned dm_bufio_get_block_size(struct dm_bufio_client *c)
1498{
1499        return c->block_size;
1500}
1501EXPORT_SYMBOL_GPL(dm_bufio_get_block_size);
1502
1503sector_t dm_bufio_get_device_size(struct dm_bufio_client *c)
1504{
1505        return i_size_read(c->bdev->bd_inode) >>
1506                           (SECTOR_SHIFT + c->sectors_per_block_bits);
1507}
1508EXPORT_SYMBOL_GPL(dm_bufio_get_device_size);
1509
1510sector_t dm_bufio_get_block_number(struct dm_buffer *b)
1511{
1512        return b->block;
1513}
1514EXPORT_SYMBOL_GPL(dm_bufio_get_block_number);
1515
1516void *dm_bufio_get_block_data(struct dm_buffer *b)
1517{
1518        return b->data;
1519}
1520EXPORT_SYMBOL_GPL(dm_bufio_get_block_data);
1521
1522void *dm_bufio_get_aux_data(struct dm_buffer *b)
1523{
1524        return b + 1;
1525}
1526EXPORT_SYMBOL_GPL(dm_bufio_get_aux_data);
1527
1528struct dm_bufio_client *dm_bufio_get_client(struct dm_buffer *b)
1529{
1530        return b->c;
1531}
1532EXPORT_SYMBOL_GPL(dm_bufio_get_client);
1533
1534static void drop_buffers(struct dm_bufio_client *c)
1535{
1536        struct dm_buffer *b;
1537        int i;
1538        bool warned = false;
1539
1540        BUG_ON(dm_bufio_in_request());
1541
1542        /*
1543         * An optimization so that the buffers are not written one-by-one.
1544         */
1545        dm_bufio_write_dirty_buffers_async(c);
1546
1547        dm_bufio_lock(c);
1548
1549        while ((b = __get_unclaimed_buffer(c)))
1550                __free_buffer_wake(b);
1551
1552        for (i = 0; i < LIST_SIZE; i++)
1553                list_for_each_entry(b, &c->lru[i], lru_list) {
1554                        WARN_ON(!warned);
1555                        warned = true;
1556                        DMERR("leaked buffer %llx, hold count %u, list %d",
1557                              (unsigned long long)b->block, b->hold_count, i);
1558#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
1559                        print_stack_trace(&b->stack_trace, 1);
1560                        b->hold_count = 0; /* mark unclaimed to avoid BUG_ON below */
1561#endif
1562                }
1563
1564#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
1565        while ((b = __get_unclaimed_buffer(c)))
1566                __free_buffer_wake(b);
1567#endif
1568
1569        for (i = 0; i < LIST_SIZE; i++)
1570                BUG_ON(!list_empty(&c->lru[i]));
1571
1572        dm_bufio_unlock(c);
1573}
1574
1575/*
1576 * We may not be able to evict this buffer if IO pending or the client
1577 * is still using it.  Caller is expected to know buffer is too old.
1578 *
1579 * And if GFP_NOFS is used, we must not do any I/O because we hold
1580 * dm_bufio_clients_lock and we would risk deadlock if the I/O gets
1581 * rerouted to different bufio client.
1582 */
1583static bool __try_evict_buffer(struct dm_buffer *b, gfp_t gfp)
1584{
1585        if (!(gfp & __GFP_FS)) {
1586                if (test_bit(B_READING, &b->state) ||
1587                    test_bit(B_WRITING, &b->state) ||
1588                    test_bit(B_DIRTY, &b->state))
1589                        return false;
1590        }
1591
1592        if (b->hold_count)
1593                return false;
1594
1595        __make_buffer_clean(b);
1596        __unlink_buffer(b);
1597        __free_buffer_wake(b);
1598
1599        return true;
1600}
1601
1602static unsigned long get_retain_buffers(struct dm_bufio_client *c)
1603{
1604        unsigned long retain_bytes = READ_ONCE(dm_bufio_retain_bytes);
1605        return retain_bytes >> (c->sectors_per_block_bits + SECTOR_SHIFT);
1606}
1607
1608static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
1609                            gfp_t gfp_mask)
1610{
1611        int l;
1612        struct dm_buffer *b, *tmp;
1613        unsigned long freed = 0;
1614        unsigned long count = c->n_buffers[LIST_CLEAN] +
1615                              c->n_buffers[LIST_DIRTY];
1616        unsigned long retain_target = get_retain_buffers(c);
1617
1618        for (l = 0; l < LIST_SIZE; l++) {
1619                list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) {
1620                        if (__try_evict_buffer(b, gfp_mask))
1621                                freed++;
1622                        if (!--nr_to_scan || ((count - freed) <= retain_target))
1623                                return freed;
1624                        cond_resched();
1625                }
1626        }
1627        return freed;
1628}
1629
1630static unsigned long
1631dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
1632{
1633        struct dm_bufio_client *c;
1634        unsigned long freed;
1635
1636        c = container_of(shrink, struct dm_bufio_client, shrinker);
1637        if (sc->gfp_mask & __GFP_FS)
1638                dm_bufio_lock(c);
1639        else if (!dm_bufio_trylock(c))
1640                return SHRINK_STOP;
1641
1642        freed  = __scan(c, sc->nr_to_scan, sc->gfp_mask);
1643        dm_bufio_unlock(c);
1644        return freed;
1645}
1646
1647static unsigned long
1648dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
1649{
1650        struct dm_bufio_client *c = container_of(shrink, struct dm_bufio_client, shrinker);
1651        unsigned long count = READ_ONCE(c->n_buffers[LIST_CLEAN]) +
1652                              READ_ONCE(c->n_buffers[LIST_DIRTY]);
1653        unsigned long retain_target = get_retain_buffers(c);
1654
1655        return (count < retain_target) ? 0 : (count - retain_target);
1656}
1657
1658/*
1659 * Create the buffering interface
1660 */
1661struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned block_size,
1662                                               unsigned reserved_buffers, unsigned aux_size,
1663                                               void (*alloc_callback)(struct dm_buffer *),
1664                                               void (*write_callback)(struct dm_buffer *))
1665{
1666        int r;
1667        struct dm_bufio_client *c;
1668        unsigned i;
1669
1670        BUG_ON(block_size < 1 << SECTOR_SHIFT ||
1671               (block_size & (block_size - 1)));
1672
1673        c = kzalloc(sizeof(*c), GFP_KERNEL);
1674        if (!c) {
1675                r = -ENOMEM;
1676                goto bad_client;
1677        }
1678        c->buffer_tree = RB_ROOT;
1679
1680        c->bdev = bdev;
1681        c->block_size = block_size;
1682        c->sectors_per_block_bits = __ffs(block_size) - SECTOR_SHIFT;
1683        c->pages_per_block_bits = (__ffs(block_size) >= PAGE_SHIFT) ?
1684                                  __ffs(block_size) - PAGE_SHIFT : 0;
1685        c->blocks_per_page_bits = (__ffs(block_size) < PAGE_SHIFT ?
1686                                  PAGE_SHIFT - __ffs(block_size) : 0);
1687
1688        c->aux_size = aux_size;
1689        c->alloc_callback = alloc_callback;
1690        c->write_callback = write_callback;
1691
1692        for (i = 0; i < LIST_SIZE; i++) {
1693                INIT_LIST_HEAD(&c->lru[i]);
1694                c->n_buffers[i] = 0;
1695        }
1696
1697        mutex_init(&c->lock);
1698        INIT_LIST_HEAD(&c->reserved_buffers);
1699        c->need_reserved_buffers = reserved_buffers;
1700
1701        c->minimum_buffers = DM_BUFIO_MIN_BUFFERS;
1702
1703        init_waitqueue_head(&c->free_buffer_wait);
1704        c->async_write_error = 0;
1705
1706        c->dm_io = dm_io_client_create();
1707        if (IS_ERR(c->dm_io)) {
1708                r = PTR_ERR(c->dm_io);
1709                goto bad_dm_io;
1710        }
1711
1712        mutex_lock(&dm_bufio_clients_lock);
1713        if (c->blocks_per_page_bits) {
1714                if (!DM_BUFIO_CACHE_NAME(c)) {
1715                        DM_BUFIO_CACHE_NAME(c) = kasprintf(GFP_KERNEL, "dm_bufio_cache-%u", c->block_size);
1716                        if (!DM_BUFIO_CACHE_NAME(c)) {
1717                                r = -ENOMEM;
1718                                mutex_unlock(&dm_bufio_clients_lock);
1719                                goto bad_cache;
1720                        }
1721                }
1722
1723                if (!DM_BUFIO_CACHE(c)) {
1724                        DM_BUFIO_CACHE(c) = kmem_cache_create(DM_BUFIO_CACHE_NAME(c),
1725                                                              c->block_size,
1726                                                              c->block_size, 0, NULL);
1727                        if (!DM_BUFIO_CACHE(c)) {
1728                                r = -ENOMEM;
1729                                mutex_unlock(&dm_bufio_clients_lock);
1730                                goto bad_cache;
1731                        }
1732                }
1733        }
1734        mutex_unlock(&dm_bufio_clients_lock);
1735
1736        while (c->need_reserved_buffers) {
1737                struct dm_buffer *b = alloc_buffer(c, GFP_KERNEL);
1738
1739                if (!b) {
1740                        r = -ENOMEM;
1741                        goto bad_buffer;
1742                }
1743                __free_buffer_wake(b);
1744        }
1745
1746        mutex_lock(&dm_bufio_clients_lock);
1747        dm_bufio_client_count++;
1748        list_add(&c->client_list, &dm_bufio_all_clients);
1749        __cache_size_refresh();
1750        mutex_unlock(&dm_bufio_clients_lock);
1751
1752        c->shrinker.count_objects = dm_bufio_shrink_count;
1753        c->shrinker.scan_objects = dm_bufio_shrink_scan;
1754        c->shrinker.seeks = 1;
1755        c->shrinker.batch = 0;
1756        register_shrinker(&c->shrinker);
1757
1758        return c;
1759
1760bad_buffer:
1761bad_cache:
1762        while (!list_empty(&c->reserved_buffers)) {
1763                struct dm_buffer *b = list_entry(c->reserved_buffers.next,
1764                                                 struct dm_buffer, lru_list);
1765                list_del(&b->lru_list);
1766                free_buffer(b);
1767        }
1768        dm_io_client_destroy(c->dm_io);
1769bad_dm_io:
1770        kfree(c);
1771bad_client:
1772        return ERR_PTR(r);
1773}
1774EXPORT_SYMBOL_GPL(dm_bufio_client_create);
1775
1776/*
1777 * Free the buffering interface.
1778 * It is required that there are no references on any buffers.
1779 */
1780void dm_bufio_client_destroy(struct dm_bufio_client *c)
1781{
1782        unsigned i;
1783
1784        drop_buffers(c);
1785
1786        unregister_shrinker(&c->shrinker);
1787
1788        mutex_lock(&dm_bufio_clients_lock);
1789
1790        list_del(&c->client_list);
1791        dm_bufio_client_count--;
1792        __cache_size_refresh();
1793
1794        mutex_unlock(&dm_bufio_clients_lock);
1795
1796        BUG_ON(!RB_EMPTY_ROOT(&c->buffer_tree));
1797        BUG_ON(c->need_reserved_buffers);
1798
1799        while (!list_empty(&c->reserved_buffers)) {
1800                struct dm_buffer *b = list_entry(c->reserved_buffers.next,
1801                                                 struct dm_buffer, lru_list);
1802                list_del(&b->lru_list);
1803                free_buffer(b);
1804        }
1805
1806        for (i = 0; i < LIST_SIZE; i++)
1807                if (c->n_buffers[i])
1808                        DMERR("leaked buffer count %d: %ld", i, c->n_buffers[i]);
1809
1810        for (i = 0; i < LIST_SIZE; i++)
1811                BUG_ON(c->n_buffers[i]);
1812
1813        dm_io_client_destroy(c->dm_io);
1814        kfree(c);
1815}
1816EXPORT_SYMBOL_GPL(dm_bufio_client_destroy);
1817
1818void dm_bufio_set_sector_offset(struct dm_bufio_client *c, sector_t start)
1819{
1820        c->start = start;
1821}
1822EXPORT_SYMBOL_GPL(dm_bufio_set_sector_offset);
1823
1824static unsigned get_max_age_hz(void)
1825{
1826        unsigned max_age = READ_ONCE(dm_bufio_max_age);
1827
1828        if (max_age > UINT_MAX / HZ)
1829                max_age = UINT_MAX / HZ;
1830
1831        return max_age * HZ;
1832}
1833
1834static bool older_than(struct dm_buffer *b, unsigned long age_hz)
1835{
1836        return time_after_eq(jiffies, b->last_accessed + age_hz);
1837}
1838
1839static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz)
1840{
1841        struct dm_buffer *b, *tmp;
1842        unsigned long retain_target = get_retain_buffers(c);
1843        unsigned long count;
1844        LIST_HEAD(write_list);
1845
1846        dm_bufio_lock(c);
1847
1848        __check_watermark(c, &write_list);
1849        if (unlikely(!list_empty(&write_list))) {
1850                dm_bufio_unlock(c);
1851                __flush_write_list(&write_list);
1852                dm_bufio_lock(c);
1853        }
1854
1855        count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY];
1856        list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_CLEAN], lru_list) {
1857                if (count <= retain_target)
1858                        break;
1859
1860                if (!older_than(b, age_hz))
1861                        break;
1862
1863                if (__try_evict_buffer(b, 0))
1864                        count--;
1865
1866                cond_resched();
1867        }
1868
1869        dm_bufio_unlock(c);
1870}
1871
1872static void cleanup_old_buffers(void)
1873{
1874        unsigned long max_age_hz = get_max_age_hz();
1875        struct dm_bufio_client *c;
1876
1877        mutex_lock(&dm_bufio_clients_lock);
1878
1879        __cache_size_refresh();
1880
1881        list_for_each_entry(c, &dm_bufio_all_clients, client_list)
1882                __evict_old_buffers(c, max_age_hz);
1883
1884        mutex_unlock(&dm_bufio_clients_lock);
1885}
1886
1887static struct workqueue_struct *dm_bufio_wq;
1888static struct delayed_work dm_bufio_work;
1889
1890static void work_fn(struct work_struct *w)
1891{
1892        cleanup_old_buffers();
1893
1894        queue_delayed_work(dm_bufio_wq, &dm_bufio_work,
1895                           DM_BUFIO_WORK_TIMER_SECS * HZ);
1896}
1897
1898/*----------------------------------------------------------------
1899 * Module setup
1900 *--------------------------------------------------------------*/
1901
1902/*
1903 * This is called only once for the whole dm_bufio module.
1904 * It initializes memory limit.
1905 */
1906static int __init dm_bufio_init(void)
1907{
1908        __u64 mem;
1909
1910        dm_bufio_allocated_kmem_cache = 0;
1911        dm_bufio_allocated_get_free_pages = 0;
1912        dm_bufio_allocated_vmalloc = 0;
1913        dm_bufio_current_allocated = 0;
1914
1915        memset(&dm_bufio_caches, 0, sizeof dm_bufio_caches);
1916        memset(&dm_bufio_cache_names, 0, sizeof dm_bufio_cache_names);
1917
1918        mem = (__u64)mult_frac(totalram_pages - totalhigh_pages,
1919                               DM_BUFIO_MEMORY_PERCENT, 100) << PAGE_SHIFT;
1920
1921        if (mem > ULONG_MAX)
1922                mem = ULONG_MAX;
1923
1924#ifdef CONFIG_MMU
1925        if (mem > mult_frac(VMALLOC_TOTAL, DM_BUFIO_VMALLOC_PERCENT, 100))
1926                mem = mult_frac(VMALLOC_TOTAL, DM_BUFIO_VMALLOC_PERCENT, 100);
1927#endif
1928
1929        dm_bufio_default_cache_size = mem;
1930
1931        mutex_lock(&dm_bufio_clients_lock);
1932        __cache_size_refresh();
1933        mutex_unlock(&dm_bufio_clients_lock);
1934
1935        dm_bufio_wq = alloc_workqueue("dm_bufio_cache", WQ_MEM_RECLAIM, 0);
1936        if (!dm_bufio_wq)
1937                return -ENOMEM;
1938
1939        INIT_DELAYED_WORK(&dm_bufio_work, work_fn);
1940        queue_delayed_work(dm_bufio_wq, &dm_bufio_work,
1941                           DM_BUFIO_WORK_TIMER_SECS * HZ);
1942
1943        return 0;
1944}
1945
1946/*
1947 * This is called once when unloading the dm_bufio module.
1948 */
1949static void __exit dm_bufio_exit(void)
1950{
1951        int bug = 0;
1952        int i;
1953
1954        cancel_delayed_work_sync(&dm_bufio_work);
1955        destroy_workqueue(dm_bufio_wq);
1956
1957        for (i = 0; i < ARRAY_SIZE(dm_bufio_caches); i++)
1958                kmem_cache_destroy(dm_bufio_caches[i]);
1959
1960        for (i = 0; i < ARRAY_SIZE(dm_bufio_cache_names); i++)
1961                kfree(dm_bufio_cache_names[i]);
1962
1963        if (dm_bufio_client_count) {
1964                DMCRIT("%s: dm_bufio_client_count leaked: %d",
1965                        __func__, dm_bufio_client_count);
1966                bug = 1;
1967        }
1968
1969        if (dm_bufio_current_allocated) {
1970                DMCRIT("%s: dm_bufio_current_allocated leaked: %lu",
1971                        __func__, dm_bufio_current_allocated);
1972                bug = 1;
1973        }
1974
1975        if (dm_bufio_allocated_get_free_pages) {
1976                DMCRIT("%s: dm_bufio_allocated_get_free_pages leaked: %lu",
1977                       __func__, dm_bufio_allocated_get_free_pages);
1978                bug = 1;
1979        }
1980
1981        if (dm_bufio_allocated_vmalloc) {
1982                DMCRIT("%s: dm_bufio_vmalloc leaked: %lu",
1983                       __func__, dm_bufio_allocated_vmalloc);
1984                bug = 1;
1985        }
1986
1987        BUG_ON(bug);
1988}
1989
1990module_init(dm_bufio_init)
1991module_exit(dm_bufio_exit)
1992
1993module_param_named(max_cache_size_bytes, dm_bufio_cache_size, ulong, S_IRUGO | S_IWUSR);
1994MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache");
1995
1996module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR);
1997MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds");
1998
1999module_param_named(retain_bytes, dm_bufio_retain_bytes, ulong, S_IRUGO | S_IWUSR);
2000MODULE_PARM_DESC(retain_bytes, "Try to keep at least this many bytes cached in memory");
2001
2002module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR);
2003MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory");
2004
2005module_param_named(allocated_kmem_cache_bytes, dm_bufio_allocated_kmem_cache, ulong, S_IRUGO);
2006MODULE_PARM_DESC(allocated_kmem_cache_bytes, "Memory allocated with kmem_cache_alloc");
2007
2008module_param_named(allocated_get_free_pages_bytes, dm_bufio_allocated_get_free_pages, ulong, S_IRUGO);
2009MODULE_PARM_DESC(allocated_get_free_pages_bytes, "Memory allocated with get_free_pages");
2010
2011module_param_named(allocated_vmalloc_bytes, dm_bufio_allocated_vmalloc, ulong, S_IRUGO);
2012MODULE_PARM_DESC(allocated_vmalloc_bytes, "Memory allocated with vmalloc");
2013
2014module_param_named(current_allocated_bytes, dm_bufio_current_allocated, ulong, S_IRUGO);
2015MODULE_PARM_DESC(current_allocated_bytes, "Memory currently used by the cache");
2016
2017MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>");
2018MODULE_DESCRIPTION(DM_NAME " buffered I/O library");
2019MODULE_LICENSE("GPL");
2020