linux/drivers/md/dm-writecache.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (C) 2018 Red Hat. All rights reserved.
   4 *
   5 * This file is released under the GPL.
   6 */
   7
   8#include <linux/device-mapper.h>
   9#include <linux/module.h>
  10#include <linux/init.h>
  11#include <linux/vmalloc.h>
  12#include <linux/kthread.h>
  13#include <linux/dm-io.h>
  14#include <linux/dm-kcopyd.h>
  15#include <linux/dax.h>
  16#include <linux/pfn_t.h>
  17#include <linux/libnvdimm.h>
  18#include <linux/delay.h>
  19#include "dm-io-tracker.h"
  20
  21#define DM_MSG_PREFIX "writecache"
  22
  23#define HIGH_WATERMARK                  50
  24#define LOW_WATERMARK                   45
  25#define MAX_WRITEBACK_JOBS              0
  26#define ENDIO_LATENCY                   16
  27#define WRITEBACK_LATENCY               64
  28#define AUTOCOMMIT_BLOCKS_SSD           65536
  29#define AUTOCOMMIT_BLOCKS_PMEM          64
  30#define AUTOCOMMIT_MSEC                 1000
  31#define MAX_AGE_DIV                     16
  32#define MAX_AGE_UNSPECIFIED             -1UL
  33#define PAUSE_WRITEBACK                 (HZ * 3)
  34
  35#define BITMAP_GRANULARITY      65536
  36#if BITMAP_GRANULARITY < PAGE_SIZE
  37#undef BITMAP_GRANULARITY
  38#define BITMAP_GRANULARITY      PAGE_SIZE
  39#endif
  40
  41#if IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && IS_ENABLED(CONFIG_DAX_DRIVER)
  42#define DM_WRITECACHE_HAS_PMEM
  43#endif
  44
  45#ifdef DM_WRITECACHE_HAS_PMEM
  46#define pmem_assign(dest, src)                                  \
  47do {                                                            \
  48        typeof(dest) uniq = (src);                              \
  49        memcpy_flushcache(&(dest), &uniq, sizeof(dest));        \
  50} while (0)
  51#else
  52#define pmem_assign(dest, src)  ((dest) = (src))
  53#endif
  54
  55#if IS_ENABLED(CONFIG_ARCH_HAS_COPY_MC) && defined(DM_WRITECACHE_HAS_PMEM)
  56#define DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
  57#endif
  58
  59#define MEMORY_SUPERBLOCK_MAGIC         0x23489321
  60#define MEMORY_SUPERBLOCK_VERSION       1
  61
  62struct wc_memory_entry {
  63        __le64 original_sector;
  64        __le64 seq_count;
  65};
  66
  67struct wc_memory_superblock {
  68        union {
  69                struct {
  70                        __le32 magic;
  71                        __le32 version;
  72                        __le32 block_size;
  73                        __le32 pad;
  74                        __le64 n_blocks;
  75                        __le64 seq_count;
  76                };
  77                __le64 padding[8];
  78        };
  79        struct wc_memory_entry entries[];
  80};
  81
  82struct wc_entry {
  83        struct rb_node rb_node;
  84        struct list_head lru;
  85        unsigned short wc_list_contiguous;
  86        bool write_in_progress
  87#if BITS_PER_LONG == 64
  88                :1
  89#endif
  90        ;
  91        unsigned long index
  92#if BITS_PER_LONG == 64
  93                :47
  94#endif
  95        ;
  96        unsigned long age;
  97#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
  98        uint64_t original_sector;
  99        uint64_t seq_count;
 100#endif
 101};
 102
 103#ifdef DM_WRITECACHE_HAS_PMEM
 104#define WC_MODE_PMEM(wc)                        ((wc)->pmem_mode)
 105#define WC_MODE_FUA(wc)                         ((wc)->writeback_fua)
 106#else
 107#define WC_MODE_PMEM(wc)                        false
 108#define WC_MODE_FUA(wc)                         false
 109#endif
 110#define WC_MODE_SORT_FREELIST(wc)               (!WC_MODE_PMEM(wc))
 111
 112struct dm_writecache {
 113        struct mutex lock;
 114        struct list_head lru;
 115        union {
 116                struct list_head freelist;
 117                struct {
 118                        struct rb_root freetree;
 119                        struct wc_entry *current_free;
 120                };
 121        };
 122        struct rb_root tree;
 123
 124        size_t freelist_size;
 125        size_t writeback_size;
 126        size_t freelist_high_watermark;
 127        size_t freelist_low_watermark;
 128        unsigned long max_age;
 129        unsigned long pause;
 130
 131        unsigned uncommitted_blocks;
 132        unsigned autocommit_blocks;
 133        unsigned max_writeback_jobs;
 134
 135        int error;
 136
 137        unsigned long autocommit_jiffies;
 138        struct timer_list autocommit_timer;
 139        struct wait_queue_head freelist_wait;
 140
 141        struct timer_list max_age_timer;
 142
 143        atomic_t bio_in_progress[2];
 144        struct wait_queue_head bio_in_progress_wait[2];
 145
 146        struct dm_target *ti;
 147        struct dm_dev *dev;
 148        struct dm_dev *ssd_dev;
 149        sector_t start_sector;
 150        void *memory_map;
 151        uint64_t memory_map_size;
 152        size_t metadata_sectors;
 153        size_t n_blocks;
 154        uint64_t seq_count;
 155        sector_t data_device_sectors;
 156        void *block_start;
 157        struct wc_entry *entries;
 158        unsigned block_size;
 159        unsigned char block_size_bits;
 160
 161        bool pmem_mode:1;
 162        bool writeback_fua:1;
 163
 164        bool overwrote_committed:1;
 165        bool memory_vmapped:1;
 166
 167        bool start_sector_set:1;
 168        bool high_wm_percent_set:1;
 169        bool low_wm_percent_set:1;
 170        bool max_writeback_jobs_set:1;
 171        bool autocommit_blocks_set:1;
 172        bool autocommit_time_set:1;
 173        bool max_age_set:1;
 174        bool writeback_fua_set:1;
 175        bool flush_on_suspend:1;
 176        bool cleaner:1;
 177        bool cleaner_set:1;
 178        bool metadata_only:1;
 179        bool pause_set:1;
 180
 181        unsigned high_wm_percent_value;
 182        unsigned low_wm_percent_value;
 183        unsigned autocommit_time_value;
 184        unsigned max_age_value;
 185        unsigned pause_value;
 186
 187        unsigned writeback_all;
 188        struct workqueue_struct *writeback_wq;
 189        struct work_struct writeback_work;
 190        struct work_struct flush_work;
 191
 192        struct dm_io_tracker iot;
 193
 194        struct dm_io_client *dm_io;
 195
 196        raw_spinlock_t endio_list_lock;
 197        struct list_head endio_list;
 198        struct task_struct *endio_thread;
 199
 200        struct task_struct *flush_thread;
 201        struct bio_list flush_list;
 202
 203        struct dm_kcopyd_client *dm_kcopyd;
 204        unsigned long *dirty_bitmap;
 205        unsigned dirty_bitmap_size;
 206
 207        struct bio_set bio_set;
 208        mempool_t copy_pool;
 209};
 210
 211#define WB_LIST_INLINE          16
 212
 213struct writeback_struct {
 214        struct list_head endio_entry;
 215        struct dm_writecache *wc;
 216        struct wc_entry **wc_list;
 217        unsigned wc_list_n;
 218        struct wc_entry *wc_list_inline[WB_LIST_INLINE];
 219        struct bio bio;
 220};
 221
 222struct copy_struct {
 223        struct list_head endio_entry;
 224        struct dm_writecache *wc;
 225        struct wc_entry *e;
 226        unsigned n_entries;
 227        int error;
 228};
 229
 230DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(dm_writecache_throttle,
 231                                            "A percentage of time allocated for data copying");
 232
 233static void wc_lock(struct dm_writecache *wc)
 234{
 235        mutex_lock(&wc->lock);
 236}
 237
 238static void wc_unlock(struct dm_writecache *wc)
 239{
 240        mutex_unlock(&wc->lock);
 241}
 242
 243#ifdef DM_WRITECACHE_HAS_PMEM
 244static int persistent_memory_claim(struct dm_writecache *wc)
 245{
 246        int r;
 247        loff_t s;
 248        long p, da;
 249        pfn_t pfn;
 250        int id;
 251        struct page **pages;
 252        sector_t offset;
 253
 254        wc->memory_vmapped = false;
 255
 256        s = wc->memory_map_size;
 257        p = s >> PAGE_SHIFT;
 258        if (!p) {
 259                r = -EINVAL;
 260                goto err1;
 261        }
 262        if (p != s >> PAGE_SHIFT) {
 263                r = -EOVERFLOW;
 264                goto err1;
 265        }
 266
 267        offset = get_start_sect(wc->ssd_dev->bdev);
 268        if (offset & (PAGE_SIZE / 512 - 1)) {
 269                r = -EINVAL;
 270                goto err1;
 271        }
 272        offset >>= PAGE_SHIFT - 9;
 273
 274        id = dax_read_lock();
 275
 276        da = dax_direct_access(wc->ssd_dev->dax_dev, offset, p, &wc->memory_map, &pfn);
 277        if (da < 0) {
 278                wc->memory_map = NULL;
 279                r = da;
 280                goto err2;
 281        }
 282        if (!pfn_t_has_page(pfn)) {
 283                wc->memory_map = NULL;
 284                r = -EOPNOTSUPP;
 285                goto err2;
 286        }
 287        if (da != p) {
 288                long i;
 289                wc->memory_map = NULL;
 290                pages = kvmalloc_array(p, sizeof(struct page *), GFP_KERNEL);
 291                if (!pages) {
 292                        r = -ENOMEM;
 293                        goto err2;
 294                }
 295                i = 0;
 296                do {
 297                        long daa;
 298                        daa = dax_direct_access(wc->ssd_dev->dax_dev, offset + i, p - i,
 299                                                NULL, &pfn);
 300                        if (daa <= 0) {
 301                                r = daa ? daa : -EINVAL;
 302                                goto err3;
 303                        }
 304                        if (!pfn_t_has_page(pfn)) {
 305                                r = -EOPNOTSUPP;
 306                                goto err3;
 307                        }
 308                        while (daa-- && i < p) {
 309                                pages[i++] = pfn_t_to_page(pfn);
 310                                pfn.val++;
 311                                if (!(i & 15))
 312                                        cond_resched();
 313                        }
 314                } while (i < p);
 315                wc->memory_map = vmap(pages, p, VM_MAP, PAGE_KERNEL);
 316                if (!wc->memory_map) {
 317                        r = -ENOMEM;
 318                        goto err3;
 319                }
 320                kvfree(pages);
 321                wc->memory_vmapped = true;
 322        }
 323
 324        dax_read_unlock(id);
 325
 326        wc->memory_map += (size_t)wc->start_sector << SECTOR_SHIFT;
 327        wc->memory_map_size -= (size_t)wc->start_sector << SECTOR_SHIFT;
 328
 329        return 0;
 330err3:
 331        kvfree(pages);
 332err2:
 333        dax_read_unlock(id);
 334err1:
 335        return r;
 336}
 337#else
 338static int persistent_memory_claim(struct dm_writecache *wc)
 339{
 340        return -EOPNOTSUPP;
 341}
 342#endif
 343
 344static void persistent_memory_release(struct dm_writecache *wc)
 345{
 346        if (wc->memory_vmapped)
 347                vunmap(wc->memory_map - ((size_t)wc->start_sector << SECTOR_SHIFT));
 348}
 349
 350static struct page *persistent_memory_page(void *addr)
 351{
 352        if (is_vmalloc_addr(addr))
 353                return vmalloc_to_page(addr);
 354        else
 355                return virt_to_page(addr);
 356}
 357
 358static unsigned persistent_memory_page_offset(void *addr)
 359{
 360        return (unsigned long)addr & (PAGE_SIZE - 1);
 361}
 362
 363static void persistent_memory_flush_cache(void *ptr, size_t size)
 364{
 365        if (is_vmalloc_addr(ptr))
 366                flush_kernel_vmap_range(ptr, size);
 367}
 368
 369static void persistent_memory_invalidate_cache(void *ptr, size_t size)
 370{
 371        if (is_vmalloc_addr(ptr))
 372                invalidate_kernel_vmap_range(ptr, size);
 373}
 374
 375static struct wc_memory_superblock *sb(struct dm_writecache *wc)
 376{
 377        return wc->memory_map;
 378}
 379
 380static struct wc_memory_entry *memory_entry(struct dm_writecache *wc, struct wc_entry *e)
 381{
 382        return &sb(wc)->entries[e->index];
 383}
 384
 385static void *memory_data(struct dm_writecache *wc, struct wc_entry *e)
 386{
 387        return (char *)wc->block_start + (e->index << wc->block_size_bits);
 388}
 389
 390static sector_t cache_sector(struct dm_writecache *wc, struct wc_entry *e)
 391{
 392        return wc->start_sector + wc->metadata_sectors +
 393                ((sector_t)e->index << (wc->block_size_bits - SECTOR_SHIFT));
 394}
 395
 396static uint64_t read_original_sector(struct dm_writecache *wc, struct wc_entry *e)
 397{
 398#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
 399        return e->original_sector;
 400#else
 401        return le64_to_cpu(memory_entry(wc, e)->original_sector);
 402#endif
 403}
 404
 405static uint64_t read_seq_count(struct dm_writecache *wc, struct wc_entry *e)
 406{
 407#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
 408        return e->seq_count;
 409#else
 410        return le64_to_cpu(memory_entry(wc, e)->seq_count);
 411#endif
 412}
 413
 414static void clear_seq_count(struct dm_writecache *wc, struct wc_entry *e)
 415{
 416#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
 417        e->seq_count = -1;
 418#endif
 419        pmem_assign(memory_entry(wc, e)->seq_count, cpu_to_le64(-1));
 420}
 421
 422static void write_original_sector_seq_count(struct dm_writecache *wc, struct wc_entry *e,
 423                                            uint64_t original_sector, uint64_t seq_count)
 424{
 425        struct wc_memory_entry me;
 426#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
 427        e->original_sector = original_sector;
 428        e->seq_count = seq_count;
 429#endif
 430        me.original_sector = cpu_to_le64(original_sector);
 431        me.seq_count = cpu_to_le64(seq_count);
 432        pmem_assign(*memory_entry(wc, e), me);
 433}
 434
 435#define writecache_error(wc, err, msg, arg...)                          \
 436do {                                                                    \
 437        if (!cmpxchg(&(wc)->error, 0, err))                             \
 438                DMERR(msg, ##arg);                                      \
 439        wake_up(&(wc)->freelist_wait);                                  \
 440} while (0)
 441
 442#define writecache_has_error(wc)        (unlikely(READ_ONCE((wc)->error)))
 443
 444static void writecache_flush_all_metadata(struct dm_writecache *wc)
 445{
 446        if (!WC_MODE_PMEM(wc))
 447                memset(wc->dirty_bitmap, -1, wc->dirty_bitmap_size);
 448}
 449
 450static void writecache_flush_region(struct dm_writecache *wc, void *ptr, size_t size)
 451{
 452        if (!WC_MODE_PMEM(wc))
 453                __set_bit(((char *)ptr - (char *)wc->memory_map) / BITMAP_GRANULARITY,
 454                          wc->dirty_bitmap);
 455}
 456
 457static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev);
 458
 459struct io_notify {
 460        struct dm_writecache *wc;
 461        struct completion c;
 462        atomic_t count;
 463};
 464
 465static void writecache_notify_io(unsigned long error, void *context)
 466{
 467        struct io_notify *endio = context;
 468
 469        if (unlikely(error != 0))
 470                writecache_error(endio->wc, -EIO, "error writing metadata");
 471        BUG_ON(atomic_read(&endio->count) <= 0);
 472        if (atomic_dec_and_test(&endio->count))
 473                complete(&endio->c);
 474}
 475
 476static void writecache_wait_for_ios(struct dm_writecache *wc, int direction)
 477{
 478        wait_event(wc->bio_in_progress_wait[direction],
 479                   !atomic_read(&wc->bio_in_progress[direction]));
 480}
 481
 482static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
 483{
 484        struct dm_io_region region;
 485        struct dm_io_request req;
 486        struct io_notify endio = {
 487                wc,
 488                COMPLETION_INITIALIZER_ONSTACK(endio.c),
 489                ATOMIC_INIT(1),
 490        };
 491        unsigned bitmap_bits = wc->dirty_bitmap_size * 8;
 492        unsigned i = 0;
 493
 494        while (1) {
 495                unsigned j;
 496                i = find_next_bit(wc->dirty_bitmap, bitmap_bits, i);
 497                if (unlikely(i == bitmap_bits))
 498                        break;
 499                j = find_next_zero_bit(wc->dirty_bitmap, bitmap_bits, i);
 500
 501                region.bdev = wc->ssd_dev->bdev;
 502                region.sector = (sector_t)i * (BITMAP_GRANULARITY >> SECTOR_SHIFT);
 503                region.count = (sector_t)(j - i) * (BITMAP_GRANULARITY >> SECTOR_SHIFT);
 504
 505                if (unlikely(region.sector >= wc->metadata_sectors))
 506                        break;
 507                if (unlikely(region.sector + region.count > wc->metadata_sectors))
 508                        region.count = wc->metadata_sectors - region.sector;
 509
 510                region.sector += wc->start_sector;
 511                atomic_inc(&endio.count);
 512                req.bi_op = REQ_OP_WRITE;
 513                req.bi_op_flags = REQ_SYNC;
 514                req.mem.type = DM_IO_VMA;
 515                req.mem.ptr.vma = (char *)wc->memory_map + (size_t)i * BITMAP_GRANULARITY;
 516                req.client = wc->dm_io;
 517                req.notify.fn = writecache_notify_io;
 518                req.notify.context = &endio;
 519
 520                /* writing via async dm-io (implied by notify.fn above) won't return an error */
 521                (void) dm_io(&req, 1, &region, NULL);
 522                i = j;
 523        }
 524
 525        writecache_notify_io(0, &endio);
 526        wait_for_completion_io(&endio.c);
 527
 528        if (wait_for_ios)
 529                writecache_wait_for_ios(wc, WRITE);
 530
 531        writecache_disk_flush(wc, wc->ssd_dev);
 532
 533        memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size);
 534}
 535
 536static void ssd_commit_superblock(struct dm_writecache *wc)
 537{
 538        int r;
 539        struct dm_io_region region;
 540        struct dm_io_request req;
 541
 542        region.bdev = wc->ssd_dev->bdev;
 543        region.sector = 0;
 544        region.count = max(4096U, wc->block_size) >> SECTOR_SHIFT;
 545
 546        if (unlikely(region.sector + region.count > wc->metadata_sectors))
 547                region.count = wc->metadata_sectors - region.sector;
 548
 549        region.sector += wc->start_sector;
 550
 551        req.bi_op = REQ_OP_WRITE;
 552        req.bi_op_flags = REQ_SYNC | REQ_FUA;
 553        req.mem.type = DM_IO_VMA;
 554        req.mem.ptr.vma = (char *)wc->memory_map;
 555        req.client = wc->dm_io;
 556        req.notify.fn = NULL;
 557        req.notify.context = NULL;
 558
 559        r = dm_io(&req, 1, &region, NULL);
 560        if (unlikely(r))
 561                writecache_error(wc, r, "error writing superblock");
 562}
 563
 564static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
 565{
 566        if (WC_MODE_PMEM(wc))
 567                pmem_wmb();
 568        else
 569                ssd_commit_flushed(wc, wait_for_ios);
 570}
 571
 572static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev)
 573{
 574        int r;
 575        struct dm_io_region region;
 576        struct dm_io_request req;
 577
 578        region.bdev = dev->bdev;
 579        region.sector = 0;
 580        region.count = 0;
 581        req.bi_op = REQ_OP_WRITE;
 582        req.bi_op_flags = REQ_PREFLUSH;
 583        req.mem.type = DM_IO_KMEM;
 584        req.mem.ptr.addr = NULL;
 585        req.client = wc->dm_io;
 586        req.notify.fn = NULL;
 587
 588        r = dm_io(&req, 1, &region, NULL);
 589        if (unlikely(r))
 590                writecache_error(wc, r, "error flushing metadata: %d", r);
 591}
 592
 593#define WFE_RETURN_FOLLOWING    1
 594#define WFE_LOWEST_SEQ          2
 595
 596static struct wc_entry *writecache_find_entry(struct dm_writecache *wc,
 597                                              uint64_t block, int flags)
 598{
 599        struct wc_entry *e;
 600        struct rb_node *node = wc->tree.rb_node;
 601
 602        if (unlikely(!node))
 603                return NULL;
 604
 605        while (1) {
 606                e = container_of(node, struct wc_entry, rb_node);
 607                if (read_original_sector(wc, e) == block)
 608                        break;
 609
 610                node = (read_original_sector(wc, e) >= block ?
 611                        e->rb_node.rb_left : e->rb_node.rb_right);
 612                if (unlikely(!node)) {
 613                        if (!(flags & WFE_RETURN_FOLLOWING))
 614                                return NULL;
 615                        if (read_original_sector(wc, e) >= block) {
 616                                return e;
 617                        } else {
 618                                node = rb_next(&e->rb_node);
 619                                if (unlikely(!node))
 620                                        return NULL;
 621                                e = container_of(node, struct wc_entry, rb_node);
 622                                return e;
 623                        }
 624                }
 625        }
 626
 627        while (1) {
 628                struct wc_entry *e2;
 629                if (flags & WFE_LOWEST_SEQ)
 630                        node = rb_prev(&e->rb_node);
 631                else
 632                        node = rb_next(&e->rb_node);
 633                if (unlikely(!node))
 634                        return e;
 635                e2 = container_of(node, struct wc_entry, rb_node);
 636                if (read_original_sector(wc, e2) != block)
 637                        return e;
 638                e = e2;
 639        }
 640}
 641
 642static void writecache_insert_entry(struct dm_writecache *wc, struct wc_entry *ins)
 643{
 644        struct wc_entry *e;
 645        struct rb_node **node = &wc->tree.rb_node, *parent = NULL;
 646
 647        while (*node) {
 648                e = container_of(*node, struct wc_entry, rb_node);
 649                parent = &e->rb_node;
 650                if (read_original_sector(wc, e) > read_original_sector(wc, ins))
 651                        node = &parent->rb_left;
 652                else
 653                        node = &parent->rb_right;
 654        }
 655        rb_link_node(&ins->rb_node, parent, node);
 656        rb_insert_color(&ins->rb_node, &wc->tree);
 657        list_add(&ins->lru, &wc->lru);
 658        ins->age = jiffies;
 659}
 660
 661static void writecache_unlink(struct dm_writecache *wc, struct wc_entry *e)
 662{
 663        list_del(&e->lru);
 664        rb_erase(&e->rb_node, &wc->tree);
 665}
 666
 667static void writecache_add_to_freelist(struct dm_writecache *wc, struct wc_entry *e)
 668{
 669        if (WC_MODE_SORT_FREELIST(wc)) {
 670                struct rb_node **node = &wc->freetree.rb_node, *parent = NULL;
 671                if (unlikely(!*node))
 672                        wc->current_free = e;
 673                while (*node) {
 674                        parent = *node;
 675                        if (&e->rb_node < *node)
 676                                node = &parent->rb_left;
 677                        else
 678                                node = &parent->rb_right;
 679                }
 680                rb_link_node(&e->rb_node, parent, node);
 681                rb_insert_color(&e->rb_node, &wc->freetree);
 682        } else {
 683                list_add_tail(&e->lru, &wc->freelist);
 684        }
 685        wc->freelist_size++;
 686}
 687
 688static inline void writecache_verify_watermark(struct dm_writecache *wc)
 689{
 690        if (unlikely(wc->freelist_size + wc->writeback_size <= wc->freelist_high_watermark))
 691                queue_work(wc->writeback_wq, &wc->writeback_work);
 692}
 693
 694static void writecache_max_age_timer(struct timer_list *t)
 695{
 696        struct dm_writecache *wc = from_timer(wc, t, max_age_timer);
 697
 698        if (!dm_suspended(wc->ti) && !writecache_has_error(wc)) {
 699                queue_work(wc->writeback_wq, &wc->writeback_work);
 700                mod_timer(&wc->max_age_timer, jiffies + wc->max_age / MAX_AGE_DIV);
 701        }
 702}
 703
 704static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc, sector_t expected_sector)
 705{
 706        struct wc_entry *e;
 707
 708        if (WC_MODE_SORT_FREELIST(wc)) {
 709                struct rb_node *next;
 710                if (unlikely(!wc->current_free))
 711                        return NULL;
 712                e = wc->current_free;
 713                if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector))
 714                        return NULL;
 715                next = rb_next(&e->rb_node);
 716                rb_erase(&e->rb_node, &wc->freetree);
 717                if (unlikely(!next))
 718                        next = rb_first(&wc->freetree);
 719                wc->current_free = next ? container_of(next, struct wc_entry, rb_node) : NULL;
 720        } else {
 721                if (unlikely(list_empty(&wc->freelist)))
 722                        return NULL;
 723                e = container_of(wc->freelist.next, struct wc_entry, lru);
 724                if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector))
 725                        return NULL;
 726                list_del(&e->lru);
 727        }
 728        wc->freelist_size--;
 729
 730        writecache_verify_watermark(wc);
 731
 732        return e;
 733}
 734
 735static void writecache_free_entry(struct dm_writecache *wc, struct wc_entry *e)
 736{
 737        writecache_unlink(wc, e);
 738        writecache_add_to_freelist(wc, e);
 739        clear_seq_count(wc, e);
 740        writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry));
 741        if (unlikely(waitqueue_active(&wc->freelist_wait)))
 742                wake_up(&wc->freelist_wait);
 743}
 744
 745static void writecache_wait_on_freelist(struct dm_writecache *wc)
 746{
 747        DEFINE_WAIT(wait);
 748
 749        prepare_to_wait(&wc->freelist_wait, &wait, TASK_UNINTERRUPTIBLE);
 750        wc_unlock(wc);
 751        io_schedule();
 752        finish_wait(&wc->freelist_wait, &wait);
 753        wc_lock(wc);
 754}
 755
 756static void writecache_poison_lists(struct dm_writecache *wc)
 757{
 758        /*
 759         * Catch incorrect access to these values while the device is suspended.
 760         */
 761        memset(&wc->tree, -1, sizeof wc->tree);
 762        wc->lru.next = LIST_POISON1;
 763        wc->lru.prev = LIST_POISON2;
 764        wc->freelist.next = LIST_POISON1;
 765        wc->freelist.prev = LIST_POISON2;
 766}
 767
 768static void writecache_flush_entry(struct dm_writecache *wc, struct wc_entry *e)
 769{
 770        writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry));
 771        if (WC_MODE_PMEM(wc))
 772                writecache_flush_region(wc, memory_data(wc, e), wc->block_size);
 773}
 774
 775static bool writecache_entry_is_committed(struct dm_writecache *wc, struct wc_entry *e)
 776{
 777        return read_seq_count(wc, e) < wc->seq_count;
 778}
 779
 780static void writecache_flush(struct dm_writecache *wc)
 781{
 782        struct wc_entry *e, *e2;
 783        bool need_flush_after_free;
 784
 785        wc->uncommitted_blocks = 0;
 786        del_timer(&wc->autocommit_timer);
 787
 788        if (list_empty(&wc->lru))
 789                return;
 790
 791        e = container_of(wc->lru.next, struct wc_entry, lru);
 792        if (writecache_entry_is_committed(wc, e)) {
 793                if (wc->overwrote_committed) {
 794                        writecache_wait_for_ios(wc, WRITE);
 795                        writecache_disk_flush(wc, wc->ssd_dev);
 796                        wc->overwrote_committed = false;
 797                }
 798                return;
 799        }
 800        while (1) {
 801                writecache_flush_entry(wc, e);
 802                if (unlikely(e->lru.next == &wc->lru))
 803                        break;
 804                e2 = container_of(e->lru.next, struct wc_entry, lru);
 805                if (writecache_entry_is_committed(wc, e2))
 806                        break;
 807                e = e2;
 808                cond_resched();
 809        }
 810        writecache_commit_flushed(wc, true);
 811
 812        wc->seq_count++;
 813        pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count));
 814        if (WC_MODE_PMEM(wc))
 815                writecache_commit_flushed(wc, false);
 816        else
 817                ssd_commit_superblock(wc);
 818
 819        wc->overwrote_committed = false;
 820
 821        need_flush_after_free = false;
 822        while (1) {
 823                /* Free another committed entry with lower seq-count */
 824                struct rb_node *rb_node = rb_prev(&e->rb_node);
 825
 826                if (rb_node) {
 827                        e2 = container_of(rb_node, struct wc_entry, rb_node);
 828                        if (read_original_sector(wc, e2) == read_original_sector(wc, e) &&
 829                            likely(!e2->write_in_progress)) {
 830                                writecache_free_entry(wc, e2);
 831                                need_flush_after_free = true;
 832                        }
 833                }
 834                if (unlikely(e->lru.prev == &wc->lru))
 835                        break;
 836                e = container_of(e->lru.prev, struct wc_entry, lru);
 837                cond_resched();
 838        }
 839
 840        if (need_flush_after_free)
 841                writecache_commit_flushed(wc, false);
 842}
 843
 844static void writecache_flush_work(struct work_struct *work)
 845{
 846        struct dm_writecache *wc = container_of(work, struct dm_writecache, flush_work);
 847
 848        wc_lock(wc);
 849        writecache_flush(wc);
 850        wc_unlock(wc);
 851}
 852
 853static void writecache_autocommit_timer(struct timer_list *t)
 854{
 855        struct dm_writecache *wc = from_timer(wc, t, autocommit_timer);
 856        if (!writecache_has_error(wc))
 857                queue_work(wc->writeback_wq, &wc->flush_work);
 858}
 859
 860static void writecache_schedule_autocommit(struct dm_writecache *wc)
 861{
 862        if (!timer_pending(&wc->autocommit_timer))
 863                mod_timer(&wc->autocommit_timer, jiffies + wc->autocommit_jiffies);
 864}
 865
 866static void writecache_discard(struct dm_writecache *wc, sector_t start, sector_t end)
 867{
 868        struct wc_entry *e;
 869        bool discarded_something = false;
 870
 871        e = writecache_find_entry(wc, start, WFE_RETURN_FOLLOWING | WFE_LOWEST_SEQ);
 872        if (unlikely(!e))
 873                return;
 874
 875        while (read_original_sector(wc, e) < end) {
 876                struct rb_node *node = rb_next(&e->rb_node);
 877
 878                if (likely(!e->write_in_progress)) {
 879                        if (!discarded_something) {
 880                                if (!WC_MODE_PMEM(wc)) {
 881                                        writecache_wait_for_ios(wc, READ);
 882                                        writecache_wait_for_ios(wc, WRITE);
 883                                }
 884                                discarded_something = true;
 885                        }
 886                        if (!writecache_entry_is_committed(wc, e))
 887                                wc->uncommitted_blocks--;
 888                        writecache_free_entry(wc, e);
 889                }
 890
 891                if (unlikely(!node))
 892                        break;
 893
 894                e = container_of(node, struct wc_entry, rb_node);
 895        }
 896
 897        if (discarded_something)
 898                writecache_commit_flushed(wc, false);
 899}
 900
 901static bool writecache_wait_for_writeback(struct dm_writecache *wc)
 902{
 903        if (wc->writeback_size) {
 904                writecache_wait_on_freelist(wc);
 905                return true;
 906        }
 907        return false;
 908}
 909
 910static void writecache_suspend(struct dm_target *ti)
 911{
 912        struct dm_writecache *wc = ti->private;
 913        bool flush_on_suspend;
 914
 915        del_timer_sync(&wc->autocommit_timer);
 916        del_timer_sync(&wc->max_age_timer);
 917
 918        wc_lock(wc);
 919        writecache_flush(wc);
 920        flush_on_suspend = wc->flush_on_suspend;
 921        if (flush_on_suspend) {
 922                wc->flush_on_suspend = false;
 923                wc->writeback_all++;
 924                queue_work(wc->writeback_wq, &wc->writeback_work);
 925        }
 926        wc_unlock(wc);
 927
 928        drain_workqueue(wc->writeback_wq);
 929
 930        wc_lock(wc);
 931        if (flush_on_suspend)
 932                wc->writeback_all--;
 933        while (writecache_wait_for_writeback(wc));
 934
 935        if (WC_MODE_PMEM(wc))
 936                persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size);
 937
 938        writecache_poison_lists(wc);
 939
 940        wc_unlock(wc);
 941}
 942
 943static int writecache_alloc_entries(struct dm_writecache *wc)
 944{
 945        size_t b;
 946
 947        if (wc->entries)
 948                return 0;
 949        wc->entries = vmalloc(array_size(sizeof(struct wc_entry), wc->n_blocks));
 950        if (!wc->entries)
 951                return -ENOMEM;
 952        for (b = 0; b < wc->n_blocks; b++) {
 953                struct wc_entry *e = &wc->entries[b];
 954                e->index = b;
 955                e->write_in_progress = false;
 956                cond_resched();
 957        }
 958
 959        return 0;
 960}
 961
 962static int writecache_read_metadata(struct dm_writecache *wc, sector_t n_sectors)
 963{
 964        struct dm_io_region region;
 965        struct dm_io_request req;
 966
 967        region.bdev = wc->ssd_dev->bdev;
 968        region.sector = wc->start_sector;
 969        region.count = n_sectors;
 970        req.bi_op = REQ_OP_READ;
 971        req.bi_op_flags = REQ_SYNC;
 972        req.mem.type = DM_IO_VMA;
 973        req.mem.ptr.vma = (char *)wc->memory_map;
 974        req.client = wc->dm_io;
 975        req.notify.fn = NULL;
 976
 977        return dm_io(&req, 1, &region, NULL);
 978}
 979
 980static void writecache_resume(struct dm_target *ti)
 981{
 982        struct dm_writecache *wc = ti->private;
 983        size_t b;
 984        bool need_flush = false;
 985        __le64 sb_seq_count;
 986        int r;
 987
 988        wc_lock(wc);
 989
 990        wc->data_device_sectors = bdev_nr_sectors(wc->dev->bdev);
 991
 992        if (WC_MODE_PMEM(wc)) {
 993                persistent_memory_invalidate_cache(wc->memory_map, wc->memory_map_size);
 994        } else {
 995                r = writecache_read_metadata(wc, wc->metadata_sectors);
 996                if (r) {
 997                        size_t sb_entries_offset;
 998                        writecache_error(wc, r, "unable to read metadata: %d", r);
 999                        sb_entries_offset = offsetof(struct wc_memory_superblock, entries);
1000                        memset((char *)wc->memory_map + sb_entries_offset, -1,
1001                               (wc->metadata_sectors << SECTOR_SHIFT) - sb_entries_offset);
1002                }
1003        }
1004
1005        wc->tree = RB_ROOT;
1006        INIT_LIST_HEAD(&wc->lru);
1007        if (WC_MODE_SORT_FREELIST(wc)) {
1008                wc->freetree = RB_ROOT;
1009                wc->current_free = NULL;
1010        } else {
1011                INIT_LIST_HEAD(&wc->freelist);
1012        }
1013        wc->freelist_size = 0;
1014
1015        r = copy_mc_to_kernel(&sb_seq_count, &sb(wc)->seq_count,
1016                              sizeof(uint64_t));
1017        if (r) {
1018                writecache_error(wc, r, "hardware memory error when reading superblock: %d", r);
1019                sb_seq_count = cpu_to_le64(0);
1020        }
1021        wc->seq_count = le64_to_cpu(sb_seq_count);
1022
1023#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
1024        for (b = 0; b < wc->n_blocks; b++) {
1025                struct wc_entry *e = &wc->entries[b];
1026                struct wc_memory_entry wme;
1027                if (writecache_has_error(wc)) {
1028                        e->original_sector = -1;
1029                        e->seq_count = -1;
1030                        continue;
1031                }
1032                r = copy_mc_to_kernel(&wme, memory_entry(wc, e),
1033                                      sizeof(struct wc_memory_entry));
1034                if (r) {
1035                        writecache_error(wc, r, "hardware memory error when reading metadata entry %lu: %d",
1036                                         (unsigned long)b, r);
1037                        e->original_sector = -1;
1038                        e->seq_count = -1;
1039                } else {
1040                        e->original_sector = le64_to_cpu(wme.original_sector);
1041                        e->seq_count = le64_to_cpu(wme.seq_count);
1042                }
1043                cond_resched();
1044        }
1045#endif
1046        for (b = 0; b < wc->n_blocks; b++) {
1047                struct wc_entry *e = &wc->entries[b];
1048                if (!writecache_entry_is_committed(wc, e)) {
1049                        if (read_seq_count(wc, e) != -1) {
1050erase_this:
1051                                clear_seq_count(wc, e);
1052                                need_flush = true;
1053                        }
1054                        writecache_add_to_freelist(wc, e);
1055                } else {
1056                        struct wc_entry *old;
1057
1058                        old = writecache_find_entry(wc, read_original_sector(wc, e), 0);
1059                        if (!old) {
1060                                writecache_insert_entry(wc, e);
1061                        } else {
1062                                if (read_seq_count(wc, old) == read_seq_count(wc, e)) {
1063                                        writecache_error(wc, -EINVAL,
1064                                                 "two identical entries, position %llu, sector %llu, sequence %llu",
1065                                                 (unsigned long long)b, (unsigned long long)read_original_sector(wc, e),
1066                                                 (unsigned long long)read_seq_count(wc, e));
1067                                }
1068                                if (read_seq_count(wc, old) > read_seq_count(wc, e)) {
1069                                        goto erase_this;
1070                                } else {
1071                                        writecache_free_entry(wc, old);
1072                                        writecache_insert_entry(wc, e);
1073                                        need_flush = true;
1074                                }
1075                        }
1076                }
1077                cond_resched();
1078        }
1079
1080        if (need_flush) {
1081                writecache_flush_all_metadata(wc);
1082                writecache_commit_flushed(wc, false);
1083        }
1084
1085        writecache_verify_watermark(wc);
1086
1087        if (wc->max_age != MAX_AGE_UNSPECIFIED)
1088                mod_timer(&wc->max_age_timer, jiffies + wc->max_age / MAX_AGE_DIV);
1089
1090        wc_unlock(wc);
1091}
1092
1093static int process_flush_mesg(unsigned argc, char **argv, struct dm_writecache *wc)
1094{
1095        if (argc != 1)
1096                return -EINVAL;
1097
1098        wc_lock(wc);
1099        if (dm_suspended(wc->ti)) {
1100                wc_unlock(wc);
1101                return -EBUSY;
1102        }
1103        if (writecache_has_error(wc)) {
1104                wc_unlock(wc);
1105                return -EIO;
1106        }
1107
1108        writecache_flush(wc);
1109        wc->writeback_all++;
1110        queue_work(wc->writeback_wq, &wc->writeback_work);
1111        wc_unlock(wc);
1112
1113        flush_workqueue(wc->writeback_wq);
1114
1115        wc_lock(wc);
1116        wc->writeback_all--;
1117        if (writecache_has_error(wc)) {
1118                wc_unlock(wc);
1119                return -EIO;
1120        }
1121        wc_unlock(wc);
1122
1123        return 0;
1124}
1125
1126static int process_flush_on_suspend_mesg(unsigned argc, char **argv, struct dm_writecache *wc)
1127{
1128        if (argc != 1)
1129                return -EINVAL;
1130
1131        wc_lock(wc);
1132        wc->flush_on_suspend = true;
1133        wc_unlock(wc);
1134
1135        return 0;
1136}
1137
1138static void activate_cleaner(struct dm_writecache *wc)
1139{
1140        wc->flush_on_suspend = true;
1141        wc->cleaner = true;
1142        wc->freelist_high_watermark = wc->n_blocks;
1143        wc->freelist_low_watermark = wc->n_blocks;
1144}
1145
1146static int process_cleaner_mesg(unsigned argc, char **argv, struct dm_writecache *wc)
1147{
1148        if (argc != 1)
1149                return -EINVAL;
1150
1151        wc_lock(wc);
1152        activate_cleaner(wc);
1153        if (!dm_suspended(wc->ti))
1154                writecache_verify_watermark(wc);
1155        wc_unlock(wc);
1156
1157        return 0;
1158}
1159
1160static int writecache_message(struct dm_target *ti, unsigned argc, char **argv,
1161                              char *result, unsigned maxlen)
1162{
1163        int r = -EINVAL;
1164        struct dm_writecache *wc = ti->private;
1165
1166        if (!strcasecmp(argv[0], "flush"))
1167                r = process_flush_mesg(argc, argv, wc);
1168        else if (!strcasecmp(argv[0], "flush_on_suspend"))
1169                r = process_flush_on_suspend_mesg(argc, argv, wc);
1170        else if (!strcasecmp(argv[0], "cleaner"))
1171                r = process_cleaner_mesg(argc, argv, wc);
1172        else
1173                DMERR("unrecognised message received: %s", argv[0]);
1174
1175        return r;
1176}
1177
1178static void memcpy_flushcache_optimized(void *dest, void *source, size_t size)
1179{
1180        /*
1181         * clflushopt performs better with block size 1024, 2048, 4096
1182         * non-temporal stores perform better with block size 512
1183         *
1184         * block size   512             1024            2048            4096
1185         * movnti       496 MB/s        642 MB/s        725 MB/s        744 MB/s
1186         * clflushopt   373 MB/s        688 MB/s        1.1 GB/s        1.2 GB/s
1187         *
1188         * We see that movnti performs better for 512-byte blocks, and
1189         * clflushopt performs better for 1024-byte and larger blocks. So, we
1190         * prefer clflushopt for sizes >= 768.
1191         *
1192         * NOTE: this happens to be the case now (with dm-writecache's single
1193         * threaded model) but re-evaluate this once memcpy_flushcache() is
1194         * enabled to use movdir64b which might invalidate this performance
1195         * advantage seen with cache-allocating-writes plus flushing.
1196         */
1197#ifdef CONFIG_X86
1198        if (static_cpu_has(X86_FEATURE_CLFLUSHOPT) &&
1199            likely(boot_cpu_data.x86_clflush_size == 64) &&
1200            likely(size >= 768)) {
1201                do {
1202                        memcpy((void *)dest, (void *)source, 64);
1203                        clflushopt((void *)dest);
1204                        dest += 64;
1205                        source += 64;
1206                        size -= 64;
1207                } while (size >= 64);
1208                return;
1209        }
1210#endif
1211        memcpy_flushcache(dest, source, size);
1212}
1213
1214static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data)
1215{
1216        void *buf;
1217        unsigned long flags;
1218        unsigned size;
1219        int rw = bio_data_dir(bio);
1220        unsigned remaining_size = wc->block_size;
1221
1222        do {
1223                struct bio_vec bv = bio_iter_iovec(bio, bio->bi_iter);
1224                buf = bvec_kmap_irq(&bv, &flags);
1225                size = bv.bv_len;
1226                if (unlikely(size > remaining_size))
1227                        size = remaining_size;
1228
1229                if (rw == READ) {
1230                        int r;
1231                        r = copy_mc_to_kernel(buf, data, size);
1232                        flush_dcache_page(bio_page(bio));
1233                        if (unlikely(r)) {
1234                                writecache_error(wc, r, "hardware memory error when reading data: %d", r);
1235                                bio->bi_status = BLK_STS_IOERR;
1236                        }
1237                } else {
1238                        flush_dcache_page(bio_page(bio));
1239                        memcpy_flushcache_optimized(data, buf, size);
1240                }
1241
1242                bvec_kunmap_irq(buf, &flags);
1243
1244                data = (char *)data + size;
1245                remaining_size -= size;
1246                bio_advance(bio, size);
1247        } while (unlikely(remaining_size));
1248}
1249
1250static int writecache_flush_thread(void *data)
1251{
1252        struct dm_writecache *wc = data;
1253
1254        while (1) {
1255                struct bio *bio;
1256
1257                wc_lock(wc);
1258                bio = bio_list_pop(&wc->flush_list);
1259                if (!bio) {
1260                        set_current_state(TASK_INTERRUPTIBLE);
1261                        wc_unlock(wc);
1262
1263                        if (unlikely(kthread_should_stop())) {
1264                                set_current_state(TASK_RUNNING);
1265                                break;
1266                        }
1267
1268                        schedule();
1269                        continue;
1270                }
1271
1272                if (bio_op(bio) == REQ_OP_DISCARD) {
1273                        writecache_discard(wc, bio->bi_iter.bi_sector,
1274                                           bio_end_sector(bio));
1275                        wc_unlock(wc);
1276                        bio_set_dev(bio, wc->dev->bdev);
1277                        submit_bio_noacct(bio);
1278                } else {
1279                        writecache_flush(wc);
1280                        wc_unlock(wc);
1281                        if (writecache_has_error(wc))
1282                                bio->bi_status = BLK_STS_IOERR;
1283                        bio_endio(bio);
1284                }
1285        }
1286
1287        return 0;
1288}
1289
1290static void writecache_offload_bio(struct dm_writecache *wc, struct bio *bio)
1291{
1292        if (bio_list_empty(&wc->flush_list))
1293                wake_up_process(wc->flush_thread);
1294        bio_list_add(&wc->flush_list, bio);
1295}
1296
1297static int writecache_map(struct dm_target *ti, struct bio *bio)
1298{
1299        struct wc_entry *e;
1300        struct dm_writecache *wc = ti->private;
1301
1302        bio->bi_private = NULL;
1303
1304        wc_lock(wc);
1305
1306        if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
1307                if (writecache_has_error(wc))
1308                        goto unlock_error;
1309                if (WC_MODE_PMEM(wc)) {
1310                        writecache_flush(wc);
1311                        if (writecache_has_error(wc))
1312                                goto unlock_error;
1313                        if (unlikely(wc->cleaner) || unlikely(wc->metadata_only))
1314                                goto unlock_remap_origin;
1315                        goto unlock_submit;
1316                } else {
1317                        if (dm_bio_get_target_bio_nr(bio))
1318                                goto unlock_remap_origin;
1319                        writecache_offload_bio(wc, bio);
1320                        goto unlock_return;
1321                }
1322        }
1323
1324        bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
1325
1326        if (unlikely((((unsigned)bio->bi_iter.bi_sector | bio_sectors(bio)) &
1327                                (wc->block_size / 512 - 1)) != 0)) {
1328                DMERR("I/O is not aligned, sector %llu, size %u, block size %u",
1329                      (unsigned long long)bio->bi_iter.bi_sector,
1330                      bio->bi_iter.bi_size, wc->block_size);
1331                goto unlock_error;
1332        }
1333
1334        if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
1335                if (writecache_has_error(wc))
1336                        goto unlock_error;
1337                if (WC_MODE_PMEM(wc)) {
1338                        writecache_discard(wc, bio->bi_iter.bi_sector, bio_end_sector(bio));
1339                        goto unlock_remap_origin;
1340                } else {
1341                        writecache_offload_bio(wc, bio);
1342                        goto unlock_return;
1343                }
1344        }
1345
1346        if (bio_data_dir(bio) == READ) {
1347read_next_block:
1348                e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING);
1349                if (e && read_original_sector(wc, e) == bio->bi_iter.bi_sector) {
1350                        if (WC_MODE_PMEM(wc)) {
1351                                bio_copy_block(wc, bio, memory_data(wc, e));
1352                                if (bio->bi_iter.bi_size)
1353                                        goto read_next_block;
1354                                goto unlock_submit;
1355                        } else {
1356                                dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT);
1357                                bio_set_dev(bio, wc->ssd_dev->bdev);
1358                                bio->bi_iter.bi_sector = cache_sector(wc, e);
1359                                if (!writecache_entry_is_committed(wc, e))
1360                                        writecache_wait_for_ios(wc, WRITE);
1361                                goto unlock_remap;
1362                        }
1363                } else {
1364                        if (e) {
1365                                sector_t next_boundary =
1366                                        read_original_sector(wc, e) - bio->bi_iter.bi_sector;
1367                                if (next_boundary < bio->bi_iter.bi_size >> SECTOR_SHIFT) {
1368                                        dm_accept_partial_bio(bio, next_boundary);
1369                                }
1370                        }
1371                        goto unlock_remap_origin;
1372                }
1373        } else {
1374                do {
1375                        bool found_entry = false;
1376                        bool search_used = false;
1377                        if (writecache_has_error(wc))
1378                                goto unlock_error;
1379                        e = writecache_find_entry(wc, bio->bi_iter.bi_sector, 0);
1380                        if (e) {
1381                                if (!writecache_entry_is_committed(wc, e)) {
1382                                        search_used = true;
1383                                        goto bio_copy;
1384                                }
1385                                if (!WC_MODE_PMEM(wc) && !e->write_in_progress) {
1386                                        wc->overwrote_committed = true;
1387                                        search_used = true;
1388                                        goto bio_copy;
1389                                }
1390                                found_entry = true;
1391                        } else {
1392                                if (unlikely(wc->cleaner) ||
1393                                    (wc->metadata_only && !(bio->bi_opf & REQ_META)))
1394                                        goto direct_write;
1395                        }
1396                        e = writecache_pop_from_freelist(wc, (sector_t)-1);
1397                        if (unlikely(!e)) {
1398                                if (!WC_MODE_PMEM(wc) && !found_entry) {
1399direct_write:
1400                                        e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING);
1401                                        if (e) {
1402                                                sector_t next_boundary = read_original_sector(wc, e) - bio->bi_iter.bi_sector;
1403                                                BUG_ON(!next_boundary);
1404                                                if (next_boundary < bio->bi_iter.bi_size >> SECTOR_SHIFT) {
1405                                                        dm_accept_partial_bio(bio, next_boundary);
1406                                                }
1407                                        }
1408                                        goto unlock_remap_origin;
1409                                }
1410                                writecache_wait_on_freelist(wc);
1411                                continue;
1412                        }
1413                        write_original_sector_seq_count(wc, e, bio->bi_iter.bi_sector, wc->seq_count);
1414                        writecache_insert_entry(wc, e);
1415                        wc->uncommitted_blocks++;
1416bio_copy:
1417                        if (WC_MODE_PMEM(wc)) {
1418                                bio_copy_block(wc, bio, memory_data(wc, e));
1419                        } else {
1420                                unsigned bio_size = wc->block_size;
1421                                sector_t start_cache_sec = cache_sector(wc, e);
1422                                sector_t current_cache_sec = start_cache_sec + (bio_size >> SECTOR_SHIFT);
1423
1424                                while (bio_size < bio->bi_iter.bi_size) {
1425                                        if (!search_used) {
1426                                                struct wc_entry *f = writecache_pop_from_freelist(wc, current_cache_sec);
1427                                                if (!f)
1428                                                        break;
1429                                                write_original_sector_seq_count(wc, f, bio->bi_iter.bi_sector +
1430                                                                                (bio_size >> SECTOR_SHIFT), wc->seq_count);
1431                                                writecache_insert_entry(wc, f);
1432                                                wc->uncommitted_blocks++;
1433                                        } else {
1434                                                struct wc_entry *f;
1435                                                struct rb_node *next = rb_next(&e->rb_node);
1436                                                if (!next)
1437                                                        break;
1438                                                f = container_of(next, struct wc_entry, rb_node);
1439                                                if (f != e + 1)
1440                                                        break;
1441                                                if (read_original_sector(wc, f) !=
1442                                                    read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT))
1443                                                        break;
1444                                                if (unlikely(f->write_in_progress))
1445                                                        break;
1446                                                if (writecache_entry_is_committed(wc, f))
1447                                                        wc->overwrote_committed = true;
1448                                                e = f;
1449                                        }
1450                                        bio_size += wc->block_size;
1451                                        current_cache_sec += wc->block_size >> SECTOR_SHIFT;
1452                                }
1453
1454                                bio_set_dev(bio, wc->ssd_dev->bdev);
1455                                bio->bi_iter.bi_sector = start_cache_sec;
1456                                dm_accept_partial_bio(bio, bio_size >> SECTOR_SHIFT);
1457
1458                                if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) {
1459                                        wc->uncommitted_blocks = 0;
1460                                        queue_work(wc->writeback_wq, &wc->flush_work);
1461                                } else {
1462                                        writecache_schedule_autocommit(wc);
1463                                }
1464                                goto unlock_remap;
1465                        }
1466                } while (bio->bi_iter.bi_size);
1467
1468                if (unlikely(bio->bi_opf & REQ_FUA ||
1469                             wc->uncommitted_blocks >= wc->autocommit_blocks))
1470                        writecache_flush(wc);
1471                else
1472                        writecache_schedule_autocommit(wc);
1473                goto unlock_submit;
1474        }
1475
1476unlock_remap_origin:
1477        if (likely(wc->pause != 0)) {
1478                 if (bio_op(bio) == REQ_OP_WRITE) {
1479                        dm_iot_io_begin(&wc->iot, 1);
1480                        bio->bi_private = (void *)2;
1481                }
1482        }
1483        bio_set_dev(bio, wc->dev->bdev);
1484        wc_unlock(wc);
1485        return DM_MAPIO_REMAPPED;
1486
1487unlock_remap:
1488        /* make sure that writecache_end_io decrements bio_in_progress: */
1489        bio->bi_private = (void *)1;
1490        atomic_inc(&wc->bio_in_progress[bio_data_dir(bio)]);
1491        wc_unlock(wc);
1492        return DM_MAPIO_REMAPPED;
1493
1494unlock_submit:
1495        wc_unlock(wc);
1496        bio_endio(bio);
1497        return DM_MAPIO_SUBMITTED;
1498
1499unlock_return:
1500        wc_unlock(wc);
1501        return DM_MAPIO_SUBMITTED;
1502
1503unlock_error:
1504        wc_unlock(wc);
1505        bio_io_error(bio);
1506        return DM_MAPIO_SUBMITTED;
1507}
1508
1509static int writecache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *status)
1510{
1511        struct dm_writecache *wc = ti->private;
1512
1513        if (bio->bi_private == (void *)1) {
1514                int dir = bio_data_dir(bio);
1515                if (atomic_dec_and_test(&wc->bio_in_progress[dir]))
1516                        if (unlikely(waitqueue_active(&wc->bio_in_progress_wait[dir])))
1517                                wake_up(&wc->bio_in_progress_wait[dir]);
1518        } else if (bio->bi_private == (void *)2) {
1519                dm_iot_io_end(&wc->iot, 1);
1520        }
1521        return 0;
1522}
1523
1524static int writecache_iterate_devices(struct dm_target *ti,
1525                                      iterate_devices_callout_fn fn, void *data)
1526{
1527        struct dm_writecache *wc = ti->private;
1528
1529        return fn(ti, wc->dev, 0, ti->len, data);
1530}
1531
1532static void writecache_io_hints(struct dm_target *ti, struct queue_limits *limits)
1533{
1534        struct dm_writecache *wc = ti->private;
1535
1536        if (limits->logical_block_size < wc->block_size)
1537                limits->logical_block_size = wc->block_size;
1538
1539        if (limits->physical_block_size < wc->block_size)
1540                limits->physical_block_size = wc->block_size;
1541
1542        if (limits->io_min < wc->block_size)
1543                limits->io_min = wc->block_size;
1544}
1545
1546
1547static void writecache_writeback_endio(struct bio *bio)
1548{
1549        struct writeback_struct *wb = container_of(bio, struct writeback_struct, bio);
1550        struct dm_writecache *wc = wb->wc;
1551        unsigned long flags;
1552
1553        raw_spin_lock_irqsave(&wc->endio_list_lock, flags);
1554        if (unlikely(list_empty(&wc->endio_list)))
1555                wake_up_process(wc->endio_thread);
1556        list_add_tail(&wb->endio_entry, &wc->endio_list);
1557        raw_spin_unlock_irqrestore(&wc->endio_list_lock, flags);
1558}
1559
1560static void writecache_copy_endio(int read_err, unsigned long write_err, void *ptr)
1561{
1562        struct copy_struct *c = ptr;
1563        struct dm_writecache *wc = c->wc;
1564
1565        c->error = likely(!(read_err | write_err)) ? 0 : -EIO;
1566
1567        raw_spin_lock_irq(&wc->endio_list_lock);
1568        if (unlikely(list_empty(&wc->endio_list)))
1569                wake_up_process(wc->endio_thread);
1570        list_add_tail(&c->endio_entry, &wc->endio_list);
1571        raw_spin_unlock_irq(&wc->endio_list_lock);
1572}
1573
1574static void __writecache_endio_pmem(struct dm_writecache *wc, struct list_head *list)
1575{
1576        unsigned i;
1577        struct writeback_struct *wb;
1578        struct wc_entry *e;
1579        unsigned long n_walked = 0;
1580
1581        do {
1582                wb = list_entry(list->next, struct writeback_struct, endio_entry);
1583                list_del(&wb->endio_entry);
1584
1585                if (unlikely(wb->bio.bi_status != BLK_STS_OK))
1586                        writecache_error(wc, blk_status_to_errno(wb->bio.bi_status),
1587                                        "write error %d", wb->bio.bi_status);
1588                i = 0;
1589                do {
1590                        e = wb->wc_list[i];
1591                        BUG_ON(!e->write_in_progress);
1592                        e->write_in_progress = false;
1593                        INIT_LIST_HEAD(&e->lru);
1594                        if (!writecache_has_error(wc))
1595                                writecache_free_entry(wc, e);
1596                        BUG_ON(!wc->writeback_size);
1597                        wc->writeback_size--;
1598                        n_walked++;
1599                        if (unlikely(n_walked >= ENDIO_LATENCY)) {
1600                                writecache_commit_flushed(wc, false);
1601                                wc_unlock(wc);
1602                                wc_lock(wc);
1603                                n_walked = 0;
1604                        }
1605                } while (++i < wb->wc_list_n);
1606
1607                if (wb->wc_list != wb->wc_list_inline)
1608                        kfree(wb->wc_list);
1609                bio_put(&wb->bio);
1610        } while (!list_empty(list));
1611}
1612
1613static void __writecache_endio_ssd(struct dm_writecache *wc, struct list_head *list)
1614{
1615        struct copy_struct *c;
1616        struct wc_entry *e;
1617
1618        do {
1619                c = list_entry(list->next, struct copy_struct, endio_entry);
1620                list_del(&c->endio_entry);
1621
1622                if (unlikely(c->error))
1623                        writecache_error(wc, c->error, "copy error");
1624
1625                e = c->e;
1626                do {
1627                        BUG_ON(!e->write_in_progress);
1628                        e->write_in_progress = false;
1629                        INIT_LIST_HEAD(&e->lru);
1630                        if (!writecache_has_error(wc))
1631                                writecache_free_entry(wc, e);
1632
1633                        BUG_ON(!wc->writeback_size);
1634                        wc->writeback_size--;
1635                        e++;
1636                } while (--c->n_entries);
1637                mempool_free(c, &wc->copy_pool);
1638        } while (!list_empty(list));
1639}
1640
1641static int writecache_endio_thread(void *data)
1642{
1643        struct dm_writecache *wc = data;
1644
1645        while (1) {
1646                struct list_head list;
1647
1648                raw_spin_lock_irq(&wc->endio_list_lock);
1649                if (!list_empty(&wc->endio_list))
1650                        goto pop_from_list;
1651                set_current_state(TASK_INTERRUPTIBLE);
1652                raw_spin_unlock_irq(&wc->endio_list_lock);
1653
1654                if (unlikely(kthread_should_stop())) {
1655                        set_current_state(TASK_RUNNING);
1656                        break;
1657                }
1658
1659                schedule();
1660
1661                continue;
1662
1663pop_from_list:
1664                list = wc->endio_list;
1665                list.next->prev = list.prev->next = &list;
1666                INIT_LIST_HEAD(&wc->endio_list);
1667                raw_spin_unlock_irq(&wc->endio_list_lock);
1668
1669                if (!WC_MODE_FUA(wc))
1670                        writecache_disk_flush(wc, wc->dev);
1671
1672                wc_lock(wc);
1673
1674                if (WC_MODE_PMEM(wc)) {
1675                        __writecache_endio_pmem(wc, &list);
1676                } else {
1677                        __writecache_endio_ssd(wc, &list);
1678                        writecache_wait_for_ios(wc, READ);
1679                }
1680
1681                writecache_commit_flushed(wc, false);
1682
1683                wc_unlock(wc);
1684        }
1685
1686        return 0;
1687}
1688
1689static bool wc_add_block(struct writeback_struct *wb, struct wc_entry *e)
1690{
1691        struct dm_writecache *wc = wb->wc;
1692        unsigned block_size = wc->block_size;
1693        void *address = memory_data(wc, e);
1694
1695        persistent_memory_flush_cache(address, block_size);
1696
1697        if (unlikely(bio_end_sector(&wb->bio) >= wc->data_device_sectors))
1698                return true;
1699
1700        return bio_add_page(&wb->bio, persistent_memory_page(address),
1701                            block_size, persistent_memory_page_offset(address)) != 0;
1702}
1703
1704struct writeback_list {
1705        struct list_head list;
1706        size_t size;
1707};
1708
1709static void __writeback_throttle(struct dm_writecache *wc, struct writeback_list *wbl)
1710{
1711        if (unlikely(wc->max_writeback_jobs)) {
1712                if (READ_ONCE(wc->writeback_size) - wbl->size >= wc->max_writeback_jobs) {
1713                        wc_lock(wc);
1714                        while (wc->writeback_size - wbl->size >= wc->max_writeback_jobs)
1715                                writecache_wait_on_freelist(wc);
1716                        wc_unlock(wc);
1717                }
1718        }
1719        cond_resched();
1720}
1721
1722static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeback_list *wbl)
1723{
1724        struct wc_entry *e, *f;
1725        struct bio *bio;
1726        struct writeback_struct *wb;
1727        unsigned max_pages;
1728
1729        while (wbl->size) {
1730                wbl->size--;
1731                e = container_of(wbl->list.prev, struct wc_entry, lru);
1732                list_del(&e->lru);
1733
1734                max_pages = e->wc_list_contiguous;
1735
1736                bio = bio_alloc_bioset(GFP_NOIO, max_pages, &wc->bio_set);
1737                wb = container_of(bio, struct writeback_struct, bio);
1738                wb->wc = wc;
1739                bio->bi_end_io = writecache_writeback_endio;
1740                bio_set_dev(bio, wc->dev->bdev);
1741                bio->bi_iter.bi_sector = read_original_sector(wc, e);
1742                if (max_pages <= WB_LIST_INLINE ||
1743                    unlikely(!(wb->wc_list = kmalloc_array(max_pages, sizeof(struct wc_entry *),
1744                                                           GFP_NOIO | __GFP_NORETRY |
1745                                                           __GFP_NOMEMALLOC | __GFP_NOWARN)))) {
1746                        wb->wc_list = wb->wc_list_inline;
1747                        max_pages = WB_LIST_INLINE;
1748                }
1749
1750                BUG_ON(!wc_add_block(wb, e));
1751
1752                wb->wc_list[0] = e;
1753                wb->wc_list_n = 1;
1754
1755                while (wbl->size && wb->wc_list_n < max_pages) {
1756                        f = container_of(wbl->list.prev, struct wc_entry, lru);
1757                        if (read_original_sector(wc, f) !=
1758                            read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT))
1759                                break;
1760                        if (!wc_add_block(wb, f))
1761                                break;
1762                        wbl->size--;
1763                        list_del(&f->lru);
1764                        wb->wc_list[wb->wc_list_n++] = f;
1765                        e = f;
1766                }
1767                bio_set_op_attrs(bio, REQ_OP_WRITE, WC_MODE_FUA(wc) * REQ_FUA);
1768                if (writecache_has_error(wc)) {
1769                        bio->bi_status = BLK_STS_IOERR;
1770                        bio_endio(bio);
1771                } else if (unlikely(!bio_sectors(bio))) {
1772                        bio->bi_status = BLK_STS_OK;
1773                        bio_endio(bio);
1774                } else {
1775                        submit_bio(bio);
1776                }
1777
1778                __writeback_throttle(wc, wbl);
1779        }
1780}
1781
1782static void __writecache_writeback_ssd(struct dm_writecache *wc, struct writeback_list *wbl)
1783{
1784        struct wc_entry *e, *f;
1785        struct dm_io_region from, to;
1786        struct copy_struct *c;
1787
1788        while (wbl->size) {
1789                unsigned n_sectors;
1790
1791                wbl->size--;
1792                e = container_of(wbl->list.prev, struct wc_entry, lru);
1793                list_del(&e->lru);
1794
1795                n_sectors = e->wc_list_contiguous << (wc->block_size_bits - SECTOR_SHIFT);
1796
1797                from.bdev = wc->ssd_dev->bdev;
1798                from.sector = cache_sector(wc, e);
1799                from.count = n_sectors;
1800                to.bdev = wc->dev->bdev;
1801                to.sector = read_original_sector(wc, e);
1802                to.count = n_sectors;
1803
1804                c = mempool_alloc(&wc->copy_pool, GFP_NOIO);
1805                c->wc = wc;
1806                c->e = e;
1807                c->n_entries = e->wc_list_contiguous;
1808
1809                while ((n_sectors -= wc->block_size >> SECTOR_SHIFT)) {
1810                        wbl->size--;
1811                        f = container_of(wbl->list.prev, struct wc_entry, lru);
1812                        BUG_ON(f != e + 1);
1813                        list_del(&f->lru);
1814                        e = f;
1815                }
1816
1817                if (unlikely(to.sector + to.count > wc->data_device_sectors)) {
1818                        if (to.sector >= wc->data_device_sectors) {
1819                                writecache_copy_endio(0, 0, c);
1820                                continue;
1821                        }
1822                        from.count = to.count = wc->data_device_sectors - to.sector;
1823                }
1824
1825                dm_kcopyd_copy(wc->dm_kcopyd, &from, 1, &to, 0, writecache_copy_endio, c);
1826
1827                __writeback_throttle(wc, wbl);
1828        }
1829}
1830
1831static void writecache_writeback(struct work_struct *work)
1832{
1833        struct dm_writecache *wc = container_of(work, struct dm_writecache, writeback_work);
1834        struct blk_plug plug;
1835        struct wc_entry *f, *g, *e = NULL;
1836        struct rb_node *node, *next_node;
1837        struct list_head skipped;
1838        struct writeback_list wbl;
1839        unsigned long n_walked;
1840
1841        if (!WC_MODE_PMEM(wc)) {
1842                /* Wait for any active kcopyd work on behalf of ssd writeback */
1843                dm_kcopyd_client_flush(wc->dm_kcopyd);
1844        }
1845
1846        if (likely(wc->pause != 0)) {
1847                while (1) {
1848                        unsigned long idle;
1849                        if (unlikely(wc->cleaner) || unlikely(wc->writeback_all) ||
1850                            unlikely(dm_suspended(wc->ti)))
1851                                break;
1852                        idle = dm_iot_idle_time(&wc->iot);
1853                        if (idle >= wc->pause)
1854                                break;
1855                        idle = wc->pause - idle;
1856                        if (idle > HZ)
1857                                idle = HZ;
1858                        schedule_timeout_idle(idle);
1859                }
1860        }
1861
1862        wc_lock(wc);
1863restart:
1864        if (writecache_has_error(wc)) {
1865                wc_unlock(wc);
1866                return;
1867        }
1868
1869        if (unlikely(wc->writeback_all)) {
1870                if (writecache_wait_for_writeback(wc))
1871                        goto restart;
1872        }
1873
1874        if (wc->overwrote_committed) {
1875                writecache_wait_for_ios(wc, WRITE);
1876        }
1877
1878        n_walked = 0;
1879        INIT_LIST_HEAD(&skipped);
1880        INIT_LIST_HEAD(&wbl.list);
1881        wbl.size = 0;
1882        while (!list_empty(&wc->lru) &&
1883               (wc->writeback_all ||
1884                wc->freelist_size + wc->writeback_size <= wc->freelist_low_watermark ||
1885                (jiffies - container_of(wc->lru.prev, struct wc_entry, lru)->age >=
1886                 wc->max_age - wc->max_age / MAX_AGE_DIV))) {
1887
1888                n_walked++;
1889                if (unlikely(n_walked > WRITEBACK_LATENCY) &&
1890                    likely(!wc->writeback_all)) {
1891                        if (likely(!dm_suspended(wc->ti)))
1892                                queue_work(wc->writeback_wq, &wc->writeback_work);
1893                        break;
1894                }
1895
1896                if (unlikely(wc->writeback_all)) {
1897                        if (unlikely(!e)) {
1898                                writecache_flush(wc);
1899                                e = container_of(rb_first(&wc->tree), struct wc_entry, rb_node);
1900                        } else
1901                                e = g;
1902                } else
1903                        e = container_of(wc->lru.prev, struct wc_entry, lru);
1904                BUG_ON(e->write_in_progress);
1905                if (unlikely(!writecache_entry_is_committed(wc, e))) {
1906                        writecache_flush(wc);
1907                }
1908                node = rb_prev(&e->rb_node);
1909                if (node) {
1910                        f = container_of(node, struct wc_entry, rb_node);
1911                        if (unlikely(read_original_sector(wc, f) ==
1912                                     read_original_sector(wc, e))) {
1913                                BUG_ON(!f->write_in_progress);
1914                                list_move(&e->lru, &skipped);
1915                                cond_resched();
1916                                continue;
1917                        }
1918                }
1919                wc->writeback_size++;
1920                list_move(&e->lru, &wbl.list);
1921                wbl.size++;
1922                e->write_in_progress = true;
1923                e->wc_list_contiguous = 1;
1924
1925                f = e;
1926
1927                while (1) {
1928                        next_node = rb_next(&f->rb_node);
1929                        if (unlikely(!next_node))
1930                                break;
1931                        g = container_of(next_node, struct wc_entry, rb_node);
1932                        if (unlikely(read_original_sector(wc, g) ==
1933                            read_original_sector(wc, f))) {
1934                                f = g;
1935                                continue;
1936                        }
1937                        if (read_original_sector(wc, g) !=
1938                            read_original_sector(wc, f) + (wc->block_size >> SECTOR_SHIFT))
1939                                break;
1940                        if (unlikely(g->write_in_progress))
1941                                break;
1942                        if (unlikely(!writecache_entry_is_committed(wc, g)))
1943                                break;
1944
1945                        if (!WC_MODE_PMEM(wc)) {
1946                                if (g != f + 1)
1947                                        break;
1948                        }
1949
1950                        n_walked++;
1951                        //if (unlikely(n_walked > WRITEBACK_LATENCY) && likely(!wc->writeback_all))
1952                        //      break;
1953
1954                        wc->writeback_size++;
1955                        list_move(&g->lru, &wbl.list);
1956                        wbl.size++;
1957                        g->write_in_progress = true;
1958                        g->wc_list_contiguous = BIO_MAX_VECS;
1959                        f = g;
1960                        e->wc_list_contiguous++;
1961                        if (unlikely(e->wc_list_contiguous == BIO_MAX_VECS)) {
1962                                if (unlikely(wc->writeback_all)) {
1963                                        next_node = rb_next(&f->rb_node);
1964                                        if (likely(next_node))
1965                                                g = container_of(next_node, struct wc_entry, rb_node);
1966                                }
1967                                break;
1968                        }
1969                }
1970                cond_resched();
1971        }
1972
1973        if (!list_empty(&skipped)) {
1974                list_splice_tail(&skipped, &wc->lru);
1975                /*
1976                 * If we didn't do any progress, we must wait until some
1977                 * writeback finishes to avoid burning CPU in a loop
1978                 */
1979                if (unlikely(!wbl.size))
1980                        writecache_wait_for_writeback(wc);
1981        }
1982
1983        wc_unlock(wc);
1984
1985        blk_start_plug(&plug);
1986
1987        if (WC_MODE_PMEM(wc))
1988                __writecache_writeback_pmem(wc, &wbl);
1989        else
1990                __writecache_writeback_ssd(wc, &wbl);
1991
1992        blk_finish_plug(&plug);
1993
1994        if (unlikely(wc->writeback_all)) {
1995                wc_lock(wc);
1996                while (writecache_wait_for_writeback(wc));
1997                wc_unlock(wc);
1998        }
1999}
2000
2001static int calculate_memory_size(uint64_t device_size, unsigned block_size,
2002                                 size_t *n_blocks_p, size_t *n_metadata_blocks_p)
2003{
2004        uint64_t n_blocks, offset;
2005        struct wc_entry e;
2006
2007        n_blocks = device_size;
2008        do_div(n_blocks, block_size + sizeof(struct wc_memory_entry));
2009
2010        while (1) {
2011                if (!n_blocks)
2012                        return -ENOSPC;
2013                /* Verify the following entries[n_blocks] won't overflow */
2014                if (n_blocks >= ((size_t)-sizeof(struct wc_memory_superblock) /
2015                                 sizeof(struct wc_memory_entry)))
2016                        return -EFBIG;
2017                offset = offsetof(struct wc_memory_superblock, entries[n_blocks]);
2018                offset = (offset + block_size - 1) & ~(uint64_t)(block_size - 1);
2019                if (offset + n_blocks * block_size <= device_size)
2020                        break;
2021                n_blocks--;
2022        }
2023
2024        /* check if the bit field overflows */
2025        e.index = n_blocks;
2026        if (e.index != n_blocks)
2027                return -EFBIG;
2028
2029        if (n_blocks_p)
2030                *n_blocks_p = n_blocks;
2031        if (n_metadata_blocks_p)
2032                *n_metadata_blocks_p = offset >> __ffs(block_size);
2033        return 0;
2034}
2035
2036static int init_memory(struct dm_writecache *wc)
2037{
2038        size_t b;
2039        int r;
2040
2041        r = calculate_memory_size(wc->memory_map_size, wc->block_size, &wc->n_blocks, NULL);
2042        if (r)
2043                return r;
2044
2045        r = writecache_alloc_entries(wc);
2046        if (r)
2047                return r;
2048
2049        for (b = 0; b < ARRAY_SIZE(sb(wc)->padding); b++)
2050                pmem_assign(sb(wc)->padding[b], cpu_to_le64(0));
2051        pmem_assign(sb(wc)->version, cpu_to_le32(MEMORY_SUPERBLOCK_VERSION));
2052        pmem_assign(sb(wc)->block_size, cpu_to_le32(wc->block_size));
2053        pmem_assign(sb(wc)->n_blocks, cpu_to_le64(wc->n_blocks));
2054        pmem_assign(sb(wc)->seq_count, cpu_to_le64(0));
2055
2056        for (b = 0; b < wc->n_blocks; b++) {
2057                write_original_sector_seq_count(wc, &wc->entries[b], -1, -1);
2058                cond_resched();
2059        }
2060
2061        writecache_flush_all_metadata(wc);
2062        writecache_commit_flushed(wc, false);
2063        pmem_assign(sb(wc)->magic, cpu_to_le32(MEMORY_SUPERBLOCK_MAGIC));
2064        writecache_flush_region(wc, &sb(wc)->magic, sizeof sb(wc)->magic);
2065        writecache_commit_flushed(wc, false);
2066
2067        return 0;
2068}
2069
2070static void writecache_dtr(struct dm_target *ti)
2071{
2072        struct dm_writecache *wc = ti->private;
2073
2074        if (!wc)
2075                return;
2076
2077        if (wc->endio_thread)
2078                kthread_stop(wc->endio_thread);
2079
2080        if (wc->flush_thread)
2081                kthread_stop(wc->flush_thread);
2082
2083        bioset_exit(&wc->bio_set);
2084
2085        mempool_exit(&wc->copy_pool);
2086
2087        if (wc->writeback_wq)
2088                destroy_workqueue(wc->writeback_wq);
2089
2090        if (wc->dev)
2091                dm_put_device(ti, wc->dev);
2092
2093        if (wc->ssd_dev)
2094                dm_put_device(ti, wc->ssd_dev);
2095
2096        vfree(wc->entries);
2097
2098        if (wc->memory_map) {
2099                if (WC_MODE_PMEM(wc))
2100                        persistent_memory_release(wc);
2101                else
2102                        vfree(wc->memory_map);
2103        }
2104
2105        if (wc->dm_kcopyd)
2106                dm_kcopyd_client_destroy(wc->dm_kcopyd);
2107
2108        if (wc->dm_io)
2109                dm_io_client_destroy(wc->dm_io);
2110
2111        vfree(wc->dirty_bitmap);
2112
2113        kfree(wc);
2114}
2115
2116static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
2117{
2118        struct dm_writecache *wc;
2119        struct dm_arg_set as;
2120        const char *string;
2121        unsigned opt_params;
2122        size_t offset, data_size;
2123        int i, r;
2124        char dummy;
2125        int high_wm_percent = HIGH_WATERMARK;
2126        int low_wm_percent = LOW_WATERMARK;
2127        uint64_t x;
2128        struct wc_memory_superblock s;
2129
2130        static struct dm_arg _args[] = {
2131                {0, 18, "Invalid number of feature args"},
2132        };
2133
2134        as.argc = argc;
2135        as.argv = argv;
2136
2137        wc = kzalloc(sizeof(struct dm_writecache), GFP_KERNEL);
2138        if (!wc) {
2139                ti->error = "Cannot allocate writecache structure";
2140                r = -ENOMEM;
2141                goto bad;
2142        }
2143        ti->private = wc;
2144        wc->ti = ti;
2145
2146        mutex_init(&wc->lock);
2147        wc->max_age = MAX_AGE_UNSPECIFIED;
2148        writecache_poison_lists(wc);
2149        init_waitqueue_head(&wc->freelist_wait);
2150        timer_setup(&wc->autocommit_timer, writecache_autocommit_timer, 0);
2151        timer_setup(&wc->max_age_timer, writecache_max_age_timer, 0);
2152
2153        for (i = 0; i < 2; i++) {
2154                atomic_set(&wc->bio_in_progress[i], 0);
2155                init_waitqueue_head(&wc->bio_in_progress_wait[i]);
2156        }
2157
2158        wc->dm_io = dm_io_client_create();
2159        if (IS_ERR(wc->dm_io)) {
2160                r = PTR_ERR(wc->dm_io);
2161                ti->error = "Unable to allocate dm-io client";
2162                wc->dm_io = NULL;
2163                goto bad;
2164        }
2165
2166        wc->writeback_wq = alloc_workqueue("writecache-writeback", WQ_MEM_RECLAIM, 1);
2167        if (!wc->writeback_wq) {
2168                r = -ENOMEM;
2169                ti->error = "Could not allocate writeback workqueue";
2170                goto bad;
2171        }
2172        INIT_WORK(&wc->writeback_work, writecache_writeback);
2173        INIT_WORK(&wc->flush_work, writecache_flush_work);
2174
2175        dm_iot_init(&wc->iot);
2176
2177        raw_spin_lock_init(&wc->endio_list_lock);
2178        INIT_LIST_HEAD(&wc->endio_list);
2179        wc->endio_thread = kthread_create(writecache_endio_thread, wc, "writecache_endio");
2180        if (IS_ERR(wc->endio_thread)) {
2181                r = PTR_ERR(wc->endio_thread);
2182                wc->endio_thread = NULL;
2183                ti->error = "Couldn't spawn endio thread";
2184                goto bad;
2185        }
2186        wake_up_process(wc->endio_thread);
2187
2188        /*
2189         * Parse the mode (pmem or ssd)
2190         */
2191        string = dm_shift_arg(&as);
2192        if (!string)
2193                goto bad_arguments;
2194
2195        if (!strcasecmp(string, "s")) {
2196                wc->pmem_mode = false;
2197        } else if (!strcasecmp(string, "p")) {
2198#ifdef DM_WRITECACHE_HAS_PMEM
2199                wc->pmem_mode = true;
2200                wc->writeback_fua = true;
2201#else
2202                /*
2203                 * If the architecture doesn't support persistent memory or
2204                 * the kernel doesn't support any DAX drivers, this driver can
2205                 * only be used in SSD-only mode.
2206                 */
2207                r = -EOPNOTSUPP;
2208                ti->error = "Persistent memory or DAX not supported on this system";
2209                goto bad;
2210#endif
2211        } else {
2212                goto bad_arguments;
2213        }
2214
2215        if (WC_MODE_PMEM(wc)) {
2216                r = bioset_init(&wc->bio_set, BIO_POOL_SIZE,
2217                                offsetof(struct writeback_struct, bio),
2218                                BIOSET_NEED_BVECS);
2219                if (r) {
2220                        ti->error = "Could not allocate bio set";
2221                        goto bad;
2222                }
2223        } else {
2224                wc->pause = PAUSE_WRITEBACK;
2225                r = mempool_init_kmalloc_pool(&wc->copy_pool, 1, sizeof(struct copy_struct));
2226                if (r) {
2227                        ti->error = "Could not allocate mempool";
2228                        goto bad;
2229                }
2230        }
2231
2232        /*
2233         * Parse the origin data device
2234         */
2235        string = dm_shift_arg(&as);
2236        if (!string)
2237                goto bad_arguments;
2238        r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->dev);
2239        if (r) {
2240                ti->error = "Origin data device lookup failed";
2241                goto bad;
2242        }
2243
2244        /*
2245         * Parse cache data device (be it pmem or ssd)
2246         */
2247        string = dm_shift_arg(&as);
2248        if (!string)
2249                goto bad_arguments;
2250
2251        r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->ssd_dev);
2252        if (r) {
2253                ti->error = "Cache data device lookup failed";
2254                goto bad;
2255        }
2256        wc->memory_map_size = i_size_read(wc->ssd_dev->bdev->bd_inode);
2257
2258        /*
2259         * Parse the cache block size
2260         */
2261        string = dm_shift_arg(&as);
2262        if (!string)
2263                goto bad_arguments;
2264        if (sscanf(string, "%u%c", &wc->block_size, &dummy) != 1 ||
2265            wc->block_size < 512 || wc->block_size > PAGE_SIZE ||
2266            (wc->block_size & (wc->block_size - 1))) {
2267                r = -EINVAL;
2268                ti->error = "Invalid block size";
2269                goto bad;
2270        }
2271        if (wc->block_size < bdev_logical_block_size(wc->dev->bdev) ||
2272            wc->block_size < bdev_logical_block_size(wc->ssd_dev->bdev)) {
2273                r = -EINVAL;
2274                ti->error = "Block size is smaller than device logical block size";
2275                goto bad;
2276        }
2277        wc->block_size_bits = __ffs(wc->block_size);
2278
2279        wc->max_writeback_jobs = MAX_WRITEBACK_JOBS;
2280        wc->autocommit_blocks = !WC_MODE_PMEM(wc) ? AUTOCOMMIT_BLOCKS_SSD : AUTOCOMMIT_BLOCKS_PMEM;
2281        wc->autocommit_jiffies = msecs_to_jiffies(AUTOCOMMIT_MSEC);
2282
2283        /*
2284         * Parse optional arguments
2285         */
2286        r = dm_read_arg_group(_args, &as, &opt_params, &ti->error);
2287        if (r)
2288                goto bad;
2289
2290        while (opt_params) {
2291                string = dm_shift_arg(&as), opt_params--;
2292                if (!strcasecmp(string, "start_sector") && opt_params >= 1) {
2293                        unsigned long long start_sector;
2294                        string = dm_shift_arg(&as), opt_params--;
2295                        if (sscanf(string, "%llu%c", &start_sector, &dummy) != 1)
2296                                goto invalid_optional;
2297                        wc->start_sector = start_sector;
2298                        wc->start_sector_set = true;
2299                        if (wc->start_sector != start_sector ||
2300                            wc->start_sector >= wc->memory_map_size >> SECTOR_SHIFT)
2301                                goto invalid_optional;
2302                } else if (!strcasecmp(string, "high_watermark") && opt_params >= 1) {
2303                        string = dm_shift_arg(&as), opt_params--;
2304                        if (sscanf(string, "%d%c", &high_wm_percent, &dummy) != 1)
2305                                goto invalid_optional;
2306                        if (high_wm_percent < 0 || high_wm_percent > 100)
2307                                goto invalid_optional;
2308                        wc->high_wm_percent_value = high_wm_percent;
2309                        wc->high_wm_percent_set = true;
2310                } else if (!strcasecmp(string, "low_watermark") && opt_params >= 1) {
2311                        string = dm_shift_arg(&as), opt_params--;
2312                        if (sscanf(string, "%d%c", &low_wm_percent, &dummy) != 1)
2313                                goto invalid_optional;
2314                        if (low_wm_percent < 0 || low_wm_percent > 100)
2315                                goto invalid_optional;
2316                        wc->low_wm_percent_value = low_wm_percent;
2317                        wc->low_wm_percent_set = true;
2318                } else if (!strcasecmp(string, "writeback_jobs") && opt_params >= 1) {
2319                        string = dm_shift_arg(&as), opt_params--;
2320                        if (sscanf(string, "%u%c", &wc->max_writeback_jobs, &dummy) != 1)
2321                                goto invalid_optional;
2322                        wc->max_writeback_jobs_set = true;
2323                } else if (!strcasecmp(string, "autocommit_blocks") && opt_params >= 1) {
2324                        string = dm_shift_arg(&as), opt_params--;
2325                        if (sscanf(string, "%u%c", &wc->autocommit_blocks, &dummy) != 1)
2326                                goto invalid_optional;
2327                        wc->autocommit_blocks_set = true;
2328                } else if (!strcasecmp(string, "autocommit_time") && opt_params >= 1) {
2329                        unsigned autocommit_msecs;
2330                        string = dm_shift_arg(&as), opt_params--;
2331                        if (sscanf(string, "%u%c", &autocommit_msecs, &dummy) != 1)
2332                                goto invalid_optional;
2333                        if (autocommit_msecs > 3600000)
2334                                goto invalid_optional;
2335                        wc->autocommit_jiffies = msecs_to_jiffies(autocommit_msecs);
2336                        wc->autocommit_time_value = autocommit_msecs;
2337                        wc->autocommit_time_set = true;
2338                } else if (!strcasecmp(string, "max_age") && opt_params >= 1) {
2339                        unsigned max_age_msecs;
2340                        string = dm_shift_arg(&as), opt_params--;
2341                        if (sscanf(string, "%u%c", &max_age_msecs, &dummy) != 1)
2342                                goto invalid_optional;
2343                        if (max_age_msecs > 86400000)
2344                                goto invalid_optional;
2345                        wc->max_age = msecs_to_jiffies(max_age_msecs);
2346                        wc->max_age_set = true;
2347                        wc->max_age_value = max_age_msecs;
2348                } else if (!strcasecmp(string, "cleaner")) {
2349                        wc->cleaner_set = true;
2350                        wc->cleaner = true;
2351                } else if (!strcasecmp(string, "fua")) {
2352                        if (WC_MODE_PMEM(wc)) {
2353                                wc->writeback_fua = true;
2354                                wc->writeback_fua_set = true;
2355                        } else goto invalid_optional;
2356                } else if (!strcasecmp(string, "nofua")) {
2357                        if (WC_MODE_PMEM(wc)) {
2358                                wc->writeback_fua = false;
2359                                wc->writeback_fua_set = true;
2360                        } else goto invalid_optional;
2361                } else if (!strcasecmp(string, "metadata_only")) {
2362                        wc->metadata_only = true;
2363                } else if (!strcasecmp(string, "pause_writeback") && opt_params >= 1) {
2364                        unsigned pause_msecs;
2365                        if (WC_MODE_PMEM(wc))
2366                                goto invalid_optional;
2367                        string = dm_shift_arg(&as), opt_params--;
2368                        if (sscanf(string, "%u%c", &pause_msecs, &dummy) != 1)
2369                                goto invalid_optional;
2370                        if (pause_msecs > 60000)
2371                                goto invalid_optional;
2372                        wc->pause = msecs_to_jiffies(pause_msecs);
2373                        wc->pause_set = true;
2374                        wc->pause_value = pause_msecs;
2375                } else {
2376invalid_optional:
2377                        r = -EINVAL;
2378                        ti->error = "Invalid optional argument";
2379                        goto bad;
2380                }
2381        }
2382
2383        if (high_wm_percent < low_wm_percent) {
2384                r = -EINVAL;
2385                ti->error = "High watermark must be greater than or equal to low watermark";
2386                goto bad;
2387        }
2388
2389        if (WC_MODE_PMEM(wc)) {
2390                if (!dax_synchronous(wc->ssd_dev->dax_dev)) {
2391                        r = -EOPNOTSUPP;
2392                        ti->error = "Asynchronous persistent memory not supported as pmem cache";
2393                        goto bad;
2394                }
2395
2396                r = persistent_memory_claim(wc);
2397                if (r) {
2398                        ti->error = "Unable to map persistent memory for cache";
2399                        goto bad;
2400                }
2401        } else {
2402                size_t n_blocks, n_metadata_blocks;
2403                uint64_t n_bitmap_bits;
2404
2405                wc->memory_map_size -= (uint64_t)wc->start_sector << SECTOR_SHIFT;
2406
2407                bio_list_init(&wc->flush_list);
2408                wc->flush_thread = kthread_create(writecache_flush_thread, wc, "dm_writecache_flush");
2409                if (IS_ERR(wc->flush_thread)) {
2410                        r = PTR_ERR(wc->flush_thread);
2411                        wc->flush_thread = NULL;
2412                        ti->error = "Couldn't spawn flush thread";
2413                        goto bad;
2414                }
2415                wake_up_process(wc->flush_thread);
2416
2417                r = calculate_memory_size(wc->memory_map_size, wc->block_size,
2418                                          &n_blocks, &n_metadata_blocks);
2419                if (r) {
2420                        ti->error = "Invalid device size";
2421                        goto bad;
2422                }
2423
2424                n_bitmap_bits = (((uint64_t)n_metadata_blocks << wc->block_size_bits) +
2425                                 BITMAP_GRANULARITY - 1) / BITMAP_GRANULARITY;
2426                /* this is limitation of test_bit functions */
2427                if (n_bitmap_bits > 1U << 31) {
2428                        r = -EFBIG;
2429                        ti->error = "Invalid device size";
2430                        goto bad;
2431                }
2432
2433                wc->memory_map = vmalloc(n_metadata_blocks << wc->block_size_bits);
2434                if (!wc->memory_map) {
2435                        r = -ENOMEM;
2436                        ti->error = "Unable to allocate memory for metadata";
2437                        goto bad;
2438                }
2439
2440                wc->dm_kcopyd = dm_kcopyd_client_create(&dm_kcopyd_throttle);
2441                if (IS_ERR(wc->dm_kcopyd)) {
2442                        r = PTR_ERR(wc->dm_kcopyd);
2443                        ti->error = "Unable to allocate dm-kcopyd client";
2444                        wc->dm_kcopyd = NULL;
2445                        goto bad;
2446                }
2447
2448                wc->metadata_sectors = n_metadata_blocks << (wc->block_size_bits - SECTOR_SHIFT);
2449                wc->dirty_bitmap_size = (n_bitmap_bits + BITS_PER_LONG - 1) /
2450                        BITS_PER_LONG * sizeof(unsigned long);
2451                wc->dirty_bitmap = vzalloc(wc->dirty_bitmap_size);
2452                if (!wc->dirty_bitmap) {
2453                        r = -ENOMEM;
2454                        ti->error = "Unable to allocate dirty bitmap";
2455                        goto bad;
2456                }
2457
2458                r = writecache_read_metadata(wc, wc->block_size >> SECTOR_SHIFT);
2459                if (r) {
2460                        ti->error = "Unable to read first block of metadata";
2461                        goto bad;
2462                }
2463        }
2464
2465        r = copy_mc_to_kernel(&s, sb(wc), sizeof(struct wc_memory_superblock));
2466        if (r) {
2467                ti->error = "Hardware memory error when reading superblock";
2468                goto bad;
2469        }
2470        if (!le32_to_cpu(s.magic) && !le32_to_cpu(s.version)) {
2471                r = init_memory(wc);
2472                if (r) {
2473                        ti->error = "Unable to initialize device";
2474                        goto bad;
2475                }
2476                r = copy_mc_to_kernel(&s, sb(wc),
2477                                      sizeof(struct wc_memory_superblock));
2478                if (r) {
2479                        ti->error = "Hardware memory error when reading superblock";
2480                        goto bad;
2481                }
2482        }
2483
2484        if (le32_to_cpu(s.magic) != MEMORY_SUPERBLOCK_MAGIC) {
2485                ti->error = "Invalid magic in the superblock";
2486                r = -EINVAL;
2487                goto bad;
2488        }
2489
2490        if (le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION) {
2491                ti->error = "Invalid version in the superblock";
2492                r = -EINVAL;
2493                goto bad;
2494        }
2495
2496        if (le32_to_cpu(s.block_size) != wc->block_size) {
2497                ti->error = "Block size does not match superblock";
2498                r = -EINVAL;
2499                goto bad;
2500        }
2501
2502        wc->n_blocks = le64_to_cpu(s.n_blocks);
2503
2504        offset = wc->n_blocks * sizeof(struct wc_memory_entry);
2505        if (offset / sizeof(struct wc_memory_entry) != le64_to_cpu(sb(wc)->n_blocks)) {
2506overflow:
2507                ti->error = "Overflow in size calculation";
2508                r = -EINVAL;
2509                goto bad;
2510        }
2511        offset += sizeof(struct wc_memory_superblock);
2512        if (offset < sizeof(struct wc_memory_superblock))
2513                goto overflow;
2514        offset = (offset + wc->block_size - 1) & ~(size_t)(wc->block_size - 1);
2515        data_size = wc->n_blocks * (size_t)wc->block_size;
2516        if (!offset || (data_size / wc->block_size != wc->n_blocks) ||
2517            (offset + data_size < offset))
2518                goto overflow;
2519        if (offset + data_size > wc->memory_map_size) {
2520                ti->error = "Memory area is too small";
2521                r = -EINVAL;
2522                goto bad;
2523        }
2524
2525        wc->metadata_sectors = offset >> SECTOR_SHIFT;
2526        wc->block_start = (char *)sb(wc) + offset;
2527
2528        x = (uint64_t)wc->n_blocks * (100 - high_wm_percent);
2529        x += 50;
2530        do_div(x, 100);
2531        wc->freelist_high_watermark = x;
2532        x = (uint64_t)wc->n_blocks * (100 - low_wm_percent);
2533        x += 50;
2534        do_div(x, 100);
2535        wc->freelist_low_watermark = x;
2536
2537        if (wc->cleaner)
2538                activate_cleaner(wc);
2539
2540        r = writecache_alloc_entries(wc);
2541        if (r) {
2542                ti->error = "Cannot allocate memory";
2543                goto bad;
2544        }
2545
2546        ti->num_flush_bios = WC_MODE_PMEM(wc) ? 1 : 2;
2547        ti->flush_supported = true;
2548        ti->num_discard_bios = 1;
2549
2550        if (WC_MODE_PMEM(wc))
2551                persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size);
2552
2553        return 0;
2554
2555bad_arguments:
2556        r = -EINVAL;
2557        ti->error = "Bad arguments";
2558bad:
2559        writecache_dtr(ti);
2560        return r;
2561}
2562
2563static void writecache_status(struct dm_target *ti, status_type_t type,
2564                              unsigned status_flags, char *result, unsigned maxlen)
2565{
2566        struct dm_writecache *wc = ti->private;
2567        unsigned extra_args;
2568        unsigned sz = 0;
2569
2570        switch (type) {
2571        case STATUSTYPE_INFO:
2572                DMEMIT("%ld %llu %llu %llu", writecache_has_error(wc),
2573                       (unsigned long long)wc->n_blocks, (unsigned long long)wc->freelist_size,
2574                       (unsigned long long)wc->writeback_size);
2575                break;
2576        case STATUSTYPE_TABLE:
2577                DMEMIT("%c %s %s %u ", WC_MODE_PMEM(wc) ? 'p' : 's',
2578                                wc->dev->name, wc->ssd_dev->name, wc->block_size);
2579                extra_args = 0;
2580                if (wc->start_sector_set)
2581                        extra_args += 2;
2582                if (wc->high_wm_percent_set)
2583                        extra_args += 2;
2584                if (wc->low_wm_percent_set)
2585                        extra_args += 2;
2586                if (wc->max_writeback_jobs_set)
2587                        extra_args += 2;
2588                if (wc->autocommit_blocks_set)
2589                        extra_args += 2;
2590                if (wc->autocommit_time_set)
2591                        extra_args += 2;
2592                if (wc->max_age_set)
2593                        extra_args += 2;
2594                if (wc->cleaner_set)
2595                        extra_args++;
2596                if (wc->writeback_fua_set)
2597                        extra_args++;
2598                if (wc->metadata_only)
2599                        extra_args++;
2600                if (wc->pause_set)
2601                        extra_args += 2;
2602
2603                DMEMIT("%u", extra_args);
2604                if (wc->start_sector_set)
2605                        DMEMIT(" start_sector %llu", (unsigned long long)wc->start_sector);
2606                if (wc->high_wm_percent_set)
2607                        DMEMIT(" high_watermark %u", wc->high_wm_percent_value);
2608                if (wc->low_wm_percent_set)
2609                        DMEMIT(" low_watermark %u", wc->low_wm_percent_value);
2610                if (wc->max_writeback_jobs_set)
2611                        DMEMIT(" writeback_jobs %u", wc->max_writeback_jobs);
2612                if (wc->autocommit_blocks_set)
2613                        DMEMIT(" autocommit_blocks %u", wc->autocommit_blocks);
2614                if (wc->autocommit_time_set)
2615                        DMEMIT(" autocommit_time %u", wc->autocommit_time_value);
2616                if (wc->max_age_set)
2617                        DMEMIT(" max_age %u", wc->max_age_value);
2618                if (wc->cleaner_set)
2619                        DMEMIT(" cleaner");
2620                if (wc->writeback_fua_set)
2621                        DMEMIT(" %sfua", wc->writeback_fua ? "" : "no");
2622                if (wc->metadata_only)
2623                        DMEMIT(" metadata_only");
2624                if (wc->pause_set)
2625                        DMEMIT(" pause_writeback %u", wc->pause_value);
2626                break;
2627        }
2628}
2629
2630static struct target_type writecache_target = {
2631        .name                   = "writecache",
2632        .version                = {1, 5, 0},
2633        .module                 = THIS_MODULE,
2634        .ctr                    = writecache_ctr,
2635        .dtr                    = writecache_dtr,
2636        .status                 = writecache_status,
2637        .postsuspend            = writecache_suspend,
2638        .resume                 = writecache_resume,
2639        .message                = writecache_message,
2640        .map                    = writecache_map,
2641        .end_io                 = writecache_end_io,
2642        .iterate_devices        = writecache_iterate_devices,
2643        .io_hints               = writecache_io_hints,
2644};
2645
2646static int __init dm_writecache_init(void)
2647{
2648        int r;
2649
2650        r = dm_register_target(&writecache_target);
2651        if (r < 0) {
2652                DMERR("register failed %d", r);
2653                return r;
2654        }
2655
2656        return 0;
2657}
2658
2659static void __exit dm_writecache_exit(void)
2660{
2661        dm_unregister_target(&writecache_target);
2662}
2663
2664module_init(dm_writecache_init);
2665module_exit(dm_writecache_exit);
2666
2667MODULE_DESCRIPTION(DM_NAME " writecache target");
2668MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>");
2669MODULE_LICENSE("GPL");
2670