linux/drivers/md/dm-writecache.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (C) 2018 Red Hat. All rights reserved.
   4 *
   5 * This file is released under the GPL.
   6 */
   7
   8#include <linux/device-mapper.h>
   9#include <linux/module.h>
  10#include <linux/init.h>
  11#include <linux/vmalloc.h>
  12#include <linux/kthread.h>
  13#include <linux/dm-io.h>
  14#include <linux/dm-kcopyd.h>
  15#include <linux/dax.h>
  16#include <linux/pfn_t.h>
  17#include <linux/libnvdimm.h>
  18#include <linux/delay.h>
  19#include "dm-io-tracker.h"
  20
  21#define DM_MSG_PREFIX "writecache"
  22
  23#define HIGH_WATERMARK                  50
  24#define LOW_WATERMARK                   45
  25#define MAX_WRITEBACK_JOBS              0
  26#define ENDIO_LATENCY                   16
  27#define WRITEBACK_LATENCY               64
  28#define AUTOCOMMIT_BLOCKS_SSD           65536
  29#define AUTOCOMMIT_BLOCKS_PMEM          64
  30#define AUTOCOMMIT_MSEC                 1000
  31#define MAX_AGE_DIV                     16
  32#define MAX_AGE_UNSPECIFIED             -1UL
  33#define PAUSE_WRITEBACK                 (HZ * 3)
  34
  35#define BITMAP_GRANULARITY      65536
  36#if BITMAP_GRANULARITY < PAGE_SIZE
  37#undef BITMAP_GRANULARITY
  38#define BITMAP_GRANULARITY      PAGE_SIZE
  39#endif
  40
  41#if IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API) && IS_ENABLED(CONFIG_DAX_DRIVER)
  42#define DM_WRITECACHE_HAS_PMEM
  43#endif
  44
  45#ifdef DM_WRITECACHE_HAS_PMEM
  46#define pmem_assign(dest, src)                                  \
  47do {                                                            \
  48        typeof(dest) uniq = (src);                              \
  49        memcpy_flushcache(&(dest), &uniq, sizeof(dest));        \
  50} while (0)
  51#else
  52#define pmem_assign(dest, src)  ((dest) = (src))
  53#endif
  54
  55#if IS_ENABLED(CONFIG_ARCH_HAS_COPY_MC) && defined(DM_WRITECACHE_HAS_PMEM)
  56#define DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
  57#endif
  58
  59#define MEMORY_SUPERBLOCK_MAGIC         0x23489321
  60#define MEMORY_SUPERBLOCK_VERSION       1
  61
  62struct wc_memory_entry {
  63        __le64 original_sector;
  64        __le64 seq_count;
  65};
  66
  67struct wc_memory_superblock {
  68        union {
  69                struct {
  70                        __le32 magic;
  71                        __le32 version;
  72                        __le32 block_size;
  73                        __le32 pad;
  74                        __le64 n_blocks;
  75                        __le64 seq_count;
  76                };
  77                __le64 padding[8];
  78        };
  79        struct wc_memory_entry entries[];
  80};
  81
  82struct wc_entry {
  83        struct rb_node rb_node;
  84        struct list_head lru;
  85        unsigned short wc_list_contiguous;
  86        bool write_in_progress
  87#if BITS_PER_LONG == 64
  88                :1
  89#endif
  90        ;
  91        unsigned long index
  92#if BITS_PER_LONG == 64
  93                :47
  94#endif
  95        ;
  96        unsigned long age;
  97#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
  98        uint64_t original_sector;
  99        uint64_t seq_count;
 100#endif
 101};
 102
 103#ifdef DM_WRITECACHE_HAS_PMEM
 104#define WC_MODE_PMEM(wc)                        ((wc)->pmem_mode)
 105#define WC_MODE_FUA(wc)                         ((wc)->writeback_fua)
 106#else
 107#define WC_MODE_PMEM(wc)                        false
 108#define WC_MODE_FUA(wc)                         false
 109#endif
 110#define WC_MODE_SORT_FREELIST(wc)               (!WC_MODE_PMEM(wc))
 111
 112struct dm_writecache {
 113        struct mutex lock;
 114        struct list_head lru;
 115        union {
 116                struct list_head freelist;
 117                struct {
 118                        struct rb_root freetree;
 119                        struct wc_entry *current_free;
 120                };
 121        };
 122        struct rb_root tree;
 123
 124        size_t freelist_size;
 125        size_t writeback_size;
 126        size_t freelist_high_watermark;
 127        size_t freelist_low_watermark;
 128        unsigned long max_age;
 129        unsigned long pause;
 130
 131        unsigned uncommitted_blocks;
 132        unsigned autocommit_blocks;
 133        unsigned max_writeback_jobs;
 134
 135        int error;
 136
 137        unsigned long autocommit_jiffies;
 138        struct timer_list autocommit_timer;
 139        struct wait_queue_head freelist_wait;
 140
 141        struct timer_list max_age_timer;
 142
 143        atomic_t bio_in_progress[2];
 144        struct wait_queue_head bio_in_progress_wait[2];
 145
 146        struct dm_target *ti;
 147        struct dm_dev *dev;
 148        struct dm_dev *ssd_dev;
 149        sector_t start_sector;
 150        void *memory_map;
 151        uint64_t memory_map_size;
 152        size_t metadata_sectors;
 153        size_t n_blocks;
 154        uint64_t seq_count;
 155        sector_t data_device_sectors;
 156        void *block_start;
 157        struct wc_entry *entries;
 158        unsigned block_size;
 159        unsigned char block_size_bits;
 160
 161        bool pmem_mode:1;
 162        bool writeback_fua:1;
 163
 164        bool overwrote_committed:1;
 165        bool memory_vmapped:1;
 166
 167        bool start_sector_set:1;
 168        bool high_wm_percent_set:1;
 169        bool low_wm_percent_set:1;
 170        bool max_writeback_jobs_set:1;
 171        bool autocommit_blocks_set:1;
 172        bool autocommit_time_set:1;
 173        bool max_age_set:1;
 174        bool writeback_fua_set:1;
 175        bool flush_on_suspend:1;
 176        bool cleaner:1;
 177        bool cleaner_set:1;
 178        bool metadata_only:1;
 179        bool pause_set:1;
 180
 181        unsigned high_wm_percent_value;
 182        unsigned low_wm_percent_value;
 183        unsigned autocommit_time_value;
 184        unsigned max_age_value;
 185        unsigned pause_value;
 186
 187        unsigned writeback_all;
 188        struct workqueue_struct *writeback_wq;
 189        struct work_struct writeback_work;
 190        struct work_struct flush_work;
 191
 192        struct dm_io_tracker iot;
 193
 194        struct dm_io_client *dm_io;
 195
 196        raw_spinlock_t endio_list_lock;
 197        struct list_head endio_list;
 198        struct task_struct *endio_thread;
 199
 200        struct task_struct *flush_thread;
 201        struct bio_list flush_list;
 202
 203        struct dm_kcopyd_client *dm_kcopyd;
 204        unsigned long *dirty_bitmap;
 205        unsigned dirty_bitmap_size;
 206
 207        struct bio_set bio_set;
 208        mempool_t copy_pool;
 209
 210        struct {
 211                unsigned long long reads;
 212                unsigned long long read_hits;
 213                unsigned long long writes;
 214                unsigned long long write_hits_uncommitted;
 215                unsigned long long write_hits_committed;
 216                unsigned long long writes_around;
 217                unsigned long long writes_allocate;
 218                unsigned long long writes_blocked_on_freelist;
 219                unsigned long long flushes;
 220                unsigned long long discards;
 221        } stats;
 222};
 223
 224#define WB_LIST_INLINE          16
 225
 226struct writeback_struct {
 227        struct list_head endio_entry;
 228        struct dm_writecache *wc;
 229        struct wc_entry **wc_list;
 230        unsigned wc_list_n;
 231        struct wc_entry *wc_list_inline[WB_LIST_INLINE];
 232        struct bio bio;
 233};
 234
 235struct copy_struct {
 236        struct list_head endio_entry;
 237        struct dm_writecache *wc;
 238        struct wc_entry *e;
 239        unsigned n_entries;
 240        int error;
 241};
 242
 243DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(dm_writecache_throttle,
 244                                            "A percentage of time allocated for data copying");
 245
 246static void wc_lock(struct dm_writecache *wc)
 247{
 248        mutex_lock(&wc->lock);
 249}
 250
 251static void wc_unlock(struct dm_writecache *wc)
 252{
 253        mutex_unlock(&wc->lock);
 254}
 255
 256#ifdef DM_WRITECACHE_HAS_PMEM
 257static int persistent_memory_claim(struct dm_writecache *wc)
 258{
 259        int r;
 260        loff_t s;
 261        long p, da;
 262        pfn_t pfn;
 263        int id;
 264        struct page **pages;
 265        sector_t offset;
 266
 267        wc->memory_vmapped = false;
 268
 269        s = wc->memory_map_size;
 270        p = s >> PAGE_SHIFT;
 271        if (!p) {
 272                r = -EINVAL;
 273                goto err1;
 274        }
 275        if (p != s >> PAGE_SHIFT) {
 276                r = -EOVERFLOW;
 277                goto err1;
 278        }
 279
 280        offset = get_start_sect(wc->ssd_dev->bdev);
 281        if (offset & (PAGE_SIZE / 512 - 1)) {
 282                r = -EINVAL;
 283                goto err1;
 284        }
 285        offset >>= PAGE_SHIFT - 9;
 286
 287        id = dax_read_lock();
 288
 289        da = dax_direct_access(wc->ssd_dev->dax_dev, offset, p, &wc->memory_map, &pfn);
 290        if (da < 0) {
 291                wc->memory_map = NULL;
 292                r = da;
 293                goto err2;
 294        }
 295        if (!pfn_t_has_page(pfn)) {
 296                wc->memory_map = NULL;
 297                r = -EOPNOTSUPP;
 298                goto err2;
 299        }
 300        if (da != p) {
 301                long i;
 302                wc->memory_map = NULL;
 303                pages = kvmalloc_array(p, sizeof(struct page *), GFP_KERNEL);
 304                if (!pages) {
 305                        r = -ENOMEM;
 306                        goto err2;
 307                }
 308                i = 0;
 309                do {
 310                        long daa;
 311                        daa = dax_direct_access(wc->ssd_dev->dax_dev, offset + i, p - i,
 312                                                NULL, &pfn);
 313                        if (daa <= 0) {
 314                                r = daa ? daa : -EINVAL;
 315                                goto err3;
 316                        }
 317                        if (!pfn_t_has_page(pfn)) {
 318                                r = -EOPNOTSUPP;
 319                                goto err3;
 320                        }
 321                        while (daa-- && i < p) {
 322                                pages[i++] = pfn_t_to_page(pfn);
 323                                pfn.val++;
 324                                if (!(i & 15))
 325                                        cond_resched();
 326                        }
 327                } while (i < p);
 328                wc->memory_map = vmap(pages, p, VM_MAP, PAGE_KERNEL);
 329                if (!wc->memory_map) {
 330                        r = -ENOMEM;
 331                        goto err3;
 332                }
 333                kvfree(pages);
 334                wc->memory_vmapped = true;
 335        }
 336
 337        dax_read_unlock(id);
 338
 339        wc->memory_map += (size_t)wc->start_sector << SECTOR_SHIFT;
 340        wc->memory_map_size -= (size_t)wc->start_sector << SECTOR_SHIFT;
 341
 342        return 0;
 343err3:
 344        kvfree(pages);
 345err2:
 346        dax_read_unlock(id);
 347err1:
 348        return r;
 349}
 350#else
 351static int persistent_memory_claim(struct dm_writecache *wc)
 352{
 353        return -EOPNOTSUPP;
 354}
 355#endif
 356
 357static void persistent_memory_release(struct dm_writecache *wc)
 358{
 359        if (wc->memory_vmapped)
 360                vunmap(wc->memory_map - ((size_t)wc->start_sector << SECTOR_SHIFT));
 361}
 362
 363static struct page *persistent_memory_page(void *addr)
 364{
 365        if (is_vmalloc_addr(addr))
 366                return vmalloc_to_page(addr);
 367        else
 368                return virt_to_page(addr);
 369}
 370
 371static unsigned persistent_memory_page_offset(void *addr)
 372{
 373        return (unsigned long)addr & (PAGE_SIZE - 1);
 374}
 375
 376static void persistent_memory_flush_cache(void *ptr, size_t size)
 377{
 378        if (is_vmalloc_addr(ptr))
 379                flush_kernel_vmap_range(ptr, size);
 380}
 381
 382static void persistent_memory_invalidate_cache(void *ptr, size_t size)
 383{
 384        if (is_vmalloc_addr(ptr))
 385                invalidate_kernel_vmap_range(ptr, size);
 386}
 387
 388static struct wc_memory_superblock *sb(struct dm_writecache *wc)
 389{
 390        return wc->memory_map;
 391}
 392
 393static struct wc_memory_entry *memory_entry(struct dm_writecache *wc, struct wc_entry *e)
 394{
 395        return &sb(wc)->entries[e->index];
 396}
 397
 398static void *memory_data(struct dm_writecache *wc, struct wc_entry *e)
 399{
 400        return (char *)wc->block_start + (e->index << wc->block_size_bits);
 401}
 402
 403static sector_t cache_sector(struct dm_writecache *wc, struct wc_entry *e)
 404{
 405        return wc->start_sector + wc->metadata_sectors +
 406                ((sector_t)e->index << (wc->block_size_bits - SECTOR_SHIFT));
 407}
 408
 409static uint64_t read_original_sector(struct dm_writecache *wc, struct wc_entry *e)
 410{
 411#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
 412        return e->original_sector;
 413#else
 414        return le64_to_cpu(memory_entry(wc, e)->original_sector);
 415#endif
 416}
 417
 418static uint64_t read_seq_count(struct dm_writecache *wc, struct wc_entry *e)
 419{
 420#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
 421        return e->seq_count;
 422#else
 423        return le64_to_cpu(memory_entry(wc, e)->seq_count);
 424#endif
 425}
 426
 427static void clear_seq_count(struct dm_writecache *wc, struct wc_entry *e)
 428{
 429#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
 430        e->seq_count = -1;
 431#endif
 432        pmem_assign(memory_entry(wc, e)->seq_count, cpu_to_le64(-1));
 433}
 434
 435static void write_original_sector_seq_count(struct dm_writecache *wc, struct wc_entry *e,
 436                                            uint64_t original_sector, uint64_t seq_count)
 437{
 438        struct wc_memory_entry me;
 439#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
 440        e->original_sector = original_sector;
 441        e->seq_count = seq_count;
 442#endif
 443        me.original_sector = cpu_to_le64(original_sector);
 444        me.seq_count = cpu_to_le64(seq_count);
 445        pmem_assign(*memory_entry(wc, e), me);
 446}
 447
 448#define writecache_error(wc, err, msg, arg...)                          \
 449do {                                                                    \
 450        if (!cmpxchg(&(wc)->error, 0, err))                             \
 451                DMERR(msg, ##arg);                                      \
 452        wake_up(&(wc)->freelist_wait);                                  \
 453} while (0)
 454
 455#define writecache_has_error(wc)        (unlikely(READ_ONCE((wc)->error)))
 456
 457static void writecache_flush_all_metadata(struct dm_writecache *wc)
 458{
 459        if (!WC_MODE_PMEM(wc))
 460                memset(wc->dirty_bitmap, -1, wc->dirty_bitmap_size);
 461}
 462
 463static void writecache_flush_region(struct dm_writecache *wc, void *ptr, size_t size)
 464{
 465        if (!WC_MODE_PMEM(wc))
 466                __set_bit(((char *)ptr - (char *)wc->memory_map) / BITMAP_GRANULARITY,
 467                          wc->dirty_bitmap);
 468}
 469
 470static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev);
 471
 472struct io_notify {
 473        struct dm_writecache *wc;
 474        struct completion c;
 475        atomic_t count;
 476};
 477
 478static void writecache_notify_io(unsigned long error, void *context)
 479{
 480        struct io_notify *endio = context;
 481
 482        if (unlikely(error != 0))
 483                writecache_error(endio->wc, -EIO, "error writing metadata");
 484        BUG_ON(atomic_read(&endio->count) <= 0);
 485        if (atomic_dec_and_test(&endio->count))
 486                complete(&endio->c);
 487}
 488
 489static void writecache_wait_for_ios(struct dm_writecache *wc, int direction)
 490{
 491        wait_event(wc->bio_in_progress_wait[direction],
 492                   !atomic_read(&wc->bio_in_progress[direction]));
 493}
 494
 495static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
 496{
 497        struct dm_io_region region;
 498        struct dm_io_request req;
 499        struct io_notify endio = {
 500                wc,
 501                COMPLETION_INITIALIZER_ONSTACK(endio.c),
 502                ATOMIC_INIT(1),
 503        };
 504        unsigned bitmap_bits = wc->dirty_bitmap_size * 8;
 505        unsigned i = 0;
 506
 507        while (1) {
 508                unsigned j;
 509                i = find_next_bit(wc->dirty_bitmap, bitmap_bits, i);
 510                if (unlikely(i == bitmap_bits))
 511                        break;
 512                j = find_next_zero_bit(wc->dirty_bitmap, bitmap_bits, i);
 513
 514                region.bdev = wc->ssd_dev->bdev;
 515                region.sector = (sector_t)i * (BITMAP_GRANULARITY >> SECTOR_SHIFT);
 516                region.count = (sector_t)(j - i) * (BITMAP_GRANULARITY >> SECTOR_SHIFT);
 517
 518                if (unlikely(region.sector >= wc->metadata_sectors))
 519                        break;
 520                if (unlikely(region.sector + region.count > wc->metadata_sectors))
 521                        region.count = wc->metadata_sectors - region.sector;
 522
 523                region.sector += wc->start_sector;
 524                atomic_inc(&endio.count);
 525                req.bi_op = REQ_OP_WRITE;
 526                req.bi_op_flags = REQ_SYNC;
 527                req.mem.type = DM_IO_VMA;
 528                req.mem.ptr.vma = (char *)wc->memory_map + (size_t)i * BITMAP_GRANULARITY;
 529                req.client = wc->dm_io;
 530                req.notify.fn = writecache_notify_io;
 531                req.notify.context = &endio;
 532
 533                /* writing via async dm-io (implied by notify.fn above) won't return an error */
 534                (void) dm_io(&req, 1, &region, NULL);
 535                i = j;
 536        }
 537
 538        writecache_notify_io(0, &endio);
 539        wait_for_completion_io(&endio.c);
 540
 541        if (wait_for_ios)
 542                writecache_wait_for_ios(wc, WRITE);
 543
 544        writecache_disk_flush(wc, wc->ssd_dev);
 545
 546        memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size);
 547}
 548
 549static void ssd_commit_superblock(struct dm_writecache *wc)
 550{
 551        int r;
 552        struct dm_io_region region;
 553        struct dm_io_request req;
 554
 555        region.bdev = wc->ssd_dev->bdev;
 556        region.sector = 0;
 557        region.count = max(4096U, wc->block_size) >> SECTOR_SHIFT;
 558
 559        if (unlikely(region.sector + region.count > wc->metadata_sectors))
 560                region.count = wc->metadata_sectors - region.sector;
 561
 562        region.sector += wc->start_sector;
 563
 564        req.bi_op = REQ_OP_WRITE;
 565        req.bi_op_flags = REQ_SYNC | REQ_FUA;
 566        req.mem.type = DM_IO_VMA;
 567        req.mem.ptr.vma = (char *)wc->memory_map;
 568        req.client = wc->dm_io;
 569        req.notify.fn = NULL;
 570        req.notify.context = NULL;
 571
 572        r = dm_io(&req, 1, &region, NULL);
 573        if (unlikely(r))
 574                writecache_error(wc, r, "error writing superblock");
 575}
 576
 577static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
 578{
 579        if (WC_MODE_PMEM(wc))
 580                pmem_wmb();
 581        else
 582                ssd_commit_flushed(wc, wait_for_ios);
 583}
 584
 585static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev)
 586{
 587        int r;
 588        struct dm_io_region region;
 589        struct dm_io_request req;
 590
 591        region.bdev = dev->bdev;
 592        region.sector = 0;
 593        region.count = 0;
 594        req.bi_op = REQ_OP_WRITE;
 595        req.bi_op_flags = REQ_PREFLUSH;
 596        req.mem.type = DM_IO_KMEM;
 597        req.mem.ptr.addr = NULL;
 598        req.client = wc->dm_io;
 599        req.notify.fn = NULL;
 600
 601        r = dm_io(&req, 1, &region, NULL);
 602        if (unlikely(r))
 603                writecache_error(wc, r, "error flushing metadata: %d", r);
 604}
 605
 606#define WFE_RETURN_FOLLOWING    1
 607#define WFE_LOWEST_SEQ          2
 608
 609static struct wc_entry *writecache_find_entry(struct dm_writecache *wc,
 610                                              uint64_t block, int flags)
 611{
 612        struct wc_entry *e;
 613        struct rb_node *node = wc->tree.rb_node;
 614
 615        if (unlikely(!node))
 616                return NULL;
 617
 618        while (1) {
 619                e = container_of(node, struct wc_entry, rb_node);
 620                if (read_original_sector(wc, e) == block)
 621                        break;
 622
 623                node = (read_original_sector(wc, e) >= block ?
 624                        e->rb_node.rb_left : e->rb_node.rb_right);
 625                if (unlikely(!node)) {
 626                        if (!(flags & WFE_RETURN_FOLLOWING))
 627                                return NULL;
 628                        if (read_original_sector(wc, e) >= block) {
 629                                return e;
 630                        } else {
 631                                node = rb_next(&e->rb_node);
 632                                if (unlikely(!node))
 633                                        return NULL;
 634                                e = container_of(node, struct wc_entry, rb_node);
 635                                return e;
 636                        }
 637                }
 638        }
 639
 640        while (1) {
 641                struct wc_entry *e2;
 642                if (flags & WFE_LOWEST_SEQ)
 643                        node = rb_prev(&e->rb_node);
 644                else
 645                        node = rb_next(&e->rb_node);
 646                if (unlikely(!node))
 647                        return e;
 648                e2 = container_of(node, struct wc_entry, rb_node);
 649                if (read_original_sector(wc, e2) != block)
 650                        return e;
 651                e = e2;
 652        }
 653}
 654
 655static void writecache_insert_entry(struct dm_writecache *wc, struct wc_entry *ins)
 656{
 657        struct wc_entry *e;
 658        struct rb_node **node = &wc->tree.rb_node, *parent = NULL;
 659
 660        while (*node) {
 661                e = container_of(*node, struct wc_entry, rb_node);
 662                parent = &e->rb_node;
 663                if (read_original_sector(wc, e) > read_original_sector(wc, ins))
 664                        node = &parent->rb_left;
 665                else
 666                        node = &parent->rb_right;
 667        }
 668        rb_link_node(&ins->rb_node, parent, node);
 669        rb_insert_color(&ins->rb_node, &wc->tree);
 670        list_add(&ins->lru, &wc->lru);
 671        ins->age = jiffies;
 672}
 673
 674static void writecache_unlink(struct dm_writecache *wc, struct wc_entry *e)
 675{
 676        list_del(&e->lru);
 677        rb_erase(&e->rb_node, &wc->tree);
 678}
 679
 680static void writecache_add_to_freelist(struct dm_writecache *wc, struct wc_entry *e)
 681{
 682        if (WC_MODE_SORT_FREELIST(wc)) {
 683                struct rb_node **node = &wc->freetree.rb_node, *parent = NULL;
 684                if (unlikely(!*node))
 685                        wc->current_free = e;
 686                while (*node) {
 687                        parent = *node;
 688                        if (&e->rb_node < *node)
 689                                node = &parent->rb_left;
 690                        else
 691                                node = &parent->rb_right;
 692                }
 693                rb_link_node(&e->rb_node, parent, node);
 694                rb_insert_color(&e->rb_node, &wc->freetree);
 695        } else {
 696                list_add_tail(&e->lru, &wc->freelist);
 697        }
 698        wc->freelist_size++;
 699}
 700
 701static inline void writecache_verify_watermark(struct dm_writecache *wc)
 702{
 703        if (unlikely(wc->freelist_size + wc->writeback_size <= wc->freelist_high_watermark))
 704                queue_work(wc->writeback_wq, &wc->writeback_work);
 705}
 706
 707static void writecache_max_age_timer(struct timer_list *t)
 708{
 709        struct dm_writecache *wc = from_timer(wc, t, max_age_timer);
 710
 711        if (!dm_suspended(wc->ti) && !writecache_has_error(wc)) {
 712                queue_work(wc->writeback_wq, &wc->writeback_work);
 713                mod_timer(&wc->max_age_timer, jiffies + wc->max_age / MAX_AGE_DIV);
 714        }
 715}
 716
 717static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc, sector_t expected_sector)
 718{
 719        struct wc_entry *e;
 720
 721        if (WC_MODE_SORT_FREELIST(wc)) {
 722                struct rb_node *next;
 723                if (unlikely(!wc->current_free))
 724                        return NULL;
 725                e = wc->current_free;
 726                if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector))
 727                        return NULL;
 728                next = rb_next(&e->rb_node);
 729                rb_erase(&e->rb_node, &wc->freetree);
 730                if (unlikely(!next))
 731                        next = rb_first(&wc->freetree);
 732                wc->current_free = next ? container_of(next, struct wc_entry, rb_node) : NULL;
 733        } else {
 734                if (unlikely(list_empty(&wc->freelist)))
 735                        return NULL;
 736                e = container_of(wc->freelist.next, struct wc_entry, lru);
 737                if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector))
 738                        return NULL;
 739                list_del(&e->lru);
 740        }
 741        wc->freelist_size--;
 742
 743        writecache_verify_watermark(wc);
 744
 745        return e;
 746}
 747
 748static void writecache_free_entry(struct dm_writecache *wc, struct wc_entry *e)
 749{
 750        writecache_unlink(wc, e);
 751        writecache_add_to_freelist(wc, e);
 752        clear_seq_count(wc, e);
 753        writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry));
 754        if (unlikely(waitqueue_active(&wc->freelist_wait)))
 755                wake_up(&wc->freelist_wait);
 756}
 757
 758static void writecache_wait_on_freelist(struct dm_writecache *wc)
 759{
 760        DEFINE_WAIT(wait);
 761
 762        prepare_to_wait(&wc->freelist_wait, &wait, TASK_UNINTERRUPTIBLE);
 763        wc_unlock(wc);
 764        io_schedule();
 765        finish_wait(&wc->freelist_wait, &wait);
 766        wc_lock(wc);
 767}
 768
 769static void writecache_poison_lists(struct dm_writecache *wc)
 770{
 771        /*
 772         * Catch incorrect access to these values while the device is suspended.
 773         */
 774        memset(&wc->tree, -1, sizeof wc->tree);
 775        wc->lru.next = LIST_POISON1;
 776        wc->lru.prev = LIST_POISON2;
 777        wc->freelist.next = LIST_POISON1;
 778        wc->freelist.prev = LIST_POISON2;
 779}
 780
 781static void writecache_flush_entry(struct dm_writecache *wc, struct wc_entry *e)
 782{
 783        writecache_flush_region(wc, memory_entry(wc, e), sizeof(struct wc_memory_entry));
 784        if (WC_MODE_PMEM(wc))
 785                writecache_flush_region(wc, memory_data(wc, e), wc->block_size);
 786}
 787
 788static bool writecache_entry_is_committed(struct dm_writecache *wc, struct wc_entry *e)
 789{
 790        return read_seq_count(wc, e) < wc->seq_count;
 791}
 792
 793static void writecache_flush(struct dm_writecache *wc)
 794{
 795        struct wc_entry *e, *e2;
 796        bool need_flush_after_free;
 797
 798        wc->uncommitted_blocks = 0;
 799        del_timer(&wc->autocommit_timer);
 800
 801        if (list_empty(&wc->lru))
 802                return;
 803
 804        e = container_of(wc->lru.next, struct wc_entry, lru);
 805        if (writecache_entry_is_committed(wc, e)) {
 806                if (wc->overwrote_committed) {
 807                        writecache_wait_for_ios(wc, WRITE);
 808                        writecache_disk_flush(wc, wc->ssd_dev);
 809                        wc->overwrote_committed = false;
 810                }
 811                return;
 812        }
 813        while (1) {
 814                writecache_flush_entry(wc, e);
 815                if (unlikely(e->lru.next == &wc->lru))
 816                        break;
 817                e2 = container_of(e->lru.next, struct wc_entry, lru);
 818                if (writecache_entry_is_committed(wc, e2))
 819                        break;
 820                e = e2;
 821                cond_resched();
 822        }
 823        writecache_commit_flushed(wc, true);
 824
 825        wc->seq_count++;
 826        pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count));
 827        if (WC_MODE_PMEM(wc))
 828                writecache_commit_flushed(wc, false);
 829        else
 830                ssd_commit_superblock(wc);
 831
 832        wc->overwrote_committed = false;
 833
 834        need_flush_after_free = false;
 835        while (1) {
 836                /* Free another committed entry with lower seq-count */
 837                struct rb_node *rb_node = rb_prev(&e->rb_node);
 838
 839                if (rb_node) {
 840                        e2 = container_of(rb_node, struct wc_entry, rb_node);
 841                        if (read_original_sector(wc, e2) == read_original_sector(wc, e) &&
 842                            likely(!e2->write_in_progress)) {
 843                                writecache_free_entry(wc, e2);
 844                                need_flush_after_free = true;
 845                        }
 846                }
 847                if (unlikely(e->lru.prev == &wc->lru))
 848                        break;
 849                e = container_of(e->lru.prev, struct wc_entry, lru);
 850                cond_resched();
 851        }
 852
 853        if (need_flush_after_free)
 854                writecache_commit_flushed(wc, false);
 855}
 856
 857static void writecache_flush_work(struct work_struct *work)
 858{
 859        struct dm_writecache *wc = container_of(work, struct dm_writecache, flush_work);
 860
 861        wc_lock(wc);
 862        writecache_flush(wc);
 863        wc_unlock(wc);
 864}
 865
 866static void writecache_autocommit_timer(struct timer_list *t)
 867{
 868        struct dm_writecache *wc = from_timer(wc, t, autocommit_timer);
 869        if (!writecache_has_error(wc))
 870                queue_work(wc->writeback_wq, &wc->flush_work);
 871}
 872
 873static void writecache_schedule_autocommit(struct dm_writecache *wc)
 874{
 875        if (!timer_pending(&wc->autocommit_timer))
 876                mod_timer(&wc->autocommit_timer, jiffies + wc->autocommit_jiffies);
 877}
 878
 879static void writecache_discard(struct dm_writecache *wc, sector_t start, sector_t end)
 880{
 881        struct wc_entry *e;
 882        bool discarded_something = false;
 883
 884        e = writecache_find_entry(wc, start, WFE_RETURN_FOLLOWING | WFE_LOWEST_SEQ);
 885        if (unlikely(!e))
 886                return;
 887
 888        while (read_original_sector(wc, e) < end) {
 889                struct rb_node *node = rb_next(&e->rb_node);
 890
 891                if (likely(!e->write_in_progress)) {
 892                        if (!discarded_something) {
 893                                if (!WC_MODE_PMEM(wc)) {
 894                                        writecache_wait_for_ios(wc, READ);
 895                                        writecache_wait_for_ios(wc, WRITE);
 896                                }
 897                                discarded_something = true;
 898                        }
 899                        if (!writecache_entry_is_committed(wc, e))
 900                                wc->uncommitted_blocks--;
 901                        writecache_free_entry(wc, e);
 902                }
 903
 904                if (unlikely(!node))
 905                        break;
 906
 907                e = container_of(node, struct wc_entry, rb_node);
 908        }
 909
 910        if (discarded_something)
 911                writecache_commit_flushed(wc, false);
 912}
 913
 914static bool writecache_wait_for_writeback(struct dm_writecache *wc)
 915{
 916        if (wc->writeback_size) {
 917                writecache_wait_on_freelist(wc);
 918                return true;
 919        }
 920        return false;
 921}
 922
 923static void writecache_suspend(struct dm_target *ti)
 924{
 925        struct dm_writecache *wc = ti->private;
 926        bool flush_on_suspend;
 927
 928        del_timer_sync(&wc->autocommit_timer);
 929        del_timer_sync(&wc->max_age_timer);
 930
 931        wc_lock(wc);
 932        writecache_flush(wc);
 933        flush_on_suspend = wc->flush_on_suspend;
 934        if (flush_on_suspend) {
 935                wc->flush_on_suspend = false;
 936                wc->writeback_all++;
 937                queue_work(wc->writeback_wq, &wc->writeback_work);
 938        }
 939        wc_unlock(wc);
 940
 941        drain_workqueue(wc->writeback_wq);
 942
 943        wc_lock(wc);
 944        if (flush_on_suspend)
 945                wc->writeback_all--;
 946        while (writecache_wait_for_writeback(wc));
 947
 948        if (WC_MODE_PMEM(wc))
 949                persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size);
 950
 951        writecache_poison_lists(wc);
 952
 953        wc_unlock(wc);
 954}
 955
 956static int writecache_alloc_entries(struct dm_writecache *wc)
 957{
 958        size_t b;
 959
 960        if (wc->entries)
 961                return 0;
 962        wc->entries = vmalloc(array_size(sizeof(struct wc_entry), wc->n_blocks));
 963        if (!wc->entries)
 964                return -ENOMEM;
 965        for (b = 0; b < wc->n_blocks; b++) {
 966                struct wc_entry *e = &wc->entries[b];
 967                e->index = b;
 968                e->write_in_progress = false;
 969                cond_resched();
 970        }
 971
 972        return 0;
 973}
 974
 975static int writecache_read_metadata(struct dm_writecache *wc, sector_t n_sectors)
 976{
 977        struct dm_io_region region;
 978        struct dm_io_request req;
 979
 980        region.bdev = wc->ssd_dev->bdev;
 981        region.sector = wc->start_sector;
 982        region.count = n_sectors;
 983        req.bi_op = REQ_OP_READ;
 984        req.bi_op_flags = REQ_SYNC;
 985        req.mem.type = DM_IO_VMA;
 986        req.mem.ptr.vma = (char *)wc->memory_map;
 987        req.client = wc->dm_io;
 988        req.notify.fn = NULL;
 989
 990        return dm_io(&req, 1, &region, NULL);
 991}
 992
 993static void writecache_resume(struct dm_target *ti)
 994{
 995        struct dm_writecache *wc = ti->private;
 996        size_t b;
 997        bool need_flush = false;
 998        __le64 sb_seq_count;
 999        int r;
1000
1001        wc_lock(wc);
1002
1003        wc->data_device_sectors = bdev_nr_sectors(wc->dev->bdev);
1004
1005        if (WC_MODE_PMEM(wc)) {
1006                persistent_memory_invalidate_cache(wc->memory_map, wc->memory_map_size);
1007        } else {
1008                r = writecache_read_metadata(wc, wc->metadata_sectors);
1009                if (r) {
1010                        size_t sb_entries_offset;
1011                        writecache_error(wc, r, "unable to read metadata: %d", r);
1012                        sb_entries_offset = offsetof(struct wc_memory_superblock, entries);
1013                        memset((char *)wc->memory_map + sb_entries_offset, -1,
1014                               (wc->metadata_sectors << SECTOR_SHIFT) - sb_entries_offset);
1015                }
1016        }
1017
1018        wc->tree = RB_ROOT;
1019        INIT_LIST_HEAD(&wc->lru);
1020        if (WC_MODE_SORT_FREELIST(wc)) {
1021                wc->freetree = RB_ROOT;
1022                wc->current_free = NULL;
1023        } else {
1024                INIT_LIST_HEAD(&wc->freelist);
1025        }
1026        wc->freelist_size = 0;
1027
1028        r = copy_mc_to_kernel(&sb_seq_count, &sb(wc)->seq_count,
1029                              sizeof(uint64_t));
1030        if (r) {
1031                writecache_error(wc, r, "hardware memory error when reading superblock: %d", r);
1032                sb_seq_count = cpu_to_le64(0);
1033        }
1034        wc->seq_count = le64_to_cpu(sb_seq_count);
1035
1036#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
1037        for (b = 0; b < wc->n_blocks; b++) {
1038                struct wc_entry *e = &wc->entries[b];
1039                struct wc_memory_entry wme;
1040                if (writecache_has_error(wc)) {
1041                        e->original_sector = -1;
1042                        e->seq_count = -1;
1043                        continue;
1044                }
1045                r = copy_mc_to_kernel(&wme, memory_entry(wc, e),
1046                                      sizeof(struct wc_memory_entry));
1047                if (r) {
1048                        writecache_error(wc, r, "hardware memory error when reading metadata entry %lu: %d",
1049                                         (unsigned long)b, r);
1050                        e->original_sector = -1;
1051                        e->seq_count = -1;
1052                } else {
1053                        e->original_sector = le64_to_cpu(wme.original_sector);
1054                        e->seq_count = le64_to_cpu(wme.seq_count);
1055                }
1056                cond_resched();
1057        }
1058#endif
1059        for (b = 0; b < wc->n_blocks; b++) {
1060                struct wc_entry *e = &wc->entries[b];
1061                if (!writecache_entry_is_committed(wc, e)) {
1062                        if (read_seq_count(wc, e) != -1) {
1063erase_this:
1064                                clear_seq_count(wc, e);
1065                                need_flush = true;
1066                        }
1067                        writecache_add_to_freelist(wc, e);
1068                } else {
1069                        struct wc_entry *old;
1070
1071                        old = writecache_find_entry(wc, read_original_sector(wc, e), 0);
1072                        if (!old) {
1073                                writecache_insert_entry(wc, e);
1074                        } else {
1075                                if (read_seq_count(wc, old) == read_seq_count(wc, e)) {
1076                                        writecache_error(wc, -EINVAL,
1077                                                 "two identical entries, position %llu, sector %llu, sequence %llu",
1078                                                 (unsigned long long)b, (unsigned long long)read_original_sector(wc, e),
1079                                                 (unsigned long long)read_seq_count(wc, e));
1080                                }
1081                                if (read_seq_count(wc, old) > read_seq_count(wc, e)) {
1082                                        goto erase_this;
1083                                } else {
1084                                        writecache_free_entry(wc, old);
1085                                        writecache_insert_entry(wc, e);
1086                                        need_flush = true;
1087                                }
1088                        }
1089                }
1090                cond_resched();
1091        }
1092
1093        if (need_flush) {
1094                writecache_flush_all_metadata(wc);
1095                writecache_commit_flushed(wc, false);
1096        }
1097
1098        writecache_verify_watermark(wc);
1099
1100        if (wc->max_age != MAX_AGE_UNSPECIFIED)
1101                mod_timer(&wc->max_age_timer, jiffies + wc->max_age / MAX_AGE_DIV);
1102
1103        wc_unlock(wc);
1104}
1105
1106static int process_flush_mesg(unsigned argc, char **argv, struct dm_writecache *wc)
1107{
1108        if (argc != 1)
1109                return -EINVAL;
1110
1111        wc_lock(wc);
1112        if (dm_suspended(wc->ti)) {
1113                wc_unlock(wc);
1114                return -EBUSY;
1115        }
1116        if (writecache_has_error(wc)) {
1117                wc_unlock(wc);
1118                return -EIO;
1119        }
1120
1121        writecache_flush(wc);
1122        wc->writeback_all++;
1123        queue_work(wc->writeback_wq, &wc->writeback_work);
1124        wc_unlock(wc);
1125
1126        flush_workqueue(wc->writeback_wq);
1127
1128        wc_lock(wc);
1129        wc->writeback_all--;
1130        if (writecache_has_error(wc)) {
1131                wc_unlock(wc);
1132                return -EIO;
1133        }
1134        wc_unlock(wc);
1135
1136        return 0;
1137}
1138
1139static int process_flush_on_suspend_mesg(unsigned argc, char **argv, struct dm_writecache *wc)
1140{
1141        if (argc != 1)
1142                return -EINVAL;
1143
1144        wc_lock(wc);
1145        wc->flush_on_suspend = true;
1146        wc_unlock(wc);
1147
1148        return 0;
1149}
1150
1151static void activate_cleaner(struct dm_writecache *wc)
1152{
1153        wc->flush_on_suspend = true;
1154        wc->cleaner = true;
1155        wc->freelist_high_watermark = wc->n_blocks;
1156        wc->freelist_low_watermark = wc->n_blocks;
1157}
1158
1159static int process_cleaner_mesg(unsigned argc, char **argv, struct dm_writecache *wc)
1160{
1161        if (argc != 1)
1162                return -EINVAL;
1163
1164        wc_lock(wc);
1165        activate_cleaner(wc);
1166        if (!dm_suspended(wc->ti))
1167                writecache_verify_watermark(wc);
1168        wc_unlock(wc);
1169
1170        return 0;
1171}
1172
1173static int process_clear_stats_mesg(unsigned argc, char **argv, struct dm_writecache *wc)
1174{
1175        if (argc != 1)
1176                return -EINVAL;
1177
1178        wc_lock(wc);
1179        memset(&wc->stats, 0, sizeof wc->stats);
1180        wc_unlock(wc);
1181
1182        return 0;
1183}
1184
1185static int writecache_message(struct dm_target *ti, unsigned argc, char **argv,
1186                              char *result, unsigned maxlen)
1187{
1188        int r = -EINVAL;
1189        struct dm_writecache *wc = ti->private;
1190
1191        if (!strcasecmp(argv[0], "flush"))
1192                r = process_flush_mesg(argc, argv, wc);
1193        else if (!strcasecmp(argv[0], "flush_on_suspend"))
1194                r = process_flush_on_suspend_mesg(argc, argv, wc);
1195        else if (!strcasecmp(argv[0], "cleaner"))
1196                r = process_cleaner_mesg(argc, argv, wc);
1197        else if (!strcasecmp(argv[0], "clear_stats"))
1198                r = process_clear_stats_mesg(argc, argv, wc);
1199        else
1200                DMERR("unrecognised message received: %s", argv[0]);
1201
1202        return r;
1203}
1204
1205static void memcpy_flushcache_optimized(void *dest, void *source, size_t size)
1206{
1207        /*
1208         * clflushopt performs better with block size 1024, 2048, 4096
1209         * non-temporal stores perform better with block size 512
1210         *
1211         * block size   512             1024            2048            4096
1212         * movnti       496 MB/s        642 MB/s        725 MB/s        744 MB/s
1213         * clflushopt   373 MB/s        688 MB/s        1.1 GB/s        1.2 GB/s
1214         *
1215         * We see that movnti performs better for 512-byte blocks, and
1216         * clflushopt performs better for 1024-byte and larger blocks. So, we
1217         * prefer clflushopt for sizes >= 768.
1218         *
1219         * NOTE: this happens to be the case now (with dm-writecache's single
1220         * threaded model) but re-evaluate this once memcpy_flushcache() is
1221         * enabled to use movdir64b which might invalidate this performance
1222         * advantage seen with cache-allocating-writes plus flushing.
1223         */
1224#ifdef CONFIG_X86
1225        if (static_cpu_has(X86_FEATURE_CLFLUSHOPT) &&
1226            likely(boot_cpu_data.x86_clflush_size == 64) &&
1227            likely(size >= 768)) {
1228                do {
1229                        memcpy((void *)dest, (void *)source, 64);
1230                        clflushopt((void *)dest);
1231                        dest += 64;
1232                        source += 64;
1233                        size -= 64;
1234                } while (size >= 64);
1235                return;
1236        }
1237#endif
1238        memcpy_flushcache(dest, source, size);
1239}
1240
1241static void bio_copy_block(struct dm_writecache *wc, struct bio *bio, void *data)
1242{
1243        void *buf;
1244        unsigned size;
1245        int rw = bio_data_dir(bio);
1246        unsigned remaining_size = wc->block_size;
1247
1248        do {
1249                struct bio_vec bv = bio_iter_iovec(bio, bio->bi_iter);
1250                buf = bvec_kmap_local(&bv);
1251                size = bv.bv_len;
1252                if (unlikely(size > remaining_size))
1253                        size = remaining_size;
1254
1255                if (rw == READ) {
1256                        int r;
1257                        r = copy_mc_to_kernel(buf, data, size);
1258                        flush_dcache_page(bio_page(bio));
1259                        if (unlikely(r)) {
1260                                writecache_error(wc, r, "hardware memory error when reading data: %d", r);
1261                                bio->bi_status = BLK_STS_IOERR;
1262                        }
1263                } else {
1264                        flush_dcache_page(bio_page(bio));
1265                        memcpy_flushcache_optimized(data, buf, size);
1266                }
1267
1268                kunmap_local(buf);
1269
1270                data = (char *)data + size;
1271                remaining_size -= size;
1272                bio_advance(bio, size);
1273        } while (unlikely(remaining_size));
1274}
1275
1276static int writecache_flush_thread(void *data)
1277{
1278        struct dm_writecache *wc = data;
1279
1280        while (1) {
1281                struct bio *bio;
1282
1283                wc_lock(wc);
1284                bio = bio_list_pop(&wc->flush_list);
1285                if (!bio) {
1286                        set_current_state(TASK_INTERRUPTIBLE);
1287                        wc_unlock(wc);
1288
1289                        if (unlikely(kthread_should_stop())) {
1290                                set_current_state(TASK_RUNNING);
1291                                break;
1292                        }
1293
1294                        schedule();
1295                        continue;
1296                }
1297
1298                if (bio_op(bio) == REQ_OP_DISCARD) {
1299                        writecache_discard(wc, bio->bi_iter.bi_sector,
1300                                           bio_end_sector(bio));
1301                        wc_unlock(wc);
1302                        bio_set_dev(bio, wc->dev->bdev);
1303                        submit_bio_noacct(bio);
1304                } else {
1305                        writecache_flush(wc);
1306                        wc_unlock(wc);
1307                        if (writecache_has_error(wc))
1308                                bio->bi_status = BLK_STS_IOERR;
1309                        bio_endio(bio);
1310                }
1311        }
1312
1313        return 0;
1314}
1315
1316static void writecache_offload_bio(struct dm_writecache *wc, struct bio *bio)
1317{
1318        if (bio_list_empty(&wc->flush_list))
1319                wake_up_process(wc->flush_thread);
1320        bio_list_add(&wc->flush_list, bio);
1321}
1322
1323enum wc_map_op {
1324        WC_MAP_SUBMIT,
1325        WC_MAP_REMAP,
1326        WC_MAP_REMAP_ORIGIN,
1327        WC_MAP_RETURN,
1328        WC_MAP_ERROR,
1329};
1330
1331static enum wc_map_op writecache_map_remap_origin(struct dm_writecache *wc, struct bio *bio,
1332                                                  struct wc_entry *e)
1333{
1334        if (e) {
1335                sector_t next_boundary =
1336                        read_original_sector(wc, e) - bio->bi_iter.bi_sector;
1337                if (next_boundary < bio->bi_iter.bi_size >> SECTOR_SHIFT)
1338                        dm_accept_partial_bio(bio, next_boundary);
1339        }
1340
1341        return WC_MAP_REMAP_ORIGIN;
1342}
1343
1344static enum wc_map_op writecache_map_read(struct dm_writecache *wc, struct bio *bio)
1345{
1346        enum wc_map_op map_op;
1347        struct wc_entry *e;
1348
1349read_next_block:
1350        wc->stats.reads++;
1351        e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING);
1352        if (e && read_original_sector(wc, e) == bio->bi_iter.bi_sector) {
1353                wc->stats.read_hits++;
1354                if (WC_MODE_PMEM(wc)) {
1355                        bio_copy_block(wc, bio, memory_data(wc, e));
1356                        if (bio->bi_iter.bi_size)
1357                                goto read_next_block;
1358                        map_op = WC_MAP_SUBMIT;
1359                } else {
1360                        dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT);
1361                        bio_set_dev(bio, wc->ssd_dev->bdev);
1362                        bio->bi_iter.bi_sector = cache_sector(wc, e);
1363                        if (!writecache_entry_is_committed(wc, e))
1364                                writecache_wait_for_ios(wc, WRITE);
1365                        map_op = WC_MAP_REMAP;
1366                }
1367        } else {
1368                map_op = writecache_map_remap_origin(wc, bio, e);
1369        }
1370
1371        return map_op;
1372}
1373
1374static enum wc_map_op writecache_bio_copy_ssd(struct dm_writecache *wc, struct bio *bio,
1375                                              struct wc_entry *e, bool search_used)
1376{
1377        unsigned bio_size = wc->block_size;
1378        sector_t start_cache_sec = cache_sector(wc, e);
1379        sector_t current_cache_sec = start_cache_sec + (bio_size >> SECTOR_SHIFT);
1380
1381        while (bio_size < bio->bi_iter.bi_size) {
1382                if (!search_used) {
1383                        struct wc_entry *f = writecache_pop_from_freelist(wc, current_cache_sec);
1384                        if (!f)
1385                                break;
1386                        write_original_sector_seq_count(wc, f, bio->bi_iter.bi_sector +
1387                                                        (bio_size >> SECTOR_SHIFT), wc->seq_count);
1388                        writecache_insert_entry(wc, f);
1389                        wc->uncommitted_blocks++;
1390                } else {
1391                        struct wc_entry *f;
1392                        struct rb_node *next = rb_next(&e->rb_node);
1393                        if (!next)
1394                                break;
1395                        f = container_of(next, struct wc_entry, rb_node);
1396                        if (f != e + 1)
1397                                break;
1398                        if (read_original_sector(wc, f) !=
1399                            read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT))
1400                                break;
1401                        if (unlikely(f->write_in_progress))
1402                                break;
1403                        if (writecache_entry_is_committed(wc, f))
1404                                wc->overwrote_committed = true;
1405                        e = f;
1406                }
1407                bio_size += wc->block_size;
1408                current_cache_sec += wc->block_size >> SECTOR_SHIFT;
1409        }
1410
1411        bio_set_dev(bio, wc->ssd_dev->bdev);
1412        bio->bi_iter.bi_sector = start_cache_sec;
1413        dm_accept_partial_bio(bio, bio_size >> SECTOR_SHIFT);
1414
1415        if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) {
1416                wc->uncommitted_blocks = 0;
1417                queue_work(wc->writeback_wq, &wc->flush_work);
1418        } else {
1419                writecache_schedule_autocommit(wc);
1420        }
1421
1422        return WC_MAP_REMAP;
1423}
1424
1425static enum wc_map_op writecache_map_write(struct dm_writecache *wc, struct bio *bio)
1426{
1427        struct wc_entry *e;
1428
1429        do {
1430                bool found_entry = false;
1431                bool search_used = false;
1432                wc->stats.writes++;
1433                if (writecache_has_error(wc))
1434                        return WC_MAP_ERROR;
1435                e = writecache_find_entry(wc, bio->bi_iter.bi_sector, 0);
1436                if (e) {
1437                        if (!writecache_entry_is_committed(wc, e)) {
1438                                wc->stats.write_hits_uncommitted++;
1439                                search_used = true;
1440                                goto bio_copy;
1441                        }
1442                        wc->stats.write_hits_committed++;
1443                        if (!WC_MODE_PMEM(wc) && !e->write_in_progress) {
1444                                wc->overwrote_committed = true;
1445                                search_used = true;
1446                                goto bio_copy;
1447                        }
1448                        found_entry = true;
1449                } else {
1450                        if (unlikely(wc->cleaner) ||
1451                            (wc->metadata_only && !(bio->bi_opf & REQ_META)))
1452                                goto direct_write;
1453                }
1454                e = writecache_pop_from_freelist(wc, (sector_t)-1);
1455                if (unlikely(!e)) {
1456                        if (!WC_MODE_PMEM(wc) && !found_entry) {
1457direct_write:
1458                                wc->stats.writes_around++;
1459                                e = writecache_find_entry(wc, bio->bi_iter.bi_sector, WFE_RETURN_FOLLOWING);
1460                                return writecache_map_remap_origin(wc, bio, e);
1461                        }
1462                        wc->stats.writes_blocked_on_freelist++;
1463                        writecache_wait_on_freelist(wc);
1464                        continue;
1465                }
1466                write_original_sector_seq_count(wc, e, bio->bi_iter.bi_sector, wc->seq_count);
1467                writecache_insert_entry(wc, e);
1468                wc->uncommitted_blocks++;
1469                wc->stats.writes_allocate++;
1470bio_copy:
1471                if (WC_MODE_PMEM(wc))
1472                        bio_copy_block(wc, bio, memory_data(wc, e));
1473                else
1474                        return writecache_bio_copy_ssd(wc, bio, e, search_used);
1475        } while (bio->bi_iter.bi_size);
1476
1477        if (unlikely(bio->bi_opf & REQ_FUA || wc->uncommitted_blocks >= wc->autocommit_blocks))
1478                writecache_flush(wc);
1479        else
1480                writecache_schedule_autocommit(wc);
1481
1482        return WC_MAP_SUBMIT;
1483}
1484
1485static enum wc_map_op writecache_map_flush(struct dm_writecache *wc, struct bio *bio)
1486{
1487        if (writecache_has_error(wc))
1488                return WC_MAP_ERROR;
1489
1490        if (WC_MODE_PMEM(wc)) {
1491                wc->stats.flushes++;
1492                writecache_flush(wc);
1493                if (writecache_has_error(wc))
1494                        return WC_MAP_ERROR;
1495                else if (unlikely(wc->cleaner) || unlikely(wc->metadata_only))
1496                        return WC_MAP_REMAP_ORIGIN;
1497                return WC_MAP_SUBMIT;
1498        }
1499        /* SSD: */
1500        if (dm_bio_get_target_bio_nr(bio))
1501                return WC_MAP_REMAP_ORIGIN;
1502        wc->stats.flushes++;
1503        writecache_offload_bio(wc, bio);
1504        return WC_MAP_RETURN;
1505}
1506
1507static enum wc_map_op writecache_map_discard(struct dm_writecache *wc, struct bio *bio)
1508{
1509        wc->stats.discards++;
1510
1511        if (writecache_has_error(wc))
1512                return WC_MAP_ERROR;
1513
1514        if (WC_MODE_PMEM(wc)) {
1515                writecache_discard(wc, bio->bi_iter.bi_sector, bio_end_sector(bio));
1516                return WC_MAP_REMAP_ORIGIN;
1517        }
1518        /* SSD: */
1519        writecache_offload_bio(wc, bio);
1520        return WC_MAP_RETURN;
1521}
1522
1523static int writecache_map(struct dm_target *ti, struct bio *bio)
1524{
1525        struct dm_writecache *wc = ti->private;
1526        enum wc_map_op map_op;
1527
1528        bio->bi_private = NULL;
1529
1530        wc_lock(wc);
1531
1532        if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
1533                map_op = writecache_map_flush(wc, bio);
1534                goto done;
1535        }
1536
1537        bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
1538
1539        if (unlikely((((unsigned)bio->bi_iter.bi_sector | bio_sectors(bio)) &
1540                                (wc->block_size / 512 - 1)) != 0)) {
1541                DMERR("I/O is not aligned, sector %llu, size %u, block size %u",
1542                      (unsigned long long)bio->bi_iter.bi_sector,
1543                      bio->bi_iter.bi_size, wc->block_size);
1544                map_op = WC_MAP_ERROR;
1545                goto done;
1546        }
1547
1548        if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
1549                map_op = writecache_map_discard(wc, bio);
1550                goto done;
1551        }
1552
1553        if (bio_data_dir(bio) == READ)
1554                map_op = writecache_map_read(wc, bio);
1555        else
1556                map_op = writecache_map_write(wc, bio);
1557done:
1558        switch (map_op) {
1559        case WC_MAP_REMAP_ORIGIN:
1560                if (likely(wc->pause != 0)) {
1561                        if (bio_op(bio) == REQ_OP_WRITE) {
1562                                dm_iot_io_begin(&wc->iot, 1);
1563                                bio->bi_private = (void *)2;
1564                        }
1565                }
1566                bio_set_dev(bio, wc->dev->bdev);
1567                wc_unlock(wc);
1568                return DM_MAPIO_REMAPPED;
1569
1570        case WC_MAP_REMAP:
1571                /* make sure that writecache_end_io decrements bio_in_progress: */
1572                bio->bi_private = (void *)1;
1573                atomic_inc(&wc->bio_in_progress[bio_data_dir(bio)]);
1574                wc_unlock(wc);
1575                return DM_MAPIO_REMAPPED;
1576
1577        case WC_MAP_SUBMIT:
1578                wc_unlock(wc);
1579                bio_endio(bio);
1580                return DM_MAPIO_SUBMITTED;
1581
1582        case WC_MAP_RETURN:
1583                wc_unlock(wc);
1584                return DM_MAPIO_SUBMITTED;
1585
1586        case WC_MAP_ERROR:
1587                wc_unlock(wc);
1588                bio_io_error(bio);
1589                return DM_MAPIO_SUBMITTED;
1590
1591        default:
1592                BUG();
1593                return -1;
1594        }
1595}
1596
1597static int writecache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *status)
1598{
1599        struct dm_writecache *wc = ti->private;
1600
1601        if (bio->bi_private == (void *)1) {
1602                int dir = bio_data_dir(bio);
1603                if (atomic_dec_and_test(&wc->bio_in_progress[dir]))
1604                        if (unlikely(waitqueue_active(&wc->bio_in_progress_wait[dir])))
1605                                wake_up(&wc->bio_in_progress_wait[dir]);
1606        } else if (bio->bi_private == (void *)2) {
1607                dm_iot_io_end(&wc->iot, 1);
1608        }
1609        return 0;
1610}
1611
1612static int writecache_iterate_devices(struct dm_target *ti,
1613                                      iterate_devices_callout_fn fn, void *data)
1614{
1615        struct dm_writecache *wc = ti->private;
1616
1617        return fn(ti, wc->dev, 0, ti->len, data);
1618}
1619
1620static void writecache_io_hints(struct dm_target *ti, struct queue_limits *limits)
1621{
1622        struct dm_writecache *wc = ti->private;
1623
1624        if (limits->logical_block_size < wc->block_size)
1625                limits->logical_block_size = wc->block_size;
1626
1627        if (limits->physical_block_size < wc->block_size)
1628                limits->physical_block_size = wc->block_size;
1629
1630        if (limits->io_min < wc->block_size)
1631                limits->io_min = wc->block_size;
1632}
1633
1634
1635static void writecache_writeback_endio(struct bio *bio)
1636{
1637        struct writeback_struct *wb = container_of(bio, struct writeback_struct, bio);
1638        struct dm_writecache *wc = wb->wc;
1639        unsigned long flags;
1640
1641        raw_spin_lock_irqsave(&wc->endio_list_lock, flags);
1642        if (unlikely(list_empty(&wc->endio_list)))
1643                wake_up_process(wc->endio_thread);
1644        list_add_tail(&wb->endio_entry, &wc->endio_list);
1645        raw_spin_unlock_irqrestore(&wc->endio_list_lock, flags);
1646}
1647
1648static void writecache_copy_endio(int read_err, unsigned long write_err, void *ptr)
1649{
1650        struct copy_struct *c = ptr;
1651        struct dm_writecache *wc = c->wc;
1652
1653        c->error = likely(!(read_err | write_err)) ? 0 : -EIO;
1654
1655        raw_spin_lock_irq(&wc->endio_list_lock);
1656        if (unlikely(list_empty(&wc->endio_list)))
1657                wake_up_process(wc->endio_thread);
1658        list_add_tail(&c->endio_entry, &wc->endio_list);
1659        raw_spin_unlock_irq(&wc->endio_list_lock);
1660}
1661
1662static void __writecache_endio_pmem(struct dm_writecache *wc, struct list_head *list)
1663{
1664        unsigned i;
1665        struct writeback_struct *wb;
1666        struct wc_entry *e;
1667        unsigned long n_walked = 0;
1668
1669        do {
1670                wb = list_entry(list->next, struct writeback_struct, endio_entry);
1671                list_del(&wb->endio_entry);
1672
1673                if (unlikely(wb->bio.bi_status != BLK_STS_OK))
1674                        writecache_error(wc, blk_status_to_errno(wb->bio.bi_status),
1675                                        "write error %d", wb->bio.bi_status);
1676                i = 0;
1677                do {
1678                        e = wb->wc_list[i];
1679                        BUG_ON(!e->write_in_progress);
1680                        e->write_in_progress = false;
1681                        INIT_LIST_HEAD(&e->lru);
1682                        if (!writecache_has_error(wc))
1683                                writecache_free_entry(wc, e);
1684                        BUG_ON(!wc->writeback_size);
1685                        wc->writeback_size--;
1686                        n_walked++;
1687                        if (unlikely(n_walked >= ENDIO_LATENCY)) {
1688                                writecache_commit_flushed(wc, false);
1689                                wc_unlock(wc);
1690                                wc_lock(wc);
1691                                n_walked = 0;
1692                        }
1693                } while (++i < wb->wc_list_n);
1694
1695                if (wb->wc_list != wb->wc_list_inline)
1696                        kfree(wb->wc_list);
1697                bio_put(&wb->bio);
1698        } while (!list_empty(list));
1699}
1700
1701static void __writecache_endio_ssd(struct dm_writecache *wc, struct list_head *list)
1702{
1703        struct copy_struct *c;
1704        struct wc_entry *e;
1705
1706        do {
1707                c = list_entry(list->next, struct copy_struct, endio_entry);
1708                list_del(&c->endio_entry);
1709
1710                if (unlikely(c->error))
1711                        writecache_error(wc, c->error, "copy error");
1712
1713                e = c->e;
1714                do {
1715                        BUG_ON(!e->write_in_progress);
1716                        e->write_in_progress = false;
1717                        INIT_LIST_HEAD(&e->lru);
1718                        if (!writecache_has_error(wc))
1719                                writecache_free_entry(wc, e);
1720
1721                        BUG_ON(!wc->writeback_size);
1722                        wc->writeback_size--;
1723                        e++;
1724                } while (--c->n_entries);
1725                mempool_free(c, &wc->copy_pool);
1726        } while (!list_empty(list));
1727}
1728
1729static int writecache_endio_thread(void *data)
1730{
1731        struct dm_writecache *wc = data;
1732
1733        while (1) {
1734                struct list_head list;
1735
1736                raw_spin_lock_irq(&wc->endio_list_lock);
1737                if (!list_empty(&wc->endio_list))
1738                        goto pop_from_list;
1739                set_current_state(TASK_INTERRUPTIBLE);
1740                raw_spin_unlock_irq(&wc->endio_list_lock);
1741
1742                if (unlikely(kthread_should_stop())) {
1743                        set_current_state(TASK_RUNNING);
1744                        break;
1745                }
1746
1747                schedule();
1748
1749                continue;
1750
1751pop_from_list:
1752                list = wc->endio_list;
1753                list.next->prev = list.prev->next = &list;
1754                INIT_LIST_HEAD(&wc->endio_list);
1755                raw_spin_unlock_irq(&wc->endio_list_lock);
1756
1757                if (!WC_MODE_FUA(wc))
1758                        writecache_disk_flush(wc, wc->dev);
1759
1760                wc_lock(wc);
1761
1762                if (WC_MODE_PMEM(wc)) {
1763                        __writecache_endio_pmem(wc, &list);
1764                } else {
1765                        __writecache_endio_ssd(wc, &list);
1766                        writecache_wait_for_ios(wc, READ);
1767                }
1768
1769                writecache_commit_flushed(wc, false);
1770
1771                wc_unlock(wc);
1772        }
1773
1774        return 0;
1775}
1776
1777static bool wc_add_block(struct writeback_struct *wb, struct wc_entry *e)
1778{
1779        struct dm_writecache *wc = wb->wc;
1780        unsigned block_size = wc->block_size;
1781        void *address = memory_data(wc, e);
1782
1783        persistent_memory_flush_cache(address, block_size);
1784
1785        if (unlikely(bio_end_sector(&wb->bio) >= wc->data_device_sectors))
1786                return true;
1787
1788        return bio_add_page(&wb->bio, persistent_memory_page(address),
1789                            block_size, persistent_memory_page_offset(address)) != 0;
1790}
1791
1792struct writeback_list {
1793        struct list_head list;
1794        size_t size;
1795};
1796
1797static void __writeback_throttle(struct dm_writecache *wc, struct writeback_list *wbl)
1798{
1799        if (unlikely(wc->max_writeback_jobs)) {
1800                if (READ_ONCE(wc->writeback_size) - wbl->size >= wc->max_writeback_jobs) {
1801                        wc_lock(wc);
1802                        while (wc->writeback_size - wbl->size >= wc->max_writeback_jobs)
1803                                writecache_wait_on_freelist(wc);
1804                        wc_unlock(wc);
1805                }
1806        }
1807        cond_resched();
1808}
1809
1810static void __writecache_writeback_pmem(struct dm_writecache *wc, struct writeback_list *wbl)
1811{
1812        struct wc_entry *e, *f;
1813        struct bio *bio;
1814        struct writeback_struct *wb;
1815        unsigned max_pages;
1816
1817        while (wbl->size) {
1818                wbl->size--;
1819                e = container_of(wbl->list.prev, struct wc_entry, lru);
1820                list_del(&e->lru);
1821
1822                max_pages = e->wc_list_contiguous;
1823
1824                bio = bio_alloc_bioset(GFP_NOIO, max_pages, &wc->bio_set);
1825                wb = container_of(bio, struct writeback_struct, bio);
1826                wb->wc = wc;
1827                bio->bi_end_io = writecache_writeback_endio;
1828                bio_set_dev(bio, wc->dev->bdev);
1829                bio->bi_iter.bi_sector = read_original_sector(wc, e);
1830                if (max_pages <= WB_LIST_INLINE ||
1831                    unlikely(!(wb->wc_list = kmalloc_array(max_pages, sizeof(struct wc_entry *),
1832                                                           GFP_NOIO | __GFP_NORETRY |
1833                                                           __GFP_NOMEMALLOC | __GFP_NOWARN)))) {
1834                        wb->wc_list = wb->wc_list_inline;
1835                        max_pages = WB_LIST_INLINE;
1836                }
1837
1838                BUG_ON(!wc_add_block(wb, e));
1839
1840                wb->wc_list[0] = e;
1841                wb->wc_list_n = 1;
1842
1843                while (wbl->size && wb->wc_list_n < max_pages) {
1844                        f = container_of(wbl->list.prev, struct wc_entry, lru);
1845                        if (read_original_sector(wc, f) !=
1846                            read_original_sector(wc, e) + (wc->block_size >> SECTOR_SHIFT))
1847                                break;
1848                        if (!wc_add_block(wb, f))
1849                                break;
1850                        wbl->size--;
1851                        list_del(&f->lru);
1852                        wb->wc_list[wb->wc_list_n++] = f;
1853                        e = f;
1854                }
1855                bio_set_op_attrs(bio, REQ_OP_WRITE, WC_MODE_FUA(wc) * REQ_FUA);
1856                if (writecache_has_error(wc)) {
1857                        bio->bi_status = BLK_STS_IOERR;
1858                        bio_endio(bio);
1859                } else if (unlikely(!bio_sectors(bio))) {
1860                        bio->bi_status = BLK_STS_OK;
1861                        bio_endio(bio);
1862                } else {
1863                        submit_bio(bio);
1864                }
1865
1866                __writeback_throttle(wc, wbl);
1867        }
1868}
1869
1870static void __writecache_writeback_ssd(struct dm_writecache *wc, struct writeback_list *wbl)
1871{
1872        struct wc_entry *e, *f;
1873        struct dm_io_region from, to;
1874        struct copy_struct *c;
1875
1876        while (wbl->size) {
1877                unsigned n_sectors;
1878
1879                wbl->size--;
1880                e = container_of(wbl->list.prev, struct wc_entry, lru);
1881                list_del(&e->lru);
1882
1883                n_sectors = e->wc_list_contiguous << (wc->block_size_bits - SECTOR_SHIFT);
1884
1885                from.bdev = wc->ssd_dev->bdev;
1886                from.sector = cache_sector(wc, e);
1887                from.count = n_sectors;
1888                to.bdev = wc->dev->bdev;
1889                to.sector = read_original_sector(wc, e);
1890                to.count = n_sectors;
1891
1892                c = mempool_alloc(&wc->copy_pool, GFP_NOIO);
1893                c->wc = wc;
1894                c->e = e;
1895                c->n_entries = e->wc_list_contiguous;
1896
1897                while ((n_sectors -= wc->block_size >> SECTOR_SHIFT)) {
1898                        wbl->size--;
1899                        f = container_of(wbl->list.prev, struct wc_entry, lru);
1900                        BUG_ON(f != e + 1);
1901                        list_del(&f->lru);
1902                        e = f;
1903                }
1904
1905                if (unlikely(to.sector + to.count > wc->data_device_sectors)) {
1906                        if (to.sector >= wc->data_device_sectors) {
1907                                writecache_copy_endio(0, 0, c);
1908                                continue;
1909                        }
1910                        from.count = to.count = wc->data_device_sectors - to.sector;
1911                }
1912
1913                dm_kcopyd_copy(wc->dm_kcopyd, &from, 1, &to, 0, writecache_copy_endio, c);
1914
1915                __writeback_throttle(wc, wbl);
1916        }
1917}
1918
1919static void writecache_writeback(struct work_struct *work)
1920{
1921        struct dm_writecache *wc = container_of(work, struct dm_writecache, writeback_work);
1922        struct blk_plug plug;
1923        struct wc_entry *f, *g, *e = NULL;
1924        struct rb_node *node, *next_node;
1925        struct list_head skipped;
1926        struct writeback_list wbl;
1927        unsigned long n_walked;
1928
1929        if (!WC_MODE_PMEM(wc)) {
1930                /* Wait for any active kcopyd work on behalf of ssd writeback */
1931                dm_kcopyd_client_flush(wc->dm_kcopyd);
1932        }
1933
1934        if (likely(wc->pause != 0)) {
1935                while (1) {
1936                        unsigned long idle;
1937                        if (unlikely(wc->cleaner) || unlikely(wc->writeback_all) ||
1938                            unlikely(dm_suspended(wc->ti)))
1939                                break;
1940                        idle = dm_iot_idle_time(&wc->iot);
1941                        if (idle >= wc->pause)
1942                                break;
1943                        idle = wc->pause - idle;
1944                        if (idle > HZ)
1945                                idle = HZ;
1946                        schedule_timeout_idle(idle);
1947                }
1948        }
1949
1950        wc_lock(wc);
1951restart:
1952        if (writecache_has_error(wc)) {
1953                wc_unlock(wc);
1954                return;
1955        }
1956
1957        if (unlikely(wc->writeback_all)) {
1958                if (writecache_wait_for_writeback(wc))
1959                        goto restart;
1960        }
1961
1962        if (wc->overwrote_committed) {
1963                writecache_wait_for_ios(wc, WRITE);
1964        }
1965
1966        n_walked = 0;
1967        INIT_LIST_HEAD(&skipped);
1968        INIT_LIST_HEAD(&wbl.list);
1969        wbl.size = 0;
1970        while (!list_empty(&wc->lru) &&
1971               (wc->writeback_all ||
1972                wc->freelist_size + wc->writeback_size <= wc->freelist_low_watermark ||
1973                (jiffies - container_of(wc->lru.prev, struct wc_entry, lru)->age >=
1974                 wc->max_age - wc->max_age / MAX_AGE_DIV))) {
1975
1976                n_walked++;
1977                if (unlikely(n_walked > WRITEBACK_LATENCY) &&
1978                    likely(!wc->writeback_all)) {
1979                        if (likely(!dm_suspended(wc->ti)))
1980                                queue_work(wc->writeback_wq, &wc->writeback_work);
1981                        break;
1982                }
1983
1984                if (unlikely(wc->writeback_all)) {
1985                        if (unlikely(!e)) {
1986                                writecache_flush(wc);
1987                                e = container_of(rb_first(&wc->tree), struct wc_entry, rb_node);
1988                        } else
1989                                e = g;
1990                } else
1991                        e = container_of(wc->lru.prev, struct wc_entry, lru);
1992                BUG_ON(e->write_in_progress);
1993                if (unlikely(!writecache_entry_is_committed(wc, e))) {
1994                        writecache_flush(wc);
1995                }
1996                node = rb_prev(&e->rb_node);
1997                if (node) {
1998                        f = container_of(node, struct wc_entry, rb_node);
1999                        if (unlikely(read_original_sector(wc, f) ==
2000                                     read_original_sector(wc, e))) {
2001                                BUG_ON(!f->write_in_progress);
2002                                list_move(&e->lru, &skipped);
2003                                cond_resched();
2004                                continue;
2005                        }
2006                }
2007                wc->writeback_size++;
2008                list_move(&e->lru, &wbl.list);
2009                wbl.size++;
2010                e->write_in_progress = true;
2011                e->wc_list_contiguous = 1;
2012
2013                f = e;
2014
2015                while (1) {
2016                        next_node = rb_next(&f->rb_node);
2017                        if (unlikely(!next_node))
2018                                break;
2019                        g = container_of(next_node, struct wc_entry, rb_node);
2020                        if (unlikely(read_original_sector(wc, g) ==
2021                            read_original_sector(wc, f))) {
2022                                f = g;
2023                                continue;
2024                        }
2025                        if (read_original_sector(wc, g) !=
2026                            read_original_sector(wc, f) + (wc->block_size >> SECTOR_SHIFT))
2027                                break;
2028                        if (unlikely(g->write_in_progress))
2029                                break;
2030                        if (unlikely(!writecache_entry_is_committed(wc, g)))
2031                                break;
2032
2033                        if (!WC_MODE_PMEM(wc)) {
2034                                if (g != f + 1)
2035                                        break;
2036                        }
2037
2038                        n_walked++;
2039                        //if (unlikely(n_walked > WRITEBACK_LATENCY) && likely(!wc->writeback_all))
2040                        //      break;
2041
2042                        wc->writeback_size++;
2043                        list_move(&g->lru, &wbl.list);
2044                        wbl.size++;
2045                        g->write_in_progress = true;
2046                        g->wc_list_contiguous = BIO_MAX_VECS;
2047                        f = g;
2048                        e->wc_list_contiguous++;
2049                        if (unlikely(e->wc_list_contiguous == BIO_MAX_VECS)) {
2050                                if (unlikely(wc->writeback_all)) {
2051                                        next_node = rb_next(&f->rb_node);
2052                                        if (likely(next_node))
2053                                                g = container_of(next_node, struct wc_entry, rb_node);
2054                                }
2055                                break;
2056                        }
2057                }
2058                cond_resched();
2059        }
2060
2061        if (!list_empty(&skipped)) {
2062                list_splice_tail(&skipped, &wc->lru);
2063                /*
2064                 * If we didn't do any progress, we must wait until some
2065                 * writeback finishes to avoid burning CPU in a loop
2066                 */
2067                if (unlikely(!wbl.size))
2068                        writecache_wait_for_writeback(wc);
2069        }
2070
2071        wc_unlock(wc);
2072
2073        blk_start_plug(&plug);
2074
2075        if (WC_MODE_PMEM(wc))
2076                __writecache_writeback_pmem(wc, &wbl);
2077        else
2078                __writecache_writeback_ssd(wc, &wbl);
2079
2080        blk_finish_plug(&plug);
2081
2082        if (unlikely(wc->writeback_all)) {
2083                wc_lock(wc);
2084                while (writecache_wait_for_writeback(wc));
2085                wc_unlock(wc);
2086        }
2087}
2088
2089static int calculate_memory_size(uint64_t device_size, unsigned block_size,
2090                                 size_t *n_blocks_p, size_t *n_metadata_blocks_p)
2091{
2092        uint64_t n_blocks, offset;
2093        struct wc_entry e;
2094
2095        n_blocks = device_size;
2096        do_div(n_blocks, block_size + sizeof(struct wc_memory_entry));
2097
2098        while (1) {
2099                if (!n_blocks)
2100                        return -ENOSPC;
2101                /* Verify the following entries[n_blocks] won't overflow */
2102                if (n_blocks >= ((size_t)-sizeof(struct wc_memory_superblock) /
2103                                 sizeof(struct wc_memory_entry)))
2104                        return -EFBIG;
2105                offset = offsetof(struct wc_memory_superblock, entries[n_blocks]);
2106                offset = (offset + block_size - 1) & ~(uint64_t)(block_size - 1);
2107                if (offset + n_blocks * block_size <= device_size)
2108                        break;
2109                n_blocks--;
2110        }
2111
2112        /* check if the bit field overflows */
2113        e.index = n_blocks;
2114        if (e.index != n_blocks)
2115                return -EFBIG;
2116
2117        if (n_blocks_p)
2118                *n_blocks_p = n_blocks;
2119        if (n_metadata_blocks_p)
2120                *n_metadata_blocks_p = offset >> __ffs(block_size);
2121        return 0;
2122}
2123
2124static int init_memory(struct dm_writecache *wc)
2125{
2126        size_t b;
2127        int r;
2128
2129        r = calculate_memory_size(wc->memory_map_size, wc->block_size, &wc->n_blocks, NULL);
2130        if (r)
2131                return r;
2132
2133        r = writecache_alloc_entries(wc);
2134        if (r)
2135                return r;
2136
2137        for (b = 0; b < ARRAY_SIZE(sb(wc)->padding); b++)
2138                pmem_assign(sb(wc)->padding[b], cpu_to_le64(0));
2139        pmem_assign(sb(wc)->version, cpu_to_le32(MEMORY_SUPERBLOCK_VERSION));
2140        pmem_assign(sb(wc)->block_size, cpu_to_le32(wc->block_size));
2141        pmem_assign(sb(wc)->n_blocks, cpu_to_le64(wc->n_blocks));
2142        pmem_assign(sb(wc)->seq_count, cpu_to_le64(0));
2143
2144        for (b = 0; b < wc->n_blocks; b++) {
2145                write_original_sector_seq_count(wc, &wc->entries[b], -1, -1);
2146                cond_resched();
2147        }
2148
2149        writecache_flush_all_metadata(wc);
2150        writecache_commit_flushed(wc, false);
2151        pmem_assign(sb(wc)->magic, cpu_to_le32(MEMORY_SUPERBLOCK_MAGIC));
2152        writecache_flush_region(wc, &sb(wc)->magic, sizeof sb(wc)->magic);
2153        writecache_commit_flushed(wc, false);
2154
2155        return 0;
2156}
2157
2158static void writecache_dtr(struct dm_target *ti)
2159{
2160        struct dm_writecache *wc = ti->private;
2161
2162        if (!wc)
2163                return;
2164
2165        if (wc->endio_thread)
2166                kthread_stop(wc->endio_thread);
2167
2168        if (wc->flush_thread)
2169                kthread_stop(wc->flush_thread);
2170
2171        bioset_exit(&wc->bio_set);
2172
2173        mempool_exit(&wc->copy_pool);
2174
2175        if (wc->writeback_wq)
2176                destroy_workqueue(wc->writeback_wq);
2177
2178        if (wc->dev)
2179                dm_put_device(ti, wc->dev);
2180
2181        if (wc->ssd_dev)
2182                dm_put_device(ti, wc->ssd_dev);
2183
2184        vfree(wc->entries);
2185
2186        if (wc->memory_map) {
2187                if (WC_MODE_PMEM(wc))
2188                        persistent_memory_release(wc);
2189                else
2190                        vfree(wc->memory_map);
2191        }
2192
2193        if (wc->dm_kcopyd)
2194                dm_kcopyd_client_destroy(wc->dm_kcopyd);
2195
2196        if (wc->dm_io)
2197                dm_io_client_destroy(wc->dm_io);
2198
2199        vfree(wc->dirty_bitmap);
2200
2201        kfree(wc);
2202}
2203
2204static int writecache_ctr(struct dm_target *ti, unsigned argc, char **argv)
2205{
2206        struct dm_writecache *wc;
2207        struct dm_arg_set as;
2208        const char *string;
2209        unsigned opt_params;
2210        size_t offset, data_size;
2211        int i, r;
2212        char dummy;
2213        int high_wm_percent = HIGH_WATERMARK;
2214        int low_wm_percent = LOW_WATERMARK;
2215        uint64_t x;
2216        struct wc_memory_superblock s;
2217
2218        static struct dm_arg _args[] = {
2219                {0, 18, "Invalid number of feature args"},
2220        };
2221
2222        as.argc = argc;
2223        as.argv = argv;
2224
2225        wc = kzalloc(sizeof(struct dm_writecache), GFP_KERNEL);
2226        if (!wc) {
2227                ti->error = "Cannot allocate writecache structure";
2228                r = -ENOMEM;
2229                goto bad;
2230        }
2231        ti->private = wc;
2232        wc->ti = ti;
2233
2234        mutex_init(&wc->lock);
2235        wc->max_age = MAX_AGE_UNSPECIFIED;
2236        writecache_poison_lists(wc);
2237        init_waitqueue_head(&wc->freelist_wait);
2238        timer_setup(&wc->autocommit_timer, writecache_autocommit_timer, 0);
2239        timer_setup(&wc->max_age_timer, writecache_max_age_timer, 0);
2240
2241        for (i = 0; i < 2; i++) {
2242                atomic_set(&wc->bio_in_progress[i], 0);
2243                init_waitqueue_head(&wc->bio_in_progress_wait[i]);
2244        }
2245
2246        wc->dm_io = dm_io_client_create();
2247        if (IS_ERR(wc->dm_io)) {
2248                r = PTR_ERR(wc->dm_io);
2249                ti->error = "Unable to allocate dm-io client";
2250                wc->dm_io = NULL;
2251                goto bad;
2252        }
2253
2254        wc->writeback_wq = alloc_workqueue("writecache-writeback", WQ_MEM_RECLAIM, 1);
2255        if (!wc->writeback_wq) {
2256                r = -ENOMEM;
2257                ti->error = "Could not allocate writeback workqueue";
2258                goto bad;
2259        }
2260        INIT_WORK(&wc->writeback_work, writecache_writeback);
2261        INIT_WORK(&wc->flush_work, writecache_flush_work);
2262
2263        dm_iot_init(&wc->iot);
2264
2265        raw_spin_lock_init(&wc->endio_list_lock);
2266        INIT_LIST_HEAD(&wc->endio_list);
2267        wc->endio_thread = kthread_create(writecache_endio_thread, wc, "writecache_endio");
2268        if (IS_ERR(wc->endio_thread)) {
2269                r = PTR_ERR(wc->endio_thread);
2270                wc->endio_thread = NULL;
2271                ti->error = "Couldn't spawn endio thread";
2272                goto bad;
2273        }
2274        wake_up_process(wc->endio_thread);
2275
2276        /*
2277         * Parse the mode (pmem or ssd)
2278         */
2279        string = dm_shift_arg(&as);
2280        if (!string)
2281                goto bad_arguments;
2282
2283        if (!strcasecmp(string, "s")) {
2284                wc->pmem_mode = false;
2285        } else if (!strcasecmp(string, "p")) {
2286#ifdef DM_WRITECACHE_HAS_PMEM
2287                wc->pmem_mode = true;
2288                wc->writeback_fua = true;
2289#else
2290                /*
2291                 * If the architecture doesn't support persistent memory or
2292                 * the kernel doesn't support any DAX drivers, this driver can
2293                 * only be used in SSD-only mode.
2294                 */
2295                r = -EOPNOTSUPP;
2296                ti->error = "Persistent memory or DAX not supported on this system";
2297                goto bad;
2298#endif
2299        } else {
2300                goto bad_arguments;
2301        }
2302
2303        if (WC_MODE_PMEM(wc)) {
2304                r = bioset_init(&wc->bio_set, BIO_POOL_SIZE,
2305                                offsetof(struct writeback_struct, bio),
2306                                BIOSET_NEED_BVECS);
2307                if (r) {
2308                        ti->error = "Could not allocate bio set";
2309                        goto bad;
2310                }
2311        } else {
2312                wc->pause = PAUSE_WRITEBACK;
2313                r = mempool_init_kmalloc_pool(&wc->copy_pool, 1, sizeof(struct copy_struct));
2314                if (r) {
2315                        ti->error = "Could not allocate mempool";
2316                        goto bad;
2317                }
2318        }
2319
2320        /*
2321         * Parse the origin data device
2322         */
2323        string = dm_shift_arg(&as);
2324        if (!string)
2325                goto bad_arguments;
2326        r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->dev);
2327        if (r) {
2328                ti->error = "Origin data device lookup failed";
2329                goto bad;
2330        }
2331
2332        /*
2333         * Parse cache data device (be it pmem or ssd)
2334         */
2335        string = dm_shift_arg(&as);
2336        if (!string)
2337                goto bad_arguments;
2338
2339        r = dm_get_device(ti, string, dm_table_get_mode(ti->table), &wc->ssd_dev);
2340        if (r) {
2341                ti->error = "Cache data device lookup failed";
2342                goto bad;
2343        }
2344        wc->memory_map_size = i_size_read(wc->ssd_dev->bdev->bd_inode);
2345
2346        /*
2347         * Parse the cache block size
2348         */
2349        string = dm_shift_arg(&as);
2350        if (!string)
2351                goto bad_arguments;
2352        if (sscanf(string, "%u%c", &wc->block_size, &dummy) != 1 ||
2353            wc->block_size < 512 || wc->block_size > PAGE_SIZE ||
2354            (wc->block_size & (wc->block_size - 1))) {
2355                r = -EINVAL;
2356                ti->error = "Invalid block size";
2357                goto bad;
2358        }
2359        if (wc->block_size < bdev_logical_block_size(wc->dev->bdev) ||
2360            wc->block_size < bdev_logical_block_size(wc->ssd_dev->bdev)) {
2361                r = -EINVAL;
2362                ti->error = "Block size is smaller than device logical block size";
2363                goto bad;
2364        }
2365        wc->block_size_bits = __ffs(wc->block_size);
2366
2367        wc->max_writeback_jobs = MAX_WRITEBACK_JOBS;
2368        wc->autocommit_blocks = !WC_MODE_PMEM(wc) ? AUTOCOMMIT_BLOCKS_SSD : AUTOCOMMIT_BLOCKS_PMEM;
2369        wc->autocommit_jiffies = msecs_to_jiffies(AUTOCOMMIT_MSEC);
2370
2371        /*
2372         * Parse optional arguments
2373         */
2374        r = dm_read_arg_group(_args, &as, &opt_params, &ti->error);
2375        if (r)
2376                goto bad;
2377
2378        while (opt_params) {
2379                string = dm_shift_arg(&as), opt_params--;
2380                if (!strcasecmp(string, "start_sector") && opt_params >= 1) {
2381                        unsigned long long start_sector;
2382                        string = dm_shift_arg(&as), opt_params--;
2383                        if (sscanf(string, "%llu%c", &start_sector, &dummy) != 1)
2384                                goto invalid_optional;
2385                        wc->start_sector = start_sector;
2386                        wc->start_sector_set = true;
2387                        if (wc->start_sector != start_sector ||
2388                            wc->start_sector >= wc->memory_map_size >> SECTOR_SHIFT)
2389                                goto invalid_optional;
2390                } else if (!strcasecmp(string, "high_watermark") && opt_params >= 1) {
2391                        string = dm_shift_arg(&as), opt_params--;
2392                        if (sscanf(string, "%d%c", &high_wm_percent, &dummy) != 1)
2393                                goto invalid_optional;
2394                        if (high_wm_percent < 0 || high_wm_percent > 100)
2395                                goto invalid_optional;
2396                        wc->high_wm_percent_value = high_wm_percent;
2397                        wc->high_wm_percent_set = true;
2398                } else if (!strcasecmp(string, "low_watermark") && opt_params >= 1) {
2399                        string = dm_shift_arg(&as), opt_params--;
2400                        if (sscanf(string, "%d%c", &low_wm_percent, &dummy) != 1)
2401                                goto invalid_optional;
2402                        if (low_wm_percent < 0 || low_wm_percent > 100)
2403                                goto invalid_optional;
2404                        wc->low_wm_percent_value = low_wm_percent;
2405                        wc->low_wm_percent_set = true;
2406                } else if (!strcasecmp(string, "writeback_jobs") && opt_params >= 1) {
2407                        string = dm_shift_arg(&as), opt_params--;
2408                        if (sscanf(string, "%u%c", &wc->max_writeback_jobs, &dummy) != 1)
2409                                goto invalid_optional;
2410                        wc->max_writeback_jobs_set = true;
2411                } else if (!strcasecmp(string, "autocommit_blocks") && opt_params >= 1) {
2412                        string = dm_shift_arg(&as), opt_params--;
2413                        if (sscanf(string, "%u%c", &wc->autocommit_blocks, &dummy) != 1)
2414                                goto invalid_optional;
2415                        wc->autocommit_blocks_set = true;
2416                } else if (!strcasecmp(string, "autocommit_time") && opt_params >= 1) {
2417                        unsigned autocommit_msecs;
2418                        string = dm_shift_arg(&as), opt_params--;
2419                        if (sscanf(string, "%u%c", &autocommit_msecs, &dummy) != 1)
2420                                goto invalid_optional;
2421                        if (autocommit_msecs > 3600000)
2422                                goto invalid_optional;
2423                        wc->autocommit_jiffies = msecs_to_jiffies(autocommit_msecs);
2424                        wc->autocommit_time_value = autocommit_msecs;
2425                        wc->autocommit_time_set = true;
2426                } else if (!strcasecmp(string, "max_age") && opt_params >= 1) {
2427                        unsigned max_age_msecs;
2428                        string = dm_shift_arg(&as), opt_params--;
2429                        if (sscanf(string, "%u%c", &max_age_msecs, &dummy) != 1)
2430                                goto invalid_optional;
2431                        if (max_age_msecs > 86400000)
2432                                goto invalid_optional;
2433                        wc->max_age = msecs_to_jiffies(max_age_msecs);
2434                        wc->max_age_set = true;
2435                        wc->max_age_value = max_age_msecs;
2436                } else if (!strcasecmp(string, "cleaner")) {
2437                        wc->cleaner_set = true;
2438                        wc->cleaner = true;
2439                } else if (!strcasecmp(string, "fua")) {
2440                        if (WC_MODE_PMEM(wc)) {
2441                                wc->writeback_fua = true;
2442                                wc->writeback_fua_set = true;
2443                        } else goto invalid_optional;
2444                } else if (!strcasecmp(string, "nofua")) {
2445                        if (WC_MODE_PMEM(wc)) {
2446                                wc->writeback_fua = false;
2447                                wc->writeback_fua_set = true;
2448                        } else goto invalid_optional;
2449                } else if (!strcasecmp(string, "metadata_only")) {
2450                        wc->metadata_only = true;
2451                } else if (!strcasecmp(string, "pause_writeback") && opt_params >= 1) {
2452                        unsigned pause_msecs;
2453                        if (WC_MODE_PMEM(wc))
2454                                goto invalid_optional;
2455                        string = dm_shift_arg(&as), opt_params--;
2456                        if (sscanf(string, "%u%c", &pause_msecs, &dummy) != 1)
2457                                goto invalid_optional;
2458                        if (pause_msecs > 60000)
2459                                goto invalid_optional;
2460                        wc->pause = msecs_to_jiffies(pause_msecs);
2461                        wc->pause_set = true;
2462                        wc->pause_value = pause_msecs;
2463                } else {
2464invalid_optional:
2465                        r = -EINVAL;
2466                        ti->error = "Invalid optional argument";
2467                        goto bad;
2468                }
2469        }
2470
2471        if (high_wm_percent < low_wm_percent) {
2472                r = -EINVAL;
2473                ti->error = "High watermark must be greater than or equal to low watermark";
2474                goto bad;
2475        }
2476
2477        if (WC_MODE_PMEM(wc)) {
2478                if (!dax_synchronous(wc->ssd_dev->dax_dev)) {
2479                        r = -EOPNOTSUPP;
2480                        ti->error = "Asynchronous persistent memory not supported as pmem cache";
2481                        goto bad;
2482                }
2483
2484                r = persistent_memory_claim(wc);
2485                if (r) {
2486                        ti->error = "Unable to map persistent memory for cache";
2487                        goto bad;
2488                }
2489        } else {
2490                size_t n_blocks, n_metadata_blocks;
2491                uint64_t n_bitmap_bits;
2492
2493                wc->memory_map_size -= (uint64_t)wc->start_sector << SECTOR_SHIFT;
2494
2495                bio_list_init(&wc->flush_list);
2496                wc->flush_thread = kthread_create(writecache_flush_thread, wc, "dm_writecache_flush");
2497                if (IS_ERR(wc->flush_thread)) {
2498                        r = PTR_ERR(wc->flush_thread);
2499                        wc->flush_thread = NULL;
2500                        ti->error = "Couldn't spawn flush thread";
2501                        goto bad;
2502                }
2503                wake_up_process(wc->flush_thread);
2504
2505                r = calculate_memory_size(wc->memory_map_size, wc->block_size,
2506                                          &n_blocks, &n_metadata_blocks);
2507                if (r) {
2508                        ti->error = "Invalid device size";
2509                        goto bad;
2510                }
2511
2512                n_bitmap_bits = (((uint64_t)n_metadata_blocks << wc->block_size_bits) +
2513                                 BITMAP_GRANULARITY - 1) / BITMAP_GRANULARITY;
2514                /* this is limitation of test_bit functions */
2515                if (n_bitmap_bits > 1U << 31) {
2516                        r = -EFBIG;
2517                        ti->error = "Invalid device size";
2518                        goto bad;
2519                }
2520
2521                wc->memory_map = vmalloc(n_metadata_blocks << wc->block_size_bits);
2522                if (!wc->memory_map) {
2523                        r = -ENOMEM;
2524                        ti->error = "Unable to allocate memory for metadata";
2525                        goto bad;
2526                }
2527
2528                wc->dm_kcopyd = dm_kcopyd_client_create(&dm_kcopyd_throttle);
2529                if (IS_ERR(wc->dm_kcopyd)) {
2530                        r = PTR_ERR(wc->dm_kcopyd);
2531                        ti->error = "Unable to allocate dm-kcopyd client";
2532                        wc->dm_kcopyd = NULL;
2533                        goto bad;
2534                }
2535
2536                wc->metadata_sectors = n_metadata_blocks << (wc->block_size_bits - SECTOR_SHIFT);
2537                wc->dirty_bitmap_size = (n_bitmap_bits + BITS_PER_LONG - 1) /
2538                        BITS_PER_LONG * sizeof(unsigned long);
2539                wc->dirty_bitmap = vzalloc(wc->dirty_bitmap_size);
2540                if (!wc->dirty_bitmap) {
2541                        r = -ENOMEM;
2542                        ti->error = "Unable to allocate dirty bitmap";
2543                        goto bad;
2544                }
2545
2546                r = writecache_read_metadata(wc, wc->block_size >> SECTOR_SHIFT);
2547                if (r) {
2548                        ti->error = "Unable to read first block of metadata";
2549                        goto bad;
2550                }
2551        }
2552
2553        r = copy_mc_to_kernel(&s, sb(wc), sizeof(struct wc_memory_superblock));
2554        if (r) {
2555                ti->error = "Hardware memory error when reading superblock";
2556                goto bad;
2557        }
2558        if (!le32_to_cpu(s.magic) && !le32_to_cpu(s.version)) {
2559                r = init_memory(wc);
2560                if (r) {
2561                        ti->error = "Unable to initialize device";
2562                        goto bad;
2563                }
2564                r = copy_mc_to_kernel(&s, sb(wc),
2565                                      sizeof(struct wc_memory_superblock));
2566                if (r) {
2567                        ti->error = "Hardware memory error when reading superblock";
2568                        goto bad;
2569                }
2570        }
2571
2572        if (le32_to_cpu(s.magic) != MEMORY_SUPERBLOCK_MAGIC) {
2573                ti->error = "Invalid magic in the superblock";
2574                r = -EINVAL;
2575                goto bad;
2576        }
2577
2578        if (le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION) {
2579                ti->error = "Invalid version in the superblock";
2580                r = -EINVAL;
2581                goto bad;
2582        }
2583
2584        if (le32_to_cpu(s.block_size) != wc->block_size) {
2585                ti->error = "Block size does not match superblock";
2586                r = -EINVAL;
2587                goto bad;
2588        }
2589
2590        wc->n_blocks = le64_to_cpu(s.n_blocks);
2591
2592        offset = wc->n_blocks * sizeof(struct wc_memory_entry);
2593        if (offset / sizeof(struct wc_memory_entry) != le64_to_cpu(sb(wc)->n_blocks)) {
2594overflow:
2595                ti->error = "Overflow in size calculation";
2596                r = -EINVAL;
2597                goto bad;
2598        }
2599        offset += sizeof(struct wc_memory_superblock);
2600        if (offset < sizeof(struct wc_memory_superblock))
2601                goto overflow;
2602        offset = (offset + wc->block_size - 1) & ~(size_t)(wc->block_size - 1);
2603        data_size = wc->n_blocks * (size_t)wc->block_size;
2604        if (!offset || (data_size / wc->block_size != wc->n_blocks) ||
2605            (offset + data_size < offset))
2606                goto overflow;
2607        if (offset + data_size > wc->memory_map_size) {
2608                ti->error = "Memory area is too small";
2609                r = -EINVAL;
2610                goto bad;
2611        }
2612
2613        wc->metadata_sectors = offset >> SECTOR_SHIFT;
2614        wc->block_start = (char *)sb(wc) + offset;
2615
2616        x = (uint64_t)wc->n_blocks * (100 - high_wm_percent);
2617        x += 50;
2618        do_div(x, 100);
2619        wc->freelist_high_watermark = x;
2620        x = (uint64_t)wc->n_blocks * (100 - low_wm_percent);
2621        x += 50;
2622        do_div(x, 100);
2623        wc->freelist_low_watermark = x;
2624
2625        if (wc->cleaner)
2626                activate_cleaner(wc);
2627
2628        r = writecache_alloc_entries(wc);
2629        if (r) {
2630                ti->error = "Cannot allocate memory";
2631                goto bad;
2632        }
2633
2634        ti->num_flush_bios = WC_MODE_PMEM(wc) ? 1 : 2;
2635        ti->flush_supported = true;
2636        ti->num_discard_bios = 1;
2637
2638        if (WC_MODE_PMEM(wc))
2639                persistent_memory_flush_cache(wc->memory_map, wc->memory_map_size);
2640
2641        return 0;
2642
2643bad_arguments:
2644        r = -EINVAL;
2645        ti->error = "Bad arguments";
2646bad:
2647        writecache_dtr(ti);
2648        return r;
2649}
2650
2651static void writecache_status(struct dm_target *ti, status_type_t type,
2652                              unsigned status_flags, char *result, unsigned maxlen)
2653{
2654        struct dm_writecache *wc = ti->private;
2655        unsigned extra_args;
2656        unsigned sz = 0;
2657
2658        switch (type) {
2659        case STATUSTYPE_INFO:
2660                DMEMIT("%ld %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu",
2661                       writecache_has_error(wc),
2662                       (unsigned long long)wc->n_blocks, (unsigned long long)wc->freelist_size,
2663                       (unsigned long long)wc->writeback_size,
2664                       wc->stats.reads,
2665                       wc->stats.read_hits,
2666                       wc->stats.writes,
2667                       wc->stats.write_hits_uncommitted,
2668                       wc->stats.write_hits_committed,
2669                       wc->stats.writes_around,
2670                       wc->stats.writes_allocate,
2671                       wc->stats.writes_blocked_on_freelist,
2672                       wc->stats.flushes,
2673                       wc->stats.discards);
2674                break;
2675        case STATUSTYPE_TABLE:
2676                DMEMIT("%c %s %s %u ", WC_MODE_PMEM(wc) ? 'p' : 's',
2677                                wc->dev->name, wc->ssd_dev->name, wc->block_size);
2678                extra_args = 0;
2679                if (wc->start_sector_set)
2680                        extra_args += 2;
2681                if (wc->high_wm_percent_set)
2682                        extra_args += 2;
2683                if (wc->low_wm_percent_set)
2684                        extra_args += 2;
2685                if (wc->max_writeback_jobs_set)
2686                        extra_args += 2;
2687                if (wc->autocommit_blocks_set)
2688                        extra_args += 2;
2689                if (wc->autocommit_time_set)
2690                        extra_args += 2;
2691                if (wc->max_age_set)
2692                        extra_args += 2;
2693                if (wc->cleaner_set)
2694                        extra_args++;
2695                if (wc->writeback_fua_set)
2696                        extra_args++;
2697                if (wc->metadata_only)
2698                        extra_args++;
2699                if (wc->pause_set)
2700                        extra_args += 2;
2701
2702                DMEMIT("%u", extra_args);
2703                if (wc->start_sector_set)
2704                        DMEMIT(" start_sector %llu", (unsigned long long)wc->start_sector);
2705                if (wc->high_wm_percent_set)
2706                        DMEMIT(" high_watermark %u", wc->high_wm_percent_value);
2707                if (wc->low_wm_percent_set)
2708                        DMEMIT(" low_watermark %u", wc->low_wm_percent_value);
2709                if (wc->max_writeback_jobs_set)
2710                        DMEMIT(" writeback_jobs %u", wc->max_writeback_jobs);
2711                if (wc->autocommit_blocks_set)
2712                        DMEMIT(" autocommit_blocks %u", wc->autocommit_blocks);
2713                if (wc->autocommit_time_set)
2714                        DMEMIT(" autocommit_time %u", wc->autocommit_time_value);
2715                if (wc->max_age_set)
2716                        DMEMIT(" max_age %u", wc->max_age_value);
2717                if (wc->cleaner_set)
2718                        DMEMIT(" cleaner");
2719                if (wc->writeback_fua_set)
2720                        DMEMIT(" %sfua", wc->writeback_fua ? "" : "no");
2721                if (wc->metadata_only)
2722                        DMEMIT(" metadata_only");
2723                if (wc->pause_set)
2724                        DMEMIT(" pause_writeback %u", wc->pause_value);
2725                break;
2726        case STATUSTYPE_IMA:
2727                *result = '\0';
2728                break;
2729        }
2730}
2731
2732static struct target_type writecache_target = {
2733        .name                   = "writecache",
2734        .version                = {1, 6, 0},
2735        .module                 = THIS_MODULE,
2736        .ctr                    = writecache_ctr,
2737        .dtr                    = writecache_dtr,
2738        .status                 = writecache_status,
2739        .postsuspend            = writecache_suspend,
2740        .resume                 = writecache_resume,
2741        .message                = writecache_message,
2742        .map                    = writecache_map,
2743        .end_io                 = writecache_end_io,
2744        .iterate_devices        = writecache_iterate_devices,
2745        .io_hints               = writecache_io_hints,
2746};
2747
2748static int __init dm_writecache_init(void)
2749{
2750        int r;
2751
2752        r = dm_register_target(&writecache_target);
2753        if (r < 0) {
2754                DMERR("register failed %d", r);
2755                return r;
2756        }
2757
2758        return 0;
2759}
2760
2761static void __exit dm_writecache_exit(void)
2762{
2763        dm_unregister_target(&writecache_target);
2764}
2765
2766module_init(dm_writecache_init);
2767module_exit(dm_writecache_exit);
2768
2769MODULE_DESCRIPTION(DM_NAME " writecache target");
2770MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>");
2771MODULE_LICENSE("GPL");
2772