LXR linux/drivers/md/dm-cache-policy-mq.c

   1/*
   2 * Copyright (C) 2012 Red Hat. All rights reserved.
   3 *
   4 * This file is released under the GPL.
   5 */
   6
   7#include "dm-cache-policy.h"
   8#include "dm.h"
   9
  10#include <linux/hash.h>
  11#include <linux/jiffies.h>
  12#include <linux/module.h>
  13#include <linux/mutex.h>
  14#include <linux/slab.h>
  15#include <linux/vmalloc.h>
  16
  17#define DM_MSG_PREFIX "cache-policy-mq"
  18
  19static struct kmem_cache *mq_entry_cache;
  20
  21/*----------------------------------------------------------------*/
  22
  23static unsigned next_power(unsigned n, unsigned min)
  24{
  25        return roundup_pow_of_two(max(n, min));
  26}
  27
  28/*----------------------------------------------------------------*/
  29
  30/*
  31 * Large, sequential ios are probably better left on the origin device since
  32 * spindles tend to have good bandwidth.
  33 *
  34 * The io_tracker tries to spot when the io is in one of these sequential
  35 * modes.
  36 *
  37 * Two thresholds to switch between random and sequential io mode are defaulting
  38 * as follows and can be adjusted via the constructor and message interfaces.
  39 */
  40#define RANDOM_THRESHOLD_DEFAULT 4
  41#define SEQUENTIAL_THRESHOLD_DEFAULT 512
  42
  43enum io_pattern {
  44        PATTERN_SEQUENTIAL,
  45        PATTERN_RANDOM
  46};
  47
  48struct io_tracker {
  49        enum io_pattern pattern;
  50
  51        unsigned nr_seq_samples;
  52        unsigned nr_rand_samples;
  53        unsigned thresholds[2];
  54
  55        dm_oblock_t last_end_oblock;
  56};
  57
  58static void iot_init(struct io_tracker *t,
  59                     int sequential_threshold, int random_threshold)
  60{
  61        t->pattern = PATTERN_RANDOM;
  62        t->nr_seq_samples = 0;
  63        t->nr_rand_samples = 0;
  64        t->last_end_oblock = 0;
  65        t->thresholds[PATTERN_RANDOM] = random_threshold;
  66        t->thresholds[PATTERN_SEQUENTIAL] = sequential_threshold;
  67}
  68
  69static enum io_pattern iot_pattern(struct io_tracker *t)
  70{
  71        return t->pattern;
  72}
  73
  74static void iot_update_stats(struct io_tracker *t, struct bio *bio)
  75{
  76        if (bio->bi_iter.bi_sector == from_oblock(t->last_end_oblock) + 1)
  77                t->nr_seq_samples++;
  78        else {
  79                /*
  80                 * Just one non-sequential IO is enough to reset the
  81                 * counters.
  82                 */
  83                if (t->nr_seq_samples) {
  84                        t->nr_seq_samples = 0;
  85                        t->nr_rand_samples = 0;
  86                }
  87
  88                t->nr_rand_samples++;
  89        }
  90
  91        t->last_end_oblock = to_oblock(bio_end_sector(bio) - 1);
  92}
  93
  94static void iot_check_for_pattern_switch(struct io_tracker *t)
  95{
  96        switch (t->pattern) {
  97        case PATTERN_SEQUENTIAL:
  98                if (t->nr_rand_samples >= t->thresholds[PATTERN_RANDOM]) {
  99                        t->pattern = PATTERN_RANDOM;
 100                        t->nr_seq_samples = t->nr_rand_samples = 0;
 101                }
 102                break;
 103
 104        case PATTERN_RANDOM:
 105                if (t->nr_seq_samples >= t->thresholds[PATTERN_SEQUENTIAL]) {
 106                        t->pattern = PATTERN_SEQUENTIAL;
 107                        t->nr_seq_samples = t->nr_rand_samples = 0;
 108                }
 109                break;
 110        }
 111}
 112
 113static void iot_examine_bio(struct io_tracker *t, struct bio *bio)
 114{
 115        iot_update_stats(t, bio);
 116        iot_check_for_pattern_switch(t);
 117}
 118
 119/*----------------------------------------------------------------*/
 120
 121
 122/*
 123 * This queue is divided up into different levels.  Allowing us to push
 124 * entries to the back of any of the levels.  Think of it as a partially
 125 * sorted queue.
 126 */
 127#define NR_QUEUE_LEVELS 16u
 128#define NR_SENTINELS NR_QUEUE_LEVELS * 3
 129
 130#define WRITEBACK_PERIOD HZ
 131
 132struct queue {
 133        unsigned nr_elts;
 134        bool current_writeback_sentinels;
 135        unsigned long next_writeback;
 136        struct list_head qs[NR_QUEUE_LEVELS];
 137        struct list_head sentinels[NR_SENTINELS];
 138};
 139
 140static void queue_init(struct queue *q)
 141{
 142        unsigned i;
 143
 144        q->nr_elts = 0;
 145        q->current_writeback_sentinels = false;
 146        q->next_writeback = 0;
 147        for (i = 0; i < NR_QUEUE_LEVELS; i++) {
 148                INIT_LIST_HEAD(q->qs + i);
 149                INIT_LIST_HEAD(q->sentinels + i);
 150                INIT_LIST_HEAD(q->sentinels + NR_QUEUE_LEVELS + i);
 151                INIT_LIST_HEAD(q->sentinels + (2 * NR_QUEUE_LEVELS) + i);
 152        }
 153}
 154
 155static unsigned queue_size(struct queue *q)
 156{
 157        return q->nr_elts;
 158}
 159
 160static bool queue_empty(struct queue *q)
 161{
 162        return q->nr_elts == 0;
 163}
 164
 165/*
 166 * Insert an entry to the back of the given level.
 167 */
 168static void queue_push(struct queue *q, unsigned level, struct list_head *elt)
 169{
 170        q->nr_elts++;
 171        list_add_tail(elt, q->qs + level);
 172}
 173
 174static void queue_remove(struct queue *q, struct list_head *elt)
 175{
 176        q->nr_elts--;
 177        list_del(elt);
 178}
 179
 180static bool is_sentinel(struct queue *q, struct list_head *h)
 181{
 182        return (h >= q->sentinels) && (h < (q->sentinels + NR_SENTINELS));
 183}
 184
 185/*
 186 * Gives us the oldest entry of the lowest popoulated level.  If the first
 187 * level is emptied then we shift down one level.
 188 */
 189static struct list_head *queue_peek(struct queue *q)
 190{
 191        unsigned level;
 192        struct list_head *h;
 193
 194        for (level = 0; level < NR_QUEUE_LEVELS; level++)
 195                list_for_each(h, q->qs + level)
 196                        if (!is_sentinel(q, h))
 197                                return h;
 198
 199        return NULL;
 200}
 201
 202static struct list_head *queue_pop(struct queue *q)
 203{
 204        struct list_head *r = queue_peek(q);
 205
 206        if (r) {
 207                q->nr_elts--;
 208                list_del(r);
 209        }
 210
 211        return r;
 212}
 213
 214/*
 215 * Pops an entry from a level that is not past a sentinel.
 216 */
 217static struct list_head *queue_pop_old(struct queue *q)
 218{
 219        unsigned level;
 220        struct list_head *h;
 221
 222        for (level = 0; level < NR_QUEUE_LEVELS; level++)
 223                list_for_each(h, q->qs + level) {
 224                        if (is_sentinel(q, h))
 225                                break;
 226
 227                        q->nr_elts--;
 228                        list_del(h);
 229                        return h;
 230                }
 231
 232        return NULL;
 233}
 234
 235static struct list_head *list_pop(struct list_head *lh)
 236{
 237        struct list_head *r = lh->next;
 238
 239        BUG_ON(!r);
 240        list_del_init(r);
 241
 242        return r;
 243}
 244
 245static struct list_head *writeback_sentinel(struct queue *q, unsigned level)
 246{
 247        if (q->current_writeback_sentinels)
 248                return q->sentinels + NR_QUEUE_LEVELS + level;
 249        else
 250                return q->sentinels + 2 * NR_QUEUE_LEVELS + level;
 251}
 252
 253static void queue_update_writeback_sentinels(struct queue *q)
 254{
 255        unsigned i;
 256        struct list_head *h;
 257
 258        if (time_after(jiffies, q->next_writeback)) {
 259                for (i = 0; i < NR_QUEUE_LEVELS; i++) {
 260                        h = writeback_sentinel(q, i);
 261                        list_del(h);
 262                        list_add_tail(h, q->qs + i);
 263                }
 264
 265                q->next_writeback = jiffies + WRITEBACK_PERIOD;
 266                q->current_writeback_sentinels = !q->current_writeback_sentinels;
 267        }
 268}
 269
 270/*
 271 * Sometimes we want to iterate through entries that have been pushed since
 272 * a certain event.  We use sentinel entries on the queues to delimit these
 273 * 'tick' events.
 274 */
 275static void queue_tick(struct queue *q)
 276{
 277        unsigned i;
 278
 279        for (i = 0; i < NR_QUEUE_LEVELS; i++) {
 280                list_del(q->sentinels + i);
 281                list_add_tail(q->sentinels + i, q->qs + i);
 282        }
 283}
 284
 285typedef void (*iter_fn)(struct list_head *, void *);
 286static void queue_iterate_tick(struct queue *q, iter_fn fn, void *context)
 287{
 288        unsigned i;
 289        struct list_head *h;
 290
 291        for (i = 0; i < NR_QUEUE_LEVELS; i++) {
 292                list_for_each_prev(h, q->qs + i) {
 293                        if (is_sentinel(q, h))
 294                                break;
 295
 296                        fn(h, context);
 297                }
 298        }
 299}
 300
 301/*----------------------------------------------------------------*/
 302
 303/*
 304 * Describes a cache entry.  Used in both the cache and the pre_cache.
 305 */
 306struct entry {
 307        struct hlist_node hlist;
 308        struct list_head list;
 309        dm_oblock_t oblock;
 310
 311        /*
 312         * FIXME: pack these better
 313         */
 314        bool dirty:1;
 315        unsigned hit_count;
 316};
 317
 318/*
 319 * Rather than storing the cblock in an entry, we allocate all entries in
 320 * an array, and infer the cblock from the entry position.
 321 *
 322 * Free entries are linked together into a list.
 323 */
 324struct entry_pool {
 325        struct entry *entries, *entries_end;
 326        struct list_head free;
 327        unsigned nr_allocated;
 328};
 329
 330static int epool_init(struct entry_pool *ep, unsigned nr_entries)
 331{
 332        unsigned i;
 333
 334        ep->entries = vzalloc(sizeof(struct entry) * nr_entries);
 335        if (!ep->entries)
 336                return -ENOMEM;
 337
 338        ep->entries_end = ep->entries + nr_entries;
 339
 340        INIT_LIST_HEAD(&ep->free);
 341        for (i = 0; i < nr_entries; i++)
 342                list_add(&ep->entries[i].list, &ep->free);
 343
 344        ep->nr_allocated = 0;
 345
 346        return 0;
 347}
 348
 349static void epool_exit(struct entry_pool *ep)
 350{
 351        vfree(ep->entries);
 352}
 353
 354static struct entry *alloc_entry(struct entry_pool *ep)
 355{
 356        struct entry *e;
 357
 358        if (list_empty(&ep->free))
 359                return NULL;
 360
 361        e = list_entry(list_pop(&ep->free), struct entry, list);
 362        INIT_LIST_HEAD(&e->list);
 363        INIT_HLIST_NODE(&e->hlist);
 364        ep->nr_allocated++;
 365
 366        return e;
 367}
 368
 369/*
 370 * This assumes the cblock hasn't already been allocated.
 371 */
 372static struct entry *alloc_particular_entry(struct entry_pool *ep, dm_cblock_t cblock)
 373{
 374        struct entry *e = ep->entries + from_cblock(cblock);
 375
 376        list_del_init(&e->list);
 377        INIT_HLIST_NODE(&e->hlist);
 378        ep->nr_allocated++;
 379
 380        return e;
 381}
 382
 383static void free_entry(struct entry_pool *ep, struct entry *e)
 384{
 385        BUG_ON(!ep->nr_allocated);
 386        ep->nr_allocated--;
 387        INIT_HLIST_NODE(&e->hlist);
 388        list_add(&e->list, &ep->free);
 389}
 390
 391/*
 392 * Returns NULL if the entry is free.
 393 */
 394static struct entry *epool_find(struct entry_pool *ep, dm_cblock_t cblock)
 395{
 396        struct entry *e = ep->entries + from_cblock(cblock);
 397        return !hlist_unhashed(&e->hlist) ? e : NULL;
 398}
 399
 400static bool epool_empty(struct entry_pool *ep)
 401{
 402        return list_empty(&ep->free);
 403}
 404
 405static bool in_pool(struct entry_pool *ep, struct entry *e)
 406{
 407        return e >= ep->entries && e < ep->entries_end;
 408}
 409
 410static dm_cblock_t infer_cblock(struct entry_pool *ep, struct entry *e)
 411{
 412        return to_cblock(e - ep->entries);
 413}
 414
 415/*----------------------------------------------------------------*/
 416
 417struct mq_policy {
 418        struct dm_cache_policy policy;
 419
 420        /* protects everything */
 421        struct mutex lock;
 422        dm_cblock_t cache_size;
 423        struct io_tracker tracker;
 424
 425        /*
 426         * Entries come from two pools, one of pre-cache entries, and one
 427         * for the cache proper.
 428         */
 429        struct entry_pool pre_cache_pool;
 430        struct entry_pool cache_pool;
 431
 432        /*
 433         * We maintain three queues of entries.  The cache proper,
 434         * consisting of a clean and dirty queue, contains the currently
 435         * active mappings.  Whereas the pre_cache tracks blocks that
 436         * are being hit frequently and potential candidates for promotion
 437         * to the cache.
 438         */
 439        struct queue pre_cache;
 440        struct queue cache_clean;
 441        struct queue cache_dirty;
 442
 443        /*
 444         * Keeps track of time, incremented by the core.  We use this to
 445         * avoid attributing multiple hits within the same tick.
 446         *
 447         * Access to tick_protected should be done with the spin lock held.
 448         * It's copied to tick at the start of the map function (within the
 449         * mutex).
 450         */
 451        spinlock_t tick_lock;
 452        unsigned tick_protected;
 453        unsigned tick;
 454
 455        /*
 456         * A count of the number of times the map function has been called
 457         * and found an entry in the pre_cache or cache.  Currently used to
 458         * calculate the generation.
 459         */
 460        unsigned hit_count;
 461
 462        /*
 463         * A generation is a longish period that is used to trigger some
 464         * book keeping effects.  eg, decrementing hit counts on entries.
 465         * This is needed to allow the cache to evolve as io patterns
 466         * change.
 467         */
 468        unsigned generation;
 469        unsigned generation_period; /* in lookups (will probably change) */
 470
 471        unsigned discard_promote_adjustment;
 472        unsigned read_promote_adjustment;
 473        unsigned write_promote_adjustment;
 474
 475        /*
 476         * The hash table allows us to quickly find an entry by origin
 477         * block.  Both pre_cache and cache entries are in here.
 478         */
 479        unsigned nr_buckets;
 480        dm_block_t hash_bits;
 481        struct hlist_head *table;
 482};
 483
 484#define DEFAULT_DISCARD_PROMOTE_ADJUSTMENT 1
 485#define DEFAULT_READ_PROMOTE_ADJUSTMENT 4
 486#define DEFAULT_WRITE_PROMOTE_ADJUSTMENT 8
 487#define DISCOURAGE_DEMOTING_DIRTY_THRESHOLD 128
 488
 489/*----------------------------------------------------------------*/
 490
 491/*
 492 * Simple hash table implementation.  Should replace with the standard hash
 493 * table that's making its way upstream.
 494 */
 495static void hash_insert(struct mq_policy *mq, struct entry *e)
 496{
 497        unsigned h = hash_64(from_oblock(e->oblock), mq->hash_bits);
 498
 499        hlist_add_head(&e->hlist, mq->table + h);
 500}
 501
 502static struct entry *hash_lookup(struct mq_policy *mq, dm_oblock_t oblock)
 503{
 504        unsigned h = hash_64(from_oblock(oblock), mq->hash_bits);
 505        struct hlist_head *bucket = mq->table + h;
 506        struct entry *e;
 507
 508        hlist_for_each_entry(e, bucket, hlist)
 509                if (e->oblock == oblock) {
 510                        hlist_del(&e->hlist);
 511                        hlist_add_head(&e->hlist, bucket);
 512                        return e;
 513                }
 514
 515        return NULL;
 516}
 517
 518static void hash_remove(struct entry *e)
 519{
 520        hlist_del(&e->hlist);
 521}
 522
 523/*----------------------------------------------------------------*/
 524
 525static bool any_free_cblocks(struct mq_policy *mq)
 526{
 527        return !epool_empty(&mq->cache_pool);
 528}
 529
 530static bool any_clean_cblocks(struct mq_policy *mq)
 531{
 532        return !queue_empty(&mq->cache_clean);
 533}
 534
 535/*----------------------------------------------------------------*/
 536
 537/*
 538 * Now we get to the meat of the policy.  This section deals with deciding
 539 * when to to add entries to the pre_cache and cache, and move between
 540 * them.
 541 */
 542
 543/*
 544 * The queue level is based on the log2 of the hit count.
 545 */
 546static unsigned queue_level(struct entry *e)
 547{
 548        return min((unsigned) ilog2(e->hit_count), NR_QUEUE_LEVELS - 1u);
 549}
 550
 551static bool in_cache(struct mq_policy *mq, struct entry *e)
 552{
 553        return in_pool(&mq->cache_pool, e);
 554}
 555
 556/*
 557 * Inserts the entry into the pre_cache or the cache.  Ensures the cache
 558 * block is marked as allocated if necc.  Inserts into the hash table.
 559 * Sets the tick which records when the entry was last moved about.
 560 */
 561static void push(struct mq_policy *mq, struct entry *e)
 562{
 563        hash_insert(mq, e);
 564
 565        if (in_cache(mq, e))
 566                queue_push(e->dirty ? &mq->cache_dirty : &mq->cache_clean,
 567                           queue_level(e), &e->list);
 568        else
 569                queue_push(&mq->pre_cache, queue_level(e), &e->list);
 570}
 571
 572/*
 573 * Removes an entry from pre_cache or cache.  Removes from the hash table.
 574 */
 575static void del(struct mq_policy *mq, struct entry *e)
 576{
 577        if (in_cache(mq, e))
 578                queue_remove(e->dirty ? &mq->cache_dirty : &mq->cache_clean, &e->list);
 579        else
 580                queue_remove(&mq->pre_cache, &e->list);
 581
 582        hash_remove(e);
 583}
 584
 585/*
 586 * Like del, except it removes the first entry in the queue (ie. the least
 587 * recently used).
 588 */
 589static struct entry *pop(struct mq_policy *mq, struct queue *q)
 590{
 591        struct entry *e;
 592        struct list_head *h = queue_pop(q);
 593
 594        if (!h)
 595                return NULL;
 596
 597        e = container_of(h, struct entry, list);
 598        hash_remove(e);
 599
 600        return e;
 601}
 602
 603static struct entry *pop_old(struct mq_policy *mq, struct queue *q)
 604{
 605        struct entry *e;
 606        struct list_head *h = queue_pop_old(q);
 607
 608        if (!h)
 609                return NULL;
 610
 611        e = container_of(h, struct entry, list);
 612        hash_remove(e);
 613
 614        return e;
 615}
 616
 617static struct entry *peek(struct queue *q)
 618{
 619        struct list_head *h = queue_peek(q);
 620        return h ? container_of(h, struct entry, list) : NULL;
 621}
 622
 623/*
 624 * The promotion threshold is adjusted every generation.  As are the counts
 625 * of the entries.
 626 *
 627 * At the moment the threshold is taken by averaging the hit counts of some
 628 * of the entries in the cache (the first 20 entries across all levels in
 629 * ascending order, giving preference to the clean entries at each level).
 630 *
 631 * We can be much cleverer than this though.  For example, each promotion
 632 * could bump up the threshold helping to prevent churn.  Much more to do
 633 * here.
 634 */
 635
 636#define MAX_TO_AVERAGE 20
 637
 638static void check_generation(struct mq_policy *mq)
 639{
 640        unsigned total = 0, nr = 0, count = 0, level;
 641        struct list_head *head;
 642        struct entry *e;
 643
 644        if ((mq->hit_count >= mq->generation_period) && (epool_empty(&mq->cache_pool))) {
 645                mq->hit_count = 0;
 646                mq->generation++;
 647
 648                for (level = 0; level < NR_QUEUE_LEVELS && count < MAX_TO_AVERAGE; level++) {
 649                        head = mq->cache_clean.qs + level;
 650                        list_for_each_entry(e, head, list) {
 651                                nr++;
 652                                total += e->hit_count;
 653
 654                                if (++count >= MAX_TO_AVERAGE)
 655                                        break;
 656                        }
 657
 658                        head = mq->cache_dirty.qs + level;
 659                        list_for_each_entry(e, head, list) {
 660                                nr++;
 661                                total += e->hit_count;
 662
 663                                if (++count >= MAX_TO_AVERAGE)
 664                                        break;
 665                        }
 666                }
 667        }
 668}
 669
 670/*
 671 * Whenever we use an entry we bump up it's hit counter, and push it to the
 672 * back to it's current level.
 673 */
 674static void requeue(struct mq_policy *mq, struct entry *e)
 675{
 676        check_generation(mq);
 677        del(mq, e);
 678        push(mq, e);
 679}
 680
 681/*
 682 * Demote the least recently used entry from the cache to the pre_cache.
 683 * Returns the new cache entry to use, and the old origin block it was
 684 * mapped to.
 685 *
 686 * We drop the hit count on the demoted entry back to 1 to stop it bouncing
 687 * straight back into the cache if it's subsequently hit.  There are
 688 * various options here, and more experimentation would be good:
 689 *
 690 * - just forget about the demoted entry completely (ie. don't insert it
 691     into the pre_cache).
 692 * - divide the hit count rather that setting to some hard coded value.
 693 * - set the hit count to a hard coded value other than 1, eg, is it better
 694 *   if it goes in at level 2?
 695 */
 696static int demote_cblock(struct mq_policy *mq,
 697                         struct policy_locker *locker, dm_oblock_t *oblock)
 698{
 699        struct entry *demoted = peek(&mq->cache_clean);
 700
 701        if (!demoted)
 702                /*
 703                 * We could get a block from mq->cache_dirty, but that
 704                 * would add extra latency to the triggering bio as it
 705                 * waits for the writeback.  Better to not promote this
 706                 * time and hope there's a clean block next time this block
 707                 * is hit.
 708                 */
 709                return -ENOSPC;
 710
 711        if (locker->fn(locker, demoted->oblock))
 712                /*
 713                 * We couldn't lock the demoted block.
 714                 */
 715                return -EBUSY;
 716
 717        del(mq, demoted);
 718        *oblock = demoted->oblock;
 719        free_entry(&mq->cache_pool, demoted);
 720
 721        /*
 722         * We used to put the demoted block into the pre-cache, but I think
 723         * it's simpler to just let it work it's way up from zero again.
 724         * Stops blocks flickering in and out of the cache.
 725         */
 726
 727        return 0;
 728}
 729
 730/*
 731 * Entries in the pre_cache whose hit count passes the promotion
 732 * threshold move to the cache proper.  Working out the correct
 733 * value for the promotion_threshold is crucial to this policy.
 734 */
 735static unsigned promote_threshold(struct mq_policy *mq)
 736{
 737        struct entry *e;
 738
 739        if (any_free_cblocks(mq))
 740                return 0;
 741
 742        e = peek(&mq->cache_clean);
 743        if (e)
 744                return e->hit_count;
 745
 746        e = peek(&mq->cache_dirty);
 747        if (e)
 748                return e->hit_count + DISCOURAGE_DEMOTING_DIRTY_THRESHOLD;
 749
 750        /* This should never happen */
 751        return 0;
 752}
 753
 754/*
 755 * We modify the basic promotion_threshold depending on the specific io.
 756 *
 757 * If the origin block has been discarded then there's no cost to copy it
 758 * to the cache.
 759 *
 760 * We bias towards reads, since they can be demoted at no cost if they
 761 * haven't been dirtied.
 762 */
 763static unsigned adjusted_promote_threshold(struct mq_policy *mq,
 764                                           bool discarded_oblock, int data_dir)
 765{
 766        if (data_dir == READ)
 767                return promote_threshold(mq) + mq->read_promote_adjustment;
 768
 769        if (discarded_oblock && (any_free_cblocks(mq) || any_clean_cblocks(mq))) {
 770                /*
 771                 * We don't need to do any copying at all, so give this a
 772                 * very low threshold.
 773                 */
 774                return mq->discard_promote_adjustment;
 775        }
 776
 777        return promote_threshold(mq) + mq->write_promote_adjustment;
 778}
 779
 780static bool should_promote(struct mq_policy *mq, struct entry *e,
 781                           bool discarded_oblock, int data_dir)
 782{
 783        return e->hit_count >=
 784                adjusted_promote_threshold(mq, discarded_oblock, data_dir);
 785}
 786
 787static int cache_entry_found(struct mq_policy *mq,
 788                             struct entry *e,
 789                             struct policy_result *result)
 790{
 791        requeue(mq, e);
 792
 793        if (in_cache(mq, e)) {
 794                result->op = POLICY_HIT;
 795                result->cblock = infer_cblock(&mq->cache_pool, e);
 796        }
 797
 798        return 0;
 799}
 800
 801/*
 802 * Moves an entry from the pre_cache to the cache.  The main work is
 803 * finding which cache block to use.
 804 */
 805static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e,
 806                              struct policy_locker *locker,
 807                              struct policy_result *result)
 808{
 809        int r;
 810        struct entry *new_e;
 811
 812        /* Ensure there's a free cblock in the cache */
 813        if (epool_empty(&mq->cache_pool)) {
 814                result->op = POLICY_REPLACE;
 815                r = demote_cblock(mq, locker, &result->old_oblock);
 816                if (r) {
 817                        result->op = POLICY_MISS;
 818                        return 0;
 819                }
 820
 821        } else
 822                result->op = POLICY_NEW;
 823
 824        new_e = alloc_entry(&mq->cache_pool);
 825        BUG_ON(!new_e);
 826
 827        new_e->oblock = e->oblock;
 828        new_e->dirty = false;
 829        new_e->hit_count = e->hit_count;
 830
 831        del(mq, e);
 832        free_entry(&mq->pre_cache_pool, e);
 833        push(mq, new_e);
 834
 835        result->cblock = infer_cblock(&mq->cache_pool, new_e);
 836
 837        return 0;
 838}
 839
 840static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e,
 841                                 bool can_migrate, bool discarded_oblock,
 842                                 int data_dir, struct policy_locker *locker,
 843                                 struct policy_result *result)
 844{
 845        int r = 0;
 846
 847        if (!should_promote(mq, e, discarded_oblock, data_dir)) {
 848                requeue(mq, e);
 849                result->op = POLICY_MISS;
 850
 851        } else if (!can_migrate)
 852                r = -EWOULDBLOCK;
 853
 854        else {
 855                requeue(mq, e);
 856                r = pre_cache_to_cache(mq, e, locker, result);
 857        }
 858
 859        return r;
 860}
 861
 862static void insert_in_pre_cache(struct mq_policy *mq,
 863                                dm_oblock_t oblock)
 864{
 865        struct entry *e = alloc_entry(&mq->pre_cache_pool);
 866
 867        if (!e)
 868                /*
 869                 * There's no spare entry structure, so we grab the least
 870                 * used one from the pre_cache.
 871                 */
 872                e = pop(mq, &mq->pre_cache);
 873
 874        if (unlikely(!e)) {
 875                DMWARN("couldn't pop from pre cache");
 876                return;
 877        }
 878
 879        e->dirty = false;
 880        e->oblock = oblock;
 881        e->hit_count = 1;
 882        push(mq, e);
 883}
 884
 885static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock,
 886                            struct policy_locker *locker,
 887                            struct policy_result *result)
 888{
 889        int r;
 890        struct entry *e;
 891
 892        if (epool_empty(&mq->cache_pool)) {
 893                result->op = POLICY_REPLACE;
 894                r = demote_cblock(mq, locker, &result->old_oblock);
 895                if (unlikely(r)) {
 896                        result->op = POLICY_MISS;
 897                        insert_in_pre_cache(mq, oblock);
 898                        return;
 899                }
 900
 901                /*
 902                 * This will always succeed, since we've just demoted.
 903                 */
 904                e = alloc_entry(&mq->cache_pool);
 905                BUG_ON(!e);
 906
 907        } else {
 908                e = alloc_entry(&mq->cache_pool);
 909                result->op = POLICY_NEW;
 910        }
 911
 912        e->oblock = oblock;
 913        e->dirty = false;
 914        e->hit_count = 1;
 915        push(mq, e);
 916
 917        result->cblock = infer_cblock(&mq->cache_pool, e);
 918}
 919
 920static int no_entry_found(struct mq_policy *mq, dm_oblock_t oblock,
 921                          bool can_migrate, bool discarded_oblock,
 922                          int data_dir, struct policy_locker *locker,
 923                          struct policy_result *result)
 924{
 925        if (adjusted_promote_threshold(mq, discarded_oblock, data_dir) <= 1) {
 926                if (can_migrate)
 927                        insert_in_cache(mq, oblock, locker, result);
 928                else
 929                        return -EWOULDBLOCK;
 930        } else {
 931                insert_in_pre_cache(mq, oblock);
 932                result->op = POLICY_MISS;
 933        }
 934
 935        return 0;
 936}
 937
 938/*
 939 * Looks the oblock up in the hash table, then decides whether to put in
 940 * pre_cache, or cache etc.
 941 */
 942static int map(struct mq_policy *mq, dm_oblock_t oblock,
 943               bool can_migrate, bool discarded_oblock,
 944               int data_dir, struct policy_locker *locker,
 945               struct policy_result *result)
 946{
 947        int r = 0;
 948        struct entry *e = hash_lookup(mq, oblock);
 949
 950        if (e && in_cache(mq, e))
 951                r = cache_entry_found(mq, e, result);
 952
 953        else if (mq->tracker.thresholds[PATTERN_SEQUENTIAL] &&
 954                 iot_pattern(&mq->tracker) == PATTERN_SEQUENTIAL)
 955                result->op = POLICY_MISS;
 956
 957        else if (e)
 958                r = pre_cache_entry_found(mq, e, can_migrate, discarded_oblock,
 959                                          data_dir, locker, result);
 960
 961        else
 962                r = no_entry_found(mq, oblock, can_migrate, discarded_oblock,
 963                                   data_dir, locker, result);
 964
 965        if (r == -EWOULDBLOCK)
 966                result->op = POLICY_MISS;
 967
 968        return r;
 969}
 970
 971/*----------------------------------------------------------------*/
 972
 973/*
 974 * Public interface, via the policy struct.  See dm-cache-policy.h for a
 975 * description of these.
 976 */
 977
 978static struct mq_policy *to_mq_policy(struct dm_cache_policy *p)
 979{
 980        return container_of(p, struct mq_policy, policy);
 981}
 982
 983static void mq_destroy(struct dm_cache_policy *p)
 984{
 985        struct mq_policy *mq = to_mq_policy(p);
 986
 987        vfree(mq->table);
 988        epool_exit(&mq->cache_pool);
 989        epool_exit(&mq->pre_cache_pool);
 990        kfree(mq);
 991}
 992
 993static void update_pre_cache_hits(struct list_head *h, void *context)
 994{
 995        struct entry *e = container_of(h, struct entry, list);
 996        e->hit_count++;
 997}
 998
 999static void update_cache_hits(struct list_head *h, void *context)
1000{

1001        struct mq_policy *mq = context;
1002        struct entry *e = container_of(h, struct entry, list);
1003        e->hit_count++;
1004        mq->hit_count++;
1005}
1006
1007static void copy_tick(struct mq_policy *mq)
1008{
1009        unsigned long flags, tick;
1010
1011        spin_lock_irqsave(&mq->tick_lock, flags);
1012        tick = mq->tick_protected;
1013        if (tick != mq->tick) {
1014                queue_iterate_tick(&mq->pre_cache, update_pre_cache_hits, mq);
1015                queue_iterate_tick(&mq->cache_dirty, update_cache_hits, mq);
1016                queue_iterate_tick(&mq->cache_clean, update_cache_hits, mq);
1017                mq->tick = tick;
1018        }
1019
1020        queue_tick(&mq->pre_cache);
1021        queue_tick(&mq->cache_dirty);
1022        queue_tick(&mq->cache_clean);
1023        queue_update_writeback_sentinels(&mq->cache_dirty);
1024        spin_unlock_irqrestore(&mq->tick_lock, flags);
1025}
1026
1027static int mq_map(struct dm_cache_policy *p, dm_oblock_t oblock,
1028                  bool can_block, bool can_migrate, bool discarded_oblock,
1029                  struct bio *bio, struct policy_locker *locker,
1030                  struct policy_result *result)
1031{
1032        int r;
1033        struct mq_policy *mq = to_mq_policy(p);
1034
1035        result->op = POLICY_MISS;
1036
1037        if (can_block)
1038                mutex_lock(&mq->lock);
1039        else if (!mutex_trylock(&mq->lock))
1040                return -EWOULDBLOCK;
1041
1042        copy_tick(mq);
1043
1044        iot_examine_bio(&mq->tracker, bio);
1045        r = map(mq, oblock, can_migrate, discarded_oblock,
1046                bio_data_dir(bio), locker, result);
1047
1048        mutex_unlock(&mq->lock);
1049
1050        return r;
1051}
1052
1053static int mq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock)
1054{
1055        int r;
1056        struct mq_policy *mq = to_mq_policy(p);
1057        struct entry *e;
1058
1059        if (!mutex_trylock(&mq->lock))
1060                return -EWOULDBLOCK;
1061
1062        e = hash_lookup(mq, oblock);
1063        if (e && in_cache(mq, e)) {
1064                *cblock = infer_cblock(&mq->cache_pool, e);
1065                r = 0;
1066        } else
1067                r = -ENOENT;
1068
1069        mutex_unlock(&mq->lock);
1070
1071        return r;
1072}
1073
1074static void __mq_set_clear_dirty(struct mq_policy *mq, dm_oblock_t oblock, bool set)
1075{
1076        struct entry *e;
1077
1078        e = hash_lookup(mq, oblock);
1079        BUG_ON(!e || !in_cache(mq, e));
1080
1081        del(mq, e);
1082        e->dirty = set;
1083        push(mq, e);
1084}
1085
1086static void mq_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
1087{
1088        struct mq_policy *mq = to_mq_policy(p);
1089
1090        mutex_lock(&mq->lock);
1091        __mq_set_clear_dirty(mq, oblock, true);
1092        mutex_unlock(&mq->lock);
1093}
1094
1095static void mq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
1096{
1097        struct mq_policy *mq = to_mq_policy(p);
1098
1099        mutex_lock(&mq->lock);
1100        __mq_set_clear_dirty(mq, oblock, false);
1101        mutex_unlock(&mq->lock);
1102}
1103
1104static int mq_load_mapping(struct dm_cache_policy *p,
1105                           dm_oblock_t oblock, dm_cblock_t cblock,
1106                           uint32_t hint, bool hint_valid)
1107{
1108        struct mq_policy *mq = to_mq_policy(p);
1109        struct entry *e;
1110
1111        e = alloc_particular_entry(&mq->cache_pool, cblock);
1112        e->oblock = oblock;
1113        e->dirty = false;       /* this gets corrected in a minute */
1114        e->hit_count = hint_valid ? hint : 1;
1115        push(mq, e);
1116
1117        return 0;
1118}
1119
1120static int mq_save_hints(struct mq_policy *mq, struct queue *q,
1121                         policy_walk_fn fn, void *context)
1122{
1123        int r;
1124        unsigned level;
1125        struct list_head *h;
1126        struct entry *e;
1127
1128        for (level = 0; level < NR_QUEUE_LEVELS; level++)
1129                list_for_each(h, q->qs + level) {
1130                        if (is_sentinel(q, h))
1131                                continue;
1132
1133                        e = container_of(h, struct entry, list);
1134                        r = fn(context, infer_cblock(&mq->cache_pool, e),
1135                               e->oblock, e->hit_count);
1136                        if (r)
1137                                return r;
1138                }
1139
1140        return 0;
1141}
1142
1143static int mq_walk_mappings(struct dm_cache_policy *p, policy_walk_fn fn,
1144                            void *context)
1145{
1146        struct mq_policy *mq = to_mq_policy(p);
1147        int r = 0;
1148
1149        mutex_lock(&mq->lock);
1150
1151        r = mq_save_hints(mq, &mq->cache_clean, fn, context);
1152        if (!r)
1153                r = mq_save_hints(mq, &mq->cache_dirty, fn, context);
1154
1155        mutex_unlock(&mq->lock);
1156
1157        return r;
1158}
1159
1160static void __remove_mapping(struct mq_policy *mq, dm_oblock_t oblock)
1161{
1162        struct entry *e;
1163
1164        e = hash_lookup(mq, oblock);
1165        BUG_ON(!e || !in_cache(mq, e));
1166
1167        del(mq, e);
1168        free_entry(&mq->cache_pool, e);
1169}
1170
1171static void mq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
1172{
1173        struct mq_policy *mq = to_mq_policy(p);
1174
1175        mutex_lock(&mq->lock);
1176        __remove_mapping(mq, oblock);
1177        mutex_unlock(&mq->lock);
1178}
1179
1180static int __remove_cblock(struct mq_policy *mq, dm_cblock_t cblock)
1181{
1182        struct entry *e = epool_find(&mq->cache_pool, cblock);
1183
1184        if (!e)
1185                return -ENODATA;
1186
1187        del(mq, e);
1188        free_entry(&mq->cache_pool, e);
1189
1190        return 0;
1191}
1192
1193static int mq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock)
1194{
1195        int r;
1196        struct mq_policy *mq = to_mq_policy(p);
1197
1198        mutex_lock(&mq->lock);
1199        r = __remove_cblock(mq, cblock);
1200        mutex_unlock(&mq->lock);
1201
1202        return r;
1203}
1204
1205#define CLEAN_TARGET_PERCENTAGE 25
1206
1207static bool clean_target_met(struct mq_policy *mq)
1208{
1209        /*
1210         * Cache entries may not be populated.  So we're cannot rely on the
1211         * size of the clean queue.
1212         */
1213        unsigned nr_clean = from_cblock(mq->cache_size) - queue_size(&mq->cache_dirty);
1214        unsigned target = from_cblock(mq->cache_size) * CLEAN_TARGET_PERCENTAGE / 100;
1215
1216        return nr_clean >= target;
1217}
1218
1219static int __mq_writeback_work(struct mq_policy *mq, dm_oblock_t *oblock,
1220                              dm_cblock_t *cblock)
1221{
1222        struct entry *e = pop_old(mq, &mq->cache_dirty);
1223
1224        if (!e && !clean_target_met(mq))
1225                e = pop(mq, &mq->cache_dirty);
1226
1227        if (!e)
1228                return -ENODATA;
1229
1230        *oblock = e->oblock;
1231        *cblock = infer_cblock(&mq->cache_pool, e);
1232        e->dirty = false;
1233        push(mq, e);
1234
1235        return 0;
1236}
1237
1238static int mq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock,
1239                             dm_cblock_t *cblock, bool critical_only)
1240{
1241        int r;
1242        struct mq_policy *mq = to_mq_policy(p);
1243
1244        mutex_lock(&mq->lock);
1245        r = __mq_writeback_work(mq, oblock, cblock);
1246        mutex_unlock(&mq->lock);
1247
1248        return r;
1249}
1250
1251static void __force_mapping(struct mq_policy *mq,
1252                            dm_oblock_t current_oblock, dm_oblock_t new_oblock)
1253{
1254        struct entry *e = hash_lookup(mq, current_oblock);
1255
1256        if (e && in_cache(mq, e)) {
1257                del(mq, e);
1258                e->oblock = new_oblock;
1259                e->dirty = true;
1260                push(mq, e);
1261        }
1262}
1263
1264static void mq_force_mapping(struct dm_cache_policy *p,
1265                             dm_oblock_t current_oblock, dm_oblock_t new_oblock)
1266{
1267        struct mq_policy *mq = to_mq_policy(p);
1268
1269        mutex_lock(&mq->lock);
1270        __force_mapping(mq, current_oblock, new_oblock);
1271        mutex_unlock(&mq->lock);
1272}
1273
1274static dm_cblock_t mq_residency(struct dm_cache_policy *p)
1275{
1276        dm_cblock_t r;
1277        struct mq_policy *mq = to_mq_policy(p);
1278
1279        mutex_lock(&mq->lock);
1280        r = to_cblock(mq->cache_pool.nr_allocated);
1281        mutex_unlock(&mq->lock);
1282
1283        return r;
1284}
1285
1286static void mq_tick(struct dm_cache_policy *p, bool can_block)
1287{
1288        struct mq_policy *mq = to_mq_policy(p);
1289        unsigned long flags;
1290
1291        spin_lock_irqsave(&mq->tick_lock, flags);
1292        mq->tick_protected++;
1293        spin_unlock_irqrestore(&mq->tick_lock, flags);
1294
1295        if (can_block) {
1296                mutex_lock(&mq->lock);
1297                copy_tick(mq);
1298                mutex_unlock(&mq->lock);
1299        }
1300}
1301
1302static int mq_set_config_value(struct dm_cache_policy *p,
1303                               const char *key, const char *value)
1304{
1305        struct mq_policy *mq = to_mq_policy(p);
1306        unsigned long tmp;
1307
1308        if (kstrtoul(value, 10, &tmp))
1309                return -EINVAL;
1310
1311        if (!strcasecmp(key, "random_threshold")) {
1312                mq->tracker.thresholds[PATTERN_RANDOM] = tmp;
1313
1314        } else if (!strcasecmp(key, "sequential_threshold")) {
1315                mq->tracker.thresholds[PATTERN_SEQUENTIAL] = tmp;
1316
1317        } else if (!strcasecmp(key, "discard_promote_adjustment"))
1318                mq->discard_promote_adjustment = tmp;
1319
1320        else if (!strcasecmp(key, "read_promote_adjustment"))
1321                mq->read_promote_adjustment = tmp;
1322
1323        else if (!strcasecmp(key, "write_promote_adjustment"))
1324                mq->write_promote_adjustment = tmp;
1325
1326        else
1327                return -EINVAL;
1328
1329        return 0;
1330}
1331
1332static int mq_emit_config_values(struct dm_cache_policy *p, char *result,
1333                                 unsigned maxlen, ssize_t *sz_ptr)
1334{
1335        ssize_t sz = *sz_ptr;
1336        struct mq_policy *mq = to_mq_policy(p);
1337
1338        DMEMIT("10 random_threshold %u "
1339               "sequential_threshold %u "
1340               "discard_promote_adjustment %u "
1341               "read_promote_adjustment %u "
1342               "write_promote_adjustment %u ",
1343               mq->tracker.thresholds[PATTERN_RANDOM],
1344               mq->tracker.thresholds[PATTERN_SEQUENTIAL],
1345               mq->discard_promote_adjustment,
1346               mq->read_promote_adjustment,
1347               mq->write_promote_adjustment);
1348
1349        *sz_ptr = sz;
1350        return 0;
1351}
1352
1353/* Init the policy plugin interface function pointers. */
1354static void init_policy_functions(struct mq_policy *mq)
1355{
1356        mq->policy.destroy = mq_destroy;
1357        mq->policy.map = mq_map;
1358        mq->policy.lookup = mq_lookup;
1359        mq->policy.set_dirty = mq_set_dirty;
1360        mq->policy.clear_dirty = mq_clear_dirty;
1361        mq->policy.load_mapping = mq_load_mapping;
1362        mq->policy.walk_mappings = mq_walk_mappings;
1363        mq->policy.remove_mapping = mq_remove_mapping;
1364        mq->policy.remove_cblock = mq_remove_cblock;
1365        mq->policy.writeback_work = mq_writeback_work;
1366        mq->policy.force_mapping = mq_force_mapping;
1367        mq->policy.residency = mq_residency;
1368        mq->policy.tick = mq_tick;
1369        mq->policy.emit_config_values = mq_emit_config_values;
1370        mq->policy.set_config_value = mq_set_config_value;
1371}
1372
1373static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
1374                                         sector_t origin_size,
1375                                         sector_t cache_block_size)
1376{
1377        struct mq_policy *mq = kzalloc(sizeof(*mq), GFP_KERNEL);
1378
1379        if (!mq)
1380                return NULL;
1381
1382        init_policy_functions(mq);
1383        iot_init(&mq->tracker, SEQUENTIAL_THRESHOLD_DEFAULT, RANDOM_THRESHOLD_DEFAULT);
1384        mq->cache_size = cache_size;
1385
1386        if (epool_init(&mq->pre_cache_pool, from_cblock(cache_size))) {
1387                DMERR("couldn't initialize pool of pre-cache entries");
1388                goto bad_pre_cache_init;
1389        }
1390
1391        if (epool_init(&mq->cache_pool, from_cblock(cache_size))) {
1392                DMERR("couldn't initialize pool of cache entries");
1393                goto bad_cache_init;
1394        }
1395
1396        mq->tick_protected = 0;
1397        mq->tick = 0;
1398        mq->hit_count = 0;
1399        mq->generation = 0;
1400        mq->discard_promote_adjustment = DEFAULT_DISCARD_PROMOTE_ADJUSTMENT;
1401        mq->read_promote_adjustment = DEFAULT_READ_PROMOTE_ADJUSTMENT;
1402        mq->write_promote_adjustment = DEFAULT_WRITE_PROMOTE_ADJUSTMENT;
1403        mutex_init(&mq->lock);
1404        spin_lock_init(&mq->tick_lock);
1405
1406        queue_init(&mq->pre_cache);
1407        queue_init(&mq->cache_clean);
1408        queue_init(&mq->cache_dirty);
1409
1410        mq->generation_period = max((unsigned) from_cblock(cache_size), 1024U);
1411
1412        mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16);
1413        mq->hash_bits = ffs(mq->nr_buckets) - 1;
1414        mq->table = vzalloc(sizeof(*mq->table) * mq->nr_buckets);
1415        if (!mq->table)
1416                goto bad_alloc_table;
1417
1418        return &mq->policy;
1419
1420bad_alloc_table:
1421        epool_exit(&mq->cache_pool);
1422bad_cache_init:
1423        epool_exit(&mq->pre_cache_pool);
1424bad_pre_cache_init:
1425        kfree(mq);
1426
1427        return NULL;
1428}
1429
1430/*----------------------------------------------------------------*/
1431
1432static struct dm_cache_policy_type mq_policy_type = {
1433        .name = "mq",
1434        .version = {1, 4, 0},
1435        .hint_size = 4,
1436        .owner = THIS_MODULE,
1437        .create = mq_create
1438};
1439
1440static int __init mq_init(void)
1441{
1442        int r;
1443
1444        mq_entry_cache = kmem_cache_create("dm_mq_policy_cache_entry",
1445                                           sizeof(struct entry),
1446                                           __alignof__(struct entry),
1447                                           0, NULL);
1448        if (!mq_entry_cache)
1449                return -ENOMEM;
1450
1451        r = dm_cache_policy_register(&mq_policy_type);
1452        if (r) {
1453                DMERR("register failed %d", r);
1454                kmem_cache_destroy(mq_entry_cache);
1455                return -ENOMEM;
1456        }
1457
1458        return 0;
1459}
1460
1461static void __exit mq_exit(void)
1462{
1463        dm_cache_policy_unregister(&mq_policy_type);
1464
1465        kmem_cache_destroy(mq_entry_cache);
1466}
1467
1468module_init(mq_init);
1469module_exit(mq_exit);
1470
1471MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
1472MODULE_LICENSE("GPL");
1473MODULE_DESCRIPTION("mq cache policy");
1474