linux/drivers/md/dm-cache-policy-mq.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2012 Red Hat. All rights reserved.
   3 *
   4 * This file is released under the GPL.
   5 */
   6
   7#include "dm-cache-policy.h"
   8#include "dm.h"
   9
  10#include <linux/hash.h>
  11#include <linux/module.h>
  12#include <linux/mutex.h>
  13#include <linux/slab.h>
  14#include <linux/vmalloc.h>
  15
  16#define DM_MSG_PREFIX "cache-policy-mq"
  17
  18static struct kmem_cache *mq_entry_cache;
  19
  20/*----------------------------------------------------------------*/
  21
  22static unsigned next_power(unsigned n, unsigned min)
  23{
  24        return roundup_pow_of_two(max(n, min));
  25}
  26
  27/*----------------------------------------------------------------*/
  28
  29/*
  30 * Large, sequential ios are probably better left on the origin device since
  31 * spindles tend to have good bandwidth.
  32 *
  33 * The io_tracker tries to spot when the io is in one of these sequential
  34 * modes.
  35 *
  36 * Two thresholds to switch between random and sequential io mode are defaulting
  37 * as follows and can be adjusted via the constructor and message interfaces.
  38 */
  39#define RANDOM_THRESHOLD_DEFAULT 4
  40#define SEQUENTIAL_THRESHOLD_DEFAULT 512
  41
  42enum io_pattern {
  43        PATTERN_SEQUENTIAL,
  44        PATTERN_RANDOM
  45};
  46
  47struct io_tracker {
  48        enum io_pattern pattern;
  49
  50        unsigned nr_seq_samples;
  51        unsigned nr_rand_samples;
  52        unsigned thresholds[2];
  53
  54        dm_oblock_t last_end_oblock;
  55};
  56
  57static void iot_init(struct io_tracker *t,
  58                     int sequential_threshold, int random_threshold)
  59{
  60        t->pattern = PATTERN_RANDOM;
  61        t->nr_seq_samples = 0;
  62        t->nr_rand_samples = 0;
  63        t->last_end_oblock = 0;
  64        t->thresholds[PATTERN_RANDOM] = random_threshold;
  65        t->thresholds[PATTERN_SEQUENTIAL] = sequential_threshold;
  66}
  67
  68static enum io_pattern iot_pattern(struct io_tracker *t)
  69{
  70        return t->pattern;
  71}
  72
  73static void iot_update_stats(struct io_tracker *t, struct bio *bio)
  74{
  75        if (bio->bi_iter.bi_sector == from_oblock(t->last_end_oblock) + 1)
  76                t->nr_seq_samples++;
  77        else {
  78                /*
  79                 * Just one non-sequential IO is enough to reset the
  80                 * counters.
  81                 */
  82                if (t->nr_seq_samples) {
  83                        t->nr_seq_samples = 0;
  84                        t->nr_rand_samples = 0;
  85                }
  86
  87                t->nr_rand_samples++;
  88        }
  89
  90        t->last_end_oblock = to_oblock(bio_end_sector(bio) - 1);
  91}
  92
  93static void iot_check_for_pattern_switch(struct io_tracker *t)
  94{
  95        switch (t->pattern) {
  96        case PATTERN_SEQUENTIAL:
  97                if (t->nr_rand_samples >= t->thresholds[PATTERN_RANDOM]) {
  98                        t->pattern = PATTERN_RANDOM;
  99                        t->nr_seq_samples = t->nr_rand_samples = 0;
 100                }
 101                break;
 102
 103        case PATTERN_RANDOM:
 104                if (t->nr_seq_samples >= t->thresholds[PATTERN_SEQUENTIAL]) {
 105                        t->pattern = PATTERN_SEQUENTIAL;
 106                        t->nr_seq_samples = t->nr_rand_samples = 0;
 107                }
 108                break;
 109        }
 110}
 111
 112static void iot_examine_bio(struct io_tracker *t, struct bio *bio)
 113{
 114        iot_update_stats(t, bio);
 115        iot_check_for_pattern_switch(t);
 116}
 117
 118/*----------------------------------------------------------------*/
 119
 120
 121/*
 122 * This queue is divided up into different levels.  Allowing us to push
 123 * entries to the back of any of the levels.  Think of it as a partially
 124 * sorted queue.
 125 */
 126#define NR_QUEUE_LEVELS 16u
 127
 128struct queue {
 129        struct list_head qs[NR_QUEUE_LEVELS];
 130};
 131
 132static void queue_init(struct queue *q)
 133{
 134        unsigned i;
 135
 136        for (i = 0; i < NR_QUEUE_LEVELS; i++)
 137                INIT_LIST_HEAD(q->qs + i);
 138}
 139
 140/*
 141 * Checks to see if the queue is empty.
 142 * FIXME: reduce cpu usage.
 143 */
 144static bool queue_empty(struct queue *q)
 145{
 146        unsigned i;
 147
 148        for (i = 0; i < NR_QUEUE_LEVELS; i++)
 149                if (!list_empty(q->qs + i))
 150                        return false;
 151
 152        return true;
 153}
 154
 155/*
 156 * Insert an entry to the back of the given level.
 157 */
 158static void queue_push(struct queue *q, unsigned level, struct list_head *elt)
 159{
 160        list_add_tail(elt, q->qs + level);
 161}
 162
 163static void queue_remove(struct list_head *elt)
 164{
 165        list_del(elt);
 166}
 167
 168/*
 169 * Shifts all regions down one level.  This has no effect on the order of
 170 * the queue.
 171 */
 172static void queue_shift_down(struct queue *q)
 173{
 174        unsigned level;
 175
 176        for (level = 1; level < NR_QUEUE_LEVELS; level++)
 177                list_splice_init(q->qs + level, q->qs + level - 1);
 178}
 179
 180/*
 181 * Gives us the oldest entry of the lowest popoulated level.  If the first
 182 * level is emptied then we shift down one level.
 183 */
 184static struct list_head *queue_peek(struct queue *q)
 185{
 186        unsigned level;
 187
 188        for (level = 0; level < NR_QUEUE_LEVELS; level++)
 189                if (!list_empty(q->qs + level))
 190                        return q->qs[level].next;
 191
 192        return NULL;
 193}
 194
 195static struct list_head *queue_pop(struct queue *q)
 196{
 197        struct list_head *r = queue_peek(q);
 198
 199        if (r) {
 200                list_del(r);
 201
 202                /* have we just emptied the bottom level? */
 203                if (list_empty(q->qs))
 204                        queue_shift_down(q);
 205        }
 206
 207        return r;
 208}
 209
 210static struct list_head *list_pop(struct list_head *lh)
 211{
 212        struct list_head *r = lh->next;
 213
 214        BUG_ON(!r);
 215        list_del_init(r);
 216
 217        return r;
 218}
 219
 220/*----------------------------------------------------------------*/
 221
 222/*
 223 * Describes a cache entry.  Used in both the cache and the pre_cache.
 224 */
 225struct entry {
 226        struct hlist_node hlist;
 227        struct list_head list;
 228        dm_oblock_t oblock;
 229
 230        /*
 231         * FIXME: pack these better
 232         */
 233        bool dirty:1;
 234        unsigned hit_count;
 235        unsigned generation;
 236        unsigned tick;
 237};
 238
 239/*
 240 * Rather than storing the cblock in an entry, we allocate all entries in
 241 * an array, and infer the cblock from the entry position.
 242 *
 243 * Free entries are linked together into a list.
 244 */
 245struct entry_pool {
 246        struct entry *entries, *entries_end;
 247        struct list_head free;
 248        unsigned nr_allocated;
 249};
 250
 251static int epool_init(struct entry_pool *ep, unsigned nr_entries)
 252{
 253        unsigned i;
 254
 255        ep->entries = vzalloc(sizeof(struct entry) * nr_entries);
 256        if (!ep->entries)
 257                return -ENOMEM;
 258
 259        ep->entries_end = ep->entries + nr_entries;
 260
 261        INIT_LIST_HEAD(&ep->free);
 262        for (i = 0; i < nr_entries; i++)
 263                list_add(&ep->entries[i].list, &ep->free);
 264
 265        ep->nr_allocated = 0;
 266
 267        return 0;
 268}
 269
 270static void epool_exit(struct entry_pool *ep)
 271{
 272        vfree(ep->entries);
 273}
 274
 275static struct entry *alloc_entry(struct entry_pool *ep)
 276{
 277        struct entry *e;
 278
 279        if (list_empty(&ep->free))
 280                return NULL;
 281
 282        e = list_entry(list_pop(&ep->free), struct entry, list);
 283        INIT_LIST_HEAD(&e->list);
 284        INIT_HLIST_NODE(&e->hlist);
 285        ep->nr_allocated++;
 286
 287        return e;
 288}
 289
 290/*
 291 * This assumes the cblock hasn't already been allocated.
 292 */
 293static struct entry *alloc_particular_entry(struct entry_pool *ep, dm_cblock_t cblock)
 294{
 295        struct entry *e = ep->entries + from_cblock(cblock);
 296
 297        list_del_init(&e->list);
 298        INIT_HLIST_NODE(&e->hlist);
 299        ep->nr_allocated++;
 300
 301        return e;
 302}
 303
 304static void free_entry(struct entry_pool *ep, struct entry *e)
 305{
 306        BUG_ON(!ep->nr_allocated);
 307        ep->nr_allocated--;
 308        INIT_HLIST_NODE(&e->hlist);
 309        list_add(&e->list, &ep->free);
 310}
 311
 312/*
 313 * Returns NULL if the entry is free.
 314 */
 315static struct entry *epool_find(struct entry_pool *ep, dm_cblock_t cblock)
 316{
 317        struct entry *e = ep->entries + from_cblock(cblock);
 318        return !hlist_unhashed(&e->hlist) ? e : NULL;
 319}
 320
 321static bool epool_empty(struct entry_pool *ep)
 322{
 323        return list_empty(&ep->free);
 324}
 325
 326static bool in_pool(struct entry_pool *ep, struct entry *e)
 327{
 328        return e >= ep->entries && e < ep->entries_end;
 329}
 330
 331static dm_cblock_t infer_cblock(struct entry_pool *ep, struct entry *e)
 332{
 333        return to_cblock(e - ep->entries);
 334}
 335
 336/*----------------------------------------------------------------*/
 337
 338struct mq_policy {
 339        struct dm_cache_policy policy;
 340
 341        /* protects everything */
 342        struct mutex lock;
 343        dm_cblock_t cache_size;
 344        struct io_tracker tracker;
 345
 346        /*
 347         * Entries come from two pools, one of pre-cache entries, and one
 348         * for the cache proper.
 349         */
 350        struct entry_pool pre_cache_pool;
 351        struct entry_pool cache_pool;
 352
 353        /*
 354         * We maintain three queues of entries.  The cache proper,
 355         * consisting of a clean and dirty queue, contains the currently
 356         * active mappings.  Whereas the pre_cache tracks blocks that
 357         * are being hit frequently and potential candidates for promotion
 358         * to the cache.
 359         */
 360        struct queue pre_cache;
 361        struct queue cache_clean;
 362        struct queue cache_dirty;
 363
 364        /*
 365         * Keeps track of time, incremented by the core.  We use this to
 366         * avoid attributing multiple hits within the same tick.
 367         *
 368         * Access to tick_protected should be done with the spin lock held.
 369         * It's copied to tick at the start of the map function (within the
 370         * mutex).
 371         */
 372        spinlock_t tick_lock;
 373        unsigned tick_protected;
 374        unsigned tick;
 375
 376        /*
 377         * A count of the number of times the map function has been called
 378         * and found an entry in the pre_cache or cache.  Currently used to
 379         * calculate the generation.
 380         */
 381        unsigned hit_count;
 382
 383        /*
 384         * A generation is a longish period that is used to trigger some
 385         * book keeping effects.  eg, decrementing hit counts on entries.
 386         * This is needed to allow the cache to evolve as io patterns
 387         * change.
 388         */
 389        unsigned generation;
 390        unsigned generation_period; /* in lookups (will probably change) */
 391
 392        unsigned discard_promote_adjustment;
 393        unsigned read_promote_adjustment;
 394        unsigned write_promote_adjustment;
 395
 396        /*
 397         * The hash table allows us to quickly find an entry by origin
 398         * block.  Both pre_cache and cache entries are in here.
 399         */
 400        unsigned nr_buckets;
 401        dm_block_t hash_bits;
 402        struct hlist_head *table;
 403};
 404
 405#define DEFAULT_DISCARD_PROMOTE_ADJUSTMENT 1
 406#define DEFAULT_READ_PROMOTE_ADJUSTMENT 4
 407#define DEFAULT_WRITE_PROMOTE_ADJUSTMENT 8
 408#define DISCOURAGE_DEMOTING_DIRTY_THRESHOLD 128
 409
 410/*----------------------------------------------------------------*/
 411
 412/*
 413 * Simple hash table implementation.  Should replace with the standard hash
 414 * table that's making its way upstream.
 415 */
 416static void hash_insert(struct mq_policy *mq, struct entry *e)
 417{
 418        unsigned h = hash_64(from_oblock(e->oblock), mq->hash_bits);
 419
 420        hlist_add_head(&e->hlist, mq->table + h);
 421}
 422
 423static struct entry *hash_lookup(struct mq_policy *mq, dm_oblock_t oblock)
 424{
 425        unsigned h = hash_64(from_oblock(oblock), mq->hash_bits);
 426        struct hlist_head *bucket = mq->table + h;
 427        struct entry *e;
 428
 429        hlist_for_each_entry(e, bucket, hlist)
 430                if (e->oblock == oblock) {
 431                        hlist_del(&e->hlist);
 432                        hlist_add_head(&e->hlist, bucket);
 433                        return e;
 434                }
 435
 436        return NULL;
 437}
 438
 439static void hash_remove(struct entry *e)
 440{
 441        hlist_del(&e->hlist);
 442}
 443
 444/*----------------------------------------------------------------*/
 445
 446static bool any_free_cblocks(struct mq_policy *mq)
 447{
 448        return !epool_empty(&mq->cache_pool);
 449}
 450
 451static bool any_clean_cblocks(struct mq_policy *mq)
 452{
 453        return !queue_empty(&mq->cache_clean);
 454}
 455
 456/*----------------------------------------------------------------*/
 457
 458/*
 459 * Now we get to the meat of the policy.  This section deals with deciding
 460 * when to to add entries to the pre_cache and cache, and move between
 461 * them.
 462 */
 463
 464/*
 465 * The queue level is based on the log2 of the hit count.
 466 */
 467static unsigned queue_level(struct entry *e)
 468{
 469        return min((unsigned) ilog2(e->hit_count), NR_QUEUE_LEVELS - 1u);
 470}
 471
 472static bool in_cache(struct mq_policy *mq, struct entry *e)
 473{
 474        return in_pool(&mq->cache_pool, e);
 475}
 476
 477/*
 478 * Inserts the entry into the pre_cache or the cache.  Ensures the cache
 479 * block is marked as allocated if necc.  Inserts into the hash table.
 480 * Sets the tick which records when the entry was last moved about.
 481 */
 482static void push(struct mq_policy *mq, struct entry *e)
 483{
 484        e->tick = mq->tick;
 485        hash_insert(mq, e);
 486
 487        if (in_cache(mq, e))
 488                queue_push(e->dirty ? &mq->cache_dirty : &mq->cache_clean,
 489                           queue_level(e), &e->list);
 490        else
 491                queue_push(&mq->pre_cache, queue_level(e), &e->list);
 492}
 493
 494/*
 495 * Removes an entry from pre_cache or cache.  Removes from the hash table.
 496 */
 497static void del(struct mq_policy *mq, struct entry *e)
 498{
 499        queue_remove(&e->list);
 500        hash_remove(e);
 501}
 502
 503/*
 504 * Like del, except it removes the first entry in the queue (ie. the least
 505 * recently used).
 506 */
 507static struct entry *pop(struct mq_policy *mq, struct queue *q)
 508{
 509        struct entry *e;
 510        struct list_head *h = queue_pop(q);
 511
 512        if (!h)
 513                return NULL;
 514
 515        e = container_of(h, struct entry, list);
 516        hash_remove(e);
 517
 518        return e;
 519}
 520
 521static struct entry *peek(struct queue *q)
 522{
 523        struct list_head *h = queue_peek(q);
 524        return h ? container_of(h, struct entry, list) : NULL;
 525}
 526
 527/*
 528 * Has this entry already been updated?
 529 */
 530static bool updated_this_tick(struct mq_policy *mq, struct entry *e)
 531{
 532        return mq->tick == e->tick;
 533}
 534
 535/*
 536 * The promotion threshold is adjusted every generation.  As are the counts
 537 * of the entries.
 538 *
 539 * At the moment the threshold is taken by averaging the hit counts of some
 540 * of the entries in the cache (the first 20 entries across all levels in
 541 * ascending order, giving preference to the clean entries at each level).
 542 *
 543 * We can be much cleverer than this though.  For example, each promotion
 544 * could bump up the threshold helping to prevent churn.  Much more to do
 545 * here.
 546 */
 547
 548#define MAX_TO_AVERAGE 20
 549
 550static void check_generation(struct mq_policy *mq)
 551{
 552        unsigned total = 0, nr = 0, count = 0, level;
 553        struct list_head *head;
 554        struct entry *e;
 555
 556        if ((mq->hit_count >= mq->generation_period) && (epool_empty(&mq->cache_pool))) {
 557                mq->hit_count = 0;
 558                mq->generation++;
 559
 560                for (level = 0; level < NR_QUEUE_LEVELS && count < MAX_TO_AVERAGE; level++) {
 561                        head = mq->cache_clean.qs + level;
 562                        list_for_each_entry(e, head, list) {
 563                                nr++;
 564                                total += e->hit_count;
 565
 566                                if (++count >= MAX_TO_AVERAGE)
 567                                        break;
 568                        }
 569
 570                        head = mq->cache_dirty.qs + level;
 571                        list_for_each_entry(e, head, list) {
 572                                nr++;
 573                                total += e->hit_count;
 574
 575                                if (++count >= MAX_TO_AVERAGE)
 576                                        break;
 577                        }
 578                }
 579        }
 580}
 581
 582/*
 583 * Whenever we use an entry we bump up it's hit counter, and push it to the
 584 * back to it's current level.
 585 */
 586static void requeue_and_update_tick(struct mq_policy *mq, struct entry *e)
 587{
 588        if (updated_this_tick(mq, e))
 589                return;
 590
 591        e->hit_count++;
 592        mq->hit_count++;
 593        check_generation(mq);
 594
 595        /* generation adjustment, to stop the counts increasing forever. */
 596        /* FIXME: divide? */
 597        /* e->hit_count -= min(e->hit_count - 1, mq->generation - e->generation); */
 598        e->generation = mq->generation;
 599
 600        del(mq, e);
 601        push(mq, e);
 602}
 603
 604/*
 605 * Demote the least recently used entry from the cache to the pre_cache.
 606 * Returns the new cache entry to use, and the old origin block it was
 607 * mapped to.
 608 *
 609 * We drop the hit count on the demoted entry back to 1 to stop it bouncing
 610 * straight back into the cache if it's subsequently hit.  There are
 611 * various options here, and more experimentation would be good:
 612 *
 613 * - just forget about the demoted entry completely (ie. don't insert it
 614     into the pre_cache).
 615 * - divide the hit count rather that setting to some hard coded value.
 616 * - set the hit count to a hard coded value other than 1, eg, is it better
 617 *   if it goes in at level 2?
 618 */
 619static int demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock)
 620{
 621        struct entry *demoted = pop(mq, &mq->cache_clean);
 622
 623        if (!demoted)
 624                /*
 625                 * We could get a block from mq->cache_dirty, but that
 626                 * would add extra latency to the triggering bio as it
 627                 * waits for the writeback.  Better to not promote this
 628                 * time and hope there's a clean block next time this block
 629                 * is hit.
 630                 */
 631                return -ENOSPC;
 632
 633        *oblock = demoted->oblock;
 634        free_entry(&mq->cache_pool, demoted);
 635
 636        /*
 637         * We used to put the demoted block into the pre-cache, but I think
 638         * it's simpler to just let it work it's way up from zero again.
 639         * Stops blocks flickering in and out of the cache.
 640         */
 641
 642        return 0;
 643}
 644
 645/*
 646 * Entries in the pre_cache whose hit count passes the promotion
 647 * threshold move to the cache proper.  Working out the correct
 648 * value for the promotion_threshold is crucial to this policy.
 649 */
 650static unsigned promote_threshold(struct mq_policy *mq)
 651{
 652        struct entry *e;
 653
 654        if (any_free_cblocks(mq))
 655                return 0;
 656
 657        e = peek(&mq->cache_clean);
 658        if (e)
 659                return e->hit_count;
 660
 661        e = peek(&mq->cache_dirty);
 662        if (e)
 663                return e->hit_count + DISCOURAGE_DEMOTING_DIRTY_THRESHOLD;
 664
 665        /* This should never happen */
 666        return 0;
 667}
 668
 669/*
 670 * We modify the basic promotion_threshold depending on the specific io.
 671 *
 672 * If the origin block has been discarded then there's no cost to copy it
 673 * to the cache.
 674 *
 675 * We bias towards reads, since they can be demoted at no cost if they
 676 * haven't been dirtied.
 677 */
 678static unsigned adjusted_promote_threshold(struct mq_policy *mq,
 679                                           bool discarded_oblock, int data_dir)
 680{
 681        if (data_dir == READ)
 682                return promote_threshold(mq) + mq->read_promote_adjustment;
 683
 684        if (discarded_oblock && (any_free_cblocks(mq) || any_clean_cblocks(mq))) {
 685                /*
 686                 * We don't need to do any copying at all, so give this a
 687                 * very low threshold.
 688                 */
 689                return mq->discard_promote_adjustment;
 690        }
 691
 692        return promote_threshold(mq) + mq->write_promote_adjustment;
 693}
 694
 695static bool should_promote(struct mq_policy *mq, struct entry *e,
 696                           bool discarded_oblock, int data_dir)
 697{
 698        return e->hit_count >=
 699                adjusted_promote_threshold(mq, discarded_oblock, data_dir);
 700}
 701
 702static int cache_entry_found(struct mq_policy *mq,
 703                             struct entry *e,
 704                             struct policy_result *result)
 705{
 706        requeue_and_update_tick(mq, e);
 707
 708        if (in_cache(mq, e)) {
 709                result->op = POLICY_HIT;
 710                result->cblock = infer_cblock(&mq->cache_pool, e);
 711        }
 712
 713        return 0;
 714}
 715
 716/*
 717 * Moves an entry from the pre_cache to the cache.  The main work is
 718 * finding which cache block to use.
 719 */
 720static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e,
 721                              struct policy_result *result)
 722{
 723        int r;
 724        struct entry *new_e;
 725
 726        /* Ensure there's a free cblock in the cache */
 727        if (epool_empty(&mq->cache_pool)) {
 728                result->op = POLICY_REPLACE;
 729                r = demote_cblock(mq, &result->old_oblock);
 730                if (r) {
 731                        result->op = POLICY_MISS;
 732                        return 0;
 733                }
 734        } else
 735                result->op = POLICY_NEW;
 736
 737        new_e = alloc_entry(&mq->cache_pool);
 738        BUG_ON(!new_e);
 739
 740        new_e->oblock = e->oblock;
 741        new_e->dirty = false;
 742        new_e->hit_count = e->hit_count;
 743        new_e->generation = e->generation;
 744        new_e->tick = e->tick;
 745
 746        del(mq, e);
 747        free_entry(&mq->pre_cache_pool, e);
 748        push(mq, new_e);
 749
 750        result->cblock = infer_cblock(&mq->cache_pool, new_e);
 751
 752        return 0;
 753}
 754
 755static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e,
 756                                 bool can_migrate, bool discarded_oblock,
 757                                 int data_dir, struct policy_result *result)
 758{
 759        int r = 0;
 760        bool updated = updated_this_tick(mq, e);
 761
 762        if ((!discarded_oblock && updated) ||
 763            !should_promote(mq, e, discarded_oblock, data_dir)) {
 764                requeue_and_update_tick(mq, e);
 765                result->op = POLICY_MISS;
 766
 767        } else if (!can_migrate)
 768                r = -EWOULDBLOCK;
 769
 770        else {
 771                requeue_and_update_tick(mq, e);
 772                r = pre_cache_to_cache(mq, e, result);
 773        }
 774
 775        return r;
 776}
 777
 778static void insert_in_pre_cache(struct mq_policy *mq,
 779                                dm_oblock_t oblock)
 780{
 781        struct entry *e = alloc_entry(&mq->pre_cache_pool);
 782
 783        if (!e)
 784                /*
 785                 * There's no spare entry structure, so we grab the least
 786                 * used one from the pre_cache.
 787                 */
 788                e = pop(mq, &mq->pre_cache);
 789
 790        if (unlikely(!e)) {
 791                DMWARN("couldn't pop from pre cache");
 792                return;
 793        }
 794
 795        e->dirty = false;
 796        e->oblock = oblock;
 797        e->hit_count = 1;
 798        e->generation = mq->generation;
 799        push(mq, e);
 800}
 801
 802static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock,
 803                            struct policy_result *result)
 804{
 805        int r;
 806        struct entry *e;
 807
 808        if (epool_empty(&mq->cache_pool)) {
 809                result->op = POLICY_REPLACE;
 810                r = demote_cblock(mq, &result->old_oblock);
 811                if (unlikely(r)) {
 812                        result->op = POLICY_MISS;
 813                        insert_in_pre_cache(mq, oblock);
 814                        return;
 815                }
 816
 817                /*
 818                 * This will always succeed, since we've just demoted.
 819                 */
 820                e = alloc_entry(&mq->cache_pool);
 821                BUG_ON(!e);
 822
 823        } else {
 824                e = alloc_entry(&mq->cache_pool);
 825                result->op = POLICY_NEW;
 826        }
 827
 828        e->oblock = oblock;
 829        e->dirty = false;
 830        e->hit_count = 1;
 831        e->generation = mq->generation;
 832        push(mq, e);
 833
 834        result->cblock = infer_cblock(&mq->cache_pool, e);
 835}
 836
 837static int no_entry_found(struct mq_policy *mq, dm_oblock_t oblock,
 838                          bool can_migrate, bool discarded_oblock,
 839                          int data_dir, struct policy_result *result)
 840{
 841        if (adjusted_promote_threshold(mq, discarded_oblock, data_dir) <= 1) {
 842                if (can_migrate)
 843                        insert_in_cache(mq, oblock, result);
 844                else
 845                        return -EWOULDBLOCK;
 846        } else {
 847                insert_in_pre_cache(mq, oblock);
 848                result->op = POLICY_MISS;
 849        }
 850
 851        return 0;
 852}
 853
 854/*
 855 * Looks the oblock up in the hash table, then decides whether to put in
 856 * pre_cache, or cache etc.
 857 */
 858static int map(struct mq_policy *mq, dm_oblock_t oblock,
 859               bool can_migrate, bool discarded_oblock,
 860               int data_dir, struct policy_result *result)
 861{
 862        int r = 0;
 863        struct entry *e = hash_lookup(mq, oblock);
 864
 865        if (e && in_cache(mq, e))
 866                r = cache_entry_found(mq, e, result);
 867
 868        else if (mq->tracker.thresholds[PATTERN_SEQUENTIAL] &&
 869                 iot_pattern(&mq->tracker) == PATTERN_SEQUENTIAL)
 870                result->op = POLICY_MISS;
 871
 872        else if (e)
 873                r = pre_cache_entry_found(mq, e, can_migrate, discarded_oblock,
 874                                          data_dir, result);
 875
 876        else
 877                r = no_entry_found(mq, oblock, can_migrate, discarded_oblock,
 878                                   data_dir, result);
 879
 880        if (r == -EWOULDBLOCK)
 881                result->op = POLICY_MISS;
 882
 883        return r;
 884}
 885
 886/*----------------------------------------------------------------*/
 887
 888/*
 889 * Public interface, via the policy struct.  See dm-cache-policy.h for a
 890 * description of these.
 891 */
 892
 893static struct mq_policy *to_mq_policy(struct dm_cache_policy *p)
 894{
 895        return container_of(p, struct mq_policy, policy);
 896}
 897
 898static void mq_destroy(struct dm_cache_policy *p)
 899{
 900        struct mq_policy *mq = to_mq_policy(p);
 901
 902        vfree(mq->table);
 903        epool_exit(&mq->cache_pool);
 904        epool_exit(&mq->pre_cache_pool);
 905        kfree(mq);
 906}
 907
 908static void copy_tick(struct mq_policy *mq)
 909{
 910        unsigned long flags;
 911
 912        spin_lock_irqsave(&mq->tick_lock, flags);
 913        mq->tick = mq->tick_protected;
 914        spin_unlock_irqrestore(&mq->tick_lock, flags);
 915}
 916
 917static int mq_map(struct dm_cache_policy *p, dm_oblock_t oblock,
 918                  bool can_block, bool can_migrate, bool discarded_oblock,
 919                  struct bio *bio, struct policy_result *result)
 920{
 921        int r;
 922        struct mq_policy *mq = to_mq_policy(p);
 923
 924        result->op = POLICY_MISS;
 925
 926        if (can_block)
 927                mutex_lock(&mq->lock);
 928        else if (!mutex_trylock(&mq->lock))
 929                return -EWOULDBLOCK;
 930
 931        copy_tick(mq);
 932
 933        iot_examine_bio(&mq->tracker, bio);
 934        r = map(mq, oblock, can_migrate, discarded_oblock,
 935                bio_data_dir(bio), result);
 936
 937        mutex_unlock(&mq->lock);
 938
 939        return r;
 940}
 941
 942static int mq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock)
 943{
 944        int r;
 945        struct mq_policy *mq = to_mq_policy(p);
 946        struct entry *e;
 947
 948        if (!mutex_trylock(&mq->lock))
 949                return -EWOULDBLOCK;
 950
 951        e = hash_lookup(mq, oblock);
 952        if (e && in_cache(mq, e)) {
 953                *cblock = infer_cblock(&mq->cache_pool, e);
 954                r = 0;
 955        } else
 956                r = -ENOENT;
 957
 958        mutex_unlock(&mq->lock);
 959
 960        return r;
 961}
 962
 963static void __mq_set_clear_dirty(struct mq_policy *mq, dm_oblock_t oblock, bool set)
 964{
 965        struct entry *e;
 966
 967        e = hash_lookup(mq, oblock);
 968        BUG_ON(!e || !in_cache(mq, e));
 969
 970        del(mq, e);
 971        e->dirty = set;
 972        push(mq, e);
 973}
 974
 975static void mq_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
 976{
 977        struct mq_policy *mq = to_mq_policy(p);
 978
 979        mutex_lock(&mq->lock);
 980        __mq_set_clear_dirty(mq, oblock, true);
 981        mutex_unlock(&mq->lock);
 982}
 983
 984static void mq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
 985{
 986        struct mq_policy *mq = to_mq_policy(p);
 987
 988        mutex_lock(&mq->lock);
 989        __mq_set_clear_dirty(mq, oblock, false);
 990        mutex_unlock(&mq->lock);
 991}
 992
 993static int mq_load_mapping(struct dm_cache_policy *p,
 994                           dm_oblock_t oblock, dm_cblock_t cblock,
 995                           uint32_t hint, bool hint_valid)
 996{
 997        struct mq_policy *mq = to_mq_policy(p);
 998        struct entry *e;
 999
1000        e = alloc_particular_entry(&mq->cache_pool, cblock);
1001        e->oblock = oblock;
1002        e->dirty = false;       /* this gets corrected in a minute */
1003        e->hit_count = hint_valid ? hint : 1;
1004        e->generation = mq->generation;
1005        push(mq, e);
1006
1007        return 0;
1008}
1009
1010static int mq_save_hints(struct mq_policy *mq, struct queue *q,
1011                         policy_walk_fn fn, void *context)
1012{
1013        int r;
1014        unsigned level;
1015        struct entry *e;
1016
1017        for (level = 0; level < NR_QUEUE_LEVELS; level++)
1018                list_for_each_entry(e, q->qs + level, list) {
1019                        r = fn(context, infer_cblock(&mq->cache_pool, e),
1020                               e->oblock, e->hit_count);
1021                        if (r)
1022                                return r;
1023                }
1024
1025        return 0;
1026}
1027
1028static int mq_walk_mappings(struct dm_cache_policy *p, policy_walk_fn fn,
1029                            void *context)
1030{
1031        struct mq_policy *mq = to_mq_policy(p);
1032        int r = 0;
1033
1034        mutex_lock(&mq->lock);
1035
1036        r = mq_save_hints(mq, &mq->cache_clean, fn, context);
1037        if (!r)
1038                r = mq_save_hints(mq, &mq->cache_dirty, fn, context);
1039
1040        mutex_unlock(&mq->lock);
1041
1042        return r;
1043}
1044
1045static void __remove_mapping(struct mq_policy *mq, dm_oblock_t oblock)
1046{
1047        struct entry *e;
1048
1049        e = hash_lookup(mq, oblock);
1050        BUG_ON(!e || !in_cache(mq, e));
1051
1052        del(mq, e);
1053        free_entry(&mq->cache_pool, e);
1054}
1055
1056static void mq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
1057{
1058        struct mq_policy *mq = to_mq_policy(p);
1059
1060        mutex_lock(&mq->lock);
1061        __remove_mapping(mq, oblock);
1062        mutex_unlock(&mq->lock);
1063}
1064
1065static int __remove_cblock(struct mq_policy *mq, dm_cblock_t cblock)
1066{
1067        struct entry *e = epool_find(&mq->cache_pool, cblock);
1068
1069        if (!e)
1070                return -ENODATA;
1071
1072        del(mq, e);
1073        free_entry(&mq->cache_pool, e);
1074
1075        return 0;
1076}
1077
1078static int mq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock)
1079{
1080        int r;
1081        struct mq_policy *mq = to_mq_policy(p);
1082
1083        mutex_lock(&mq->lock);
1084        r = __remove_cblock(mq, cblock);
1085        mutex_unlock(&mq->lock);
1086
1087        return r;
1088}
1089
1090static int __mq_writeback_work(struct mq_policy *mq, dm_oblock_t *oblock,
1091                              dm_cblock_t *cblock)
1092{
1093        struct entry *e = pop(mq, &mq->cache_dirty);
1094
1095        if (!e)
1096                return -ENODATA;
1097
1098        *oblock = e->oblock;
1099        *cblock = infer_cblock(&mq->cache_pool, e);
1100        e->dirty = false;
1101        push(mq, e);
1102
1103        return 0;
1104}
1105
1106static int mq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock,
1107                             dm_cblock_t *cblock)
1108{
1109        int r;
1110        struct mq_policy *mq = to_mq_policy(p);
1111
1112        mutex_lock(&mq->lock);
1113        r = __mq_writeback_work(mq, oblock, cblock);
1114        mutex_unlock(&mq->lock);
1115
1116        return r;
1117}
1118
1119static void __force_mapping(struct mq_policy *mq,
1120                            dm_oblock_t current_oblock, dm_oblock_t new_oblock)
1121{
1122        struct entry *e = hash_lookup(mq, current_oblock);
1123
1124        if (e && in_cache(mq, e)) {
1125                del(mq, e);
1126                e->oblock = new_oblock;
1127                e->dirty = true;
1128                push(mq, e);
1129        }
1130}
1131
1132static void mq_force_mapping(struct dm_cache_policy *p,
1133                             dm_oblock_t current_oblock, dm_oblock_t new_oblock)
1134{
1135        struct mq_policy *mq = to_mq_policy(p);
1136
1137        mutex_lock(&mq->lock);
1138        __force_mapping(mq, current_oblock, new_oblock);
1139        mutex_unlock(&mq->lock);
1140}
1141
1142static dm_cblock_t mq_residency(struct dm_cache_policy *p)
1143{
1144        dm_cblock_t r;
1145        struct mq_policy *mq = to_mq_policy(p);
1146
1147        mutex_lock(&mq->lock);
1148        r = to_cblock(mq->cache_pool.nr_allocated);
1149        mutex_unlock(&mq->lock);
1150
1151        return r;
1152}
1153
1154static void mq_tick(struct dm_cache_policy *p)
1155{
1156        struct mq_policy *mq = to_mq_policy(p);
1157        unsigned long flags;
1158
1159        spin_lock_irqsave(&mq->tick_lock, flags);
1160        mq->tick_protected++;
1161        spin_unlock_irqrestore(&mq->tick_lock, flags);
1162}
1163
1164static int mq_set_config_value(struct dm_cache_policy *p,
1165                               const char *key, const char *value)
1166{
1167        struct mq_policy *mq = to_mq_policy(p);
1168        unsigned long tmp;
1169
1170        if (kstrtoul(value, 10, &tmp))
1171                return -EINVAL;
1172
1173        if (!strcasecmp(key, "random_threshold")) {
1174                mq->tracker.thresholds[PATTERN_RANDOM] = tmp;
1175
1176        } else if (!strcasecmp(key, "sequential_threshold")) {
1177                mq->tracker.thresholds[PATTERN_SEQUENTIAL] = tmp;
1178
1179        } else if (!strcasecmp(key, "discard_promote_adjustment"))
1180                mq->discard_promote_adjustment = tmp;
1181
1182        else if (!strcasecmp(key, "read_promote_adjustment"))
1183                mq->read_promote_adjustment = tmp;
1184
1185        else if (!strcasecmp(key, "write_promote_adjustment"))
1186                mq->write_promote_adjustment = tmp;
1187
1188        else
1189                return -EINVAL;
1190
1191        return 0;
1192}
1193
1194static int mq_emit_config_values(struct dm_cache_policy *p, char *result, unsigned maxlen)
1195{
1196        ssize_t sz = 0;
1197        struct mq_policy *mq = to_mq_policy(p);
1198
1199        DMEMIT("10 random_threshold %u "
1200               "sequential_threshold %u "
1201               "discard_promote_adjustment %u "
1202               "read_promote_adjustment %u "
1203               "write_promote_adjustment %u",
1204               mq->tracker.thresholds[PATTERN_RANDOM],
1205               mq->tracker.thresholds[PATTERN_SEQUENTIAL],
1206               mq->discard_promote_adjustment,
1207               mq->read_promote_adjustment,
1208               mq->write_promote_adjustment);
1209
1210        return 0;
1211}
1212
1213/* Init the policy plugin interface function pointers. */
1214static void init_policy_functions(struct mq_policy *mq)
1215{
1216        mq->policy.destroy = mq_destroy;
1217        mq->policy.map = mq_map;
1218        mq->policy.lookup = mq_lookup;
1219        mq->policy.set_dirty = mq_set_dirty;
1220        mq->policy.clear_dirty = mq_clear_dirty;
1221        mq->policy.load_mapping = mq_load_mapping;
1222        mq->policy.walk_mappings = mq_walk_mappings;
1223        mq->policy.remove_mapping = mq_remove_mapping;
1224        mq->policy.remove_cblock = mq_remove_cblock;
1225        mq->policy.writeback_work = mq_writeback_work;
1226        mq->policy.force_mapping = mq_force_mapping;
1227        mq->policy.residency = mq_residency;
1228        mq->policy.tick = mq_tick;
1229        mq->policy.emit_config_values = mq_emit_config_values;
1230        mq->policy.set_config_value = mq_set_config_value;
1231}
1232
1233static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
1234                                         sector_t origin_size,
1235                                         sector_t cache_block_size)
1236{
1237        struct mq_policy *mq = kzalloc(sizeof(*mq), GFP_KERNEL);
1238
1239        if (!mq)
1240                return NULL;
1241
1242        init_policy_functions(mq);
1243        iot_init(&mq->tracker, SEQUENTIAL_THRESHOLD_DEFAULT, RANDOM_THRESHOLD_DEFAULT);
1244        mq->cache_size = cache_size;
1245
1246        if (epool_init(&mq->pre_cache_pool, from_cblock(cache_size))) {
1247                DMERR("couldn't initialize pool of pre-cache entries");
1248                goto bad_pre_cache_init;
1249        }
1250
1251        if (epool_init(&mq->cache_pool, from_cblock(cache_size))) {
1252                DMERR("couldn't initialize pool of cache entries");
1253                goto bad_cache_init;
1254        }
1255
1256        mq->tick_protected = 0;
1257        mq->tick = 0;
1258        mq->hit_count = 0;
1259        mq->generation = 0;
1260        mq->discard_promote_adjustment = DEFAULT_DISCARD_PROMOTE_ADJUSTMENT;
1261        mq->read_promote_adjustment = DEFAULT_READ_PROMOTE_ADJUSTMENT;
1262        mq->write_promote_adjustment = DEFAULT_WRITE_PROMOTE_ADJUSTMENT;
1263        mutex_init(&mq->lock);
1264        spin_lock_init(&mq->tick_lock);
1265
1266        queue_init(&mq->pre_cache);
1267        queue_init(&mq->cache_clean);
1268        queue_init(&mq->cache_dirty);
1269
1270        mq->generation_period = max((unsigned) from_cblock(cache_size), 1024U);
1271
1272        mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16);
1273        mq->hash_bits = ffs(mq->nr_buckets) - 1;
1274        mq->table = vzalloc(sizeof(*mq->table) * mq->nr_buckets);
1275        if (!mq->table)
1276                goto bad_alloc_table;
1277
1278        return &mq->policy;
1279
1280bad_alloc_table:
1281        epool_exit(&mq->cache_pool);
1282bad_cache_init:
1283        epool_exit(&mq->pre_cache_pool);
1284bad_pre_cache_init:
1285        kfree(mq);
1286
1287        return NULL;
1288}
1289
1290/*----------------------------------------------------------------*/
1291
1292static struct dm_cache_policy_type mq_policy_type = {
1293        .name = "mq",
1294        .version = {1, 3, 0},
1295        .hint_size = 4,
1296        .owner = THIS_MODULE,
1297        .create = mq_create
1298};
1299
1300static struct dm_cache_policy_type default_policy_type = {
1301        .name = "default",
1302        .version = {1, 3, 0},
1303        .hint_size = 4,
1304        .owner = THIS_MODULE,
1305        .create = mq_create,
1306        .real = &mq_policy_type
1307};
1308
1309static int __init mq_init(void)
1310{
1311        int r;
1312
1313        mq_entry_cache = kmem_cache_create("dm_mq_policy_cache_entry",
1314                                           sizeof(struct entry),
1315                                           __alignof__(struct entry),
1316                                           0, NULL);
1317        if (!mq_entry_cache)
1318                goto bad;
1319
1320        r = dm_cache_policy_register(&mq_policy_type);
1321        if (r) {
1322                DMERR("register failed %d", r);
1323                goto bad_register_mq;
1324        }
1325
1326        r = dm_cache_policy_register(&default_policy_type);
1327        if (!r) {
1328                DMINFO("version %u.%u.%u loaded",
1329                       mq_policy_type.version[0],
1330                       mq_policy_type.version[1],
1331                       mq_policy_type.version[2]);
1332                return 0;
1333        }
1334
1335        DMERR("register failed (as default) %d", r);
1336
1337        dm_cache_policy_unregister(&mq_policy_type);
1338bad_register_mq:
1339        kmem_cache_destroy(mq_entry_cache);
1340bad:
1341        return -ENOMEM;
1342}
1343
1344static void __exit mq_exit(void)
1345{
1346        dm_cache_policy_unregister(&mq_policy_type);
1347        dm_cache_policy_unregister(&default_policy_type);
1348
1349        kmem_cache_destroy(mq_entry_cache);
1350}
1351
1352module_init(mq_init);
1353module_exit(mq_exit);
1354
1355MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
1356MODULE_LICENSE("GPL");
1357MODULE_DESCRIPTION("mq cache policy");
1358
1359MODULE_ALIAS("dm-cache-default");
1360