linux/drivers/md/dm-cache-policy-smq.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2015 Red Hat. All rights reserved.
   3 *
   4 * This file is released under the GPL.
   5 */
   6
   7#include "dm-cache-policy.h"
   8#include "dm-cache-policy-internal.h"
   9#include "dm.h"
  10
  11#include <linux/hash.h>
  12#include <linux/jiffies.h>
  13#include <linux/module.h>
  14#include <linux/mutex.h>
  15#include <linux/vmalloc.h>
  16#include <linux/math64.h>
  17
  18#define DM_MSG_PREFIX "cache-policy-smq"
  19
  20/*----------------------------------------------------------------*/
  21
  22/*
  23 * Safe division functions that return zero on divide by zero.
  24 */
  25static unsigned safe_div(unsigned n, unsigned d)
  26{
  27        return d ? n / d : 0u;
  28}
  29
  30static unsigned safe_mod(unsigned n, unsigned d)
  31{
  32        return d ? n % d : 0u;
  33}
  34
  35/*----------------------------------------------------------------*/
  36
  37struct entry {
  38        unsigned hash_next:28;
  39        unsigned prev:28;
  40        unsigned next:28;
  41        unsigned level:7;
  42        bool dirty:1;
  43        bool allocated:1;
  44        bool sentinel:1;
  45
  46        dm_oblock_t oblock;
  47};
  48
  49/*----------------------------------------------------------------*/
  50
  51#define INDEXER_NULL ((1u << 28u) - 1u)
  52
  53/*
  54 * An entry_space manages a set of entries that we use for the queues.
  55 * The clean and dirty queues share entries, so this object is separate
  56 * from the queue itself.
  57 */
  58struct entry_space {
  59        struct entry *begin;
  60        struct entry *end;
  61};
  62
  63static int space_init(struct entry_space *es, unsigned nr_entries)
  64{
  65        if (!nr_entries) {
  66                es->begin = es->end = NULL;
  67                return 0;
  68        }
  69
  70        es->begin = vzalloc(sizeof(struct entry) * nr_entries);
  71        if (!es->begin)
  72                return -ENOMEM;
  73
  74        es->end = es->begin + nr_entries;
  75        return 0;
  76}
  77
  78static void space_exit(struct entry_space *es)
  79{
  80        vfree(es->begin);
  81}
  82
  83static struct entry *__get_entry(struct entry_space *es, unsigned block)
  84{
  85        struct entry *e;
  86
  87        e = es->begin + block;
  88        BUG_ON(e >= es->end);
  89
  90        return e;
  91}
  92
  93static unsigned to_index(struct entry_space *es, struct entry *e)
  94{
  95        BUG_ON(e < es->begin || e >= es->end);
  96        return e - es->begin;
  97}
  98
  99static struct entry *to_entry(struct entry_space *es, unsigned block)
 100{
 101        if (block == INDEXER_NULL)
 102                return NULL;
 103
 104        return __get_entry(es, block);
 105}
 106
 107/*----------------------------------------------------------------*/
 108
 109struct ilist {
 110        unsigned nr_elts;       /* excluding sentinel entries */
 111        unsigned head, tail;
 112};
 113
 114static void l_init(struct ilist *l)
 115{
 116        l->nr_elts = 0;
 117        l->head = l->tail = INDEXER_NULL;
 118}
 119
 120static struct entry *l_head(struct entry_space *es, struct ilist *l)
 121{
 122        return to_entry(es, l->head);
 123}
 124
 125static struct entry *l_tail(struct entry_space *es, struct ilist *l)
 126{
 127        return to_entry(es, l->tail);
 128}
 129
 130static struct entry *l_next(struct entry_space *es, struct entry *e)
 131{
 132        return to_entry(es, e->next);
 133}
 134
 135static struct entry *l_prev(struct entry_space *es, struct entry *e)
 136{
 137        return to_entry(es, e->prev);
 138}
 139
 140static bool l_empty(struct ilist *l)
 141{
 142        return l->head == INDEXER_NULL;
 143}
 144
 145static void l_add_head(struct entry_space *es, struct ilist *l, struct entry *e)
 146{
 147        struct entry *head = l_head(es, l);
 148
 149        e->next = l->head;
 150        e->prev = INDEXER_NULL;
 151
 152        if (head)
 153                head->prev = l->head = to_index(es, e);
 154        else
 155                l->head = l->tail = to_index(es, e);
 156
 157        if (!e->sentinel)
 158                l->nr_elts++;
 159}
 160
 161static void l_add_tail(struct entry_space *es, struct ilist *l, struct entry *e)
 162{
 163        struct entry *tail = l_tail(es, l);
 164
 165        e->next = INDEXER_NULL;
 166        e->prev = l->tail;
 167
 168        if (tail)
 169                tail->next = l->tail = to_index(es, e);
 170        else
 171                l->head = l->tail = to_index(es, e);
 172
 173        if (!e->sentinel)
 174                l->nr_elts++;
 175}
 176
 177static void l_add_before(struct entry_space *es, struct ilist *l,
 178                         struct entry *old, struct entry *e)
 179{
 180        struct entry *prev = l_prev(es, old);
 181
 182        if (!prev)
 183                l_add_head(es, l, e);
 184
 185        else {
 186                e->prev = old->prev;
 187                e->next = to_index(es, old);
 188                prev->next = old->prev = to_index(es, e);
 189
 190                if (!e->sentinel)
 191                        l->nr_elts++;
 192        }
 193}
 194
 195static void l_del(struct entry_space *es, struct ilist *l, struct entry *e)
 196{
 197        struct entry *prev = l_prev(es, e);
 198        struct entry *next = l_next(es, e);
 199
 200        if (prev)
 201                prev->next = e->next;
 202        else
 203                l->head = e->next;
 204
 205        if (next)
 206                next->prev = e->prev;
 207        else
 208                l->tail = e->prev;
 209
 210        if (!e->sentinel)
 211                l->nr_elts--;
 212}
 213
 214static struct entry *l_pop_tail(struct entry_space *es, struct ilist *l)
 215{
 216        struct entry *e;
 217
 218        for (e = l_tail(es, l); e; e = l_prev(es, e))
 219                if (!e->sentinel) {
 220                        l_del(es, l, e);
 221                        return e;
 222                }
 223
 224        return NULL;
 225}
 226
 227/*----------------------------------------------------------------*/
 228
 229/*
 230 * The stochastic-multi-queue is a set of lru lists stacked into levels.
 231 * Entries are moved up levels when they are used, which loosely orders the
 232 * most accessed entries in the top levels and least in the bottom.  This
 233 * structure is *much* better than a single lru list.
 234 */
 235#define MAX_LEVELS 64u
 236
 237struct queue {
 238        struct entry_space *es;
 239
 240        unsigned nr_elts;
 241        unsigned nr_levels;
 242        struct ilist qs[MAX_LEVELS];
 243
 244        /*
 245         * We maintain a count of the number of entries we would like in each
 246         * level.
 247         */
 248        unsigned last_target_nr_elts;
 249        unsigned nr_top_levels;
 250        unsigned nr_in_top_levels;
 251        unsigned target_count[MAX_LEVELS];
 252};
 253
 254static void q_init(struct queue *q, struct entry_space *es, unsigned nr_levels)
 255{
 256        unsigned i;
 257
 258        q->es = es;
 259        q->nr_elts = 0;
 260        q->nr_levels = nr_levels;
 261
 262        for (i = 0; i < q->nr_levels; i++) {
 263                l_init(q->qs + i);
 264                q->target_count[i] = 0u;
 265        }
 266
 267        q->last_target_nr_elts = 0u;
 268        q->nr_top_levels = 0u;
 269        q->nr_in_top_levels = 0u;
 270}
 271
 272static unsigned q_size(struct queue *q)
 273{
 274        return q->nr_elts;
 275}
 276
 277/*
 278 * Insert an entry to the back of the given level.
 279 */
 280static void q_push(struct queue *q, struct entry *e)
 281{
 282        if (!e->sentinel)
 283                q->nr_elts++;
 284
 285        l_add_tail(q->es, q->qs + e->level, e);
 286}
 287
 288static void q_push_before(struct queue *q, struct entry *old, struct entry *e)
 289{
 290        if (!e->sentinel)
 291                q->nr_elts++;
 292
 293        l_add_before(q->es, q->qs + e->level, old, e);
 294}
 295
 296static void q_del(struct queue *q, struct entry *e)
 297{
 298        l_del(q->es, q->qs + e->level, e);
 299        if (!e->sentinel)
 300                q->nr_elts--;
 301}
 302
 303/*
 304 * Return the oldest entry of the lowest populated level.
 305 */
 306static struct entry *q_peek(struct queue *q, unsigned max_level, bool can_cross_sentinel)
 307{
 308        unsigned level;
 309        struct entry *e;
 310
 311        max_level = min(max_level, q->nr_levels);
 312
 313        for (level = 0; level < max_level; level++)
 314                for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e)) {
 315                        if (e->sentinel) {
 316                                if (can_cross_sentinel)
 317                                        continue;
 318                                else
 319                                        break;
 320                        }
 321
 322                        return e;
 323                }
 324
 325        return NULL;
 326}
 327
 328static struct entry *q_pop(struct queue *q)
 329{
 330        struct entry *e = q_peek(q, q->nr_levels, true);
 331
 332        if (e)
 333                q_del(q, e);
 334
 335        return e;
 336}
 337
 338/*
 339 * Pops an entry from a level that is not past a sentinel.
 340 */
 341static struct entry *q_pop_old(struct queue *q, unsigned max_level)
 342{
 343        struct entry *e = q_peek(q, max_level, false);
 344
 345        if (e)
 346                q_del(q, e);
 347
 348        return e;
 349}
 350
 351/*
 352 * This function assumes there is a non-sentinel entry to pop.  It's only
 353 * used by redistribute, so we know this is true.  It also doesn't adjust
 354 * the q->nr_elts count.
 355 */
 356static struct entry *__redist_pop_from(struct queue *q, unsigned level)
 357{
 358        struct entry *e;
 359
 360        for (; level < q->nr_levels; level++)
 361                for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e))
 362                        if (!e->sentinel) {
 363                                l_del(q->es, q->qs + e->level, e);
 364                                return e;
 365                        }
 366
 367        return NULL;
 368}
 369
 370static void q_set_targets_subrange_(struct queue *q, unsigned nr_elts, unsigned lbegin, unsigned lend)
 371{
 372        unsigned level, nr_levels, entries_per_level, remainder;
 373
 374        BUG_ON(lbegin > lend);
 375        BUG_ON(lend > q->nr_levels);
 376        nr_levels = lend - lbegin;
 377        entries_per_level = safe_div(nr_elts, nr_levels);
 378        remainder = safe_mod(nr_elts, nr_levels);
 379
 380        for (level = lbegin; level < lend; level++)
 381                q->target_count[level] =
 382                        (level < (lbegin + remainder)) ? entries_per_level + 1u : entries_per_level;
 383}
 384
 385/*
 386 * Typically we have fewer elements in the top few levels which allows us
 387 * to adjust the promote threshold nicely.
 388 */
 389static void q_set_targets(struct queue *q)
 390{
 391        if (q->last_target_nr_elts == q->nr_elts)
 392                return;
 393
 394        q->last_target_nr_elts = q->nr_elts;
 395
 396        if (q->nr_top_levels > q->nr_levels)
 397                q_set_targets_subrange_(q, q->nr_elts, 0, q->nr_levels);
 398
 399        else {
 400                q_set_targets_subrange_(q, q->nr_in_top_levels,
 401                                        q->nr_levels - q->nr_top_levels, q->nr_levels);
 402
 403                if (q->nr_in_top_levels < q->nr_elts)
 404                        q_set_targets_subrange_(q, q->nr_elts - q->nr_in_top_levels,
 405                                                0, q->nr_levels - q->nr_top_levels);
 406                else
 407                        q_set_targets_subrange_(q, 0, 0, q->nr_levels - q->nr_top_levels);
 408        }
 409}
 410
 411static void q_redistribute(struct queue *q)
 412{
 413        unsigned target, level;
 414        struct ilist *l, *l_above;
 415        struct entry *e;
 416
 417        q_set_targets(q);
 418
 419        for (level = 0u; level < q->nr_levels - 1u; level++) {
 420                l = q->qs + level;
 421                target = q->target_count[level];
 422
 423                /*
 424                 * Pull down some entries from the level above.
 425                 */
 426                while (l->nr_elts < target) {
 427                        e = __redist_pop_from(q, level + 1u);
 428                        if (!e) {
 429                                /* bug in nr_elts */
 430                                break;
 431                        }
 432
 433                        e->level = level;
 434                        l_add_tail(q->es, l, e);
 435                }
 436
 437                /*
 438                 * Push some entries up.
 439                 */
 440                l_above = q->qs + level + 1u;
 441                while (l->nr_elts > target) {
 442                        e = l_pop_tail(q->es, l);
 443
 444                        if (!e)
 445                                /* bug in nr_elts */
 446                                break;
 447
 448                        e->level = level + 1u;
 449                        l_add_head(q->es, l_above, e);
 450                }
 451        }
 452}
 453
 454static void q_requeue_before(struct queue *q, struct entry *dest, struct entry *e, unsigned extra_levels)
 455{
 456        struct entry *de;
 457        unsigned new_level;
 458
 459        q_del(q, e);
 460
 461        if (extra_levels && (e->level < q->nr_levels - 1u)) {
 462                new_level = min(q->nr_levels - 1u, e->level + extra_levels);
 463                for (de = l_head(q->es, q->qs + new_level); de; de = l_next(q->es, de)) {
 464                        if (de->sentinel)
 465                                continue;
 466
 467                        q_del(q, de);
 468                        de->level = e->level;
 469
 470                        if (dest)
 471                                q_push_before(q, dest, de);
 472                        else
 473                                q_push(q, de);
 474                        break;
 475                }
 476
 477                e->level = new_level;
 478        }
 479
 480        q_push(q, e);
 481}
 482
 483static void q_requeue(struct queue *q, struct entry *e, unsigned extra_levels)
 484{
 485        q_requeue_before(q, NULL, e, extra_levels);
 486}
 487
 488/*----------------------------------------------------------------*/
 489
 490#define FP_SHIFT 8
 491#define SIXTEENTH (1u << (FP_SHIFT - 4u))
 492#define EIGHTH (1u << (FP_SHIFT - 3u))
 493
 494struct stats {
 495        unsigned hit_threshold;
 496        unsigned hits;
 497        unsigned misses;
 498};
 499
 500enum performance {
 501        Q_POOR,
 502        Q_FAIR,
 503        Q_WELL
 504};
 505
 506static void stats_init(struct stats *s, unsigned nr_levels)
 507{
 508        s->hit_threshold = (nr_levels * 3u) / 4u;
 509        s->hits = 0u;
 510        s->misses = 0u;
 511}
 512
 513static void stats_reset(struct stats *s)
 514{
 515        s->hits = s->misses = 0u;
 516}
 517
 518static void stats_level_accessed(struct stats *s, unsigned level)
 519{
 520        if (level >= s->hit_threshold)
 521                s->hits++;
 522        else
 523                s->misses++;
 524}
 525
 526static void stats_miss(struct stats *s)
 527{
 528        s->misses++;
 529}
 530
 531/*
 532 * There are times when we don't have any confidence in the hotspot queue.
 533 * Such as when a fresh cache is created and the blocks have been spread
 534 * out across the levels, or if an io load changes.  We detect this by
 535 * seeing how often a lookup is in the top levels of the hotspot queue.
 536 */
 537static enum performance stats_assess(struct stats *s)
 538{
 539        unsigned confidence = safe_div(s->hits << FP_SHIFT, s->hits + s->misses);
 540
 541        if (confidence < SIXTEENTH)
 542                return Q_POOR;
 543
 544        else if (confidence < EIGHTH)
 545                return Q_FAIR;
 546
 547        else
 548                return Q_WELL;
 549}
 550
 551/*----------------------------------------------------------------*/
 552
 553struct hash_table {
 554        struct entry_space *es;
 555        unsigned long long hash_bits;
 556        unsigned *buckets;
 557};
 558
 559/*
 560 * All cache entries are stored in a chained hash table.  To save space we
 561 * use indexing again, and only store indexes to the next entry.
 562 */
 563static int h_init(struct hash_table *ht, struct entry_space *es, unsigned nr_entries)
 564{
 565        unsigned i, nr_buckets;
 566
 567        ht->es = es;
 568        nr_buckets = roundup_pow_of_two(max(nr_entries / 4u, 16u));
 569        ht->hash_bits = __ffs(nr_buckets);
 570
 571        ht->buckets = vmalloc(sizeof(*ht->buckets) * nr_buckets);
 572        if (!ht->buckets)
 573                return -ENOMEM;
 574
 575        for (i = 0; i < nr_buckets; i++)
 576                ht->buckets[i] = INDEXER_NULL;
 577
 578        return 0;
 579}
 580
 581static void h_exit(struct hash_table *ht)
 582{
 583        vfree(ht->buckets);
 584}
 585
 586static struct entry *h_head(struct hash_table *ht, unsigned bucket)
 587{
 588        return to_entry(ht->es, ht->buckets[bucket]);
 589}
 590
 591static struct entry *h_next(struct hash_table *ht, struct entry *e)
 592{
 593        return to_entry(ht->es, e->hash_next);
 594}
 595
 596static void __h_insert(struct hash_table *ht, unsigned bucket, struct entry *e)
 597{
 598        e->hash_next = ht->buckets[bucket];
 599        ht->buckets[bucket] = to_index(ht->es, e);
 600}
 601
 602static void h_insert(struct hash_table *ht, struct entry *e)
 603{
 604        unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits);
 605        __h_insert(ht, h, e);
 606}
 607
 608static struct entry *__h_lookup(struct hash_table *ht, unsigned h, dm_oblock_t oblock,
 609                                struct entry **prev)
 610{
 611        struct entry *e;
 612
 613        *prev = NULL;
 614        for (e = h_head(ht, h); e; e = h_next(ht, e)) {
 615                if (e->oblock == oblock)
 616                        return e;
 617
 618                *prev = e;
 619        }
 620
 621        return NULL;
 622}
 623
 624static void __h_unlink(struct hash_table *ht, unsigned h,
 625                       struct entry *e, struct entry *prev)
 626{
 627        if (prev)
 628                prev->hash_next = e->hash_next;
 629        else
 630                ht->buckets[h] = e->hash_next;
 631}
 632
 633/*
 634 * Also moves each entry to the front of the bucket.
 635 */
 636static struct entry *h_lookup(struct hash_table *ht, dm_oblock_t oblock)
 637{
 638        struct entry *e, *prev;
 639        unsigned h = hash_64(from_oblock(oblock), ht->hash_bits);
 640
 641        e = __h_lookup(ht, h, oblock, &prev);
 642        if (e && prev) {
 643                /*
 644                 * Move to the front because this entry is likely
 645                 * to be hit again.
 646                 */
 647                __h_unlink(ht, h, e, prev);
 648                __h_insert(ht, h, e);
 649        }
 650
 651        return e;
 652}
 653
 654static void h_remove(struct hash_table *ht, struct entry *e)
 655{
 656        unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits);
 657        struct entry *prev;
 658
 659        /*
 660         * The down side of using a singly linked list is we have to
 661         * iterate the bucket to remove an item.
 662         */
 663        e = __h_lookup(ht, h, e->oblock, &prev);
 664        if (e)
 665                __h_unlink(ht, h, e, prev);
 666}
 667
 668/*----------------------------------------------------------------*/
 669
 670struct entry_alloc {
 671        struct entry_space *es;
 672        unsigned begin;
 673
 674        unsigned nr_allocated;
 675        struct ilist free;
 676};
 677
 678static void init_allocator(struct entry_alloc *ea, struct entry_space *es,
 679                           unsigned begin, unsigned end)
 680{
 681        unsigned i;
 682
 683        ea->es = es;
 684        ea->nr_allocated = 0u;
 685        ea->begin = begin;
 686
 687        l_init(&ea->free);
 688        for (i = begin; i != end; i++)
 689                l_add_tail(ea->es, &ea->free, __get_entry(ea->es, i));
 690}
 691
 692static void init_entry(struct entry *e)
 693{
 694        /*
 695         * We can't memset because that would clear the hotspot and
 696         * sentinel bits which remain constant.
 697         */
 698        e->hash_next = INDEXER_NULL;
 699        e->next = INDEXER_NULL;
 700        e->prev = INDEXER_NULL;
 701        e->level = 0u;
 702        e->allocated = true;
 703}
 704
 705static struct entry *alloc_entry(struct entry_alloc *ea)
 706{
 707        struct entry *e;
 708
 709        if (l_empty(&ea->free))
 710                return NULL;
 711
 712        e = l_pop_tail(ea->es, &ea->free);
 713        init_entry(e);
 714        ea->nr_allocated++;
 715
 716        return e;
 717}
 718
 719/*
 720 * This assumes the cblock hasn't already been allocated.
 721 */
 722static struct entry *alloc_particular_entry(struct entry_alloc *ea, unsigned i)
 723{
 724        struct entry *e = __get_entry(ea->es, ea->begin + i);
 725
 726        BUG_ON(e->allocated);
 727
 728        l_del(ea->es, &ea->free, e);
 729        init_entry(e);
 730        ea->nr_allocated++;
 731
 732        return e;
 733}
 734
 735static void free_entry(struct entry_alloc *ea, struct entry *e)
 736{
 737        BUG_ON(!ea->nr_allocated);
 738        BUG_ON(!e->allocated);
 739
 740        ea->nr_allocated--;
 741        e->allocated = false;
 742        l_add_tail(ea->es, &ea->free, e);
 743}
 744
 745static bool allocator_empty(struct entry_alloc *ea)
 746{
 747        return l_empty(&ea->free);
 748}
 749
 750static unsigned get_index(struct entry_alloc *ea, struct entry *e)
 751{
 752        return to_index(ea->es, e) - ea->begin;
 753}
 754
 755static struct entry *get_entry(struct entry_alloc *ea, unsigned index)
 756{
 757        return __get_entry(ea->es, ea->begin + index);
 758}
 759
 760/*----------------------------------------------------------------*/
 761
 762#define NR_HOTSPOT_LEVELS 64u
 763#define NR_CACHE_LEVELS 64u
 764
 765#define WRITEBACK_PERIOD (10 * HZ)
 766#define DEMOTE_PERIOD (60 * HZ)
 767
 768#define HOTSPOT_UPDATE_PERIOD (HZ)
 769#define CACHE_UPDATE_PERIOD (10u * HZ)
 770
 771struct smq_policy {
 772        struct dm_cache_policy policy;
 773
 774        /* protects everything */
 775        spinlock_t lock;
 776        dm_cblock_t cache_size;
 777        sector_t cache_block_size;
 778
 779        sector_t hotspot_block_size;
 780        unsigned nr_hotspot_blocks;
 781        unsigned cache_blocks_per_hotspot_block;
 782        unsigned hotspot_level_jump;
 783
 784        struct entry_space es;
 785        struct entry_alloc writeback_sentinel_alloc;
 786        struct entry_alloc demote_sentinel_alloc;
 787        struct entry_alloc hotspot_alloc;
 788        struct entry_alloc cache_alloc;
 789
 790        unsigned long *hotspot_hit_bits;
 791        unsigned long *cache_hit_bits;
 792
 793        /*
 794         * We maintain three queues of entries.  The cache proper,
 795         * consisting of a clean and dirty queue, containing the currently
 796         * active mappings.  The hotspot queue uses a larger block size to
 797         * track blocks that are being hit frequently and potential
 798         * candidates for promotion to the cache.
 799         */
 800        struct queue hotspot;
 801        struct queue clean;
 802        struct queue dirty;
 803
 804        struct stats hotspot_stats;
 805        struct stats cache_stats;
 806
 807        /*
 808         * Keeps track of time, incremented by the core.  We use this to
 809         * avoid attributing multiple hits within the same tick.
 810         */
 811        unsigned tick;
 812
 813        /*
 814         * The hash tables allows us to quickly find an entry by origin
 815         * block.
 816         */
 817        struct hash_table table;
 818        struct hash_table hotspot_table;
 819
 820        bool current_writeback_sentinels;
 821        unsigned long next_writeback_period;
 822
 823        bool current_demote_sentinels;
 824        unsigned long next_demote_period;
 825
 826        unsigned write_promote_level;
 827        unsigned read_promote_level;
 828
 829        unsigned long next_hotspot_period;
 830        unsigned long next_cache_period;
 831};
 832
 833/*----------------------------------------------------------------*/
 834
 835static struct entry *get_sentinel(struct entry_alloc *ea, unsigned level, bool which)
 836{
 837        return get_entry(ea, which ? level : NR_CACHE_LEVELS + level);
 838}
 839
 840static struct entry *writeback_sentinel(struct smq_policy *mq, unsigned level)
 841{
 842        return get_sentinel(&mq->writeback_sentinel_alloc, level, mq->current_writeback_sentinels);
 843}
 844
 845static struct entry *demote_sentinel(struct smq_policy *mq, unsigned level)
 846{
 847        return get_sentinel(&mq->demote_sentinel_alloc, level, mq->current_demote_sentinels);
 848}
 849
 850static void __update_writeback_sentinels(struct smq_policy *mq)
 851{
 852        unsigned level;
 853        struct queue *q = &mq->dirty;
 854        struct entry *sentinel;
 855
 856        for (level = 0; level < q->nr_levels; level++) {
 857                sentinel = writeback_sentinel(mq, level);
 858                q_del(q, sentinel);
 859                q_push(q, sentinel);
 860        }
 861}
 862
 863static void __update_demote_sentinels(struct smq_policy *mq)
 864{
 865        unsigned level;
 866        struct queue *q = &mq->clean;
 867        struct entry *sentinel;
 868
 869        for (level = 0; level < q->nr_levels; level++) {
 870                sentinel = demote_sentinel(mq, level);
 871                q_del(q, sentinel);
 872                q_push(q, sentinel);
 873        }
 874}
 875
 876static void update_sentinels(struct smq_policy *mq)
 877{
 878        if (time_after(jiffies, mq->next_writeback_period)) {
 879                __update_writeback_sentinels(mq);
 880                mq->next_writeback_period = jiffies + WRITEBACK_PERIOD;
 881                mq->current_writeback_sentinels = !mq->current_writeback_sentinels;
 882        }
 883
 884        if (time_after(jiffies, mq->next_demote_period)) {
 885                __update_demote_sentinels(mq);
 886                mq->next_demote_period = jiffies + DEMOTE_PERIOD;
 887                mq->current_demote_sentinels = !mq->current_demote_sentinels;
 888        }
 889}
 890
 891static void __sentinels_init(struct smq_policy *mq)
 892{
 893        unsigned level;
 894        struct entry *sentinel;
 895
 896        for (level = 0; level < NR_CACHE_LEVELS; level++) {
 897                sentinel = writeback_sentinel(mq, level);
 898                sentinel->level = level;
 899                q_push(&mq->dirty, sentinel);
 900
 901                sentinel = demote_sentinel(mq, level);
 902                sentinel->level = level;
 903                q_push(&mq->clean, sentinel);
 904        }
 905}
 906
 907static void sentinels_init(struct smq_policy *mq)
 908{
 909        mq->next_writeback_period = jiffies + WRITEBACK_PERIOD;
 910        mq->next_demote_period = jiffies + DEMOTE_PERIOD;
 911
 912        mq->current_writeback_sentinels = false;
 913        mq->current_demote_sentinels = false;
 914        __sentinels_init(mq);
 915
 916        mq->current_writeback_sentinels = !mq->current_writeback_sentinels;
 917        mq->current_demote_sentinels = !mq->current_demote_sentinels;
 918        __sentinels_init(mq);
 919}
 920
 921/*----------------------------------------------------------------*/
 922
 923/*
 924 * These methods tie together the dirty queue, clean queue and hash table.
 925 */
 926static void push_new(struct smq_policy *mq, struct entry *e)
 927{
 928        struct queue *q = e->dirty ? &mq->dirty : &mq->clean;
 929        h_insert(&mq->table, e);
 930        q_push(q, e);
 931}
 932
 933static void push(struct smq_policy *mq, struct entry *e)
 934{
 935        struct entry *sentinel;
 936
 937        h_insert(&mq->table, e);
 938
 939        /*
 940         * Punch this into the queue just in front of the sentinel, to
 941         * ensure it's cleaned straight away.
 942         */
 943        if (e->dirty) {
 944                sentinel = writeback_sentinel(mq, e->level);
 945                q_push_before(&mq->dirty, sentinel, e);
 946        } else {
 947                sentinel = demote_sentinel(mq, e->level);
 948                q_push_before(&mq->clean, sentinel, e);
 949        }
 950}
 951
 952/*
 953 * Removes an entry from cache.  Removes from the hash table.
 954 */
 955static void __del(struct smq_policy *mq, struct queue *q, struct entry *e)
 956{
 957        q_del(q, e);
 958        h_remove(&mq->table, e);
 959}
 960
 961static void del(struct smq_policy *mq, struct entry *e)
 962{
 963        __del(mq, e->dirty ? &mq->dirty : &mq->clean, e);
 964}
 965
 966static struct entry *pop_old(struct smq_policy *mq, struct queue *q, unsigned max_level)
 967{
 968        struct entry *e = q_pop_old(q, max_level);
 969        if (e)
 970                h_remove(&mq->table, e);
 971        return e;
 972}
 973
 974static dm_cblock_t infer_cblock(struct smq_policy *mq, struct entry *e)
 975{
 976        return to_cblock(get_index(&mq->cache_alloc, e));
 977}
 978
 979static void requeue(struct smq_policy *mq, struct entry *e)
 980{
 981        struct entry *sentinel;
 982
 983        if (!test_and_set_bit(from_cblock(infer_cblock(mq, e)), mq->cache_hit_bits)) {
 984                if (e->dirty) {
 985                        sentinel = writeback_sentinel(mq, e->level);
 986                        q_requeue_before(&mq->dirty, sentinel, e, 1u);
 987                } else {
 988                        sentinel = demote_sentinel(mq, e->level);
 989                        q_requeue_before(&mq->clean, sentinel, e, 1u);
 990                }
 991        }
 992}
 993
 994static unsigned default_promote_level(struct smq_policy *mq)
 995{
 996        /*
 997         * The promote level depends on the current performance of the
 998         * cache.
 999         *
1000         * If the cache is performing badly, then we can't afford
1001         * to promote much without causing performance to drop below that
1002         * of the origin device.
1003         *
1004         * If the cache is performing well, then we don't need to promote
1005         * much.  If it isn't broken, don't fix it.
1006         *
1007         * If the cache is middling then we promote more.
1008         *
1009         * This scheme reminds me of a graph of entropy vs probability of a
1010         * binary variable.
1011         */
1012        static unsigned table[] = {1, 1, 1, 2, 4, 6, 7, 8, 7, 6, 4, 4, 3, 3, 2, 2, 1};
1013
1014        unsigned hits = mq->cache_stats.hits;
1015        unsigned misses = mq->cache_stats.misses;
1016        unsigned index = safe_div(hits << 4u, hits + misses);
1017        return table[index];
1018}
1019
1020static void update_promote_levels(struct smq_policy *mq)
1021{
1022        /*
1023         * If there are unused cache entries then we want to be really
1024         * eager to promote.
1025         */
1026        unsigned threshold_level = allocator_empty(&mq->cache_alloc) ?
1027                default_promote_level(mq) : (NR_HOTSPOT_LEVELS / 2u);
1028
1029        /*
1030         * If the hotspot queue is performing badly then we have little
1031         * confidence that we know which blocks to promote.  So we cut down
1032         * the amount of promotions.
1033         */
1034        switch (stats_assess(&mq->hotspot_stats)) {
1035        case Q_POOR:
1036                threshold_level /= 4u;
1037                break;
1038
1039        case Q_FAIR:
1040                threshold_level /= 2u;
1041                break;
1042
1043        case Q_WELL:
1044                break;
1045        }
1046
1047        mq->read_promote_level = NR_HOTSPOT_LEVELS - threshold_level;
1048        mq->write_promote_level = (NR_HOTSPOT_LEVELS - threshold_level) + 2u;
1049}
1050
1051/*
1052 * If the hotspot queue is performing badly, then we try and move entries
1053 * around more quickly.
1054 */
1055static void update_level_jump(struct smq_policy *mq)
1056{
1057        switch (stats_assess(&mq->hotspot_stats)) {
1058        case Q_POOR:
1059                mq->hotspot_level_jump = 4u;
1060                break;
1061
1062        case Q_FAIR:
1063                mq->hotspot_level_jump = 2u;
1064                break;
1065
1066        case Q_WELL:
1067                mq->hotspot_level_jump = 1u;
1068                break;
1069        }
1070}
1071
1072static void end_hotspot_period(struct smq_policy *mq)
1073{
1074        clear_bitset(mq->hotspot_hit_bits, mq->nr_hotspot_blocks);
1075        update_promote_levels(mq);
1076
1077        if (time_after(jiffies, mq->next_hotspot_period)) {
1078                update_level_jump(mq);
1079                q_redistribute(&mq->hotspot);
1080                stats_reset(&mq->hotspot_stats);
1081                mq->next_hotspot_period = jiffies + HOTSPOT_UPDATE_PERIOD;
1082        }
1083}
1084
1085static void end_cache_period(struct smq_policy *mq)
1086{
1087        if (time_after(jiffies, mq->next_cache_period)) {
1088                clear_bitset(mq->cache_hit_bits, from_cblock(mq->cache_size));
1089
1090                q_redistribute(&mq->dirty);
1091                q_redistribute(&mq->clean);
1092                stats_reset(&mq->cache_stats);
1093
1094                mq->next_cache_period = jiffies + CACHE_UPDATE_PERIOD;
1095        }
1096}
1097
1098static int demote_cblock(struct smq_policy *mq,
1099                         struct policy_locker *locker,
1100                         dm_oblock_t *oblock)
1101{
1102        struct entry *demoted = q_peek(&mq->clean, mq->clean.nr_levels, false);
1103        if (!demoted)
1104                /*
1105                 * We could get a block from mq->dirty, but that
1106                 * would add extra latency to the triggering bio as it
1107                 * waits for the writeback.  Better to not promote this
1108                 * time and hope there's a clean block next time this block
1109                 * is hit.
1110                 */
1111                return -ENOSPC;
1112
1113        if (locker->fn(locker, demoted->oblock))
1114                /*
1115                 * We couldn't lock this block.
1116                 */
1117                return -EBUSY;
1118
1119        del(mq, demoted);
1120        *oblock = demoted->oblock;
1121        free_entry(&mq->cache_alloc, demoted);
1122
1123        return 0;
1124}
1125
1126enum promote_result {
1127        PROMOTE_NOT,
1128        PROMOTE_TEMPORARY,
1129        PROMOTE_PERMANENT
1130};
1131
1132/*
1133 * Converts a boolean into a promote result.
1134 */
1135static enum promote_result maybe_promote(bool promote)
1136{
1137        return promote ? PROMOTE_PERMANENT : PROMOTE_NOT;
1138}
1139
1140static enum promote_result should_promote(struct smq_policy *mq, struct entry *hs_e, struct bio *bio,
1141                                          bool fast_promote)
1142{
1143        if (bio_data_dir(bio) == WRITE) {
1144                if (!allocator_empty(&mq->cache_alloc) && fast_promote)
1145                        return PROMOTE_TEMPORARY;
1146
1147                else
1148                        return maybe_promote(hs_e->level >= mq->write_promote_level);
1149        } else
1150                return maybe_promote(hs_e->level >= mq->read_promote_level);
1151}
1152
1153static void insert_in_cache(struct smq_policy *mq, dm_oblock_t oblock,
1154                            struct policy_locker *locker,
1155                            struct policy_result *result, enum promote_result pr)
1156{
1157        int r;
1158        struct entry *e;
1159
1160        if (allocator_empty(&mq->cache_alloc)) {
1161                result->op = POLICY_REPLACE;
1162                r = demote_cblock(mq, locker, &result->old_oblock);
1163                if (r) {
1164                        result->op = POLICY_MISS;
1165                        return;
1166                }
1167
1168        } else
1169                result->op = POLICY_NEW;
1170
1171        e = alloc_entry(&mq->cache_alloc);
1172        BUG_ON(!e);
1173        e->oblock = oblock;
1174
1175        if (pr == PROMOTE_TEMPORARY)
1176                push(mq, e);
1177        else
1178                push_new(mq, e);
1179
1180        result->cblock = infer_cblock(mq, e);
1181}
1182
1183static dm_oblock_t to_hblock(struct smq_policy *mq, dm_oblock_t b)
1184{
1185        sector_t r = from_oblock(b);
1186        (void) sector_div(r, mq->cache_blocks_per_hotspot_block);
1187        return to_oblock(r);
1188}
1189
1190static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b, struct bio *bio)
1191{
1192        unsigned hi;
1193        dm_oblock_t hb = to_hblock(mq, b);
1194        struct entry *e = h_lookup(&mq->hotspot_table, hb);
1195
1196        if (e) {
1197                stats_level_accessed(&mq->hotspot_stats, e->level);
1198
1199                hi = get_index(&mq->hotspot_alloc, e);
1200                q_requeue(&mq->hotspot, e,
1201                          test_and_set_bit(hi, mq->hotspot_hit_bits) ?
1202                          0u : mq->hotspot_level_jump);
1203
1204        } else {
1205                stats_miss(&mq->hotspot_stats);
1206
1207                e = alloc_entry(&mq->hotspot_alloc);
1208                if (!e) {
1209                        e = q_pop(&mq->hotspot);
1210                        if (e) {
1211                                h_remove(&mq->hotspot_table, e);
1212                                hi = get_index(&mq->hotspot_alloc, e);
1213                                clear_bit(hi, mq->hotspot_hit_bits);
1214                        }
1215
1216                }
1217
1218                if (e) {
1219                        e->oblock = hb;
1220                        q_push(&mq->hotspot, e);
1221                        h_insert(&mq->hotspot_table, e);
1222                }
1223        }
1224
1225        return e;
1226}
1227
1228/*
1229 * Looks the oblock up in the hash table, then decides whether to put in
1230 * pre_cache, or cache etc.
1231 */
1232static int map(struct smq_policy *mq, struct bio *bio, dm_oblock_t oblock,
1233               bool can_migrate, bool fast_promote,
1234               struct policy_locker *locker, struct policy_result *result)
1235{
1236        struct entry *e, *hs_e;
1237        enum promote_result pr;
1238
1239        hs_e = update_hotspot_queue(mq, oblock, bio);
1240
1241        e = h_lookup(&mq->table, oblock);
1242        if (e) {
1243                stats_level_accessed(&mq->cache_stats, e->level);
1244
1245                requeue(mq, e);
1246                result->op = POLICY_HIT;
1247                result->cblock = infer_cblock(mq, e);
1248
1249        } else {
1250                stats_miss(&mq->cache_stats);
1251
1252                pr = should_promote(mq, hs_e, bio, fast_promote);
1253                if (pr == PROMOTE_NOT)
1254                        result->op = POLICY_MISS;
1255
1256                else {
1257                        if (!can_migrate) {
1258                                result->op = POLICY_MISS;
1259                                return -EWOULDBLOCK;
1260                        }
1261
1262                        insert_in_cache(mq, oblock, locker, result, pr);
1263                }
1264        }
1265
1266        return 0;
1267}
1268
1269/*----------------------------------------------------------------*/
1270
1271/*
1272 * Public interface, via the policy struct.  See dm-cache-policy.h for a
1273 * description of these.
1274 */
1275
1276static struct smq_policy *to_smq_policy(struct dm_cache_policy *p)
1277{
1278        return container_of(p, struct smq_policy, policy);
1279}
1280
1281static void smq_destroy(struct dm_cache_policy *p)
1282{
1283        struct smq_policy *mq = to_smq_policy(p);
1284
1285        h_exit(&mq->hotspot_table);
1286        h_exit(&mq->table);
1287        free_bitset(mq->hotspot_hit_bits);
1288        free_bitset(mq->cache_hit_bits);
1289        space_exit(&mq->es);
1290        kfree(mq);
1291}
1292
1293static int smq_map(struct dm_cache_policy *p, dm_oblock_t oblock,
1294                   bool can_block, bool can_migrate, bool fast_promote,
1295                   struct bio *bio, struct policy_locker *locker,
1296                   struct policy_result *result)
1297{
1298        int r;
1299        unsigned long flags;
1300        struct smq_policy *mq = to_smq_policy(p);
1301
1302        result->op = POLICY_MISS;
1303
1304        spin_lock_irqsave(&mq->lock, flags);
1305        r = map(mq, bio, oblock, can_migrate, fast_promote, locker, result);
1306        spin_unlock_irqrestore(&mq->lock, flags);
1307
1308        return r;
1309}
1310
1311static int smq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock)
1312{
1313        int r;
1314        unsigned long flags;
1315        struct smq_policy *mq = to_smq_policy(p);
1316        struct entry *e;
1317
1318        spin_lock_irqsave(&mq->lock, flags);
1319        e = h_lookup(&mq->table, oblock);
1320        if (e) {
1321                *cblock = infer_cblock(mq, e);
1322                r = 0;
1323        } else
1324                r = -ENOENT;
1325        spin_unlock_irqrestore(&mq->lock, flags);
1326
1327        return r;
1328}
1329
1330static void __smq_set_clear_dirty(struct smq_policy *mq, dm_oblock_t oblock, bool set)
1331{
1332        struct entry *e;
1333
1334        e = h_lookup(&mq->table, oblock);
1335        BUG_ON(!e);
1336
1337        del(mq, e);
1338        e->dirty = set;
1339        push(mq, e);
1340}
1341
1342static void smq_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
1343{
1344        unsigned long flags;
1345        struct smq_policy *mq = to_smq_policy(p);
1346
1347        spin_lock_irqsave(&mq->lock, flags);
1348        __smq_set_clear_dirty(mq, oblock, true);
1349        spin_unlock_irqrestore(&mq->lock, flags);
1350}
1351
1352static void smq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock)
1353{
1354        struct smq_policy *mq = to_smq_policy(p);
1355        unsigned long flags;
1356
1357        spin_lock_irqsave(&mq->lock, flags);
1358        __smq_set_clear_dirty(mq, oblock, false);
1359        spin_unlock_irqrestore(&mq->lock, flags);
1360}
1361
1362static unsigned random_level(dm_cblock_t cblock)
1363{
1364        return hash_32_generic(from_cblock(cblock), 9) & (NR_CACHE_LEVELS - 1);
1365}
1366
1367static int smq_load_mapping(struct dm_cache_policy *p,
1368                            dm_oblock_t oblock, dm_cblock_t cblock,
1369                            uint32_t hint, bool hint_valid)
1370{
1371        struct smq_policy *mq = to_smq_policy(p);
1372        struct entry *e;
1373
1374        e = alloc_particular_entry(&mq->cache_alloc, from_cblock(cblock));
1375        e->oblock = oblock;
1376        e->dirty = false;       /* this gets corrected in a minute */
1377        e->level = hint_valid ? min(hint, NR_CACHE_LEVELS - 1) : random_level(cblock);
1378        push(mq, e);
1379
1380        return 0;
1381}
1382
1383static uint32_t smq_get_hint(struct dm_cache_policy *p, dm_cblock_t cblock)
1384{
1385        struct smq_policy *mq = to_smq_policy(p);
1386        struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
1387
1388        if (!e->allocated)
1389                return 0;
1390
1391        return e->level;
1392}
1393
1394static void __remove_mapping(struct smq_policy *mq, dm_oblock_t oblock)
1395{
1396        struct entry *e;
1397
1398        e = h_lookup(&mq->table, oblock);
1399        BUG_ON(!e);
1400
1401        del(mq, e);
1402        free_entry(&mq->cache_alloc, e);
1403}
1404
1405static void smq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock)
1406{
1407        struct smq_policy *mq = to_smq_policy(p);
1408        unsigned long flags;
1409
1410        spin_lock_irqsave(&mq->lock, flags);
1411        __remove_mapping(mq, oblock);
1412        spin_unlock_irqrestore(&mq->lock, flags);
1413}
1414
1415static int __remove_cblock(struct smq_policy *mq, dm_cblock_t cblock)
1416{
1417        struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
1418
1419        if (!e || !e->allocated)
1420                return -ENODATA;
1421
1422        del(mq, e);
1423        free_entry(&mq->cache_alloc, e);
1424
1425        return 0;
1426}
1427
1428static int smq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock)
1429{
1430        int r;
1431        unsigned long flags;
1432        struct smq_policy *mq = to_smq_policy(p);
1433
1434        spin_lock_irqsave(&mq->lock, flags);
1435        r = __remove_cblock(mq, cblock);
1436        spin_unlock_irqrestore(&mq->lock, flags);
1437
1438        return r;
1439}
1440
1441
1442#define CLEAN_TARGET_CRITICAL 5u /* percent */
1443
1444static bool clean_target_met(struct smq_policy *mq, bool critical)
1445{
1446        if (critical) {
1447                /*
1448                 * Cache entries may not be populated.  So we're cannot rely on the
1449                 * size of the clean queue.
1450                 */
1451                unsigned nr_clean = from_cblock(mq->cache_size) - q_size(&mq->dirty);
1452                unsigned target = from_cblock(mq->cache_size) * CLEAN_TARGET_CRITICAL / 100u;
1453
1454                return nr_clean >= target;
1455        } else
1456                return !q_size(&mq->dirty);
1457}
1458
1459static int __smq_writeback_work(struct smq_policy *mq, dm_oblock_t *oblock,
1460                                dm_cblock_t *cblock, bool critical_only)
1461{
1462        struct entry *e = NULL;
1463        bool target_met = clean_target_met(mq, critical_only);
1464
1465        if (critical_only)
1466                /*
1467                 * Always try and keep the bottom level clean.
1468                 */
1469                e = pop_old(mq, &mq->dirty, target_met ? 1u : mq->dirty.nr_levels);
1470
1471        else
1472                e = pop_old(mq, &mq->dirty, mq->dirty.nr_levels);
1473
1474        if (!e)
1475                return -ENODATA;
1476
1477        *oblock = e->oblock;
1478        *cblock = infer_cblock(mq, e);
1479        e->dirty = false;
1480        push_new(mq, e);
1481
1482        return 0;
1483}
1484
1485static int smq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock,
1486                              dm_cblock_t *cblock, bool critical_only)
1487{
1488        int r;
1489        unsigned long flags;
1490        struct smq_policy *mq = to_smq_policy(p);
1491
1492        spin_lock_irqsave(&mq->lock, flags);
1493        r = __smq_writeback_work(mq, oblock, cblock, critical_only);
1494        spin_unlock_irqrestore(&mq->lock, flags);
1495
1496        return r;
1497}
1498
1499static void __force_mapping(struct smq_policy *mq,
1500                            dm_oblock_t current_oblock, dm_oblock_t new_oblock)
1501{
1502        struct entry *e = h_lookup(&mq->table, current_oblock);
1503
1504        if (e) {
1505                del(mq, e);
1506                e->oblock = new_oblock;
1507                e->dirty = true;
1508                push(mq, e);
1509        }
1510}
1511
1512static void smq_force_mapping(struct dm_cache_policy *p,
1513                              dm_oblock_t current_oblock, dm_oblock_t new_oblock)
1514{
1515        unsigned long flags;
1516        struct smq_policy *mq = to_smq_policy(p);
1517
1518        spin_lock_irqsave(&mq->lock, flags);
1519        __force_mapping(mq, current_oblock, new_oblock);
1520        spin_unlock_irqrestore(&mq->lock, flags);
1521}
1522
1523static dm_cblock_t smq_residency(struct dm_cache_policy *p)
1524{
1525        dm_cblock_t r;
1526        unsigned long flags;
1527        struct smq_policy *mq = to_smq_policy(p);
1528
1529        spin_lock_irqsave(&mq->lock, flags);
1530        r = to_cblock(mq->cache_alloc.nr_allocated);
1531        spin_unlock_irqrestore(&mq->lock, flags);
1532
1533        return r;
1534}
1535
1536static void smq_tick(struct dm_cache_policy *p, bool can_block)
1537{
1538        struct smq_policy *mq = to_smq_policy(p);
1539        unsigned long flags;
1540
1541        spin_lock_irqsave(&mq->lock, flags);
1542        mq->tick++;
1543        update_sentinels(mq);
1544        end_hotspot_period(mq);
1545        end_cache_period(mq);
1546        spin_unlock_irqrestore(&mq->lock, flags);
1547}
1548
1549/*
1550 * smq has no config values, but the old mq policy did.  To avoid breaking
1551 * software we continue to accept these configurables for the mq policy,
1552 * but they have no effect.
1553 */
1554static int mq_set_config_value(struct dm_cache_policy *p,
1555                               const char *key, const char *value)
1556{
1557        unsigned long tmp;
1558
1559        if (kstrtoul(value, 10, &tmp))
1560                return -EINVAL;
1561
1562        if (!strcasecmp(key, "random_threshold") ||
1563            !strcasecmp(key, "sequential_threshold") ||
1564            !strcasecmp(key, "discard_promote_adjustment") ||
1565            !strcasecmp(key, "read_promote_adjustment") ||
1566            !strcasecmp(key, "write_promote_adjustment")) {
1567                DMWARN("tunable '%s' no longer has any effect, mq policy is now an alias for smq", key);
1568                return 0;
1569        }
1570
1571        return -EINVAL;
1572}
1573
1574static int mq_emit_config_values(struct dm_cache_policy *p, char *result,
1575                                 unsigned maxlen, ssize_t *sz_ptr)
1576{
1577        ssize_t sz = *sz_ptr;
1578
1579        DMEMIT("10 random_threshold 0 "
1580               "sequential_threshold 0 "
1581               "discard_promote_adjustment 0 "
1582               "read_promote_adjustment 0 "
1583               "write_promote_adjustment 0 ");
1584
1585        *sz_ptr = sz;
1586        return 0;
1587}
1588
1589/* Init the policy plugin interface function pointers. */
1590static void init_policy_functions(struct smq_policy *mq, bool mimic_mq)
1591{
1592        mq->policy.destroy = smq_destroy;
1593        mq->policy.map = smq_map;
1594        mq->policy.lookup = smq_lookup;
1595        mq->policy.set_dirty = smq_set_dirty;
1596        mq->policy.clear_dirty = smq_clear_dirty;
1597        mq->policy.load_mapping = smq_load_mapping;
1598        mq->policy.get_hint = smq_get_hint;
1599        mq->policy.remove_mapping = smq_remove_mapping;
1600        mq->policy.remove_cblock = smq_remove_cblock;
1601        mq->policy.writeback_work = smq_writeback_work;
1602        mq->policy.force_mapping = smq_force_mapping;
1603        mq->policy.residency = smq_residency;
1604        mq->policy.tick = smq_tick;
1605
1606        if (mimic_mq) {
1607                mq->policy.set_config_value = mq_set_config_value;
1608                mq->policy.emit_config_values = mq_emit_config_values;
1609        }
1610}
1611
1612static bool too_many_hotspot_blocks(sector_t origin_size,
1613                                    sector_t hotspot_block_size,
1614                                    unsigned nr_hotspot_blocks)
1615{
1616        return (hotspot_block_size * nr_hotspot_blocks) > origin_size;
1617}
1618
1619static void calc_hotspot_params(sector_t origin_size,
1620                                sector_t cache_block_size,
1621                                unsigned nr_cache_blocks,
1622                                sector_t *hotspot_block_size,
1623                                unsigned *nr_hotspot_blocks)
1624{
1625        *hotspot_block_size = cache_block_size * 16u;
1626        *nr_hotspot_blocks = max(nr_cache_blocks / 4u, 1024u);
1627
1628        while ((*hotspot_block_size > cache_block_size) &&
1629               too_many_hotspot_blocks(origin_size, *hotspot_block_size, *nr_hotspot_blocks))
1630                *hotspot_block_size /= 2u;
1631}
1632
1633static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size,
1634                                            sector_t origin_size,
1635                                            sector_t cache_block_size,
1636                                            bool mimic_mq)
1637{
1638        unsigned i;
1639        unsigned nr_sentinels_per_queue = 2u * NR_CACHE_LEVELS;
1640        unsigned total_sentinels = 2u * nr_sentinels_per_queue;
1641        struct smq_policy *mq = kzalloc(sizeof(*mq), GFP_KERNEL);
1642
1643        if (!mq)
1644                return NULL;
1645
1646        init_policy_functions(mq, mimic_mq);
1647        mq->cache_size = cache_size;
1648        mq->cache_block_size = cache_block_size;
1649
1650        calc_hotspot_params(origin_size, cache_block_size, from_cblock(cache_size),
1651                            &mq->hotspot_block_size, &mq->nr_hotspot_blocks);
1652
1653        mq->cache_blocks_per_hotspot_block = div64_u64(mq->hotspot_block_size, mq->cache_block_size);
1654        mq->hotspot_level_jump = 1u;
1655        if (space_init(&mq->es, total_sentinels + mq->nr_hotspot_blocks + from_cblock(cache_size))) {
1656                DMERR("couldn't initialize entry space");
1657                goto bad_pool_init;
1658        }
1659
1660        init_allocator(&mq->writeback_sentinel_alloc, &mq->es, 0, nr_sentinels_per_queue);
1661        for (i = 0; i < nr_sentinels_per_queue; i++)
1662                get_entry(&mq->writeback_sentinel_alloc, i)->sentinel = true;
1663
1664        init_allocator(&mq->demote_sentinel_alloc, &mq->es, nr_sentinels_per_queue, total_sentinels);
1665        for (i = 0; i < nr_sentinels_per_queue; i++)
1666                get_entry(&mq->demote_sentinel_alloc, i)->sentinel = true;
1667
1668        init_allocator(&mq->hotspot_alloc, &mq->es, total_sentinels,
1669                       total_sentinels + mq->nr_hotspot_blocks);
1670
1671        init_allocator(&mq->cache_alloc, &mq->es,
1672                       total_sentinels + mq->nr_hotspot_blocks,
1673                       total_sentinels + mq->nr_hotspot_blocks + from_cblock(cache_size));
1674
1675        mq->hotspot_hit_bits = alloc_bitset(mq->nr_hotspot_blocks);
1676        if (!mq->hotspot_hit_bits) {
1677                DMERR("couldn't allocate hotspot hit bitset");
1678                goto bad_hotspot_hit_bits;
1679        }
1680        clear_bitset(mq->hotspot_hit_bits, mq->nr_hotspot_blocks);
1681
1682        if (from_cblock(cache_size)) {
1683                mq->cache_hit_bits = alloc_bitset(from_cblock(cache_size));
1684                if (!mq->cache_hit_bits) {
1685                        DMERR("couldn't allocate cache hit bitset");
1686                        goto bad_cache_hit_bits;
1687                }
1688                clear_bitset(mq->cache_hit_bits, from_cblock(mq->cache_size));
1689        } else
1690                mq->cache_hit_bits = NULL;
1691
1692        mq->tick = 0;
1693        spin_lock_init(&mq->lock);
1694
1695        q_init(&mq->hotspot, &mq->es, NR_HOTSPOT_LEVELS);
1696        mq->hotspot.nr_top_levels = 8;
1697        mq->hotspot.nr_in_top_levels = min(mq->nr_hotspot_blocks / NR_HOTSPOT_LEVELS,
1698                                           from_cblock(mq->cache_size) / mq->cache_blocks_per_hotspot_block);
1699
1700        q_init(&mq->clean, &mq->es, NR_CACHE_LEVELS);
1701        q_init(&mq->dirty, &mq->es, NR_CACHE_LEVELS);
1702
1703        stats_init(&mq->hotspot_stats, NR_HOTSPOT_LEVELS);
1704        stats_init(&mq->cache_stats, NR_CACHE_LEVELS);
1705
1706        if (h_init(&mq->table, &mq->es, from_cblock(cache_size)))
1707                goto bad_alloc_table;
1708
1709        if (h_init(&mq->hotspot_table, &mq->es, mq->nr_hotspot_blocks))
1710                goto bad_alloc_hotspot_table;
1711
1712        sentinels_init(mq);
1713        mq->write_promote_level = mq->read_promote_level = NR_HOTSPOT_LEVELS;
1714
1715        mq->next_hotspot_period = jiffies;
1716        mq->next_cache_period = jiffies;
1717
1718        return &mq->policy;
1719
1720bad_alloc_hotspot_table:
1721        h_exit(&mq->table);
1722bad_alloc_table:
1723        free_bitset(mq->cache_hit_bits);
1724bad_cache_hit_bits:
1725        free_bitset(mq->hotspot_hit_bits);
1726bad_hotspot_hit_bits:
1727        space_exit(&mq->es);
1728bad_pool_init:
1729        kfree(mq);
1730
1731        return NULL;
1732}
1733
1734static struct dm_cache_policy *smq_create(dm_cblock_t cache_size,
1735                                          sector_t origin_size,
1736                                          sector_t cache_block_size)
1737{
1738        return __smq_create(cache_size, origin_size, cache_block_size, false);
1739}
1740
1741static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
1742                                         sector_t origin_size,
1743                                         sector_t cache_block_size)
1744{
1745        return __smq_create(cache_size, origin_size, cache_block_size, true);
1746}
1747
1748/*----------------------------------------------------------------*/
1749
1750static struct dm_cache_policy_type smq_policy_type = {
1751        .name = "smq",
1752        .version = {1, 5, 0},
1753        .hint_size = 4,
1754        .owner = THIS_MODULE,
1755        .create = smq_create
1756};
1757
1758static struct dm_cache_policy_type mq_policy_type = {
1759        .name = "mq",
1760        .version = {1, 5, 0},
1761        .hint_size = 4,
1762        .owner = THIS_MODULE,
1763        .create = mq_create,
1764};
1765
1766static struct dm_cache_policy_type default_policy_type = {
1767        .name = "default",
1768        .version = {1, 5, 0},
1769        .hint_size = 4,
1770        .owner = THIS_MODULE,
1771        .create = smq_create,
1772        .real = &smq_policy_type
1773};
1774
1775static int __init smq_init(void)
1776{
1777        int r;
1778
1779        r = dm_cache_policy_register(&smq_policy_type);
1780        if (r) {
1781                DMERR("register failed %d", r);
1782                return -ENOMEM;
1783        }
1784
1785        r = dm_cache_policy_register(&mq_policy_type);
1786        if (r) {
1787                DMERR("register failed (as mq) %d", r);
1788                dm_cache_policy_unregister(&smq_policy_type);
1789                return -ENOMEM;
1790        }
1791
1792        r = dm_cache_policy_register(&default_policy_type);
1793        if (r) {
1794                DMERR("register failed (as default) %d", r);
1795                dm_cache_policy_unregister(&mq_policy_type);
1796                dm_cache_policy_unregister(&smq_policy_type);
1797                return -ENOMEM;
1798        }
1799
1800        return 0;
1801}
1802
1803static void __exit smq_exit(void)
1804{
1805        dm_cache_policy_unregister(&smq_policy_type);
1806        dm_cache_policy_unregister(&mq_policy_type);
1807        dm_cache_policy_unregister(&default_policy_type);
1808}
1809
1810module_init(smq_init);
1811module_exit(smq_exit);
1812
1813MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
1814MODULE_LICENSE("GPL");
1815MODULE_DESCRIPTION("smq cache policy");
1816
1817MODULE_ALIAS("dm-cache-default");
1818MODULE_ALIAS("dm-cache-mq");
1819