linux/drivers/md/dm-cache-policy-smq.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2015 Red Hat. All rights reserved.
   3 *
   4 * This file is released under the GPL.
   5 */
   6
   7#include "dm-cache-background-tracker.h"
   8#include "dm-cache-policy-internal.h"
   9#include "dm-cache-policy.h"
  10#include "dm.h"
  11
  12#include <linux/hash.h>
  13#include <linux/jiffies.h>
  14#include <linux/module.h>
  15#include <linux/mutex.h>
  16#include <linux/vmalloc.h>
  17#include <linux/math64.h>
  18
  19#define DM_MSG_PREFIX "cache-policy-smq"
  20
  21/*----------------------------------------------------------------*/
  22
  23/*
  24 * Safe division functions that return zero on divide by zero.
  25 */
  26static unsigned safe_div(unsigned n, unsigned d)
  27{
  28        return d ? n / d : 0u;
  29}
  30
  31static unsigned safe_mod(unsigned n, unsigned d)
  32{
  33        return d ? n % d : 0u;
  34}
  35
  36/*----------------------------------------------------------------*/
  37
  38struct entry {
  39        unsigned hash_next:28;
  40        unsigned prev:28;
  41        unsigned next:28;
  42        unsigned level:6;
  43        bool dirty:1;
  44        bool allocated:1;
  45        bool sentinel:1;
  46        bool pending_work:1;
  47
  48        dm_oblock_t oblock;
  49};
  50
  51/*----------------------------------------------------------------*/
  52
  53#define INDEXER_NULL ((1u << 28u) - 1u)
  54
  55/*
  56 * An entry_space manages a set of entries that we use for the queues.
  57 * The clean and dirty queues share entries, so this object is separate
  58 * from the queue itself.
  59 */
  60struct entry_space {
  61        struct entry *begin;
  62        struct entry *end;
  63};
  64
  65static int space_init(struct entry_space *es, unsigned nr_entries)
  66{
  67        if (!nr_entries) {
  68                es->begin = es->end = NULL;
  69                return 0;
  70        }
  71
  72        es->begin = vzalloc(array_size(nr_entries, sizeof(struct entry)));
  73        if (!es->begin)
  74                return -ENOMEM;
  75
  76        es->end = es->begin + nr_entries;
  77        return 0;
  78}
  79
  80static void space_exit(struct entry_space *es)
  81{
  82        vfree(es->begin);
  83}
  84
  85static struct entry *__get_entry(struct entry_space *es, unsigned block)
  86{
  87        struct entry *e;
  88
  89        e = es->begin + block;
  90        BUG_ON(e >= es->end);
  91
  92        return e;
  93}
  94
  95static unsigned to_index(struct entry_space *es, struct entry *e)
  96{
  97        BUG_ON(e < es->begin || e >= es->end);
  98        return e - es->begin;
  99}
 100
 101static struct entry *to_entry(struct entry_space *es, unsigned block)
 102{
 103        if (block == INDEXER_NULL)
 104                return NULL;
 105
 106        return __get_entry(es, block);
 107}
 108
 109/*----------------------------------------------------------------*/
 110
 111struct ilist {
 112        unsigned nr_elts;       /* excluding sentinel entries */
 113        unsigned head, tail;
 114};
 115
 116static void l_init(struct ilist *l)
 117{
 118        l->nr_elts = 0;
 119        l->head = l->tail = INDEXER_NULL;
 120}
 121
 122static struct entry *l_head(struct entry_space *es, struct ilist *l)
 123{
 124        return to_entry(es, l->head);
 125}
 126
 127static struct entry *l_tail(struct entry_space *es, struct ilist *l)
 128{
 129        return to_entry(es, l->tail);
 130}
 131
 132static struct entry *l_next(struct entry_space *es, struct entry *e)
 133{
 134        return to_entry(es, e->next);
 135}
 136
 137static struct entry *l_prev(struct entry_space *es, struct entry *e)
 138{
 139        return to_entry(es, e->prev);
 140}
 141
 142static bool l_empty(struct ilist *l)
 143{
 144        return l->head == INDEXER_NULL;
 145}
 146
 147static void l_add_head(struct entry_space *es, struct ilist *l, struct entry *e)
 148{
 149        struct entry *head = l_head(es, l);
 150
 151        e->next = l->head;
 152        e->prev = INDEXER_NULL;
 153
 154        if (head)
 155                head->prev = l->head = to_index(es, e);
 156        else
 157                l->head = l->tail = to_index(es, e);
 158
 159        if (!e->sentinel)
 160                l->nr_elts++;
 161}
 162
 163static void l_add_tail(struct entry_space *es, struct ilist *l, struct entry *e)
 164{
 165        struct entry *tail = l_tail(es, l);
 166
 167        e->next = INDEXER_NULL;
 168        e->prev = l->tail;
 169
 170        if (tail)
 171                tail->next = l->tail = to_index(es, e);
 172        else
 173                l->head = l->tail = to_index(es, e);
 174
 175        if (!e->sentinel)
 176                l->nr_elts++;
 177}
 178
 179static void l_add_before(struct entry_space *es, struct ilist *l,
 180                         struct entry *old, struct entry *e)
 181{
 182        struct entry *prev = l_prev(es, old);
 183
 184        if (!prev)
 185                l_add_head(es, l, e);
 186
 187        else {
 188                e->prev = old->prev;
 189                e->next = to_index(es, old);
 190                prev->next = old->prev = to_index(es, e);
 191
 192                if (!e->sentinel)
 193                        l->nr_elts++;
 194        }
 195}
 196
 197static void l_del(struct entry_space *es, struct ilist *l, struct entry *e)
 198{
 199        struct entry *prev = l_prev(es, e);
 200        struct entry *next = l_next(es, e);
 201
 202        if (prev)
 203                prev->next = e->next;
 204        else
 205                l->head = e->next;
 206
 207        if (next)
 208                next->prev = e->prev;
 209        else
 210                l->tail = e->prev;
 211
 212        if (!e->sentinel)
 213                l->nr_elts--;
 214}
 215
 216static struct entry *l_pop_head(struct entry_space *es, struct ilist *l)
 217{
 218        struct entry *e;
 219
 220        for (e = l_head(es, l); e; e = l_next(es, e))
 221                if (!e->sentinel) {
 222                        l_del(es, l, e);
 223                        return e;
 224                }
 225
 226        return NULL;
 227}
 228
 229static struct entry *l_pop_tail(struct entry_space *es, struct ilist *l)
 230{
 231        struct entry *e;
 232
 233        for (e = l_tail(es, l); e; e = l_prev(es, e))
 234                if (!e->sentinel) {
 235                        l_del(es, l, e);
 236                        return e;
 237                }
 238
 239        return NULL;
 240}
 241
 242/*----------------------------------------------------------------*/
 243
 244/*
 245 * The stochastic-multi-queue is a set of lru lists stacked into levels.
 246 * Entries are moved up levels when they are used, which loosely orders the
 247 * most accessed entries in the top levels and least in the bottom.  This
 248 * structure is *much* better than a single lru list.
 249 */
 250#define MAX_LEVELS 64u
 251
 252struct queue {
 253        struct entry_space *es;
 254
 255        unsigned nr_elts;
 256        unsigned nr_levels;
 257        struct ilist qs[MAX_LEVELS];
 258
 259        /*
 260         * We maintain a count of the number of entries we would like in each
 261         * level.
 262         */
 263        unsigned last_target_nr_elts;
 264        unsigned nr_top_levels;
 265        unsigned nr_in_top_levels;
 266        unsigned target_count[MAX_LEVELS];
 267};
 268
 269static void q_init(struct queue *q, struct entry_space *es, unsigned nr_levels)
 270{
 271        unsigned i;
 272
 273        q->es = es;
 274        q->nr_elts = 0;
 275        q->nr_levels = nr_levels;
 276
 277        for (i = 0; i < q->nr_levels; i++) {
 278                l_init(q->qs + i);
 279                q->target_count[i] = 0u;
 280        }
 281
 282        q->last_target_nr_elts = 0u;
 283        q->nr_top_levels = 0u;
 284        q->nr_in_top_levels = 0u;
 285}
 286
 287static unsigned q_size(struct queue *q)
 288{
 289        return q->nr_elts;
 290}
 291
 292/*
 293 * Insert an entry to the back of the given level.
 294 */
 295static void q_push(struct queue *q, struct entry *e)
 296{
 297        BUG_ON(e->pending_work);
 298
 299        if (!e->sentinel)
 300                q->nr_elts++;
 301
 302        l_add_tail(q->es, q->qs + e->level, e);
 303}
 304
 305static void q_push_front(struct queue *q, struct entry *e)
 306{
 307        BUG_ON(e->pending_work);
 308
 309        if (!e->sentinel)
 310                q->nr_elts++;
 311
 312        l_add_head(q->es, q->qs + e->level, e);
 313}
 314
 315static void q_push_before(struct queue *q, struct entry *old, struct entry *e)
 316{
 317        BUG_ON(e->pending_work);
 318
 319        if (!e->sentinel)
 320                q->nr_elts++;
 321
 322        l_add_before(q->es, q->qs + e->level, old, e);
 323}
 324
 325static void q_del(struct queue *q, struct entry *e)
 326{
 327        l_del(q->es, q->qs + e->level, e);
 328        if (!e->sentinel)
 329                q->nr_elts--;
 330}
 331
 332/*
 333 * Return the oldest entry of the lowest populated level.
 334 */
 335static struct entry *q_peek(struct queue *q, unsigned max_level, bool can_cross_sentinel)
 336{
 337        unsigned level;
 338        struct entry *e;
 339
 340        max_level = min(max_level, q->nr_levels);
 341
 342        for (level = 0; level < max_level; level++)
 343                for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e)) {
 344                        if (e->sentinel) {
 345                                if (can_cross_sentinel)
 346                                        continue;
 347                                else
 348                                        break;
 349                        }
 350
 351                        return e;
 352                }
 353
 354        return NULL;
 355}
 356
 357static struct entry *q_pop(struct queue *q)
 358{
 359        struct entry *e = q_peek(q, q->nr_levels, true);
 360
 361        if (e)
 362                q_del(q, e);
 363
 364        return e;
 365}
 366
 367/*
 368 * This function assumes there is a non-sentinel entry to pop.  It's only
 369 * used by redistribute, so we know this is true.  It also doesn't adjust
 370 * the q->nr_elts count.
 371 */
 372static struct entry *__redist_pop_from(struct queue *q, unsigned level)
 373{
 374        struct entry *e;
 375
 376        for (; level < q->nr_levels; level++)
 377                for (e = l_head(q->es, q->qs + level); e; e = l_next(q->es, e))
 378                        if (!e->sentinel) {
 379                                l_del(q->es, q->qs + e->level, e);
 380                                return e;
 381                        }
 382
 383        return NULL;
 384}
 385
 386static void q_set_targets_subrange_(struct queue *q, unsigned nr_elts, unsigned lbegin, unsigned lend)
 387{
 388        unsigned level, nr_levels, entries_per_level, remainder;
 389
 390        BUG_ON(lbegin > lend);
 391        BUG_ON(lend > q->nr_levels);
 392        nr_levels = lend - lbegin;
 393        entries_per_level = safe_div(nr_elts, nr_levels);
 394        remainder = safe_mod(nr_elts, nr_levels);
 395
 396        for (level = lbegin; level < lend; level++)
 397                q->target_count[level] =
 398                        (level < (lbegin + remainder)) ? entries_per_level + 1u : entries_per_level;
 399}
 400
 401/*
 402 * Typically we have fewer elements in the top few levels which allows us
 403 * to adjust the promote threshold nicely.
 404 */
 405static void q_set_targets(struct queue *q)
 406{
 407        if (q->last_target_nr_elts == q->nr_elts)
 408                return;
 409
 410        q->last_target_nr_elts = q->nr_elts;
 411
 412        if (q->nr_top_levels > q->nr_levels)
 413                q_set_targets_subrange_(q, q->nr_elts, 0, q->nr_levels);
 414
 415        else {
 416                q_set_targets_subrange_(q, q->nr_in_top_levels,
 417                                        q->nr_levels - q->nr_top_levels, q->nr_levels);
 418
 419                if (q->nr_in_top_levels < q->nr_elts)
 420                        q_set_targets_subrange_(q, q->nr_elts - q->nr_in_top_levels,
 421                                                0, q->nr_levels - q->nr_top_levels);
 422                else
 423                        q_set_targets_subrange_(q, 0, 0, q->nr_levels - q->nr_top_levels);
 424        }
 425}
 426
 427static void q_redistribute(struct queue *q)
 428{
 429        unsigned target, level;
 430        struct ilist *l, *l_above;
 431        struct entry *e;
 432
 433        q_set_targets(q);
 434
 435        for (level = 0u; level < q->nr_levels - 1u; level++) {
 436                l = q->qs + level;
 437                target = q->target_count[level];
 438
 439                /*
 440                 * Pull down some entries from the level above.
 441                 */
 442                while (l->nr_elts < target) {
 443                        e = __redist_pop_from(q, level + 1u);
 444                        if (!e) {
 445                                /* bug in nr_elts */
 446                                break;
 447                        }
 448
 449                        e->level = level;
 450                        l_add_tail(q->es, l, e);
 451                }
 452
 453                /*
 454                 * Push some entries up.
 455                 */
 456                l_above = q->qs + level + 1u;
 457                while (l->nr_elts > target) {
 458                        e = l_pop_tail(q->es, l);
 459
 460                        if (!e)
 461                                /* bug in nr_elts */
 462                                break;
 463
 464                        e->level = level + 1u;
 465                        l_add_tail(q->es, l_above, e);
 466                }
 467        }
 468}
 469
 470static void q_requeue(struct queue *q, struct entry *e, unsigned extra_levels,
 471                      struct entry *s1, struct entry *s2)
 472{
 473        struct entry *de;
 474        unsigned sentinels_passed = 0;
 475        unsigned new_level = min(q->nr_levels - 1u, e->level + extra_levels);
 476
 477        /* try and find an entry to swap with */
 478        if (extra_levels && (e->level < q->nr_levels - 1u)) {
 479                for (de = l_head(q->es, q->qs + new_level); de && de->sentinel; de = l_next(q->es, de))
 480                        sentinels_passed++;
 481
 482                if (de) {
 483                        q_del(q, de);
 484                        de->level = e->level;
 485                        if (s1) {
 486                                switch (sentinels_passed) {
 487                                case 0:
 488                                        q_push_before(q, s1, de);
 489                                        break;
 490
 491                                case 1:
 492                                        q_push_before(q, s2, de);
 493                                        break;
 494
 495                                default:
 496                                        q_push(q, de);
 497                                }
 498                        } else
 499                                q_push(q, de);
 500                }
 501        }
 502
 503        q_del(q, e);
 504        e->level = new_level;
 505        q_push(q, e);
 506}
 507
 508/*----------------------------------------------------------------*/
 509
 510#define FP_SHIFT 8
 511#define SIXTEENTH (1u << (FP_SHIFT - 4u))
 512#define EIGHTH (1u << (FP_SHIFT - 3u))
 513
 514struct stats {
 515        unsigned hit_threshold;
 516        unsigned hits;
 517        unsigned misses;
 518};
 519
 520enum performance {
 521        Q_POOR,
 522        Q_FAIR,
 523        Q_WELL
 524};
 525
 526static void stats_init(struct stats *s, unsigned nr_levels)
 527{
 528        s->hit_threshold = (nr_levels * 3u) / 4u;
 529        s->hits = 0u;
 530        s->misses = 0u;
 531}
 532
 533static void stats_reset(struct stats *s)
 534{
 535        s->hits = s->misses = 0u;
 536}
 537
 538static void stats_level_accessed(struct stats *s, unsigned level)
 539{
 540        if (level >= s->hit_threshold)
 541                s->hits++;
 542        else
 543                s->misses++;
 544}
 545
 546static void stats_miss(struct stats *s)
 547{
 548        s->misses++;
 549}
 550
 551/*
 552 * There are times when we don't have any confidence in the hotspot queue.
 553 * Such as when a fresh cache is created and the blocks have been spread
 554 * out across the levels, or if an io load changes.  We detect this by
 555 * seeing how often a lookup is in the top levels of the hotspot queue.
 556 */
 557static enum performance stats_assess(struct stats *s)
 558{
 559        unsigned confidence = safe_div(s->hits << FP_SHIFT, s->hits + s->misses);
 560
 561        if (confidence < SIXTEENTH)
 562                return Q_POOR;
 563
 564        else if (confidence < EIGHTH)
 565                return Q_FAIR;
 566
 567        else
 568                return Q_WELL;
 569}
 570
 571/*----------------------------------------------------------------*/
 572
 573struct smq_hash_table {
 574        struct entry_space *es;
 575        unsigned long long hash_bits;
 576        unsigned *buckets;
 577};
 578
 579/*
 580 * All cache entries are stored in a chained hash table.  To save space we
 581 * use indexing again, and only store indexes to the next entry.
 582 */
 583static int h_init(struct smq_hash_table *ht, struct entry_space *es, unsigned nr_entries)
 584{
 585        unsigned i, nr_buckets;
 586
 587        ht->es = es;
 588        nr_buckets = roundup_pow_of_two(max(nr_entries / 4u, 16u));
 589        ht->hash_bits = __ffs(nr_buckets);
 590
 591        ht->buckets = vmalloc(array_size(nr_buckets, sizeof(*ht->buckets)));
 592        if (!ht->buckets)
 593                return -ENOMEM;
 594
 595        for (i = 0; i < nr_buckets; i++)
 596                ht->buckets[i] = INDEXER_NULL;
 597
 598        return 0;
 599}
 600
 601static void h_exit(struct smq_hash_table *ht)
 602{
 603        vfree(ht->buckets);
 604}
 605
 606static struct entry *h_head(struct smq_hash_table *ht, unsigned bucket)
 607{
 608        return to_entry(ht->es, ht->buckets[bucket]);
 609}
 610
 611static struct entry *h_next(struct smq_hash_table *ht, struct entry *e)
 612{
 613        return to_entry(ht->es, e->hash_next);
 614}
 615
 616static void __h_insert(struct smq_hash_table *ht, unsigned bucket, struct entry *e)
 617{
 618        e->hash_next = ht->buckets[bucket];
 619        ht->buckets[bucket] = to_index(ht->es, e);
 620}
 621
 622static void h_insert(struct smq_hash_table *ht, struct entry *e)
 623{
 624        unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits);
 625        __h_insert(ht, h, e);
 626}
 627
 628static struct entry *__h_lookup(struct smq_hash_table *ht, unsigned h, dm_oblock_t oblock,
 629                                struct entry **prev)
 630{
 631        struct entry *e;
 632
 633        *prev = NULL;
 634        for (e = h_head(ht, h); e; e = h_next(ht, e)) {
 635                if (e->oblock == oblock)
 636                        return e;
 637
 638                *prev = e;
 639        }
 640
 641        return NULL;
 642}
 643
 644static void __h_unlink(struct smq_hash_table *ht, unsigned h,
 645                       struct entry *e, struct entry *prev)
 646{
 647        if (prev)
 648                prev->hash_next = e->hash_next;
 649        else
 650                ht->buckets[h] = e->hash_next;
 651}
 652
 653/*
 654 * Also moves each entry to the front of the bucket.
 655 */
 656static struct entry *h_lookup(struct smq_hash_table *ht, dm_oblock_t oblock)
 657{
 658        struct entry *e, *prev;
 659        unsigned h = hash_64(from_oblock(oblock), ht->hash_bits);
 660
 661        e = __h_lookup(ht, h, oblock, &prev);
 662        if (e && prev) {
 663                /*
 664                 * Move to the front because this entry is likely
 665                 * to be hit again.
 666                 */
 667                __h_unlink(ht, h, e, prev);
 668                __h_insert(ht, h, e);
 669        }
 670
 671        return e;
 672}
 673
 674static void h_remove(struct smq_hash_table *ht, struct entry *e)
 675{
 676        unsigned h = hash_64(from_oblock(e->oblock), ht->hash_bits);
 677        struct entry *prev;
 678
 679        /*
 680         * The down side of using a singly linked list is we have to
 681         * iterate the bucket to remove an item.
 682         */
 683        e = __h_lookup(ht, h, e->oblock, &prev);
 684        if (e)
 685                __h_unlink(ht, h, e, prev);
 686}
 687
 688/*----------------------------------------------------------------*/
 689
 690struct entry_alloc {
 691        struct entry_space *es;
 692        unsigned begin;
 693
 694        unsigned nr_allocated;
 695        struct ilist free;
 696};
 697
 698static void init_allocator(struct entry_alloc *ea, struct entry_space *es,
 699                           unsigned begin, unsigned end)
 700{
 701        unsigned i;
 702
 703        ea->es = es;
 704        ea->nr_allocated = 0u;
 705        ea->begin = begin;
 706
 707        l_init(&ea->free);
 708        for (i = begin; i != end; i++)
 709                l_add_tail(ea->es, &ea->free, __get_entry(ea->es, i));
 710}
 711
 712static void init_entry(struct entry *e)
 713{
 714        /*
 715         * We can't memset because that would clear the hotspot and
 716         * sentinel bits which remain constant.
 717         */
 718        e->hash_next = INDEXER_NULL;
 719        e->next = INDEXER_NULL;
 720        e->prev = INDEXER_NULL;
 721        e->level = 0u;
 722        e->dirty = true;        /* FIXME: audit */
 723        e->allocated = true;
 724        e->sentinel = false;
 725        e->pending_work = false;
 726}
 727
 728static struct entry *alloc_entry(struct entry_alloc *ea)
 729{
 730        struct entry *e;
 731
 732        if (l_empty(&ea->free))
 733                return NULL;
 734
 735        e = l_pop_head(ea->es, &ea->free);
 736        init_entry(e);
 737        ea->nr_allocated++;
 738
 739        return e;
 740}
 741
 742/*
 743 * This assumes the cblock hasn't already been allocated.
 744 */
 745static struct entry *alloc_particular_entry(struct entry_alloc *ea, unsigned i)
 746{
 747        struct entry *e = __get_entry(ea->es, ea->begin + i);
 748
 749        BUG_ON(e->allocated);
 750
 751        l_del(ea->es, &ea->free, e);
 752        init_entry(e);
 753        ea->nr_allocated++;
 754
 755        return e;
 756}
 757
 758static void free_entry(struct entry_alloc *ea, struct entry *e)
 759{
 760        BUG_ON(!ea->nr_allocated);
 761        BUG_ON(!e->allocated);
 762
 763        ea->nr_allocated--;
 764        e->allocated = false;
 765        l_add_tail(ea->es, &ea->free, e);
 766}
 767
 768static bool allocator_empty(struct entry_alloc *ea)
 769{
 770        return l_empty(&ea->free);
 771}
 772
 773static unsigned get_index(struct entry_alloc *ea, struct entry *e)
 774{
 775        return to_index(ea->es, e) - ea->begin;
 776}
 777
 778static struct entry *get_entry(struct entry_alloc *ea, unsigned index)
 779{
 780        return __get_entry(ea->es, ea->begin + index);
 781}
 782
 783/*----------------------------------------------------------------*/
 784
 785#define NR_HOTSPOT_LEVELS 64u
 786#define NR_CACHE_LEVELS 64u
 787
 788#define WRITEBACK_PERIOD (10ul * HZ)
 789#define DEMOTE_PERIOD (60ul * HZ)
 790
 791#define HOTSPOT_UPDATE_PERIOD (HZ)
 792#define CACHE_UPDATE_PERIOD (60ul * HZ)
 793
 794struct smq_policy {
 795        struct dm_cache_policy policy;
 796
 797        /* protects everything */
 798        spinlock_t lock;
 799        dm_cblock_t cache_size;
 800        sector_t cache_block_size;
 801
 802        sector_t hotspot_block_size;
 803        unsigned nr_hotspot_blocks;
 804        unsigned cache_blocks_per_hotspot_block;
 805        unsigned hotspot_level_jump;
 806
 807        struct entry_space es;
 808        struct entry_alloc writeback_sentinel_alloc;
 809        struct entry_alloc demote_sentinel_alloc;
 810        struct entry_alloc hotspot_alloc;
 811        struct entry_alloc cache_alloc;
 812
 813        unsigned long *hotspot_hit_bits;
 814        unsigned long *cache_hit_bits;
 815
 816        /*
 817         * We maintain three queues of entries.  The cache proper,
 818         * consisting of a clean and dirty queue, containing the currently
 819         * active mappings.  The hotspot queue uses a larger block size to
 820         * track blocks that are being hit frequently and potential
 821         * candidates for promotion to the cache.
 822         */
 823        struct queue hotspot;
 824        struct queue clean;
 825        struct queue dirty;
 826
 827        struct stats hotspot_stats;
 828        struct stats cache_stats;
 829
 830        /*
 831         * Keeps track of time, incremented by the core.  We use this to
 832         * avoid attributing multiple hits within the same tick.
 833         */
 834        unsigned tick;
 835
 836        /*
 837         * The hash tables allows us to quickly find an entry by origin
 838         * block.
 839         */
 840        struct smq_hash_table table;
 841        struct smq_hash_table hotspot_table;
 842
 843        bool current_writeback_sentinels;
 844        unsigned long next_writeback_period;
 845
 846        bool current_demote_sentinels;
 847        unsigned long next_demote_period;
 848
 849        unsigned write_promote_level;
 850        unsigned read_promote_level;
 851
 852        unsigned long next_hotspot_period;
 853        unsigned long next_cache_period;
 854
 855        struct background_tracker *bg_work;
 856
 857        bool migrations_allowed;
 858};
 859
 860/*----------------------------------------------------------------*/
 861
 862static struct entry *get_sentinel(struct entry_alloc *ea, unsigned level, bool which)
 863{
 864        return get_entry(ea, which ? level : NR_CACHE_LEVELS + level);
 865}
 866
 867static struct entry *writeback_sentinel(struct smq_policy *mq, unsigned level)
 868{
 869        return get_sentinel(&mq->writeback_sentinel_alloc, level, mq->current_writeback_sentinels);
 870}
 871
 872static struct entry *demote_sentinel(struct smq_policy *mq, unsigned level)
 873{
 874        return get_sentinel(&mq->demote_sentinel_alloc, level, mq->current_demote_sentinels);
 875}
 876
 877static void __update_writeback_sentinels(struct smq_policy *mq)
 878{
 879        unsigned level;
 880        struct queue *q = &mq->dirty;
 881        struct entry *sentinel;
 882
 883        for (level = 0; level < q->nr_levels; level++) {
 884                sentinel = writeback_sentinel(mq, level);
 885                q_del(q, sentinel);
 886                q_push(q, sentinel);
 887        }
 888}
 889
 890static void __update_demote_sentinels(struct smq_policy *mq)
 891{
 892        unsigned level;
 893        struct queue *q = &mq->clean;
 894        struct entry *sentinel;
 895
 896        for (level = 0; level < q->nr_levels; level++) {
 897                sentinel = demote_sentinel(mq, level);
 898                q_del(q, sentinel);
 899                q_push(q, sentinel);
 900        }
 901}
 902
 903static void update_sentinels(struct smq_policy *mq)
 904{
 905        if (time_after(jiffies, mq->next_writeback_period)) {
 906                mq->next_writeback_period = jiffies + WRITEBACK_PERIOD;
 907                mq->current_writeback_sentinels = !mq->current_writeback_sentinels;
 908                __update_writeback_sentinels(mq);
 909        }
 910
 911        if (time_after(jiffies, mq->next_demote_period)) {
 912                mq->next_demote_period = jiffies + DEMOTE_PERIOD;
 913                mq->current_demote_sentinels = !mq->current_demote_sentinels;
 914                __update_demote_sentinels(mq);
 915        }
 916}
 917
 918static void __sentinels_init(struct smq_policy *mq)
 919{
 920        unsigned level;
 921        struct entry *sentinel;
 922
 923        for (level = 0; level < NR_CACHE_LEVELS; level++) {
 924                sentinel = writeback_sentinel(mq, level);
 925                sentinel->level = level;
 926                q_push(&mq->dirty, sentinel);
 927
 928                sentinel = demote_sentinel(mq, level);
 929                sentinel->level = level;
 930                q_push(&mq->clean, sentinel);
 931        }
 932}
 933
 934static void sentinels_init(struct smq_policy *mq)
 935{
 936        mq->next_writeback_period = jiffies + WRITEBACK_PERIOD;
 937        mq->next_demote_period = jiffies + DEMOTE_PERIOD;
 938
 939        mq->current_writeback_sentinels = false;
 940        mq->current_demote_sentinels = false;
 941        __sentinels_init(mq);
 942
 943        mq->current_writeback_sentinels = !mq->current_writeback_sentinels;
 944        mq->current_demote_sentinels = !mq->current_demote_sentinels;
 945        __sentinels_init(mq);
 946}
 947
 948/*----------------------------------------------------------------*/
 949
 950static void del_queue(struct smq_policy *mq, struct entry *e)
 951{
 952        q_del(e->dirty ? &mq->dirty : &mq->clean, e);
 953}
 954
 955static void push_queue(struct smq_policy *mq, struct entry *e)
 956{
 957        if (e->dirty)
 958                q_push(&mq->dirty, e);
 959        else
 960                q_push(&mq->clean, e);
 961}
 962
 963// !h, !q, a -> h, q, a
 964static void push(struct smq_policy *mq, struct entry *e)
 965{
 966        h_insert(&mq->table, e);
 967        if (!e->pending_work)
 968                push_queue(mq, e);
 969}
 970
 971static void push_queue_front(struct smq_policy *mq, struct entry *e)
 972{
 973        if (e->dirty)
 974                q_push_front(&mq->dirty, e);
 975        else
 976                q_push_front(&mq->clean, e);
 977}
 978
 979static void push_front(struct smq_policy *mq, struct entry *e)
 980{
 981        h_insert(&mq->table, e);
 982        if (!e->pending_work)
 983                push_queue_front(mq, e);
 984}
 985
 986static dm_cblock_t infer_cblock(struct smq_policy *mq, struct entry *e)
 987{
 988        return to_cblock(get_index(&mq->cache_alloc, e));
 989}
 990
 991static void requeue(struct smq_policy *mq, struct entry *e)
 992{
 993        /*
 994         * Pending work has temporarily been taken out of the queues.
 995         */
 996        if (e->pending_work)
 997                return;
 998
 999        if (!test_and_set_bit(from_cblock(infer_cblock(mq, e)), mq->cache_hit_bits)) {
1000                if (!e->dirty) {
1001                        q_requeue(&mq->clean, e, 1u, NULL, NULL);
1002                        return;
1003                }
1004
1005                q_requeue(&mq->dirty, e, 1u,
1006                          get_sentinel(&mq->writeback_sentinel_alloc, e->level, !mq->current_writeback_sentinels),
1007                          get_sentinel(&mq->writeback_sentinel_alloc, e->level, mq->current_writeback_sentinels));
1008        }
1009}
1010
1011static unsigned default_promote_level(struct smq_policy *mq)
1012{
1013        /*
1014         * The promote level depends on the current performance of the
1015         * cache.
1016         *
1017         * If the cache is performing badly, then we can't afford
1018         * to promote much without causing performance to drop below that
1019         * of the origin device.
1020         *
1021         * If the cache is performing well, then we don't need to promote
1022         * much.  If it isn't broken, don't fix it.
1023         *
1024         * If the cache is middling then we promote more.
1025         *
1026         * This scheme reminds me of a graph of entropy vs probability of a
1027         * binary variable.
1028         */
1029        static unsigned table[] = {1, 1, 1, 2, 4, 6, 7, 8, 7, 6, 4, 4, 3, 3, 2, 2, 1};
1030
1031        unsigned hits = mq->cache_stats.hits;
1032        unsigned misses = mq->cache_stats.misses;
1033        unsigned index = safe_div(hits << 4u, hits + misses);
1034        return table[index];
1035}
1036
1037static void update_promote_levels(struct smq_policy *mq)
1038{
1039        /*
1040         * If there are unused cache entries then we want to be really
1041         * eager to promote.
1042         */
1043        unsigned threshold_level = allocator_empty(&mq->cache_alloc) ?
1044                default_promote_level(mq) : (NR_HOTSPOT_LEVELS / 2u);
1045
1046        threshold_level = max(threshold_level, NR_HOTSPOT_LEVELS);
1047
1048        /*
1049         * If the hotspot queue is performing badly then we have little
1050         * confidence that we know which blocks to promote.  So we cut down
1051         * the amount of promotions.
1052         */
1053        switch (stats_assess(&mq->hotspot_stats)) {
1054        case Q_POOR:
1055                threshold_level /= 4u;
1056                break;
1057
1058        case Q_FAIR:
1059                threshold_level /= 2u;
1060                break;
1061
1062        case Q_WELL:
1063                break;
1064        }
1065
1066        mq->read_promote_level = NR_HOTSPOT_LEVELS - threshold_level;
1067        mq->write_promote_level = (NR_HOTSPOT_LEVELS - threshold_level);
1068}
1069
1070/*
1071 * If the hotspot queue is performing badly, then we try and move entries
1072 * around more quickly.
1073 */
1074static void update_level_jump(struct smq_policy *mq)
1075{
1076        switch (stats_assess(&mq->hotspot_stats)) {
1077        case Q_POOR:
1078                mq->hotspot_level_jump = 4u;
1079                break;
1080
1081        case Q_FAIR:
1082                mq->hotspot_level_jump = 2u;
1083                break;
1084
1085        case Q_WELL:
1086                mq->hotspot_level_jump = 1u;
1087                break;
1088        }
1089}
1090
1091static void end_hotspot_period(struct smq_policy *mq)
1092{
1093        clear_bitset(mq->hotspot_hit_bits, mq->nr_hotspot_blocks);
1094        update_promote_levels(mq);
1095
1096        if (time_after(jiffies, mq->next_hotspot_period)) {
1097                update_level_jump(mq);
1098                q_redistribute(&mq->hotspot);
1099                stats_reset(&mq->hotspot_stats);
1100                mq->next_hotspot_period = jiffies + HOTSPOT_UPDATE_PERIOD;
1101        }
1102}
1103
1104static void end_cache_period(struct smq_policy *mq)
1105{
1106        if (time_after(jiffies, mq->next_cache_period)) {
1107                clear_bitset(mq->cache_hit_bits, from_cblock(mq->cache_size));
1108
1109                q_redistribute(&mq->dirty);
1110                q_redistribute(&mq->clean);
1111                stats_reset(&mq->cache_stats);
1112
1113                mq->next_cache_period = jiffies + CACHE_UPDATE_PERIOD;
1114        }
1115}
1116
1117/*----------------------------------------------------------------*/
1118
1119/*
1120 * Targets are given as a percentage.
1121 */
1122#define CLEAN_TARGET 25u
1123#define FREE_TARGET 25u
1124
1125static unsigned percent_to_target(struct smq_policy *mq, unsigned p)
1126{
1127        return from_cblock(mq->cache_size) * p / 100u;
1128}
1129
1130static bool clean_target_met(struct smq_policy *mq, bool idle)
1131{
1132        /*
1133         * Cache entries may not be populated.  So we cannot rely on the
1134         * size of the clean queue.
1135         */
1136        if (idle) {
1137                /*
1138                 * We'd like to clean everything.
1139                 */
1140                return q_size(&mq->dirty) == 0u;
1141        }
1142
1143        /*
1144         * If we're busy we don't worry about cleaning at all.
1145         */
1146        return true;
1147}
1148
1149static bool free_target_met(struct smq_policy *mq)
1150{
1151        unsigned nr_free;
1152
1153        nr_free = from_cblock(mq->cache_size) - mq->cache_alloc.nr_allocated;
1154        return (nr_free + btracker_nr_demotions_queued(mq->bg_work)) >=
1155                percent_to_target(mq, FREE_TARGET);
1156}
1157
1158/*----------------------------------------------------------------*/
1159
1160static void mark_pending(struct smq_policy *mq, struct entry *e)
1161{
1162        BUG_ON(e->sentinel);
1163        BUG_ON(!e->allocated);
1164        BUG_ON(e->pending_work);
1165        e->pending_work = true;
1166}
1167
1168static void clear_pending(struct smq_policy *mq, struct entry *e)
1169{
1170        BUG_ON(!e->pending_work);
1171        e->pending_work = false;
1172}
1173
1174static void queue_writeback(struct smq_policy *mq, bool idle)
1175{
1176        int r;
1177        struct policy_work work;
1178        struct entry *e;
1179
1180        e = q_peek(&mq->dirty, mq->dirty.nr_levels, idle);
1181        if (e) {
1182                mark_pending(mq, e);
1183                q_del(&mq->dirty, e);
1184
1185                work.op = POLICY_WRITEBACK;
1186                work.oblock = e->oblock;
1187                work.cblock = infer_cblock(mq, e);
1188
1189                r = btracker_queue(mq->bg_work, &work, NULL);
1190                if (r) {
1191                        clear_pending(mq, e);
1192                        q_push_front(&mq->dirty, e);
1193                }
1194        }
1195}
1196
1197static void queue_demotion(struct smq_policy *mq)
1198{
1199        int r;
1200        struct policy_work work;
1201        struct entry *e;
1202
1203        if (WARN_ON_ONCE(!mq->migrations_allowed))
1204                return;
1205
1206        e = q_peek(&mq->clean, mq->clean.nr_levels / 2, true);
1207        if (!e) {
1208                if (!clean_target_met(mq, true))
1209                        queue_writeback(mq, false);
1210                return;
1211        }
1212
1213        mark_pending(mq, e);
1214        q_del(&mq->clean, e);
1215
1216        work.op = POLICY_DEMOTE;
1217        work.oblock = e->oblock;
1218        work.cblock = infer_cblock(mq, e);
1219        r = btracker_queue(mq->bg_work, &work, NULL);
1220        if (r) {
1221                clear_pending(mq, e);
1222                q_push_front(&mq->clean, e);
1223        }
1224}
1225
1226static void queue_promotion(struct smq_policy *mq, dm_oblock_t oblock,
1227                            struct policy_work **workp)
1228{
1229        int r;
1230        struct entry *e;
1231        struct policy_work work;
1232
1233        if (!mq->migrations_allowed)
1234                return;
1235
1236        if (allocator_empty(&mq->cache_alloc)) {
1237                /*
1238                 * We always claim to be 'idle' to ensure some demotions happen
1239                 * with continuous loads.
1240                 */
1241                if (!free_target_met(mq))
1242                        queue_demotion(mq);
1243                return;
1244        }
1245
1246        if (btracker_promotion_already_present(mq->bg_work, oblock))
1247                return;
1248
1249        /*
1250         * We allocate the entry now to reserve the cblock.  If the
1251         * background work is aborted we must remember to free it.
1252         */
1253        e = alloc_entry(&mq->cache_alloc);
1254        BUG_ON(!e);
1255        e->pending_work = true;
1256        work.op = POLICY_PROMOTE;
1257        work.oblock = oblock;
1258        work.cblock = infer_cblock(mq, e);
1259        r = btracker_queue(mq->bg_work, &work, workp);
1260        if (r)
1261                free_entry(&mq->cache_alloc, e);
1262}
1263
1264/*----------------------------------------------------------------*/
1265
1266enum promote_result {
1267        PROMOTE_NOT,
1268        PROMOTE_TEMPORARY,
1269        PROMOTE_PERMANENT
1270};
1271
1272/*
1273 * Converts a boolean into a promote result.
1274 */
1275static enum promote_result maybe_promote(bool promote)
1276{
1277        return promote ? PROMOTE_PERMANENT : PROMOTE_NOT;
1278}
1279
1280static enum promote_result should_promote(struct smq_policy *mq, struct entry *hs_e,
1281                                          int data_dir, bool fast_promote)
1282{
1283        if (data_dir == WRITE) {
1284                if (!allocator_empty(&mq->cache_alloc) && fast_promote)
1285                        return PROMOTE_TEMPORARY;
1286
1287                return maybe_promote(hs_e->level >= mq->write_promote_level);
1288        } else
1289                return maybe_promote(hs_e->level >= mq->read_promote_level);
1290}
1291
1292static dm_oblock_t to_hblock(struct smq_policy *mq, dm_oblock_t b)
1293{
1294        sector_t r = from_oblock(b);
1295        (void) sector_div(r, mq->cache_blocks_per_hotspot_block);
1296        return to_oblock(r);
1297}
1298
1299static struct entry *update_hotspot_queue(struct smq_policy *mq, dm_oblock_t b)
1300{
1301        unsigned hi;
1302        dm_oblock_t hb = to_hblock(mq, b);
1303        struct entry *e = h_lookup(&mq->hotspot_table, hb);
1304
1305        if (e) {
1306                stats_level_accessed(&mq->hotspot_stats, e->level);
1307
1308                hi = get_index(&mq->hotspot_alloc, e);
1309                q_requeue(&mq->hotspot, e,
1310                          test_and_set_bit(hi, mq->hotspot_hit_bits) ?
1311                          0u : mq->hotspot_level_jump,
1312                          NULL, NULL);
1313
1314        } else {
1315                stats_miss(&mq->hotspot_stats);
1316
1317                e = alloc_entry(&mq->hotspot_alloc);
1318                if (!e) {
1319                        e = q_pop(&mq->hotspot);
1320                        if (e) {
1321                                h_remove(&mq->hotspot_table, e);
1322                                hi = get_index(&mq->hotspot_alloc, e);
1323                                clear_bit(hi, mq->hotspot_hit_bits);
1324                        }
1325
1326                }
1327
1328                if (e) {
1329                        e->oblock = hb;
1330                        q_push(&mq->hotspot, e);
1331                        h_insert(&mq->hotspot_table, e);
1332                }
1333        }
1334
1335        return e;
1336}
1337
1338/*----------------------------------------------------------------*/
1339
1340/*
1341 * Public interface, via the policy struct.  See dm-cache-policy.h for a
1342 * description of these.
1343 */
1344
1345static struct smq_policy *to_smq_policy(struct dm_cache_policy *p)
1346{
1347        return container_of(p, struct smq_policy, policy);
1348}
1349
1350static void smq_destroy(struct dm_cache_policy *p)
1351{
1352        struct smq_policy *mq = to_smq_policy(p);
1353
1354        btracker_destroy(mq->bg_work);
1355        h_exit(&mq->hotspot_table);
1356        h_exit(&mq->table);
1357        free_bitset(mq->hotspot_hit_bits);
1358        free_bitset(mq->cache_hit_bits);
1359        space_exit(&mq->es);
1360        kfree(mq);
1361}
1362
1363/*----------------------------------------------------------------*/
1364
1365static int __lookup(struct smq_policy *mq, dm_oblock_t oblock, dm_cblock_t *cblock,
1366                    int data_dir, bool fast_copy,
1367                    struct policy_work **work, bool *background_work)
1368{
1369        struct entry *e, *hs_e;
1370        enum promote_result pr;
1371
1372        *background_work = false;
1373
1374        e = h_lookup(&mq->table, oblock);
1375        if (e) {
1376                stats_level_accessed(&mq->cache_stats, e->level);
1377
1378                requeue(mq, e);
1379                *cblock = infer_cblock(mq, e);
1380                return 0;
1381
1382        } else {
1383                stats_miss(&mq->cache_stats);
1384
1385                /*
1386                 * The hotspot queue only gets updated with misses.
1387                 */
1388                hs_e = update_hotspot_queue(mq, oblock);
1389
1390                pr = should_promote(mq, hs_e, data_dir, fast_copy);
1391                if (pr != PROMOTE_NOT) {
1392                        queue_promotion(mq, oblock, work);
1393                        *background_work = true;
1394                }
1395
1396                return -ENOENT;
1397        }
1398}
1399
1400static int smq_lookup(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t *cblock,
1401                      int data_dir, bool fast_copy,
1402                      bool *background_work)
1403{
1404        int r;
1405        unsigned long flags;
1406        struct smq_policy *mq = to_smq_policy(p);
1407
1408        spin_lock_irqsave(&mq->lock, flags);
1409        r = __lookup(mq, oblock, cblock,
1410                     data_dir, fast_copy,
1411                     NULL, background_work);
1412        spin_unlock_irqrestore(&mq->lock, flags);
1413
1414        return r;
1415}
1416
1417static int smq_lookup_with_work(struct dm_cache_policy *p,
1418                                dm_oblock_t oblock, dm_cblock_t *cblock,
1419                                int data_dir, bool fast_copy,
1420                                struct policy_work **work)
1421{
1422        int r;
1423        bool background_queued;
1424        unsigned long flags;
1425        struct smq_policy *mq = to_smq_policy(p);
1426
1427        spin_lock_irqsave(&mq->lock, flags);
1428        r = __lookup(mq, oblock, cblock, data_dir, fast_copy, work, &background_queued);
1429        spin_unlock_irqrestore(&mq->lock, flags);
1430
1431        return r;
1432}
1433
1434static int smq_get_background_work(struct dm_cache_policy *p, bool idle,
1435                                   struct policy_work **result)
1436{
1437        int r;
1438        unsigned long flags;
1439        struct smq_policy *mq = to_smq_policy(p);
1440
1441        spin_lock_irqsave(&mq->lock, flags);
1442        r = btracker_issue(mq->bg_work, result);
1443        if (r == -ENODATA) {
1444                if (!clean_target_met(mq, idle)) {
1445                        queue_writeback(mq, idle);
1446                        r = btracker_issue(mq->bg_work, result);
1447                }
1448        }
1449        spin_unlock_irqrestore(&mq->lock, flags);
1450
1451        return r;
1452}
1453
1454/*
1455 * We need to clear any pending work flags that have been set, and in the
1456 * case of promotion free the entry for the destination cblock.
1457 */
1458static void __complete_background_work(struct smq_policy *mq,
1459                                       struct policy_work *work,
1460                                       bool success)
1461{
1462        struct entry *e = get_entry(&mq->cache_alloc,
1463                                    from_cblock(work->cblock));
1464
1465        switch (work->op) {
1466        case POLICY_PROMOTE:
1467                // !h, !q, a
1468                clear_pending(mq, e);
1469                if (success) {
1470                        e->oblock = work->oblock;
1471                        e->level = NR_CACHE_LEVELS - 1;
1472                        push(mq, e);
1473                        // h, q, a
1474                } else {
1475                        free_entry(&mq->cache_alloc, e);
1476                        // !h, !q, !a
1477                }
1478                break;
1479
1480        case POLICY_DEMOTE:
1481                // h, !q, a
1482                if (success) {
1483                        h_remove(&mq->table, e);
1484                        free_entry(&mq->cache_alloc, e);
1485                        // !h, !q, !a
1486                } else {
1487                        clear_pending(mq, e);
1488                        push_queue(mq, e);
1489                        // h, q, a
1490                }
1491                break;
1492
1493        case POLICY_WRITEBACK:
1494                // h, !q, a
1495                clear_pending(mq, e);
1496                push_queue(mq, e);
1497                // h, q, a
1498                break;
1499        }
1500
1501        btracker_complete(mq->bg_work, work);
1502}
1503
1504static void smq_complete_background_work(struct dm_cache_policy *p,
1505                                         struct policy_work *work,
1506                                         bool success)
1507{
1508        unsigned long flags;
1509        struct smq_policy *mq = to_smq_policy(p);
1510
1511        spin_lock_irqsave(&mq->lock, flags);
1512        __complete_background_work(mq, work, success);
1513        spin_unlock_irqrestore(&mq->lock, flags);
1514}
1515
1516// in_hash(oblock) -> in_hash(oblock)
1517static void __smq_set_clear_dirty(struct smq_policy *mq, dm_cblock_t cblock, bool set)
1518{
1519        struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
1520
1521        if (e->pending_work)
1522                e->dirty = set;
1523        else {
1524                del_queue(mq, e);
1525                e->dirty = set;
1526                push_queue(mq, e);
1527        }
1528}
1529
1530static void smq_set_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
1531{
1532        unsigned long flags;
1533        struct smq_policy *mq = to_smq_policy(p);
1534
1535        spin_lock_irqsave(&mq->lock, flags);
1536        __smq_set_clear_dirty(mq, cblock, true);
1537        spin_unlock_irqrestore(&mq->lock, flags);
1538}
1539
1540static void smq_clear_dirty(struct dm_cache_policy *p, dm_cblock_t cblock)
1541{
1542        struct smq_policy *mq = to_smq_policy(p);
1543        unsigned long flags;
1544
1545        spin_lock_irqsave(&mq->lock, flags);
1546        __smq_set_clear_dirty(mq, cblock, false);
1547        spin_unlock_irqrestore(&mq->lock, flags);
1548}
1549
1550static unsigned random_level(dm_cblock_t cblock)
1551{
1552        return hash_32(from_cblock(cblock), 9) & (NR_CACHE_LEVELS - 1);
1553}
1554
1555static int smq_load_mapping(struct dm_cache_policy *p,
1556                            dm_oblock_t oblock, dm_cblock_t cblock,
1557                            bool dirty, uint32_t hint, bool hint_valid)
1558{
1559        struct smq_policy *mq = to_smq_policy(p);
1560        struct entry *e;
1561
1562        e = alloc_particular_entry(&mq->cache_alloc, from_cblock(cblock));
1563        e->oblock = oblock;
1564        e->dirty = dirty;
1565        e->level = hint_valid ? min(hint, NR_CACHE_LEVELS - 1) : random_level(cblock);
1566        e->pending_work = false;
1567
1568        /*
1569         * When we load mappings we push ahead of both sentinels in order to
1570         * allow demotions and cleaning to occur immediately.
1571         */
1572        push_front(mq, e);
1573
1574        return 0;
1575}
1576
1577static int smq_invalidate_mapping(struct dm_cache_policy *p, dm_cblock_t cblock)
1578{
1579        struct smq_policy *mq = to_smq_policy(p);
1580        struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
1581
1582        if (!e->allocated)
1583                return -ENODATA;
1584
1585        // FIXME: what if this block has pending background work?
1586        del_queue(mq, e);
1587        h_remove(&mq->table, e);
1588        free_entry(&mq->cache_alloc, e);
1589        return 0;
1590}
1591
1592static uint32_t smq_get_hint(struct dm_cache_policy *p, dm_cblock_t cblock)
1593{
1594        struct smq_policy *mq = to_smq_policy(p);
1595        struct entry *e = get_entry(&mq->cache_alloc, from_cblock(cblock));
1596
1597        if (!e->allocated)
1598                return 0;
1599
1600        return e->level;
1601}
1602
1603static dm_cblock_t smq_residency(struct dm_cache_policy *p)
1604{
1605        dm_cblock_t r;
1606        unsigned long flags;
1607        struct smq_policy *mq = to_smq_policy(p);
1608
1609        spin_lock_irqsave(&mq->lock, flags);
1610        r = to_cblock(mq->cache_alloc.nr_allocated);
1611        spin_unlock_irqrestore(&mq->lock, flags);
1612
1613        return r;
1614}
1615
1616static void smq_tick(struct dm_cache_policy *p, bool can_block)
1617{
1618        struct smq_policy *mq = to_smq_policy(p);
1619        unsigned long flags;
1620
1621        spin_lock_irqsave(&mq->lock, flags);
1622        mq->tick++;
1623        update_sentinels(mq);
1624        end_hotspot_period(mq);
1625        end_cache_period(mq);
1626        spin_unlock_irqrestore(&mq->lock, flags);
1627}
1628
1629static void smq_allow_migrations(struct dm_cache_policy *p, bool allow)
1630{
1631        struct smq_policy *mq = to_smq_policy(p);
1632        mq->migrations_allowed = allow;
1633}
1634
1635/*
1636 * smq has no config values, but the old mq policy did.  To avoid breaking
1637 * software we continue to accept these configurables for the mq policy,
1638 * but they have no effect.
1639 */
1640static int mq_set_config_value(struct dm_cache_policy *p,
1641                               const char *key, const char *value)
1642{
1643        unsigned long tmp;
1644
1645        if (kstrtoul(value, 10, &tmp))
1646                return -EINVAL;
1647
1648        if (!strcasecmp(key, "random_threshold") ||
1649            !strcasecmp(key, "sequential_threshold") ||
1650            !strcasecmp(key, "discard_promote_adjustment") ||
1651            !strcasecmp(key, "read_promote_adjustment") ||
1652            !strcasecmp(key, "write_promote_adjustment")) {
1653                DMWARN("tunable '%s' no longer has any effect, mq policy is now an alias for smq", key);
1654                return 0;
1655        }
1656
1657        return -EINVAL;
1658}
1659
1660static int mq_emit_config_values(struct dm_cache_policy *p, char *result,
1661                                 unsigned maxlen, ssize_t *sz_ptr)
1662{
1663        ssize_t sz = *sz_ptr;
1664
1665        DMEMIT("10 random_threshold 0 "
1666               "sequential_threshold 0 "
1667               "discard_promote_adjustment 0 "
1668               "read_promote_adjustment 0 "
1669               "write_promote_adjustment 0 ");
1670
1671        *sz_ptr = sz;
1672        return 0;
1673}
1674
1675/* Init the policy plugin interface function pointers. */
1676static void init_policy_functions(struct smq_policy *mq, bool mimic_mq)
1677{
1678        mq->policy.destroy = smq_destroy;
1679        mq->policy.lookup = smq_lookup;
1680        mq->policy.lookup_with_work = smq_lookup_with_work;
1681        mq->policy.get_background_work = smq_get_background_work;
1682        mq->policy.complete_background_work = smq_complete_background_work;
1683        mq->policy.set_dirty = smq_set_dirty;
1684        mq->policy.clear_dirty = smq_clear_dirty;
1685        mq->policy.load_mapping = smq_load_mapping;
1686        mq->policy.invalidate_mapping = smq_invalidate_mapping;
1687        mq->policy.get_hint = smq_get_hint;
1688        mq->policy.residency = smq_residency;
1689        mq->policy.tick = smq_tick;
1690        mq->policy.allow_migrations = smq_allow_migrations;
1691
1692        if (mimic_mq) {
1693                mq->policy.set_config_value = mq_set_config_value;
1694                mq->policy.emit_config_values = mq_emit_config_values;
1695        }
1696}
1697
1698static bool too_many_hotspot_blocks(sector_t origin_size,
1699                                    sector_t hotspot_block_size,
1700                                    unsigned nr_hotspot_blocks)
1701{
1702        return (hotspot_block_size * nr_hotspot_blocks) > origin_size;
1703}
1704
1705static void calc_hotspot_params(sector_t origin_size,
1706                                sector_t cache_block_size,
1707                                unsigned nr_cache_blocks,
1708                                sector_t *hotspot_block_size,
1709                                unsigned *nr_hotspot_blocks)
1710{
1711        *hotspot_block_size = cache_block_size * 16u;
1712        *nr_hotspot_blocks = max(nr_cache_blocks / 4u, 1024u);
1713
1714        while ((*hotspot_block_size > cache_block_size) &&
1715               too_many_hotspot_blocks(origin_size, *hotspot_block_size, *nr_hotspot_blocks))
1716                *hotspot_block_size /= 2u;
1717}
1718
1719static struct dm_cache_policy *__smq_create(dm_cblock_t cache_size,
1720                                            sector_t origin_size,
1721                                            sector_t cache_block_size,
1722                                            bool mimic_mq,
1723                                            bool migrations_allowed)
1724{
1725        unsigned i;
1726        unsigned nr_sentinels_per_queue = 2u * NR_CACHE_LEVELS;
1727        unsigned total_sentinels = 2u * nr_sentinels_per_queue;
1728        struct smq_policy *mq = kzalloc(sizeof(*mq), GFP_KERNEL);
1729
1730        if (!mq)
1731                return NULL;
1732
1733        init_policy_functions(mq, mimic_mq);
1734        mq->cache_size = cache_size;
1735        mq->cache_block_size = cache_block_size;
1736
1737        calc_hotspot_params(origin_size, cache_block_size, from_cblock(cache_size),
1738                            &mq->hotspot_block_size, &mq->nr_hotspot_blocks);
1739
1740        mq->cache_blocks_per_hotspot_block = div64_u64(mq->hotspot_block_size, mq->cache_block_size);
1741        mq->hotspot_level_jump = 1u;
1742        if (space_init(&mq->es, total_sentinels + mq->nr_hotspot_blocks + from_cblock(cache_size))) {
1743                DMERR("couldn't initialize entry space");
1744                goto bad_pool_init;
1745        }
1746
1747        init_allocator(&mq->writeback_sentinel_alloc, &mq->es, 0, nr_sentinels_per_queue);
1748        for (i = 0; i < nr_sentinels_per_queue; i++)
1749                get_entry(&mq->writeback_sentinel_alloc, i)->sentinel = true;
1750
1751        init_allocator(&mq->demote_sentinel_alloc, &mq->es, nr_sentinels_per_queue, total_sentinels);
1752        for (i = 0; i < nr_sentinels_per_queue; i++)
1753                get_entry(&mq->demote_sentinel_alloc, i)->sentinel = true;
1754
1755        init_allocator(&mq->hotspot_alloc, &mq->es, total_sentinels,
1756                       total_sentinels + mq->nr_hotspot_blocks);
1757
1758        init_allocator(&mq->cache_alloc, &mq->es,
1759                       total_sentinels + mq->nr_hotspot_blocks,
1760                       total_sentinels + mq->nr_hotspot_blocks + from_cblock(cache_size));
1761
1762        mq->hotspot_hit_bits = alloc_bitset(mq->nr_hotspot_blocks);
1763        if (!mq->hotspot_hit_bits) {
1764                DMERR("couldn't allocate hotspot hit bitset");
1765                goto bad_hotspot_hit_bits;
1766        }
1767        clear_bitset(mq->hotspot_hit_bits, mq->nr_hotspot_blocks);
1768
1769        if (from_cblock(cache_size)) {
1770                mq->cache_hit_bits = alloc_bitset(from_cblock(cache_size));
1771                if (!mq->cache_hit_bits) {
1772                        DMERR("couldn't allocate cache hit bitset");
1773                        goto bad_cache_hit_bits;
1774                }
1775                clear_bitset(mq->cache_hit_bits, from_cblock(mq->cache_size));
1776        } else
1777                mq->cache_hit_bits = NULL;
1778
1779        mq->tick = 0;
1780        spin_lock_init(&mq->lock);
1781
1782        q_init(&mq->hotspot, &mq->es, NR_HOTSPOT_LEVELS);
1783        mq->hotspot.nr_top_levels = 8;
1784        mq->hotspot.nr_in_top_levels = min(mq->nr_hotspot_blocks / NR_HOTSPOT_LEVELS,
1785                                           from_cblock(mq->cache_size) / mq->cache_blocks_per_hotspot_block);
1786
1787        q_init(&mq->clean, &mq->es, NR_CACHE_LEVELS);
1788        q_init(&mq->dirty, &mq->es, NR_CACHE_LEVELS);
1789
1790        stats_init(&mq->hotspot_stats, NR_HOTSPOT_LEVELS);
1791        stats_init(&mq->cache_stats, NR_CACHE_LEVELS);
1792
1793        if (h_init(&mq->table, &mq->es, from_cblock(cache_size)))
1794                goto bad_alloc_table;
1795
1796        if (h_init(&mq->hotspot_table, &mq->es, mq->nr_hotspot_blocks))
1797                goto bad_alloc_hotspot_table;
1798
1799        sentinels_init(mq);
1800        mq->write_promote_level = mq->read_promote_level = NR_HOTSPOT_LEVELS;
1801
1802        mq->next_hotspot_period = jiffies;
1803        mq->next_cache_period = jiffies;
1804
1805        mq->bg_work = btracker_create(4096); /* FIXME: hard coded value */
1806        if (!mq->bg_work)
1807                goto bad_btracker;
1808
1809        mq->migrations_allowed = migrations_allowed;
1810
1811        return &mq->policy;
1812
1813bad_btracker:
1814        h_exit(&mq->hotspot_table);
1815bad_alloc_hotspot_table:
1816        h_exit(&mq->table);
1817bad_alloc_table:
1818        free_bitset(mq->cache_hit_bits);
1819bad_cache_hit_bits:
1820        free_bitset(mq->hotspot_hit_bits);
1821bad_hotspot_hit_bits:
1822        space_exit(&mq->es);
1823bad_pool_init:
1824        kfree(mq);
1825
1826        return NULL;
1827}
1828
1829static struct dm_cache_policy *smq_create(dm_cblock_t cache_size,
1830                                          sector_t origin_size,
1831                                          sector_t cache_block_size)
1832{
1833        return __smq_create(cache_size, origin_size, cache_block_size, false, true);
1834}
1835
1836static struct dm_cache_policy *mq_create(dm_cblock_t cache_size,
1837                                         sector_t origin_size,
1838                                         sector_t cache_block_size)
1839{
1840        return __smq_create(cache_size, origin_size, cache_block_size, true, true);
1841}
1842
1843static struct dm_cache_policy *cleaner_create(dm_cblock_t cache_size,
1844                                              sector_t origin_size,
1845                                              sector_t cache_block_size)
1846{
1847        return __smq_create(cache_size, origin_size, cache_block_size, false, false);
1848}
1849
1850/*----------------------------------------------------------------*/
1851
1852static struct dm_cache_policy_type smq_policy_type = {
1853        .name = "smq",
1854        .version = {2, 0, 0},
1855        .hint_size = 4,
1856        .owner = THIS_MODULE,
1857        .create = smq_create
1858};
1859
1860static struct dm_cache_policy_type mq_policy_type = {
1861        .name = "mq",
1862        .version = {2, 0, 0},
1863        .hint_size = 4,
1864        .owner = THIS_MODULE,
1865        .create = mq_create,
1866};
1867
1868static struct dm_cache_policy_type cleaner_policy_type = {
1869        .name = "cleaner",
1870        .version = {2, 0, 0},
1871        .hint_size = 4,
1872        .owner = THIS_MODULE,
1873        .create = cleaner_create,
1874};
1875
1876static struct dm_cache_policy_type default_policy_type = {
1877        .name = "default",
1878        .version = {2, 0, 0},
1879        .hint_size = 4,
1880        .owner = THIS_MODULE,
1881        .create = smq_create,
1882        .real = &smq_policy_type
1883};
1884
1885static int __init smq_init(void)
1886{
1887        int r;
1888
1889        r = dm_cache_policy_register(&smq_policy_type);
1890        if (r) {
1891                DMERR("register failed %d", r);
1892                return -ENOMEM;
1893        }
1894
1895        r = dm_cache_policy_register(&mq_policy_type);
1896        if (r) {
1897                DMERR("register failed (as mq) %d", r);
1898                goto out_mq;
1899        }
1900
1901        r = dm_cache_policy_register(&cleaner_policy_type);
1902        if (r) {
1903                DMERR("register failed (as cleaner) %d", r);
1904                goto out_cleaner;
1905        }
1906
1907        r = dm_cache_policy_register(&default_policy_type);
1908        if (r) {
1909                DMERR("register failed (as default) %d", r);
1910                goto out_default;
1911        }
1912
1913        return 0;
1914
1915out_default:
1916        dm_cache_policy_unregister(&cleaner_policy_type);
1917out_cleaner:
1918        dm_cache_policy_unregister(&mq_policy_type);
1919out_mq:
1920        dm_cache_policy_unregister(&smq_policy_type);
1921
1922        return -ENOMEM;
1923}
1924
1925static void __exit smq_exit(void)
1926{
1927        dm_cache_policy_unregister(&cleaner_policy_type);
1928        dm_cache_policy_unregister(&smq_policy_type);
1929        dm_cache_policy_unregister(&mq_policy_type);
1930        dm_cache_policy_unregister(&default_policy_type);
1931}
1932
1933module_init(smq_init);
1934module_exit(smq_exit);
1935
1936MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
1937MODULE_LICENSE("GPL");
1938MODULE_DESCRIPTION("smq cache policy");
1939
1940MODULE_ALIAS("dm-cache-default");
1941MODULE_ALIAS("dm-cache-mq");
1942MODULE_ALIAS("dm-cache-cleaner");
1943