linux/drivers/md/dm-ps-historical-service-time.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Historical Service Time
   4 *
   5 *  Keeps a time-weighted exponential moving average of the historical
   6 *  service time. Estimates future service time based on the historical
   7 *  service time and the number of outstanding requests.
   8 *
   9 *  Marks paths stale if they have not finished within hst *
  10 *  num_paths. If a path is stale and unused, we will send a single
  11 *  request to probe in case the path has improved. This situation
  12 *  generally arises if the path is so much worse than others that it
  13 *  will never have the best estimated service time, or if the entire
  14 *  multipath device is unused. If a path is stale and in use, limit the
  15 *  number of requests it can receive with the assumption that the path
  16 *  has become degraded.
  17 *
  18 *  To avoid repeatedly calculating exponents for time weighting, times
  19 *  are split into HST_WEIGHT_COUNT buckets each (1 >> HST_BUCKET_SHIFT)
  20 *  ns, and the weighting is pre-calculated.
  21 *
  22 */
  23
  24#include "dm.h"
  25#include "dm-path-selector.h"
  26
  27#include <linux/blkdev.h>
  28#include <linux/slab.h>
  29#include <linux/module.h>
  30
  31
  32#define DM_MSG_PREFIX   "multipath historical-service-time"
  33#define HST_MIN_IO 1
  34#define HST_VERSION "0.1.1"
  35
  36#define HST_FIXED_SHIFT 10  /* 10 bits of decimal precision */
  37#define HST_FIXED_MAX (ULLONG_MAX >> HST_FIXED_SHIFT)
  38#define HST_FIXED_1 (1 << HST_FIXED_SHIFT)
  39#define HST_FIXED_95 972
  40#define HST_MAX_INFLIGHT HST_FIXED_1
  41#define HST_BUCKET_SHIFT 24 /* Buckets are ~ 16ms */
  42#define HST_WEIGHT_COUNT 64ULL
  43
  44struct selector {
  45        struct list_head valid_paths;
  46        struct list_head failed_paths;
  47        int valid_count;
  48        spinlock_t lock;
  49
  50        unsigned int weights[HST_WEIGHT_COUNT];
  51        unsigned int threshold_multiplier;
  52};
  53
  54struct path_info {
  55        struct list_head list;
  56        struct dm_path *path;
  57        unsigned int repeat_count;
  58
  59        spinlock_t lock;
  60
  61        u64 historical_service_time; /* Fixed point */
  62
  63        u64 stale_after;
  64        u64 last_finish;
  65
  66        u64 outstanding;
  67};
  68
  69/**
  70 * fixed_power - compute: x^n, in O(log n) time
  71 *
  72 * @x:         base of the power
  73 * @frac_bits: fractional bits of @x
  74 * @n:         power to raise @x to.
  75 *
  76 * By exploiting the relation between the definition of the natural power
  77 * function: x^n := x*x*...*x (x multiplied by itself for n times), and
  78 * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
  79 * (where: n_i \elem {0, 1}, the binary vector representing n),
  80 * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
  81 * of course trivially computable in O(log_2 n), the length of our binary
  82 * vector.
  83 *
  84 * (see: kernel/sched/loadavg.c)
  85 */
  86static u64 fixed_power(u64 x, unsigned int frac_bits, unsigned int n)
  87{
  88        unsigned long result = 1UL << frac_bits;
  89
  90        if (n) {
  91                for (;;) {
  92                        if (n & 1) {
  93                                result *= x;
  94                                result += 1UL << (frac_bits - 1);
  95                                result >>= frac_bits;
  96                        }
  97                        n >>= 1;
  98                        if (!n)
  99                                break;
 100                        x *= x;
 101                        x += 1UL << (frac_bits - 1);
 102                        x >>= frac_bits;
 103                }
 104        }
 105
 106        return result;
 107}
 108
 109/*
 110 * Calculate the next value of an exponential moving average
 111 * a_1 = a_0 * e + a * (1 - e)
 112 *
 113 * @last: [0, ULLONG_MAX >> HST_FIXED_SHIFT]
 114 * @next: [0, ULLONG_MAX >> HST_FIXED_SHIFT]
 115 * @weight: [0, HST_FIXED_1]
 116 *
 117 * Note:
 118 *   To account for multiple periods in the same calculation,
 119 *   a_n = a_0 * e^n + a * (1 - e^n),
 120 *   so call fixed_ema(last, next, pow(weight, N))
 121 */
 122static u64 fixed_ema(u64 last, u64 next, u64 weight)
 123{
 124        last *= weight;
 125        last += next * (HST_FIXED_1 - weight);
 126        last += 1ULL << (HST_FIXED_SHIFT - 1);
 127        return last >> HST_FIXED_SHIFT;
 128}
 129
 130static struct selector *alloc_selector(void)
 131{
 132        struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
 133
 134        if (s) {
 135                INIT_LIST_HEAD(&s->valid_paths);
 136                INIT_LIST_HEAD(&s->failed_paths);
 137                spin_lock_init(&s->lock);
 138                s->valid_count = 0;
 139        }
 140
 141        return s;
 142}
 143
 144/*
 145 * Get the weight for a given time span.
 146 */
 147static u64 hst_weight(struct path_selector *ps, u64 delta)
 148{
 149        struct selector *s = ps->context;
 150        int bucket = clamp(delta >> HST_BUCKET_SHIFT, 0ULL,
 151                           HST_WEIGHT_COUNT - 1);
 152
 153        return s->weights[bucket];
 154}
 155
 156/*
 157 * Set up the weights array.
 158 *
 159 * weights[len-1] = 0
 160 * weights[n] = base ^ (n + 1)
 161 */
 162static void hst_set_weights(struct path_selector *ps, unsigned int base)
 163{
 164        struct selector *s = ps->context;
 165        int i;
 166
 167        if (base >= HST_FIXED_1)
 168                return;
 169
 170        for (i = 0; i < HST_WEIGHT_COUNT - 1; i++)
 171                s->weights[i] = fixed_power(base, HST_FIXED_SHIFT, i + 1);
 172        s->weights[HST_WEIGHT_COUNT - 1] = 0;
 173}
 174
 175static int hst_create(struct path_selector *ps, unsigned int argc, char **argv)
 176{
 177        struct selector *s;
 178        unsigned int base_weight = HST_FIXED_95;
 179        unsigned int threshold_multiplier = 0;
 180        char dummy;
 181
 182        /*
 183         * Arguments: [<base_weight> [<threshold_multiplier>]]
 184         *   <base_weight>: Base weight for ema [0, 1024) 10-bit fixed point. A
 185         *                  value of 0 will completely ignore any history.
 186         *                  If not given, default (HST_FIXED_95) is used.
 187         *   <threshold_multiplier>: Minimum threshold multiplier for paths to
 188         *                  be considered different. That is, a path is
 189         *                  considered different iff (p1 > N * p2) where p1
 190         *                  is the path with higher service time. A threshold
 191         *                  of 1 or 0 has no effect. Defaults to 0.
 192         */
 193        if (argc > 2)
 194                return -EINVAL;
 195
 196        if (argc && (sscanf(argv[0], "%u%c", &base_weight, &dummy) != 1 ||
 197             base_weight >= HST_FIXED_1)) {
 198                return -EINVAL;
 199        }
 200
 201        if (argc > 1 && (sscanf(argv[1], "%u%c",
 202                                &threshold_multiplier, &dummy) != 1)) {
 203                return -EINVAL;
 204        }
 205
 206        s = alloc_selector();
 207        if (!s)
 208                return -ENOMEM;
 209
 210        ps->context = s;
 211
 212        hst_set_weights(ps, base_weight);
 213        s->threshold_multiplier = threshold_multiplier;
 214        return 0;
 215}
 216
 217static void free_paths(struct list_head *paths)
 218{
 219        struct path_info *pi, *next;
 220
 221        list_for_each_entry_safe(pi, next, paths, list) {
 222                list_del(&pi->list);
 223                kfree(pi);
 224        }
 225}
 226
 227static void hst_destroy(struct path_selector *ps)
 228{
 229        struct selector *s = ps->context;
 230
 231        free_paths(&s->valid_paths);
 232        free_paths(&s->failed_paths);
 233        kfree(s);
 234        ps->context = NULL;
 235}
 236
 237static int hst_status(struct path_selector *ps, struct dm_path *path,
 238                     status_type_t type, char *result, unsigned int maxlen)
 239{
 240        unsigned int sz = 0;
 241        struct path_info *pi;
 242
 243        if (!path) {
 244                struct selector *s = ps->context;
 245
 246                DMEMIT("2 %u %u ", s->weights[0], s->threshold_multiplier);
 247        } else {
 248                pi = path->pscontext;
 249
 250                switch (type) {
 251                case STATUSTYPE_INFO:
 252                        DMEMIT("%llu %llu %llu ", pi->historical_service_time,
 253                               pi->outstanding, pi->stale_after);
 254                        break;
 255                case STATUSTYPE_TABLE:
 256                        DMEMIT("0 ");
 257                        break;
 258                }
 259        }
 260
 261        return sz;
 262}
 263
 264static int hst_add_path(struct path_selector *ps, struct dm_path *path,
 265                       int argc, char **argv, char **error)
 266{
 267        struct selector *s = ps->context;
 268        struct path_info *pi;
 269        unsigned int repeat_count = HST_MIN_IO;
 270        char dummy;
 271        unsigned long flags;
 272
 273        /*
 274         * Arguments: [<repeat_count>]
 275         *   <repeat_count>: The number of I/Os before switching path.
 276         *                   If not given, default (HST_MIN_IO) is used.
 277         */
 278        if (argc > 1) {
 279                *error = "historical-service-time ps: incorrect number of arguments";
 280                return -EINVAL;
 281        }
 282
 283        if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
 284                *error = "historical-service-time ps: invalid repeat count";
 285                return -EINVAL;
 286        }
 287
 288        /* allocate the path */
 289        pi = kmalloc(sizeof(*pi), GFP_KERNEL);
 290        if (!pi) {
 291                *error = "historical-service-time ps: Error allocating path context";
 292                return -ENOMEM;
 293        }
 294
 295        pi->path = path;
 296        pi->repeat_count = repeat_count;
 297
 298        pi->historical_service_time = HST_FIXED_1;
 299
 300        spin_lock_init(&pi->lock);
 301        pi->outstanding = 0;
 302
 303        pi->stale_after = 0;
 304        pi->last_finish = 0;
 305
 306        path->pscontext = pi;
 307
 308        spin_lock_irqsave(&s->lock, flags);
 309        list_add_tail(&pi->list, &s->valid_paths);
 310        s->valid_count++;
 311        spin_unlock_irqrestore(&s->lock, flags);
 312
 313        return 0;
 314}
 315
 316static void hst_fail_path(struct path_selector *ps, struct dm_path *path)
 317{
 318        struct selector *s = ps->context;
 319        struct path_info *pi = path->pscontext;
 320        unsigned long flags;
 321
 322        spin_lock_irqsave(&s->lock, flags);
 323        list_move(&pi->list, &s->failed_paths);
 324        s->valid_count--;
 325        spin_unlock_irqrestore(&s->lock, flags);
 326}
 327
 328static int hst_reinstate_path(struct path_selector *ps, struct dm_path *path)
 329{
 330        struct selector *s = ps->context;
 331        struct path_info *pi = path->pscontext;
 332        unsigned long flags;
 333
 334        spin_lock_irqsave(&s->lock, flags);
 335        list_move_tail(&pi->list, &s->valid_paths);
 336        s->valid_count++;
 337        spin_unlock_irqrestore(&s->lock, flags);
 338
 339        return 0;
 340}
 341
 342static void hst_fill_compare(struct path_info *pi, u64 *hst,
 343                             u64 *out, u64 *stale)
 344{
 345        unsigned long flags;
 346
 347        spin_lock_irqsave(&pi->lock, flags);
 348        *hst = pi->historical_service_time;
 349        *out = pi->outstanding;
 350        *stale = pi->stale_after;
 351        spin_unlock_irqrestore(&pi->lock, flags);
 352}
 353
 354/*
 355 * Compare the estimated service time of 2 paths, pi1 and pi2,
 356 * for the incoming I/O.
 357 *
 358 * Returns:
 359 * < 0 : pi1 is better
 360 * 0   : no difference between pi1 and pi2
 361 * > 0 : pi2 is better
 362 *
 363 */
 364static long long hst_compare(struct path_info *pi1, struct path_info *pi2,
 365                             u64 time_now, struct path_selector *ps)
 366{
 367        struct selector *s = ps->context;
 368        u64 hst1, hst2;
 369        long long out1, out2, stale1, stale2;
 370        int pi2_better, over_threshold;
 371
 372        hst_fill_compare(pi1, &hst1, &out1, &stale1);
 373        hst_fill_compare(pi2, &hst2, &out2, &stale2);
 374
 375        /* Check here if estimated latency for two paths are too similar.
 376         * If this is the case, we skip extra calculation and just compare
 377         * outstanding requests. In this case, any unloaded paths will
 378         * be preferred.
 379         */
 380        if (hst1 > hst2)
 381                over_threshold = hst1 > (s->threshold_multiplier * hst2);
 382        else
 383                over_threshold = hst2 > (s->threshold_multiplier * hst1);
 384
 385        if (!over_threshold)
 386                return out1 - out2;
 387
 388        /*
 389         * If an unloaded path is stale, choose it. If both paths are unloaded,
 390         * choose path that is the most stale.
 391         * (If one path is loaded, choose the other)
 392         */
 393        if ((!out1 && stale1 < time_now) || (!out2 && stale2 < time_now) ||
 394            (!out1 && !out2))
 395                return (!out2 * stale1) - (!out1 * stale2);
 396
 397        /* Compare estimated service time. If outstanding is the same, we
 398         * don't need to multiply
 399         */
 400        if (out1 == out2) {
 401                pi2_better = hst1 > hst2;
 402        } else {
 403                /* Potential overflow with out >= 1024 */
 404                if (unlikely(out1 >= HST_MAX_INFLIGHT ||
 405                             out2 >= HST_MAX_INFLIGHT)) {
 406                        /* If over 1023 in-flights, we may overflow if hst
 407                         * is at max. (With this shift we still overflow at
 408                         * 1048576 in-flights, which is high enough).
 409                         */
 410                        hst1 >>= HST_FIXED_SHIFT;
 411                        hst2 >>= HST_FIXED_SHIFT;
 412                }
 413                pi2_better = (1 + out1) * hst1 > (1 + out2) * hst2;
 414        }
 415
 416        /* In the case that the 'winner' is stale, limit to equal usage. */
 417        if (pi2_better) {
 418                if (stale2 < time_now)
 419                        return out1 - out2;
 420                return 1;
 421        }
 422        if (stale1 < time_now)
 423                return out1 - out2;
 424        return -1;
 425}
 426
 427static struct dm_path *hst_select_path(struct path_selector *ps,
 428                                       size_t nr_bytes)
 429{
 430        struct selector *s = ps->context;
 431        struct path_info *pi = NULL, *best = NULL;
 432        u64 time_now = sched_clock();
 433        struct dm_path *ret = NULL;
 434        unsigned long flags;
 435
 436        spin_lock_irqsave(&s->lock, flags);
 437        if (list_empty(&s->valid_paths))
 438                goto out;
 439
 440        list_for_each_entry(pi, &s->valid_paths, list) {
 441                if (!best || (hst_compare(pi, best, time_now, ps) < 0))
 442                        best = pi;
 443        }
 444
 445        if (!best)
 446                goto out;
 447
 448        /* Move last used path to end (least preferred in case of ties) */
 449        list_move_tail(&best->list, &s->valid_paths);
 450
 451        ret = best->path;
 452
 453out:
 454        spin_unlock_irqrestore(&s->lock, flags);
 455        return ret;
 456}
 457
 458static int hst_start_io(struct path_selector *ps, struct dm_path *path,
 459                        size_t nr_bytes)
 460{
 461        struct path_info *pi = path->pscontext;
 462        unsigned long flags;
 463
 464        spin_lock_irqsave(&pi->lock, flags);
 465        pi->outstanding++;
 466        spin_unlock_irqrestore(&pi->lock, flags);
 467
 468        return 0;
 469}
 470
 471static u64 path_service_time(struct path_info *pi, u64 start_time)
 472{
 473        u64 sched_now = ktime_get_ns();
 474
 475        /* if a previous disk request has finished after this IO was
 476         * sent to the hardware, pretend the submission happened
 477         * serially.
 478         */
 479        if (time_after64(pi->last_finish, start_time))
 480                start_time = pi->last_finish;
 481
 482        pi->last_finish = sched_now;
 483        if (time_before64(sched_now, start_time))
 484                return 0;
 485
 486        return sched_now - start_time;
 487}
 488
 489static int hst_end_io(struct path_selector *ps, struct dm_path *path,
 490                      size_t nr_bytes, u64 start_time)
 491{
 492        struct path_info *pi = path->pscontext;
 493        struct selector *s = ps->context;
 494        unsigned long flags;
 495        u64 st;
 496
 497        spin_lock_irqsave(&pi->lock, flags);
 498
 499        st = path_service_time(pi, start_time);
 500        pi->outstanding--;
 501        pi->historical_service_time =
 502                fixed_ema(pi->historical_service_time,
 503                          min(st * HST_FIXED_1, HST_FIXED_MAX),
 504                          hst_weight(ps, st));
 505
 506        /*
 507         * On request end, mark path as fresh. If a path hasn't
 508         * finished any requests within the fresh period, the estimated
 509         * service time is considered too optimistic and we limit the
 510         * maximum requests on that path.
 511         */
 512        pi->stale_after = pi->last_finish +
 513                (s->valid_count * (pi->historical_service_time >> HST_FIXED_SHIFT));
 514
 515        spin_unlock_irqrestore(&pi->lock, flags);
 516
 517        return 0;
 518}
 519
 520static struct path_selector_type hst_ps = {
 521        .name           = "historical-service-time",
 522        .module         = THIS_MODULE,
 523        .table_args     = 1,
 524        .info_args      = 3,
 525        .create         = hst_create,
 526        .destroy        = hst_destroy,
 527        .status         = hst_status,
 528        .add_path       = hst_add_path,
 529        .fail_path      = hst_fail_path,
 530        .reinstate_path = hst_reinstate_path,
 531        .select_path    = hst_select_path,
 532        .start_io       = hst_start_io,
 533        .end_io         = hst_end_io,
 534};
 535
 536static int __init dm_hst_init(void)
 537{
 538        int r = dm_register_path_selector(&hst_ps);
 539
 540        if (r < 0)
 541                DMERR("register failed %d", r);
 542
 543        DMINFO("version " HST_VERSION " loaded");
 544
 545        return r;
 546}
 547
 548static void __exit dm_hst_exit(void)
 549{
 550        int r = dm_unregister_path_selector(&hst_ps);
 551
 552        if (r < 0)
 553                DMERR("unregister failed %d", r);
 554}
 555
 556module_init(dm_hst_init);
 557module_exit(dm_hst_exit);
 558
 559MODULE_DESCRIPTION(DM_NAME " measured service time oriented path selector");
 560MODULE_AUTHOR("Khazhismel Kumykov <khazhy@google.com>");
 561MODULE_LICENSE("GPL");
 562