linux/drivers/md/dm-service-time.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2007-2009 NEC Corporation.  All Rights Reserved.
   3 *
   4 * Module Author: Kiyoshi Ueda
   5 *
   6 * This file is released under the GPL.
   7 *
   8 * Throughput oriented path selector.
   9 */
  10
  11#include "dm.h"
  12#include "dm-path-selector.h"
  13
  14#include <linux/slab.h>
  15#include <linux/module.h>
  16
  17#define DM_MSG_PREFIX   "multipath service-time"
  18#define ST_MIN_IO       1
  19#define ST_MAX_RELATIVE_THROUGHPUT      100
  20#define ST_MAX_RELATIVE_THROUGHPUT_SHIFT        7
  21#define ST_MAX_INFLIGHT_SIZE    ((size_t)-1 >> ST_MAX_RELATIVE_THROUGHPUT_SHIFT)
  22#define ST_VERSION      "0.2.0"
  23
  24struct selector {
  25        struct list_head valid_paths;
  26        struct list_head failed_paths;
  27};
  28
  29struct path_info {
  30        struct list_head list;
  31        struct dm_path *path;
  32        unsigned repeat_count;
  33        unsigned relative_throughput;
  34        atomic_t in_flight_size;        /* Total size of in-flight I/Os */
  35};
  36
  37static struct selector *alloc_selector(void)
  38{
  39        struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
  40
  41        if (s) {
  42                INIT_LIST_HEAD(&s->valid_paths);
  43                INIT_LIST_HEAD(&s->failed_paths);
  44        }
  45
  46        return s;
  47}
  48
  49static int st_create(struct path_selector *ps, unsigned argc, char **argv)
  50{
  51        struct selector *s = alloc_selector();
  52
  53        if (!s)
  54                return -ENOMEM;
  55
  56        ps->context = s;
  57        return 0;
  58}
  59
  60static void free_paths(struct list_head *paths)
  61{
  62        struct path_info *pi, *next;
  63
  64        list_for_each_entry_safe(pi, next, paths, list) {
  65                list_del(&pi->list);
  66                kfree(pi);
  67        }
  68}
  69
  70static void st_destroy(struct path_selector *ps)
  71{
  72        struct selector *s = ps->context;
  73
  74        free_paths(&s->valid_paths);
  75        free_paths(&s->failed_paths);
  76        kfree(s);
  77        ps->context = NULL;
  78}
  79
  80static int st_status(struct path_selector *ps, struct dm_path *path,
  81                     status_type_t type, char *result, unsigned maxlen)
  82{
  83        unsigned sz = 0;
  84        struct path_info *pi;
  85
  86        if (!path)
  87                DMEMIT("0 ");
  88        else {
  89                pi = path->pscontext;
  90
  91                switch (type) {
  92                case STATUSTYPE_INFO:
  93                        DMEMIT("%d %u ", atomic_read(&pi->in_flight_size),
  94                               pi->relative_throughput);
  95                        break;
  96                case STATUSTYPE_TABLE:
  97                        DMEMIT("%u %u ", pi->repeat_count,
  98                               pi->relative_throughput);
  99                        break;
 100                }
 101        }
 102
 103        return sz;
 104}
 105
 106static int st_add_path(struct path_selector *ps, struct dm_path *path,
 107                       int argc, char **argv, char **error)
 108{
 109        struct selector *s = ps->context;
 110        struct path_info *pi;
 111        unsigned repeat_count = ST_MIN_IO;
 112        unsigned relative_throughput = 1;
 113        char dummy;
 114
 115        /*
 116         * Arguments: [<repeat_count> [<relative_throughput>]]
 117         *      <repeat_count>: The number of I/Os before switching path.
 118         *                      If not given, default (ST_MIN_IO) is used.
 119         *      <relative_throughput>: The relative throughput value of
 120         *                      the path among all paths in the path-group.
 121         *                      The valid range: 0-<ST_MAX_RELATIVE_THROUGHPUT>
 122         *                      If not given, minimum value '1' is used.
 123         *                      If '0' is given, the path isn't selected while
 124         *                      other paths having a positive value are
 125         *                      available.
 126         */
 127        if (argc > 2) {
 128                *error = "service-time ps: incorrect number of arguments";
 129                return -EINVAL;
 130        }
 131
 132        if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
 133                *error = "service-time ps: invalid repeat count";
 134                return -EINVAL;
 135        }
 136
 137        if ((argc == 2) &&
 138            (sscanf(argv[1], "%u%c", &relative_throughput, &dummy) != 1 ||
 139             relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) {
 140                *error = "service-time ps: invalid relative_throughput value";
 141                return -EINVAL;
 142        }
 143
 144        /* allocate the path */
 145        pi = kmalloc(sizeof(*pi), GFP_KERNEL);
 146        if (!pi) {
 147                *error = "service-time ps: Error allocating path context";
 148                return -ENOMEM;
 149        }
 150
 151        pi->path = path;
 152        pi->repeat_count = repeat_count;
 153        pi->relative_throughput = relative_throughput;
 154        atomic_set(&pi->in_flight_size, 0);
 155
 156        path->pscontext = pi;
 157
 158        list_add_tail(&pi->list, &s->valid_paths);
 159
 160        return 0;
 161}
 162
 163static void st_fail_path(struct path_selector *ps, struct dm_path *path)
 164{
 165        struct selector *s = ps->context;
 166        struct path_info *pi = path->pscontext;
 167
 168        list_move(&pi->list, &s->failed_paths);
 169}
 170
 171static int st_reinstate_path(struct path_selector *ps, struct dm_path *path)
 172{
 173        struct selector *s = ps->context;
 174        struct path_info *pi = path->pscontext;
 175
 176        list_move_tail(&pi->list, &s->valid_paths);
 177
 178        return 0;
 179}
 180
 181/*
 182 * Compare the estimated service time of 2 paths, pi1 and pi2,
 183 * for the incoming I/O.
 184 *
 185 * Returns:
 186 * < 0 : pi1 is better
 187 * 0   : no difference between pi1 and pi2
 188 * > 0 : pi2 is better
 189 *
 190 * Description:
 191 * Basically, the service time is estimated by:
 192 *     ('pi->in-flight-size' + 'incoming') / 'pi->relative_throughput'
 193 * To reduce the calculation, some optimizations are made.
 194 * (See comments inline)
 195 */
 196static int st_compare_load(struct path_info *pi1, struct path_info *pi2,
 197                           size_t incoming)
 198{
 199        size_t sz1, sz2, st1, st2;
 200
 201        sz1 = atomic_read(&pi1->in_flight_size);
 202        sz2 = atomic_read(&pi2->in_flight_size);
 203
 204        /*
 205         * Case 1: Both have same throughput value. Choose less loaded path.
 206         */
 207        if (pi1->relative_throughput == pi2->relative_throughput)
 208                return sz1 - sz2;
 209
 210        /*
 211         * Case 2a: Both have same load. Choose higher throughput path.
 212         * Case 2b: One path has no throughput value. Choose the other one.
 213         */
 214        if (sz1 == sz2 ||
 215            !pi1->relative_throughput || !pi2->relative_throughput)
 216                return pi2->relative_throughput - pi1->relative_throughput;
 217
 218        /*
 219         * Case 3: Calculate service time. Choose faster path.
 220         *         Service time using pi1:
 221         *             st1 = (sz1 + incoming) / pi1->relative_throughput
 222         *         Service time using pi2:
 223         *             st2 = (sz2 + incoming) / pi2->relative_throughput
 224         *
 225         *         To avoid the division, transform the expression to use
 226         *         multiplication.
 227         *         Because ->relative_throughput > 0 here, if st1 < st2,
 228         *         the expressions below are the same meaning:
 229         *             (sz1 + incoming) / pi1->relative_throughput <
 230         *                 (sz2 + incoming) / pi2->relative_throughput
 231         *             (sz1 + incoming) * pi2->relative_throughput <
 232         *                 (sz2 + incoming) * pi1->relative_throughput
 233         *         So use the later one.
 234         */
 235        sz1 += incoming;
 236        sz2 += incoming;
 237        if (unlikely(sz1 >= ST_MAX_INFLIGHT_SIZE ||
 238                     sz2 >= ST_MAX_INFLIGHT_SIZE)) {
 239                /*
 240                 * Size may be too big for multiplying pi->relative_throughput
 241                 * and overflow.
 242                 * To avoid the overflow and mis-selection, shift down both.
 243                 */
 244                sz1 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT;
 245                sz2 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT;
 246        }
 247        st1 = sz1 * pi2->relative_throughput;
 248        st2 = sz2 * pi1->relative_throughput;
 249        if (st1 != st2)
 250                return st1 - st2;
 251
 252        /*
 253         * Case 4: Service time is equal. Choose higher throughput path.
 254         */
 255        return pi2->relative_throughput - pi1->relative_throughput;
 256}
 257
 258static struct dm_path *st_select_path(struct path_selector *ps,
 259                                      unsigned *repeat_count, size_t nr_bytes)
 260{
 261        struct selector *s = ps->context;
 262        struct path_info *pi = NULL, *best = NULL;
 263
 264        if (list_empty(&s->valid_paths))
 265                return NULL;
 266
 267        /* Change preferred (first in list) path to evenly balance. */
 268        list_move_tail(s->valid_paths.next, &s->valid_paths);
 269
 270        list_for_each_entry(pi, &s->valid_paths, list)
 271                if (!best || (st_compare_load(pi, best, nr_bytes) < 0))
 272                        best = pi;
 273
 274        if (!best)
 275                return NULL;
 276
 277        *repeat_count = best->repeat_count;
 278
 279        return best->path;
 280}
 281
 282static int st_start_io(struct path_selector *ps, struct dm_path *path,
 283                       size_t nr_bytes)
 284{
 285        struct path_info *pi = path->pscontext;
 286
 287        atomic_add(nr_bytes, &pi->in_flight_size);
 288
 289        return 0;
 290}
 291
 292static int st_end_io(struct path_selector *ps, struct dm_path *path,
 293                     size_t nr_bytes)
 294{
 295        struct path_info *pi = path->pscontext;
 296
 297        atomic_sub(nr_bytes, &pi->in_flight_size);
 298
 299        return 0;
 300}
 301
 302static struct path_selector_type st_ps = {
 303        .name           = "service-time",
 304        .module         = THIS_MODULE,
 305        .table_args     = 2,
 306        .info_args      = 2,
 307        .create         = st_create,
 308        .destroy        = st_destroy,
 309        .status         = st_status,
 310        .add_path       = st_add_path,
 311        .fail_path      = st_fail_path,
 312        .reinstate_path = st_reinstate_path,
 313        .select_path    = st_select_path,
 314        .start_io       = st_start_io,
 315        .end_io         = st_end_io,
 316};
 317
 318static int __init dm_st_init(void)
 319{
 320        int r = dm_register_path_selector(&st_ps);
 321
 322        if (r < 0)
 323                DMERR("register failed %d", r);
 324
 325        DMINFO("version " ST_VERSION " loaded");
 326
 327        return r;
 328}
 329
 330static void __exit dm_st_exit(void)
 331{
 332        int r = dm_unregister_path_selector(&st_ps);
 333
 334        if (r < 0)
 335                DMERR("unregister failed %d", r);
 336}
 337
 338module_init(dm_st_init);
 339module_exit(dm_st_exit);
 340
 341MODULE_DESCRIPTION(DM_NAME " throughput oriented path selector");
 342MODULE_AUTHOR("Kiyoshi Ueda <k-ueda@ct.jp.nec.com>");
 343MODULE_LICENSE("GPL");
 344