linux/block/blk-mq-tag.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Tag allocation using scalable bitmaps. Uses active queue tracking to support
   4 * fairer distribution of tags between multiple submitters when a shared tag map
   5 * is used.
   6 *
   7 * Copyright (C) 2013-2014 Jens Axboe
   8 */
   9#include <linux/kernel.h>
  10#include <linux/module.h>
  11
  12#include <linux/blk-mq.h>
  13#include <linux/delay.h>
  14#include "blk.h"
  15#include "blk-mq.h"
  16#include "blk-mq-tag.h"
  17
  18/*
  19 * If a previously inactive queue goes active, bump the active user count.
  20 * We need to do this before try to allocate driver tag, then even if fail
  21 * to get tag when first time, the other shared-tag users could reserve
  22 * budget for it.
  23 */
  24bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
  25{
  26        if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&
  27            !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
  28                atomic_inc(&hctx->tags->active_queues);
  29
  30        return true;
  31}
  32
  33/*
  34 * Wakeup all potentially sleeping on tags
  35 */
  36void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
  37{
  38        sbitmap_queue_wake_all(&tags->bitmap_tags);
  39        if (include_reserve)
  40                sbitmap_queue_wake_all(&tags->breserved_tags);
  41}
  42
  43/*
  44 * If a previously busy queue goes inactive, potential waiters could now
  45 * be allowed to queue. Wake them up and check.
  46 */
  47void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
  48{
  49        struct blk_mq_tags *tags = hctx->tags;
  50
  51        if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
  52                return;
  53
  54        atomic_dec(&tags->active_queues);
  55
  56        blk_mq_tag_wakeup_all(tags, false);
  57}
  58
  59/*
  60 * For shared tag users, we track the number of currently active users
  61 * and attempt to provide a fair share of the tag depth for each of them.
  62 */
  63static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
  64                                  struct sbitmap_queue *bt)
  65{
  66        unsigned int depth, users;
  67
  68        if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_SHARED))
  69                return true;
  70        if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
  71                return true;
  72
  73        /*
  74         * Don't try dividing an ant
  75         */
  76        if (bt->sb.depth == 1)
  77                return true;
  78
  79        users = atomic_read(&hctx->tags->active_queues);
  80        if (!users)
  81                return true;
  82
  83        /*
  84         * Allow at least some tags
  85         */
  86        depth = max((bt->sb.depth + users - 1) / users, 4U);
  87        return atomic_read(&hctx->nr_active) < depth;
  88}
  89
  90static int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
  91                            struct sbitmap_queue *bt)
  92{
  93        if (!(data->flags & BLK_MQ_REQ_INTERNAL) &&
  94            !hctx_may_queue(data->hctx, bt))
  95                return -1;
  96        if (data->shallow_depth)
  97                return __sbitmap_queue_get_shallow(bt, data->shallow_depth);
  98        else
  99                return __sbitmap_queue_get(bt);
 100}
 101
 102unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 103{
 104        struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
 105        struct sbitmap_queue *bt;
 106        struct sbq_wait_state *ws;
 107        DEFINE_SBQ_WAIT(wait);
 108        unsigned int tag_offset;
 109        int tag;
 110
 111        if (data->flags & BLK_MQ_REQ_RESERVED) {
 112                if (unlikely(!tags->nr_reserved_tags)) {
 113                        WARN_ON_ONCE(1);
 114                        return BLK_MQ_TAG_FAIL;
 115                }
 116                bt = &tags->breserved_tags;
 117                tag_offset = 0;
 118        } else {
 119                bt = &tags->bitmap_tags;
 120                tag_offset = tags->nr_reserved_tags;
 121        }
 122
 123        tag = __blk_mq_get_tag(data, bt);
 124        if (tag != -1)
 125                goto found_tag;
 126
 127        if (data->flags & BLK_MQ_REQ_NOWAIT)
 128                return BLK_MQ_TAG_FAIL;
 129
 130        ws = bt_wait_ptr(bt, data->hctx);
 131        do {
 132                struct sbitmap_queue *bt_prev;
 133
 134                /*
 135                 * We're out of tags on this hardware queue, kick any
 136                 * pending IO submits before going to sleep waiting for
 137                 * some to complete.
 138                 */
 139                blk_mq_run_hw_queue(data->hctx, false);
 140
 141                /*
 142                 * Retry tag allocation after running the hardware queue,
 143                 * as running the queue may also have found completions.
 144                 */
 145                tag = __blk_mq_get_tag(data, bt);
 146                if (tag != -1)
 147                        break;
 148
 149                sbitmap_prepare_to_wait(bt, ws, &wait, TASK_UNINTERRUPTIBLE);
 150
 151                tag = __blk_mq_get_tag(data, bt);
 152                if (tag != -1)
 153                        break;
 154
 155                bt_prev = bt;
 156                io_schedule();
 157
 158                sbitmap_finish_wait(bt, ws, &wait);
 159
 160                data->ctx = blk_mq_get_ctx(data->q);
 161                data->hctx = blk_mq_map_queue(data->q, data->cmd_flags,
 162                                                data->ctx);
 163                tags = blk_mq_tags_from_data(data);
 164                if (data->flags & BLK_MQ_REQ_RESERVED)
 165                        bt = &tags->breserved_tags;
 166                else
 167                        bt = &tags->bitmap_tags;
 168
 169                /*
 170                 * If destination hw queue is changed, fake wake up on
 171                 * previous queue for compensating the wake up miss, so
 172                 * other allocations on previous queue won't be starved.
 173                 */
 174                if (bt != bt_prev)
 175                        sbitmap_queue_wake_up(bt_prev);
 176
 177                ws = bt_wait_ptr(bt, data->hctx);
 178        } while (1);
 179
 180        sbitmap_finish_wait(bt, ws, &wait);
 181
 182found_tag:
 183        return tag + tag_offset;
 184}
 185
 186void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
 187                    unsigned int tag)
 188{
 189        if (!blk_mq_tag_is_reserved(tags, tag)) {
 190                const int real_tag = tag - tags->nr_reserved_tags;
 191
 192                BUG_ON(real_tag >= tags->nr_tags);
 193                sbitmap_queue_clear(&tags->bitmap_tags, real_tag, ctx->cpu);
 194        } else {
 195                BUG_ON(tag >= tags->nr_reserved_tags);
 196                sbitmap_queue_clear(&tags->breserved_tags, tag, ctx->cpu);
 197        }
 198}
 199
 200struct bt_iter_data {
 201        struct blk_mq_hw_ctx *hctx;
 202        busy_iter_fn *fn;
 203        void *data;
 204        bool reserved;
 205};
 206
 207static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
 208{
 209        struct bt_iter_data *iter_data = data;
 210        struct blk_mq_hw_ctx *hctx = iter_data->hctx;
 211        struct blk_mq_tags *tags = hctx->tags;
 212        bool reserved = iter_data->reserved;
 213        struct request *rq;
 214
 215        if (!reserved)
 216                bitnr += tags->nr_reserved_tags;
 217        rq = tags->rqs[bitnr];
 218
 219        /*
 220         * We can hit rq == NULL here, because the tagging functions
 221         * test and set the bit before assigning ->rqs[].
 222         */
 223        if (rq && rq->q == hctx->queue)
 224                return iter_data->fn(hctx, rq, iter_data->data, reserved);
 225        return true;
 226}
 227
 228/**
 229 * bt_for_each - iterate over the requests associated with a hardware queue
 230 * @hctx:       Hardware queue to examine.
 231 * @bt:         sbitmap to examine. This is either the breserved_tags member
 232 *              or the bitmap_tags member of struct blk_mq_tags.
 233 * @fn:         Pointer to the function that will be called for each request
 234 *              associated with @hctx that has been assigned a driver tag.
 235 *              @fn will be called as follows: @fn(@hctx, rq, @data, @reserved)
 236 *              where rq is a pointer to a request. Return true to continue
 237 *              iterating tags, false to stop.
 238 * @data:       Will be passed as third argument to @fn.
 239 * @reserved:   Indicates whether @bt is the breserved_tags member or the
 240 *              bitmap_tags member of struct blk_mq_tags.
 241 */
 242static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt,
 243                        busy_iter_fn *fn, void *data, bool reserved)
 244{
 245        struct bt_iter_data iter_data = {
 246                .hctx = hctx,
 247                .fn = fn,
 248                .data = data,
 249                .reserved = reserved,
 250        };
 251
 252        sbitmap_for_each_set(&bt->sb, bt_iter, &iter_data);
 253}
 254
 255struct bt_tags_iter_data {
 256        struct blk_mq_tags *tags;
 257        busy_tag_iter_fn *fn;
 258        void *data;
 259        bool reserved;
 260};
 261
 262static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
 263{
 264        struct bt_tags_iter_data *iter_data = data;
 265        struct blk_mq_tags *tags = iter_data->tags;
 266        bool reserved = iter_data->reserved;
 267        struct request *rq;
 268
 269        if (!reserved)
 270                bitnr += tags->nr_reserved_tags;
 271
 272        /*
 273         * We can hit rq == NULL here, because the tagging functions
 274         * test and set the bit before assining ->rqs[].
 275         */
 276        rq = tags->rqs[bitnr];
 277        if (rq && blk_mq_request_started(rq))
 278                return iter_data->fn(rq, iter_data->data, reserved);
 279
 280        return true;
 281}
 282
 283/**
 284 * bt_tags_for_each - iterate over the requests in a tag map
 285 * @tags:       Tag map to iterate over.
 286 * @bt:         sbitmap to examine. This is either the breserved_tags member
 287 *              or the bitmap_tags member of struct blk_mq_tags.
 288 * @fn:         Pointer to the function that will be called for each started
 289 *              request. @fn will be called as follows: @fn(rq, @data,
 290 *              @reserved) where rq is a pointer to a request. Return true
 291 *              to continue iterating tags, false to stop.
 292 * @data:       Will be passed as second argument to @fn.
 293 * @reserved:   Indicates whether @bt is the breserved_tags member or the
 294 *              bitmap_tags member of struct blk_mq_tags.
 295 */
 296static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt,
 297                             busy_tag_iter_fn *fn, void *data, bool reserved)
 298{
 299        struct bt_tags_iter_data iter_data = {
 300                .tags = tags,
 301                .fn = fn,
 302                .data = data,
 303                .reserved = reserved,
 304        };
 305
 306        if (tags->rqs)
 307                sbitmap_for_each_set(&bt->sb, bt_tags_iter, &iter_data);
 308}
 309
 310/**
 311 * blk_mq_all_tag_busy_iter - iterate over all started requests in a tag map
 312 * @tags:       Tag map to iterate over.
 313 * @fn:         Pointer to the function that will be called for each started
 314 *              request. @fn will be called as follows: @fn(rq, @priv,
 315 *              reserved) where rq is a pointer to a request. 'reserved'
 316 *              indicates whether or not @rq is a reserved request. Return
 317 *              true to continue iterating tags, false to stop.
 318 * @priv:       Will be passed as second argument to @fn.
 319 */
 320static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
 321                busy_tag_iter_fn *fn, void *priv)
 322{
 323        if (tags->nr_reserved_tags)
 324                bt_tags_for_each(tags, &tags->breserved_tags, fn, priv, true);
 325        bt_tags_for_each(tags, &tags->bitmap_tags, fn, priv, false);
 326}
 327
 328/**
 329 * blk_mq_tagset_busy_iter - iterate over all started requests in a tag set
 330 * @tagset:     Tag set to iterate over.
 331 * @fn:         Pointer to the function that will be called for each started
 332 *              request. @fn will be called as follows: @fn(rq, @priv,
 333 *              reserved) where rq is a pointer to a request. 'reserved'
 334 *              indicates whether or not @rq is a reserved request. Return
 335 *              true to continue iterating tags, false to stop.
 336 * @priv:       Will be passed as second argument to @fn.
 337 */
 338void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
 339                busy_tag_iter_fn *fn, void *priv)
 340{
 341        int i;
 342
 343        for (i = 0; i < tagset->nr_hw_queues; i++) {
 344                if (tagset->tags && tagset->tags[i])
 345                        blk_mq_all_tag_busy_iter(tagset->tags[i], fn, priv);
 346        }
 347}
 348EXPORT_SYMBOL(blk_mq_tagset_busy_iter);
 349
 350static bool blk_mq_tagset_count_completed_rqs(struct request *rq,
 351                void *data, bool reserved)
 352{
 353        unsigned *count = data;
 354
 355        if (blk_mq_request_completed(rq))
 356                (*count)++;
 357        return true;
 358}
 359
 360/**
 361 * blk_mq_tagset_wait_completed_request - wait until all completed req's
 362 * complete funtion is run
 363 * @tagset:     Tag set to drain completed request
 364 *
 365 * Note: This function has to be run after all IO queues are shutdown
 366 */
 367void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset)
 368{
 369        while (true) {
 370                unsigned count = 0;
 371
 372                blk_mq_tagset_busy_iter(tagset,
 373                                blk_mq_tagset_count_completed_rqs, &count);
 374                if (!count)
 375                        break;
 376                msleep(5);
 377        }
 378}
 379EXPORT_SYMBOL(blk_mq_tagset_wait_completed_request);
 380
 381/**
 382 * blk_mq_queue_tag_busy_iter - iterate over all requests with a driver tag
 383 * @q:          Request queue to examine.
 384 * @fn:         Pointer to the function that will be called for each request
 385 *              on @q. @fn will be called as follows: @fn(hctx, rq, @priv,
 386 *              reserved) where rq is a pointer to a request and hctx points
 387 *              to the hardware queue associated with the request. 'reserved'
 388 *              indicates whether or not @rq is a reserved request.
 389 * @priv:       Will be passed as third argument to @fn.
 390 *
 391 * Note: if @q->tag_set is shared with other request queues then @fn will be
 392 * called for all requests on all queues that share that tag set and not only
 393 * for requests associated with @q.
 394 */
 395void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
 396                void *priv)
 397{
 398        struct blk_mq_hw_ctx *hctx;
 399        int i;
 400
 401        /*
 402         * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and queue_hw_ctx
 403         * while the queue is frozen. So we can use q_usage_counter to avoid
 404         * racing with it. __blk_mq_update_nr_hw_queues() uses
 405         * synchronize_rcu() to ensure this function left the critical section
 406         * below.
 407         */
 408        if (!percpu_ref_tryget(&q->q_usage_counter))
 409                return;
 410
 411        queue_for_each_hw_ctx(q, hctx, i) {
 412                struct blk_mq_tags *tags = hctx->tags;
 413
 414                /*
 415                 * If no software queues are currently mapped to this
 416                 * hardware queue, there's nothing to check
 417                 */
 418                if (!blk_mq_hw_queue_mapped(hctx))
 419                        continue;
 420
 421                if (tags->nr_reserved_tags)
 422                        bt_for_each(hctx, &tags->breserved_tags, fn, priv, true);
 423                bt_for_each(hctx, &tags->bitmap_tags, fn, priv, false);
 424        }
 425        blk_queue_exit(q);
 426}
 427
 428static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth,
 429                    bool round_robin, int node)
 430{
 431        return sbitmap_queue_init_node(bt, depth, -1, round_robin, GFP_KERNEL,
 432                                       node);
 433}
 434
 435static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags,
 436                                                   int node, int alloc_policy)
 437{
 438        unsigned int depth = tags->nr_tags - tags->nr_reserved_tags;
 439        bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR;
 440
 441        if (bt_alloc(&tags->bitmap_tags, depth, round_robin, node))
 442                goto free_tags;
 443        if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, round_robin,
 444                     node))
 445                goto free_bitmap_tags;
 446
 447        return tags;
 448free_bitmap_tags:
 449        sbitmap_queue_free(&tags->bitmap_tags);
 450free_tags:
 451        kfree(tags);
 452        return NULL;
 453}
 454
 455struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
 456                                     unsigned int reserved_tags,
 457                                     int node, int alloc_policy)
 458{
 459        struct blk_mq_tags *tags;
 460
 461        if (total_tags > BLK_MQ_TAG_MAX) {
 462                pr_err("blk-mq: tag depth too large\n");
 463                return NULL;
 464        }
 465
 466        tags = kzalloc_node(sizeof(*tags), GFP_KERNEL, node);
 467        if (!tags)
 468                return NULL;
 469
 470        tags->nr_tags = total_tags;
 471        tags->nr_reserved_tags = reserved_tags;
 472
 473        return blk_mq_init_bitmap_tags(tags, node, alloc_policy);
 474}
 475
 476void blk_mq_free_tags(struct blk_mq_tags *tags)
 477{
 478        sbitmap_queue_free(&tags->bitmap_tags);
 479        sbitmap_queue_free(&tags->breserved_tags);
 480        kfree(tags);
 481}
 482
 483int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
 484                            struct blk_mq_tags **tagsptr, unsigned int tdepth,
 485                            bool can_grow)
 486{
 487        struct blk_mq_tags *tags = *tagsptr;
 488
 489        if (tdepth <= tags->nr_reserved_tags)
 490                return -EINVAL;
 491
 492        /*
 493         * If we are allowed to grow beyond the original size, allocate
 494         * a new set of tags before freeing the old one.
 495         */
 496        if (tdepth > tags->nr_tags) {
 497                struct blk_mq_tag_set *set = hctx->queue->tag_set;
 498                struct blk_mq_tags *new;
 499                bool ret;
 500
 501                if (!can_grow)
 502                        return -EINVAL;
 503
 504                /*
 505                 * We need some sort of upper limit, set it high enough that
 506                 * no valid use cases should require more.
 507                 */
 508                if (tdepth > 16 * BLKDEV_MAX_RQ)
 509                        return -EINVAL;
 510
 511                new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth,
 512                                tags->nr_reserved_tags);
 513                if (!new)
 514                        return -ENOMEM;
 515                ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth);
 516                if (ret) {
 517                        blk_mq_free_rq_map(new);
 518                        return -ENOMEM;
 519                }
 520
 521                blk_mq_free_rqs(set, *tagsptr, hctx->queue_num);
 522                blk_mq_free_rq_map(*tagsptr);
 523                *tagsptr = new;
 524        } else {
 525                /*
 526                 * Don't need (or can't) update reserved tags here, they
 527                 * remain static and should never need resizing.
 528                 */
 529                sbitmap_queue_resize(&tags->bitmap_tags,
 530                                tdepth - tags->nr_reserved_tags);
 531        }
 532
 533        return 0;
 534}
 535
 536/**
 537 * blk_mq_unique_tag() - return a tag that is unique queue-wide
 538 * @rq: request for which to compute a unique tag
 539 *
 540 * The tag field in struct request is unique per hardware queue but not over
 541 * all hardware queues. Hence this function that returns a tag with the
 542 * hardware context index in the upper bits and the per hardware queue tag in
 543 * the lower bits.
 544 *
 545 * Note: When called for a request that is queued on a non-multiqueue request
 546 * queue, the hardware context index is set to zero.
 547 */
 548u32 blk_mq_unique_tag(struct request *rq)
 549{
 550        return (rq->mq_hctx->queue_num << BLK_MQ_UNIQUE_TAG_BITS) |
 551                (rq->tag & BLK_MQ_UNIQUE_TAG_MASK);
 552}
 553EXPORT_SYMBOL(blk_mq_unique_tag);
 554