1
2
3
4
5
6
7#include <linux/kernel.h>
8#include <linux/module.h>
9#include <linux/backing-dev.h>
10#include <linux/bio.h>
11#include <linux/blkdev.h>
12#include <linux/kmemleak.h>
13#include <linux/mm.h>
14#include <linux/init.h>
15#include <linux/slab.h>
16#include <linux/workqueue.h>
17#include <linux/smp.h>
18#include <linux/llist.h>
19#include <linux/list_sort.h>
20#include <linux/cpu.h>
21#include <linux/cache.h>
22#include <linux/sched/sysctl.h>
23#include <linux/delay.h>
24#include <linux/crash_dump.h>
25
26#include <trace/events/block.h>
27
28#include <linux/blk-mq.h>
29#include "blk.h"
30#include "blk-mq.h"
31#include "blk-mq-debugfs.h"
32#include "blk-mq-tag.h"
33#include "blk-mq-sched.h"
34#include "blk-stat.h"
35
36static DEFINE_MUTEX(all_q_mutex);
37static LIST_HEAD(all_q_list);
38
39static void blk_mq_poll_stats_start(struct request_queue *q);
40static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
41
42
43
44
45bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
46{
47 return sbitmap_any_bit_set(&hctx->ctx_map) ||
48 !list_empty_careful(&hctx->dispatch) ||
49 blk_mq_sched_has_work(hctx);
50}
51
52
53
54
55static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
56 struct blk_mq_ctx *ctx)
57{
58 if (!sbitmap_test_bit(&hctx->ctx_map, ctx->index_hw))
59 sbitmap_set_bit(&hctx->ctx_map, ctx->index_hw);
60}
61
62static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
63 struct blk_mq_ctx *ctx)
64{
65 sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw);
66}
67
68void blk_freeze_queue_start(struct request_queue *q)
69{
70 int freeze_depth;
71
72 freeze_depth = atomic_inc_return(&q->mq_freeze_depth);
73 if (freeze_depth == 1) {
74 percpu_ref_kill(&q->q_usage_counter);
75 if (q->mq_ops)
76 blk_mq_run_hw_queues(q, false);
77 }
78}
79EXPORT_SYMBOL_GPL(blk_freeze_queue_start);
80
81void blk_mq_freeze_queue_wait(struct request_queue *q)
82{
83 wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
84}
85EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait);
86
87int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
88 unsigned long timeout)
89{
90 return wait_event_timeout(q->mq_freeze_wq,
91 percpu_ref_is_zero(&q->q_usage_counter),
92 timeout);
93}
94EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout);
95
96
97
98
99
100void blk_freeze_queue(struct request_queue *q)
101{
102
103
104
105
106
107
108
109 blk_freeze_queue_start(q);
110 if (!q->mq_ops)
111 blk_drain_queue(q);
112 blk_mq_freeze_queue_wait(q);
113}
114
115void blk_mq_freeze_queue(struct request_queue *q)
116{
117
118
119
120
121 blk_freeze_queue(q);
122}
123EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
124
125void blk_mq_unfreeze_queue(struct request_queue *q)
126{
127 int freeze_depth;
128
129 freeze_depth = atomic_dec_return(&q->mq_freeze_depth);
130 WARN_ON_ONCE(freeze_depth < 0);
131 if (!freeze_depth) {
132 percpu_ref_reinit(&q->q_usage_counter);
133 wake_up_all(&q->mq_freeze_wq);
134 }
135}
136EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
137
138
139
140
141
142
143
144
145
146void blk_mq_quiesce_queue(struct request_queue *q)
147{
148 struct blk_mq_hw_ctx *hctx;
149 unsigned int i;
150 bool rcu = false;
151
152 blk_mq_stop_hw_queues(q);
153
154 queue_for_each_hw_ctx(q, hctx, i) {
155 if (hctx->flags & BLK_MQ_F_BLOCKING)
156 synchronize_srcu(&hctx->queue_rq_srcu);
157 else
158 rcu = true;
159 }
160 if (rcu)
161 synchronize_rcu();
162}
163EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
164
165void blk_mq_wake_waiters(struct request_queue *q)
166{
167 struct blk_mq_hw_ctx *hctx;
168 unsigned int i;
169
170 queue_for_each_hw_ctx(q, hctx, i)
171 if (blk_mq_hw_queue_mapped(hctx))
172 blk_mq_tag_wakeup_all(hctx->tags, true);
173}
174
175bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
176{
177 return blk_mq_has_free_tags(hctx->tags);
178}
179EXPORT_SYMBOL(blk_mq_can_queue);
180
181void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
182 struct request *rq, unsigned int rw_flags)
183{
184 if (blk_queue_io_stat(q))
185 rw_flags |= REQ_IO_STAT;
186
187 INIT_LIST_HEAD(&rq->queuelist);
188
189 rq->q = q;
190 rq->mq_ctx = ctx;
191 rq->cmd_flags |= rw_flags;
192
193 rq->cpu = -1;
194 INIT_HLIST_NODE(&rq->hash);
195 RB_CLEAR_NODE(&rq->rb_node);
196 rq->rq_disk = NULL;
197 rq->part = NULL;
198 rq->start_time = jiffies;
199#ifdef CONFIG_BLK_CGROUP
200 rq->rl = NULL;
201 set_start_time_ns(rq);
202 rq->io_start_time_ns = 0;
203#endif
204 rq->nr_phys_segments = 0;
205#if defined(CONFIG_BLK_DEV_INTEGRITY)
206 rq->nr_integrity_segments = 0;
207#endif
208 rq->special = NULL;
209
210 rq->errors = 0;
211
212 rq->cmd = rq->__cmd;
213
214 rq->extra_len = 0;
215 rq->sense_len = 0;
216 rq->resid_len = 0;
217 rq->sense = NULL;
218
219 INIT_LIST_HEAD(&rq->timeout_list);
220 rq->timeout = 0;
221
222 rq->end_io = NULL;
223 rq->end_io_data = NULL;
224 rq->next_rq = NULL;
225
226 ctx->rq_dispatched[rw_is_sync(rw_flags)]++;
227}
228EXPORT_SYMBOL_GPL(blk_mq_rq_ctx_init);
229
230struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data, int rw)
231{
232 struct request *rq;
233 unsigned int tag;
234
235 tag = blk_mq_get_tag(data);
236 if (tag != BLK_MQ_TAG_FAIL) {
237 struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
238
239 rq = tags->static_rqs[tag];
240
241 if (data->flags & BLK_MQ_REQ_INTERNAL) {
242 rq->tag = -1;
243 __rq_aux(rq, data->q)->internal_tag = tag;
244 } else {
245 if (blk_mq_tag_busy(data->hctx)) {
246 rq->cmd_flags = REQ_MQ_INFLIGHT;
247 atomic_inc(&data->hctx->nr_active);
248 }
249 rq->tag = tag;
250 __rq_aux(rq, data->q)->internal_tag = -1;
251 data->hctx->tags->rqs[rq->tag] = rq;
252 }
253
254 blk_mq_rq_ctx_init(data->q, data->ctx, rq, rw);
255 if (data->flags & BLK_MQ_REQ_PREEMPT)
256 rq->cmd_flags |= REQ_PREEMPT;
257
258 return rq;
259 }
260
261 return NULL;
262}
263EXPORT_SYMBOL_GPL(__blk_mq_alloc_request);
264
265struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
266 unsigned int flags)
267{
268 struct blk_mq_alloc_data alloc_data = { .flags = flags };
269 struct request *rq;
270 int ret;
271
272 ret = blk_queue_enter(q, flags);
273 if (ret)
274 return ERR_PTR(ret);
275
276 rq = blk_mq_sched_get_request(q, NULL, rw, &alloc_data);
277
278 blk_mq_put_ctx(alloc_data.ctx);
279 blk_queue_exit(q);
280
281 if (!rq)
282 return ERR_PTR(-EWOULDBLOCK);
283 return rq;
284}
285EXPORT_SYMBOL(blk_mq_alloc_request);
286
287struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw,
288 unsigned int flags, unsigned int hctx_idx)
289{
290 struct blk_mq_alloc_data alloc_data = { .flags = flags };
291 struct request *rq;
292 unsigned int cpu;
293 int ret;
294
295
296
297
298
299
300
301 if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)))
302 return ERR_PTR(-EINVAL);
303
304 if (hctx_idx >= q->nr_hw_queues)
305 return ERR_PTR(-EIO);
306
307 ret = blk_queue_enter(q, flags);
308 if (ret)
309 return ERR_PTR(ret);
310
311
312
313
314
315 alloc_data.hctx = q->queue_hw_ctx[hctx_idx];
316 if (!blk_mq_hw_queue_mapped(alloc_data.hctx)) {
317 blk_queue_exit(q);
318 return ERR_PTR(-EXDEV);
319 }
320 cpu = cpumask_first(alloc_data.hctx->cpumask);
321 alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
322
323 rq = blk_mq_sched_get_request(q, NULL, rw, &alloc_data);
324
325 blk_queue_exit(q);
326
327 if (!rq)
328 return ERR_PTR(-EWOULDBLOCK);
329
330 return rq;
331}
332EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
333
334static void
335blk_mq_sched_completed_request(struct request *rq)
336{
337 struct elevator_queue *e = rq->q->elevator;
338
339 if (e && e->aux->ops.mq.completed_request)
340 e->aux->ops.mq.completed_request(rq);
341}
342
343void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
344 struct request *rq)
345{
346 const int sched_tag = rq_aux(rq)->internal_tag;
347 struct request_queue *q = rq->q;
348
349 if (rq->cmd_flags & REQ_MQ_INFLIGHT)
350 atomic_dec(&hctx->nr_active);
351 rq->cmd_flags = 0;
352
353 clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
354 if (rq->tag != -1)
355 blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
356 if (sched_tag != -1)
357 blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag);
358 blk_mq_sched_restart(hctx);
359 blk_queue_exit(q);
360}
361
362static void blk_mq_finish_hctx_request(struct blk_mq_hw_ctx *hctx,
363 struct request *rq)
364{
365 struct blk_mq_ctx *ctx = rq->mq_ctx;
366
367 ctx->rq_completed[rq_is_sync(rq)]++;
368 __blk_mq_finish_request(hctx, ctx, rq);
369}
370EXPORT_SYMBOL_GPL(blk_mq_finish_request);
371
372void blk_mq_finish_request(struct request *rq)
373{
374 blk_mq_finish_hctx_request(blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), rq);
375 }
376
377void blk_mq_free_request(struct request *rq)
378{
379 blk_mq_sched_put_request(rq);
380}
381EXPORT_SYMBOL_GPL(blk_mq_free_request);
382
383inline void __blk_mq_end_request(struct request *rq, int error)
384{
385 blk_account_io_done(rq);
386
387 if (rq->end_io) {
388 rq->end_io(rq, error);
389 } else {
390 if (unlikely(blk_bidi_rq(rq)))
391 blk_mq_free_request(rq->next_rq);
392 blk_mq_free_request(rq);
393 }
394}
395EXPORT_SYMBOL(__blk_mq_end_request);
396
397void blk_mq_end_request(struct request *rq, int error)
398{
399 if (blk_update_request(rq, error, blk_rq_bytes(rq)))
400 BUG();
401 __blk_mq_end_request(rq, error);
402}
403EXPORT_SYMBOL(blk_mq_end_request);
404
405static void __blk_mq_complete_request_remote(void *data)
406{
407 struct request *rq = data;
408
409 rq->q->softirq_done_fn(rq);
410}
411
412static void blk_mq_ipi_complete_request(struct request *rq)
413{
414 struct blk_mq_ctx *ctx = rq->mq_ctx;
415 bool shared = false;
416 int cpu;
417
418 if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) {
419 rq->q->softirq_done_fn(rq);
420 return;
421 }
422
423 cpu = get_cpu();
424 if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
425 shared = cpus_share_cache(cpu, ctx->cpu);
426
427 if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
428 rq->csd.func = __blk_mq_complete_request_remote;
429 rq->csd.info = rq;
430 rq->csd.flags = 0;
431 smp_call_function_single_async(ctx->cpu, &rq->csd);
432 } else {
433 rq->q->softirq_done_fn(rq);
434 }
435 put_cpu();
436}
437
438static void blk_mq_stat_add(struct request *rq)
439{
440 if (rq->cmd_flags & REQ_STATS) {
441 blk_mq_poll_stats_start(rq->q);
442 blk_stat_add(rq);
443 }
444}
445
446static void __blk_mq_complete_request(struct request *rq)
447{
448 struct request_queue *q = rq->q;
449
450 if (rq_aux(rq)->internal_tag != -1)
451 blk_mq_sched_completed_request(rq);
452
453 blk_mq_stat_add(rq);
454
455 if (!q->softirq_done_fn)
456 blk_mq_end_request(rq, rq->errors);
457 else
458 blk_mq_ipi_complete_request(rq);
459}
460
461
462
463
464
465
466
467
468
469void blk_mq_complete_request(struct request *rq, int error)
470{
471 struct request_queue *q = rq->q;
472
473 if (unlikely(blk_should_fake_timeout(q)))
474 return;
475 if (!blk_mark_rq_complete(rq)) {
476 rq->errors = error;
477 __blk_mq_complete_request(rq);
478 }
479}
480EXPORT_SYMBOL(blk_mq_complete_request);
481
482int blk_mq_request_started(struct request *rq)
483{
484 return test_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
485}
486EXPORT_SYMBOL_GPL(blk_mq_request_started);
487
488void blk_mq_start_request(struct request *rq)
489{
490 struct request_queue *q = rq->q;
491
492 blk_mq_sched_started_request(rq);
493
494 trace_block_rq_issue(q, rq);
495
496 rq->resid_len = blk_rq_bytes(rq);
497 if (unlikely(blk_bidi_rq(rq)))
498 rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq);
499
500 if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
501 blk_stat_set_issue_time(&rq_aux(rq)->issue_stat);
502 rq->cmd_flags |= REQ_STATS;
503 }
504
505 blk_add_timer(rq);
506
507
508
509
510
511 smp_mb__before_atomic();
512
513
514
515
516
517
518
519 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
520 set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
521 if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
522 clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
523
524 if (q->dma_drain_size && blk_rq_bytes(rq)) {
525
526
527
528
529
530 rq->nr_phys_segments++;
531 }
532}
533EXPORT_SYMBOL(blk_mq_start_request);
534
535
536
537
538
539
540
541
542
543
544static void __blk_mq_requeue_request(struct request *rq)
545{
546 struct request_queue *q = rq->q;
547
548 blk_mq_put_driver_tag(rq);
549
550 trace_block_rq_requeue(q, rq);
551 blk_mq_sched_requeue_request(rq);
552
553 if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
554 if (q->dma_drain_size && blk_rq_bytes(rq))
555 rq->nr_phys_segments--;
556 }
557}
558
559void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
560{
561 __blk_mq_requeue_request(rq);
562
563 BUG_ON(blk_queued_rq(rq));
564 blk_mq_add_to_requeue_list(rq, true, kick_requeue_list);
565}
566EXPORT_SYMBOL(blk_mq_requeue_request);
567
568static void blk_mq_requeue_work(struct work_struct *work)
569{
570 struct request_queue *q =
571 container_of(work, struct request_queue, requeue_work.work);
572 LIST_HEAD(rq_list);
573 struct request *rq, *next;
574 unsigned long flags;
575
576 spin_lock_irqsave(&q->requeue_lock, flags);
577 list_splice_init(&q->requeue_list, &rq_list);
578 spin_unlock_irqrestore(&q->requeue_lock, flags);
579
580 list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
581 if (!(rq->cmd_flags & REQ_SOFTBARRIER))
582 continue;
583
584 rq->cmd_flags &= ~REQ_SOFTBARRIER;
585 list_del_init(&rq->queuelist);
586 blk_mq_sched_insert_request(rq, true, false, false, true);
587 }
588
589 while (!list_empty(&rq_list)) {
590 rq = list_entry(rq_list.next, struct request, queuelist);
591 list_del_init(&rq->queuelist);
592 blk_mq_sched_insert_request(rq, false, false, false, true);
593 }
594
595 blk_mq_run_hw_queues(q, false);
596}
597
598void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
599 bool kick_requeue_list)
600{
601 struct request_queue *q = rq->q;
602 unsigned long flags;
603
604
605
606
607
608 BUG_ON(rq->cmd_flags & REQ_SOFTBARRIER);
609
610 spin_lock_irqsave(&q->requeue_lock, flags);
611 if (at_head) {
612 rq->cmd_flags |= REQ_SOFTBARRIER;
613 list_add(&rq->queuelist, &q->requeue_list);
614 } else {
615 list_add_tail(&rq->queuelist, &q->requeue_list);
616 }
617 spin_unlock_irqrestore(&q->requeue_lock, flags);
618
619 if (kick_requeue_list)
620 blk_mq_kick_requeue_list(q);
621}
622EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
623
624void blk_mq_kick_requeue_list(struct request_queue *q)
625{
626 kblockd_schedule_delayed_work(&q->requeue_work, 0);
627}
628EXPORT_SYMBOL(blk_mq_kick_requeue_list);
629
630void blk_mq_delay_kick_requeue_list(struct request_queue *q,
631 unsigned long msecs)
632{
633 kblockd_schedule_delayed_work(&q->requeue_work,
634 msecs_to_jiffies(msecs));
635}
636EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
637
638struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
639{
640 if (tag < tags->nr_tags)
641 return tags->rqs[tag];
642
643 return NULL;
644}
645EXPORT_SYMBOL(blk_mq_tag_to_rq);
646
647struct blk_mq_timeout_data {
648 unsigned long next;
649 unsigned int next_set;
650};
651
652void blk_mq_rq_timed_out(struct request *req, bool reserved)
653{
654 struct blk_mq_ops *ops = req->q->mq_ops;
655 enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
656
657
658
659
660
661
662
663
664
665
666 if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags))
667 return;
668
669 if (ops->timeout)
670 ret = ops->timeout(req, reserved);
671
672 switch (ret) {
673 case BLK_EH_HANDLED:
674 __blk_mq_complete_request(req);
675 break;
676 case BLK_EH_RESET_TIMER:
677 blk_add_timer(req);
678 blk_clear_rq_complete(req);
679 break;
680 case BLK_EH_NOT_HANDLED:
681 break;
682 default:
683 printk(KERN_ERR "block: bad eh return: %d\n", ret);
684 break;
685 }
686}
687
688static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
689 struct request *rq, void *priv, bool reserved)
690{
691 struct blk_mq_timeout_data *data = priv;
692
693 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
694 return;
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709 if (time_after_eq(jiffies, rq->deadline)) {
710 if (!blk_mark_rq_complete(rq))
711 blk_mq_rq_timed_out(rq, reserved);
712 } else if (!data->next_set || time_after(data->next, rq->deadline)) {
713 data->next = rq->deadline;
714 data->next_set = 1;
715 }
716}
717
718static void blk_mq_timeout_work(struct work_struct *work)
719{
720 struct request_queue *q =
721 container_of(work, struct request_queue, timeout_work);
722 struct blk_mq_timeout_data data = {
723 .next = 0,
724 .next_set = 0,
725 };
726 int i;
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741 if (!percpu_ref_tryget(&q->q_usage_counter))
742 return;
743
744 blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);
745
746 if (data.next_set) {
747 data.next = blk_rq_timeout(round_jiffies_up(data.next));
748 mod_timer(&q->timeout, data.next);
749 } else {
750 struct blk_mq_hw_ctx *hctx;
751
752 queue_for_each_hw_ctx(q, hctx, i) {
753
754 if (blk_mq_hw_queue_mapped(hctx))
755 blk_mq_tag_idle(hctx);
756 }
757 }
758 blk_queue_exit(q);
759}
760
761
762
763
764
765
766static bool blk_mq_attempt_merge(struct request_queue *q,
767 struct blk_mq_ctx *ctx, struct bio *bio)
768{
769 struct request *rq;
770 int checked = 8;
771
772 list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) {
773 int el_ret;
774
775 if (!checked--)
776 break;
777
778 if (!blk_rq_merge_ok(rq, bio))
779 continue;
780
781 el_ret = blk_try_merge(rq, bio);
782 if (el_ret == ELEVATOR_NO_MERGE)
783 continue;
784
785 if (!blk_mq_sched_allow_merge(q, rq, bio))
786 break;
787
788 if (el_ret == ELEVATOR_BACK_MERGE) {
789 if (bio_attempt_back_merge(q, rq, bio)) {
790 ctx->rq_merged++;
791 return true;
792 }
793 break;
794 } else if (el_ret == ELEVATOR_FRONT_MERGE) {
795 if (bio_attempt_front_merge(q, rq, bio)) {
796 ctx->rq_merged++;
797 return true;
798 }
799 break;
800 }
801 }
802
803 return false;
804}
805
806struct flush_busy_ctx_data {
807 struct blk_mq_hw_ctx *hctx;
808 struct list_head *list;
809};
810
811static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data)
812{
813 struct flush_busy_ctx_data *flush_data = data;
814 struct blk_mq_hw_ctx *hctx = flush_data->hctx;
815 struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
816
817 sbitmap_clear_bit(sb, bitnr);
818 spin_lock(&ctx->lock);
819 list_splice_tail_init(&ctx->rq_list, flush_data->list);
820 spin_unlock(&ctx->lock);
821 return true;
822}
823
824
825
826
827
828void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
829{
830 struct flush_busy_ctx_data data = {
831 .hctx = hctx,
832 .list = list,
833 };
834
835 sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data);
836}
837EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs);
838
839struct dispatch_rq_data {
840 struct blk_mq_hw_ctx *hctx;
841 struct request *rq;
842};
843
844static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr,
845 void *data)
846{
847 struct dispatch_rq_data *dispatch_data = data;
848 struct blk_mq_hw_ctx *hctx = dispatch_data->hctx;
849 struct blk_mq_ctx *ctx = hctx->ctxs[bitnr];
850
851 spin_lock(&ctx->lock);
852 if (unlikely(!list_empty(&ctx->rq_list))) {
853 dispatch_data->rq = list_entry_rq(ctx->rq_list.next);
854 list_del_init(&dispatch_data->rq->queuelist);
855 if (list_empty(&ctx->rq_list))
856 sbitmap_clear_bit(sb, bitnr);
857 }
858 spin_unlock(&ctx->lock);
859
860 return !dispatch_data->rq;
861}
862
863struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
864 struct blk_mq_ctx *start)
865{
866 unsigned off = start ? start->index_hw : 0;
867 struct dispatch_rq_data data = {
868 .hctx = hctx,
869 .rq = NULL,
870 };
871
872 __sbitmap_for_each_set(&hctx->ctx_map, off,
873 dispatch_rq_from_ctx, &data);
874
875 return data.rq;
876}
877
878static inline unsigned int queued_to_index(unsigned int queued)
879{
880 if (!queued)
881 return 0;
882
883 return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
884}
885
886bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
887 bool wait)
888{
889 struct blk_mq_alloc_data data = {
890 .q = rq->q,
891 .hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu),
892 .flags = wait ? 0 : BLK_MQ_REQ_NOWAIT,
893 };
894
895 if (rq->tag != -1)
896 goto done;
897
898 if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq_aux(rq)->internal_tag))
899 data.flags |= BLK_MQ_REQ_RESERVED;
900
901 rq->tag = blk_mq_get_tag(&data);
902 if (rq->tag >= 0) {
903 if (blk_mq_tag_busy(data.hctx)) {
904 rq->cmd_flags |= REQ_MQ_INFLIGHT;
905 atomic_inc(&data.hctx->nr_active);
906 }
907 data.hctx->tags->rqs[rq->tag] = rq;
908 }
909
910done:
911 if (hctx)
912 *hctx = data.hctx;
913 return rq->tag != -1;
914}
915
916static int blk_mq_dispatch_wake(wait_queue_t *wait, unsigned mode,
917 int flags, void *key)
918{
919 struct blk_mq_hw_ctx *hctx;
920
921 hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait);
922
923 list_del_init(&wait->task_list);
924 blk_mq_run_hw_queue(hctx, true);
925 return 1;
926}
927
928
929
930
931
932
933
934static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx **hctx,
935 struct request *rq)
936{
937 struct blk_mq_hw_ctx *this_hctx = *hctx;
938 bool shared_tags = (this_hctx->flags & BLK_MQ_F_TAG_SHARED) != 0;
939 struct sbq_wait_state *ws;
940 wait_queue_t *wait;
941 bool ret;
942
943 if (!shared_tags) {
944 if (!test_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state))
945 set_bit(BLK_MQ_S_SCHED_RESTART, &this_hctx->state);
946 } else {
947 wait = &this_hctx->dispatch_wait;
948 if (!list_empty_careful(&wait->task_list))
949 return false;
950
951 spin_lock(&this_hctx->lock);
952 if (!list_empty(&wait->task_list)) {
953 spin_unlock(&this_hctx->lock);
954 return false;
955 }
956
957 ws = bt_wait_ptr(&this_hctx->tags->bitmap_tags, this_hctx);
958 add_wait_queue(&ws->wait, wait);
959 }
960
961
962
963
964
965
966 ret = blk_mq_get_driver_tag(rq, hctx, false);
967
968 if (!shared_tags) {
969
970
971
972
973 return ret;
974 } else {
975 if (!ret) {
976 spin_unlock(&this_hctx->lock);
977 return false;
978 }
979
980
981
982
983
984 spin_lock_irq(&ws->wait.lock);
985 list_del_init(&wait->task_list);
986 spin_unlock_irq(&ws->wait.lock);
987 spin_unlock(&this_hctx->lock);
988 return true;
989 }
990}
991
992bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
993 bool got_budget)
994{
995 struct blk_mq_hw_ctx *hctx;
996 bool no_tag = false;
997 struct request *rq, *nxt;
998 LIST_HEAD(driver_list);
999 struct list_head *dptr;
1000 int errors, queued, ret = BLK_MQ_RQ_QUEUE_OK;
1001
1002 if (list_empty(list))
1003 return false;
1004
1005 WARN_ON(!list_is_singular(list) && got_budget);
1006
1007
1008
1009
1010
1011 dptr = NULL;
1012
1013
1014
1015
1016 errors = queued = 0;
1017 do {
1018 struct blk_mq_queue_data bd;
1019
1020 rq = list_first_entry(list, struct request, queuelist);
1021 if (!blk_mq_get_driver_tag(rq, &hctx, false)) {
1022
1023
1024
1025
1026
1027
1028
1029 if (!blk_mq_mark_tag_wait(&hctx, rq)) {
1030 if (got_budget)
1031 blk_mq_put_dispatch_budget(hctx);
1032
1033
1034
1035
1036 if (hctx->flags & BLK_MQ_F_TAG_SHARED)
1037 no_tag = true;
1038 break;
1039 }
1040 }
1041
1042 if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) {
1043 blk_mq_put_driver_tag(rq);
1044 break;
1045 }
1046
1047 list_del_init(&rq->queuelist);
1048
1049 bd.rq = rq;
1050 bd.list = dptr;
1051
1052
1053
1054
1055
1056 if (list_empty(list))
1057 bd.last = true;
1058 else {
1059 nxt = list_first_entry(list, struct request, queuelist);
1060 bd.last = !blk_mq_get_driver_tag(nxt, NULL, false);
1061 }
1062
1063 ret = q->mq_ops->queue_rq(hctx, &bd);
1064 switch (ret) {
1065 case BLK_MQ_RQ_QUEUE_OK:
1066 queued++;
1067 break;
1068 case BLK_MQ_RQ_QUEUE_BUSY:
1069
1070
1071
1072
1073 if (!list_empty(list)) {
1074 nxt = list_first_entry(list, struct request, queuelist);
1075 blk_mq_put_driver_tag(nxt);
1076 }
1077 list_add(&rq->queuelist, list);
1078 __blk_mq_requeue_request(rq);
1079 break;
1080 default:
1081 pr_err("blk-mq: bad return on queue: %d\n", ret);
1082 case BLK_MQ_RQ_QUEUE_ERROR:
1083 errors++;
1084 rq->errors = -EIO;
1085 blk_mq_end_request(rq, rq->errors);
1086 break;
1087 }
1088
1089 if (ret == BLK_MQ_RQ_QUEUE_BUSY)
1090 break;
1091
1092
1093
1094
1095
1096 if (!dptr && list->next != list->prev)
1097 dptr = &driver_list;
1098 } while (!list_empty(list));
1099
1100 hctx->dispatched[queued_to_index(queued)]++;
1101
1102
1103
1104
1105
1106 if (!list_empty(list)) {
1107 spin_lock(&hctx->lock);
1108 list_splice_init(list, &hctx->dispatch);
1109 spin_unlock(&hctx->lock);
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128 if (!blk_mq_sched_needs_restart(hctx) ||
1129 (no_tag && list_empty_careful(&hctx->dispatch_wait.task_list)))
1130 blk_mq_run_hw_queue(hctx, true);
1131 }
1132
1133 return (queued + errors) != 0;
1134}
1135
1136static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
1137{
1138 int srcu_idx;
1139
1140 WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
1141 cpu_online(hctx->next_cpu));
1142
1143 if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
1144 rcu_read_lock();
1145 blk_mq_sched_dispatch_requests(hctx);
1146 rcu_read_unlock();
1147 } else {
1148 might_sleep();
1149
1150 srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu);
1151 blk_mq_sched_dispatch_requests(hctx);
1152 srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx);
1153 }
1154}
1155
1156
1157
1158
1159
1160
1161
1162static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
1163{
1164 if (hctx->queue->nr_hw_queues == 1)
1165 return WORK_CPU_UNBOUND;
1166
1167 if (--hctx->next_cpu_batch <= 0) {
1168 int next_cpu;
1169
1170 next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
1171 if (next_cpu >= nr_cpu_ids)
1172 next_cpu = cpumask_first(hctx->cpumask);
1173
1174 hctx->next_cpu = next_cpu;
1175 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
1176 }
1177
1178 return hctx->next_cpu;
1179}
1180
1181static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
1182 unsigned long msecs)
1183{
1184 if (unlikely(blk_mq_hctx_stopped(hctx) ||
1185 !blk_mq_hw_queue_mapped(hctx)))
1186 return;
1187
1188 if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
1189 int cpu = get_cpu();
1190 if (cpumask_test_cpu(cpu, hctx->cpumask)) {
1191 __blk_mq_run_hw_queue(hctx);
1192 put_cpu();
1193 return;
1194 }
1195
1196 put_cpu();
1197 }
1198
1199 kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
1200 &hctx->run_work, msecs);
1201}
1202
1203void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
1204{
1205 __blk_mq_delay_run_hw_queue(hctx, true, msecs);
1206}
1207EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
1208
1209void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
1210{
1211 __blk_mq_delay_run_hw_queue(hctx, async, 0);
1212}
1213EXPORT_SYMBOL(blk_mq_run_hw_queue);
1214
1215void blk_mq_run_hw_queues(struct request_queue *q, bool async)
1216{
1217 struct blk_mq_hw_ctx *hctx;
1218 int i;
1219
1220 queue_for_each_hw_ctx(q, hctx, i) {
1221 if (!blk_mq_hctx_has_pending(hctx) ||
1222 blk_mq_hctx_stopped(hctx))
1223 continue;
1224
1225 blk_mq_run_hw_queue(hctx, async);
1226 }
1227}
1228EXPORT_SYMBOL(blk_mq_run_hw_queues);
1229
1230
1231
1232
1233
1234
1235
1236
1237bool blk_mq_queue_stopped(struct request_queue *q)
1238{
1239 struct blk_mq_hw_ctx *hctx;
1240 int i;
1241
1242 queue_for_each_hw_ctx(q, hctx, i)
1243 if (blk_mq_hctx_stopped(hctx))
1244 return true;
1245
1246 return false;
1247}
1248EXPORT_SYMBOL(blk_mq_queue_stopped);
1249
1250void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
1251{
1252 cancel_delayed_work(&hctx->run_work);
1253 cancel_delayed_work(&hctx->delay_work);
1254 set_bit(BLK_MQ_S_STOPPED, &hctx->state);
1255}
1256EXPORT_SYMBOL(blk_mq_stop_hw_queue);
1257
1258void blk_mq_stop_hw_queues(struct request_queue *q)
1259{
1260 struct blk_mq_hw_ctx *hctx;
1261 int i;
1262
1263 queue_for_each_hw_ctx(q, hctx, i)
1264 blk_mq_stop_hw_queue(hctx);
1265}
1266EXPORT_SYMBOL(blk_mq_stop_hw_queues);
1267
1268void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
1269{
1270 clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
1271
1272 blk_mq_run_hw_queue(hctx, false);
1273}
1274EXPORT_SYMBOL(blk_mq_start_hw_queue);
1275
1276void blk_mq_start_hw_queues(struct request_queue *q)
1277{
1278 struct blk_mq_hw_ctx *hctx;
1279 int i;
1280
1281 queue_for_each_hw_ctx(q, hctx, i)
1282 blk_mq_start_hw_queue(hctx);
1283}
1284EXPORT_SYMBOL(blk_mq_start_hw_queues);
1285
1286void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
1287{
1288 struct blk_mq_hw_ctx *hctx;
1289 int i;
1290
1291 queue_for_each_hw_ctx(q, hctx, i) {
1292 if (!blk_mq_hctx_stopped(hctx))
1293 continue;
1294
1295 clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
1296 blk_mq_run_hw_queue(hctx, async);
1297 }
1298}
1299EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
1300
1301static void blk_mq_run_work_fn(struct work_struct *work)
1302{
1303 struct blk_mq_hw_ctx *hctx;
1304
1305 hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
1306
1307 __blk_mq_run_hw_queue(hctx);
1308}
1309
1310static void blk_mq_delay_work_fn(struct work_struct *work)
1311{
1312 struct blk_mq_hw_ctx *hctx;
1313
1314 hctx = container_of(work, struct blk_mq_hw_ctx, delay_work.work);
1315
1316 if (test_and_clear_bit(BLK_MQ_S_STOPPED, &hctx->state))
1317 __blk_mq_run_hw_queue(hctx);
1318}
1319
1320void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
1321{
1322 if (unlikely(!blk_mq_hw_queue_mapped(hctx)))
1323 return;
1324
1325 blk_mq_stop_hw_queue(hctx);
1326 kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
1327 &hctx->delay_work, msecs_to_jiffies(msecs));
1328}
1329EXPORT_SYMBOL(blk_mq_delay_queue);
1330
1331static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx,
1332 struct request *rq,
1333 bool at_head)
1334{
1335 struct blk_mq_ctx *ctx = rq->mq_ctx;
1336
1337 trace_block_rq_insert(hctx->queue, rq);
1338
1339 if (at_head)
1340 list_add(&rq->queuelist, &ctx->rq_list);
1341 else
1342 list_add_tail(&rq->queuelist, &ctx->rq_list);
1343}
1344
1345void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
1346 bool at_head)
1347{
1348 struct blk_mq_ctx *ctx = rq->mq_ctx;
1349
1350 __blk_mq_insert_req_list(hctx, rq, at_head);
1351 blk_mq_hctx_mark_pending(hctx, ctx);
1352}
1353
1354
1355
1356
1357
1358void blk_mq_request_bypass_insert(struct request *rq, bool run_queue)
1359{
1360 struct blk_mq_ctx *ctx = rq->mq_ctx;
1361 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu);
1362
1363 spin_lock(&hctx->lock);
1364 list_add_tail(&rq->queuelist, &hctx->dispatch);
1365 spin_unlock(&hctx->lock);
1366
1367 if (run_queue)
1368 blk_mq_run_hw_queue(hctx, false);
1369}
1370
1371void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
1372 struct list_head *list)
1373
1374{
1375
1376
1377
1378
1379 spin_lock(&ctx->lock);
1380 while (!list_empty(list)) {
1381 struct request *rq;
1382
1383 rq = list_first_entry(list, struct request, queuelist);
1384 BUG_ON(rq->mq_ctx != ctx);
1385 list_del_init(&rq->queuelist);
1386 __blk_mq_insert_req_list(hctx, rq, false);
1387 }
1388 blk_mq_hctx_mark_pending(hctx, ctx);
1389 spin_unlock(&ctx->lock);
1390}
1391
1392static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
1393{
1394 struct request *rqa = container_of(a, struct request, queuelist);
1395 struct request *rqb = container_of(b, struct request, queuelist);
1396
1397 return !(rqa->mq_ctx < rqb->mq_ctx ||
1398 (rqa->mq_ctx == rqb->mq_ctx &&
1399 blk_rq_pos(rqa) < blk_rq_pos(rqb)));
1400}
1401
1402void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
1403{
1404 struct blk_mq_ctx *this_ctx;
1405 struct request_queue *this_q;
1406 struct request *rq;
1407 LIST_HEAD(list);
1408 LIST_HEAD(ctx_list);
1409 unsigned int depth;
1410
1411 list_splice_init(&plug->mq_list, &list);
1412
1413 list_sort(NULL, &list, plug_ctx_cmp);
1414
1415 this_q = NULL;
1416 this_ctx = NULL;
1417 depth = 0;
1418
1419 while (!list_empty(&list)) {
1420 rq = list_entry_rq(list.next);
1421 list_del_init(&rq->queuelist);
1422 BUG_ON(!rq->q);
1423 if (rq->mq_ctx != this_ctx) {
1424 if (this_ctx) {
1425 trace_block_unplug(this_q, depth, from_schedule);
1426 blk_mq_sched_insert_requests(this_q, this_ctx,
1427 &ctx_list,
1428 from_schedule);
1429 }
1430
1431 this_ctx = rq->mq_ctx;
1432 this_q = rq->q;
1433 depth = 0;
1434 }
1435
1436 depth++;
1437 list_add_tail(&rq->queuelist, &ctx_list);
1438 }
1439
1440
1441
1442
1443
1444 if (this_ctx) {
1445 trace_block_unplug(this_q, depth, from_schedule);
1446 blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list,
1447 from_schedule);
1448 }
1449}
1450
1451static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
1452{
1453 init_request_from_bio(rq, bio);
1454
1455 if (blk_do_io_stat(rq))
1456 blk_account_io_start(rq, true);
1457}
1458
1459static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx)
1460{
1461 return (hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
1462 !blk_queue_nomerges(hctx->queue);
1463}
1464
1465
1466static inline bool blk_mq_merge_bio(struct request_queue *q, struct bio *bio)
1467{
1468 bool ret = false;
1469 struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
1470 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
1471
1472 if (hctx_allow_merges(hctx) && bio_mergeable(bio)) {
1473 spin_lock(&ctx->lock);
1474 ret = blk_mq_attempt_merge(q, ctx, bio);
1475 spin_unlock(&ctx->lock);
1476 }
1477
1478 blk_mq_put_ctx(ctx);
1479 return ret;
1480}
1481
1482static inline void blk_mq_queue_io(struct blk_mq_hw_ctx *hctx,
1483 struct blk_mq_ctx *ctx,
1484 struct request *rq)
1485{
1486 spin_lock(&ctx->lock);
1487 __blk_mq_insert_request(hctx, rq, false);
1488 spin_unlock(&ctx->lock);
1489}
1490
1491static void __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
1492 struct request *rq, bool may_sleep)
1493{
1494 struct request_queue *q = rq->q;
1495 struct blk_mq_queue_data bd = {
1496 .rq = rq,
1497 .list = NULL,
1498 .last = true,
1499 };
1500 int ret;
1501 bool run_queue = true;
1502
1503 if (blk_mq_hctx_stopped(hctx)) {
1504 run_queue = false;
1505 goto insert;
1506 }
1507
1508 if (q->elevator)
1509 goto insert;
1510
1511 if (!blk_mq_get_driver_tag(rq, NULL, false))
1512 goto insert;
1513
1514 if (!blk_mq_get_dispatch_budget(hctx)) {
1515 blk_mq_put_driver_tag(rq);
1516 goto insert;
1517 }
1518
1519
1520
1521
1522
1523
1524 ret = q->mq_ops->queue_rq(hctx, &bd);
1525 if (ret == BLK_MQ_RQ_QUEUE_OK)
1526 return;
1527
1528 if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
1529 rq->errors = -EIO;
1530 blk_mq_end_request(rq, rq->errors);
1531 return;
1532 }
1533
1534 __blk_mq_requeue_request(rq);
1535insert:
1536 blk_mq_sched_insert_request(rq, false, run_queue, false, may_sleep);
1537}
1538
1539static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
1540 struct request *rq)
1541{
1542 if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
1543 rcu_read_lock();
1544 __blk_mq_try_issue_directly(hctx, rq, false);
1545 rcu_read_unlock();
1546 } else {
1547 unsigned int srcu_idx;
1548
1549 might_sleep();
1550
1551 srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu);
1552 __blk_mq_try_issue_directly(hctx, rq, true);
1553 srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx);
1554 }
1555}
1556
1557static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
1558{
1559 const int is_sync = rw_is_sync(bio->bi_rw);
1560 const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
1561 struct blk_mq_alloc_data data = { .flags = 0 };
1562 struct request *rq;
1563 unsigned int request_count = 0;
1564 struct blk_plug *plug;
1565 struct request *same_queue_rq = NULL;
1566
1567 blk_queue_bounce(q, &bio);
1568
1569 if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
1570 bio_endio(bio, -EIO);
1571 return;
1572 }
1573
1574 if (!is_flush_fua && !blk_queue_nomerges(q) &&
1575 blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
1576 return;
1577
1578 if (blk_mq_sched_bio_merge(q, bio))
1579 return;
1580
1581 if (blk_mq_merge_bio(q, bio))
1582 return;
1583
1584 trace_block_getrq(q, bio, bio->bi_rw);
1585
1586 rq = blk_mq_sched_get_request(q, bio, bio->bi_rw, &data);
1587 if (unlikely(!rq))
1588 return;
1589
1590 plug = current->plug;
1591 if (unlikely(is_flush_fua)) {
1592 blk_mq_put_ctx(data.ctx);
1593 blk_mq_bio_to_request(rq, bio);
1594
1595
1596 blk_insert_flush(rq);
1597 blk_mq_run_hw_queue(data.hctx, true);
1598 } else if (plug && q->nr_hw_queues == 1) {
1599 struct request *last = NULL;
1600
1601 blk_mq_put_ctx(data.ctx);
1602 blk_mq_bio_to_request(rq, bio);
1603
1604
1605
1606
1607
1608 if (list_empty(&plug->mq_list))
1609 request_count = 0;
1610 else if (blk_queue_nomerges(q))
1611 request_count = blk_plug_queued_count(q);
1612
1613 if (!request_count)
1614 trace_block_plug(q);
1615 else
1616 last = list_entry_rq(plug->mq_list.prev);
1617
1618 if (request_count >= BLK_MAX_REQUEST_COUNT || (last &&
1619 blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
1620 blk_flush_plug_list(plug, false);
1621 trace_block_plug(q);
1622 }
1623
1624 list_add_tail(&rq->queuelist, &plug->mq_list);
1625 } else if (plug && !blk_queue_nomerges(q)) {
1626 blk_mq_bio_to_request(rq, bio);
1627
1628
1629
1630
1631
1632
1633
1634
1635 if (list_empty(&plug->mq_list))
1636 same_queue_rq = NULL;
1637 if (same_queue_rq)
1638 list_del_init(&same_queue_rq->queuelist);
1639 list_add_tail(&rq->queuelist, &plug->mq_list);
1640
1641 blk_mq_put_ctx(data.ctx);
1642
1643 if (same_queue_rq) {
1644 data.hctx = blk_mq_map_queue(q,
1645 same_queue_rq->mq_ctx->cpu);
1646 blk_mq_try_issue_directly(data.hctx, same_queue_rq);
1647 }
1648 } else if (q->nr_hw_queues > 1 && is_sync) {
1649 blk_mq_put_ctx(data.ctx);
1650 blk_mq_bio_to_request(rq, bio);
1651 blk_mq_try_issue_directly(data.hctx, rq);
1652 } else if (q->elevator) {
1653 blk_mq_put_ctx(data.ctx);
1654 blk_mq_bio_to_request(rq, bio);
1655 blk_mq_sched_insert_request(rq, false, true, true, true);
1656 } else {
1657 blk_mq_put_ctx(data.ctx);
1658 blk_mq_bio_to_request(rq, bio);
1659 blk_mq_queue_io(data.hctx, data.ctx, rq);
1660 blk_mq_run_hw_queue(data.hctx, true);
1661 }
1662}
1663
1664void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
1665 unsigned int hctx_idx)
1666{
1667 struct page *page;
1668
1669 if (tags->rqs && set->ops->exit_request) {
1670 int i;
1671
1672 for (i = 0; i < tags->nr_tags; i++) {
1673 struct request *rq = tags->static_rqs[i];
1674
1675 if (!rq)
1676 continue;
1677 set->ops->exit_request(set->driver_data, rq,
1678 hctx_idx, i);
1679 tags->static_rqs[i] = NULL;
1680 }
1681 }
1682
1683 while (!list_empty(&tags->page_list)) {
1684 page = list_first_entry(&tags->page_list, struct page, lru);
1685 list_del_init(&page->lru);
1686
1687
1688
1689
1690 kmemleak_free(page_address(page));
1691 __free_pages(page, page->private);
1692 }
1693}
1694
1695void blk_mq_free_rq_map(struct blk_mq_tags *tags)
1696{
1697 kfree(tags->rqs);
1698 tags->rqs = NULL;
1699 kfree(tags->static_rqs);
1700 tags->static_rqs = NULL;
1701
1702 blk_mq_free_tags(tags);
1703}
1704
1705struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
1706 unsigned int hctx_idx,
1707 unsigned int nr_tags,
1708 unsigned int reserved_tags)
1709{
1710 struct blk_mq_tags *tags;
1711
1712 tags = blk_mq_init_tags(nr_tags, reserved_tags,
1713 set->numa_node,
1714 BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
1715 if (!tags)
1716 return NULL;
1717
1718 tags->rqs = kzalloc_node(nr_tags * sizeof(struct request *),
1719 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
1720 set->numa_node);
1721 if (!tags->rqs) {
1722 blk_mq_free_tags(tags);
1723 return NULL;
1724 }
1725
1726 tags->static_rqs = kzalloc_node(nr_tags * sizeof(struct request *),
1727 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
1728 set->numa_node);
1729 if (!tags->static_rqs) {
1730 kfree(tags->rqs);
1731 blk_mq_free_tags(tags);
1732 return NULL;
1733 }
1734
1735 return tags;
1736}
1737
1738static size_t order_to_size(unsigned int order)
1739{
1740 return (size_t)PAGE_SIZE << order;
1741}
1742
1743int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
1744 unsigned int hctx_idx, unsigned int depth)
1745{
1746 unsigned int i, j, entries_per_page, max_order = 4;
1747 size_t rq_size, left;
1748
1749 INIT_LIST_HEAD(&tags->page_list);
1750
1751
1752
1753
1754
1755 rq_size = round_up(sizeof(struct request) + set->cmd_size +
1756 sizeof(struct request_aux), cache_line_size());
1757 left = rq_size * depth;
1758
1759 for (i = 0; i < depth; ) {
1760 int this_order = max_order;
1761 struct page *page;
1762 int to_do;
1763 void *p;
1764
1765 while (this_order && left < order_to_size(this_order - 1))
1766 this_order--;
1767
1768 do {
1769 page = alloc_pages_node(set->numa_node,
1770 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO,
1771 this_order);
1772 if (page)
1773 break;
1774 if (!this_order--)
1775 break;
1776 if (order_to_size(this_order) < rq_size)
1777 break;
1778 } while (1);
1779
1780 if (!page)
1781 goto fail;
1782
1783 page->private = this_order;
1784 list_add_tail(&page->lru, &tags->page_list);
1785
1786 p = page_address(page);
1787
1788
1789
1790
1791 kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO);
1792 entries_per_page = order_to_size(this_order) / rq_size;
1793 to_do = min(entries_per_page, depth - i);
1794 left -= to_do * rq_size;
1795 for (j = 0; j < to_do; j++) {
1796 struct request *rq = p;
1797
1798 tags->static_rqs[i] = rq;
1799 if (set->ops->init_request) {
1800 if (set->ops->init_request(set->driver_data,
1801 rq, hctx_idx, i,
1802 set->numa_node)) {
1803 tags->static_rqs[i] = NULL;
1804 goto fail;
1805 }
1806 }
1807
1808 p += rq_size;
1809 i++;
1810 }
1811 }
1812 return 0;
1813
1814fail:
1815 blk_mq_free_rqs(set, tags, hctx_idx);
1816 return -ENOMEM;
1817}
1818
1819
1820
1821
1822
1823
1824static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu)
1825{
1826 struct blk_mq_ctx *ctx;
1827 LIST_HEAD(tmp);
1828
1829 ctx = __blk_mq_get_ctx(hctx->queue, cpu);
1830
1831 spin_lock(&ctx->lock);
1832 if (!list_empty(&ctx->rq_list)) {
1833 list_splice_init(&ctx->rq_list, &tmp);
1834 blk_mq_hctx_clear_pending(hctx, ctx);
1835 }
1836 spin_unlock(&ctx->lock);
1837
1838 if (list_empty(&tmp))
1839 return NOTIFY_OK;
1840
1841 spin_lock(&hctx->lock);
1842 list_splice_tail_init(&tmp, &hctx->dispatch);
1843 spin_unlock(&hctx->lock);
1844
1845 blk_mq_run_hw_queue(hctx, true);
1846 return NOTIFY_OK;
1847}
1848
1849static int blk_mq_hctx_notify(void *data, unsigned long action,
1850 unsigned int cpu)
1851{
1852 struct blk_mq_hw_ctx *hctx = data;
1853
1854 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
1855 return blk_mq_hctx_cpu_offline(hctx, cpu);
1856
1857
1858
1859
1860
1861
1862 return NOTIFY_OK;
1863}
1864
1865
1866static void blk_mq_exit_hctx(struct request_queue *q,
1867 struct blk_mq_tag_set *set,
1868 struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
1869{
1870 unsigned flush_start_tag = set->queue_depth;
1871
1872 blk_mq_debugfs_unregister_hctx(hctx);
1873
1874 if (blk_mq_hw_queue_mapped(hctx))
1875 blk_mq_tag_idle(hctx);
1876
1877 if (set->ops->exit_request)
1878 set->ops->exit_request(set->driver_data,
1879 hctx->fq->flush_rq, hctx_idx,
1880 flush_start_tag + hctx_idx);
1881
1882 blk_mq_sched_exit_hctx(q, hctx, hctx_idx);
1883
1884 if (set->ops->exit_hctx)
1885 set->ops->exit_hctx(hctx, hctx_idx);
1886
1887 if (hctx->flags & BLK_MQ_F_BLOCKING)
1888 cleanup_srcu_struct(&hctx->queue_rq_srcu);
1889
1890 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
1891 blk_free_flush_queue(hctx->fq);
1892 sbitmap_free(&hctx->ctx_map);
1893}
1894
1895static void blk_mq_exit_hw_queues(struct request_queue *q,
1896 struct blk_mq_tag_set *set, int nr_queue)
1897{
1898 struct blk_mq_hw_ctx *hctx;
1899 unsigned int i;
1900
1901 queue_for_each_hw_ctx(q, hctx, i) {
1902 if (i == nr_queue)
1903 break;
1904 blk_mq_exit_hctx(q, set, hctx, i);
1905 }
1906}
1907
1908static void blk_mq_free_hw_queues(struct request_queue *q,
1909 struct blk_mq_tag_set *set)
1910{
1911 struct blk_mq_hw_ctx *hctx;
1912 unsigned int i;
1913
1914 queue_for_each_hw_ctx(q, hctx, i)
1915 free_cpumask_var(hctx->cpumask);
1916}
1917
1918static int blk_mq_init_hctx(struct request_queue *q,
1919 struct blk_mq_tag_set *set,
1920 struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
1921{
1922 int node;
1923 unsigned flush_start_tag = set->queue_depth;
1924
1925 node = hctx->numa_node;
1926 if (node == NUMA_NO_NODE)
1927 node = hctx->numa_node = set->numa_node;
1928
1929 INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
1930 INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn);
1931 spin_lock_init(&hctx->lock);
1932 INIT_LIST_HEAD(&hctx->dispatch);
1933 hctx->queue = q;
1934 hctx->queue_num = hctx_idx;
1935 hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED;
1936
1937 blk_mq_init_cpu_notifier(&hctx->cpu_notifier,
1938 blk_mq_hctx_notify, hctx);
1939 blk_mq_register_cpu_notifier(&hctx->cpu_notifier);
1940
1941 hctx->tags = set->tags[hctx_idx];
1942
1943
1944
1945
1946
1947 hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *),
1948 GFP_KERNEL, node);
1949 if (!hctx->ctxs)
1950 goto unregister_cpu_notifier;
1951
1952 if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8), GFP_KERNEL,
1953 node))
1954 goto free_ctxs;
1955
1956 hctx->nr_ctx = 0;
1957
1958 init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
1959 INIT_LIST_HEAD(&hctx->dispatch_wait.task_list);
1960
1961 if (set->ops->init_hctx &&
1962 set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
1963 goto free_bitmap;
1964
1965 if (blk_mq_sched_init_hctx(q, hctx, hctx_idx))
1966 goto exit_hctx;
1967
1968 hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size +
1969 sizeof(struct request_aux));
1970 if (!hctx->fq)
1971 goto sched_exit_hctx;
1972
1973 if (set->ops->init_request &&
1974 set->ops->init_request(set->driver_data,
1975 hctx->fq->flush_rq, hctx_idx,
1976 flush_start_tag + hctx_idx, node))
1977 goto free_fq;
1978
1979 if (hctx->flags & BLK_MQ_F_BLOCKING)
1980 init_srcu_struct(&hctx->queue_rq_srcu);
1981
1982 blk_mq_debugfs_register_hctx(q, hctx);
1983
1984 return 0;
1985
1986 free_fq:
1987 kfree(hctx->fq);
1988 sched_exit_hctx:
1989 blk_mq_sched_exit_hctx(q, hctx, hctx_idx);
1990 exit_hctx:
1991 if (set->ops->exit_hctx)
1992 set->ops->exit_hctx(hctx, hctx_idx);
1993 free_bitmap:
1994 sbitmap_free(&hctx->ctx_map);
1995 free_ctxs:
1996 kfree(hctx->ctxs);
1997 unregister_cpu_notifier:
1998 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
1999
2000 return -1;
2001}
2002
2003static void blk_mq_init_cpu_queues(struct request_queue *q,
2004 unsigned int nr_hw_queues)
2005{
2006 unsigned int i;
2007
2008 for_each_possible_cpu(i) {
2009 struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
2010 struct blk_mq_hw_ctx *hctx;
2011
2012 memset(__ctx, 0, sizeof(*__ctx));
2013 __ctx->cpu = i;
2014 spin_lock_init(&__ctx->lock);
2015 INIT_LIST_HEAD(&__ctx->rq_list);
2016 __ctx->queue = q;
2017
2018
2019 if (!cpu_online(i))
2020 continue;
2021
2022 hctx = blk_mq_map_queue(q, i);
2023
2024
2025
2026
2027
2028 if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
2029 hctx->numa_node = local_memory_node(cpu_to_node(i));
2030 }
2031}
2032
2033static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, int hctx_idx)
2034{
2035 int ret = 0;
2036
2037 set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx,
2038 set->queue_depth, set->reserved_tags);
2039 if (!set->tags[hctx_idx])
2040 return false;
2041
2042 ret = blk_mq_alloc_rqs(set, set->tags[hctx_idx], hctx_idx,
2043 set->queue_depth);
2044 if (!ret)
2045 return true;
2046
2047 blk_mq_free_rq_map(set->tags[hctx_idx]);
2048 set->tags[hctx_idx] = NULL;
2049 return false;
2050}
2051
2052static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
2053 unsigned int hctx_idx)
2054{
2055 if (set->tags[hctx_idx]) {
2056 blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
2057 blk_mq_free_rq_map(set->tags[hctx_idx]);
2058 set->tags[hctx_idx] = NULL;
2059 }
2060}
2061
2062static void blk_mq_map_swqueue(struct request_queue *q,
2063 const struct cpumask *online_mask)
2064{
2065 unsigned int i, hctx_idx;
2066 struct blk_mq_hw_ctx *hctx;
2067 struct blk_mq_ctx *ctx;
2068 struct blk_mq_tag_set *set = q->tag_set;
2069
2070
2071
2072
2073 mutex_lock(&q->sysfs_lock);
2074
2075 queue_for_each_hw_ctx(q, hctx, i) {
2076 cpumask_clear(hctx->cpumask);
2077 hctx->nr_ctx = 0;
2078 }
2079
2080
2081
2082
2083 for_each_possible_cpu(i) {
2084
2085 if (!cpumask_test_cpu(i, online_mask))
2086 continue;
2087
2088 hctx_idx = q->mq_map[i];
2089
2090 if (!set->tags[hctx_idx] &&
2091 !__blk_mq_alloc_rq_map(set, hctx_idx)) {
2092
2093
2094
2095
2096
2097
2098 q->mq_map[i] = 0;
2099 }
2100
2101 ctx = per_cpu_ptr(q->queue_ctx, i);
2102 hctx = blk_mq_map_queue(q, i);
2103
2104 cpumask_set_cpu(i, hctx->cpumask);
2105 ctx->index_hw = hctx->nr_ctx;
2106 hctx->ctxs[hctx->nr_ctx++] = ctx;
2107 }
2108
2109 mutex_unlock(&q->sysfs_lock);
2110
2111 queue_for_each_hw_ctx(q, hctx, i) {
2112
2113
2114
2115
2116 if (!hctx->nr_ctx) {
2117
2118
2119
2120
2121 if (i && set->tags[i])
2122 blk_mq_free_map_and_requests(set, i);
2123
2124 hctx->tags = NULL;
2125 continue;
2126 }
2127
2128 hctx->tags = set->tags[i];
2129 WARN_ON(!hctx->tags);
2130
2131
2132
2133
2134
2135
2136 sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx);
2137
2138
2139
2140
2141 hctx->next_cpu = cpumask_first(hctx->cpumask);
2142 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
2143 }
2144}
2145
2146
2147
2148
2149
2150static void queue_set_hctx_shared(struct request_queue *q, bool shared)
2151{
2152 struct blk_mq_hw_ctx *hctx;
2153 int i;
2154
2155 queue_for_each_hw_ctx(q, hctx, i) {
2156 if (shared) {
2157 if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
2158 atomic_inc(&q->shared_hctx_restart);
2159 hctx->flags |= BLK_MQ_F_TAG_SHARED;
2160 } else {
2161 if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
2162 atomic_dec(&q->shared_hctx_restart);
2163 hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
2164 }
2165 }
2166}
2167
2168static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set,
2169 bool shared)
2170{
2171 struct request_queue *q;
2172
2173 lockdep_assert_held(&set->tag_list_lock);
2174
2175 list_for_each_entry(q, &set->tag_list, tag_set_list) {
2176 blk_mq_freeze_queue(q);
2177 queue_set_hctx_shared(q, shared);
2178 blk_mq_unfreeze_queue(q);
2179 }
2180}
2181
2182static void blk_mq_del_queue_tag_set(struct request_queue *q)
2183{
2184 struct blk_mq_tag_set *set = q->tag_set;
2185
2186 mutex_lock(&set->tag_list_lock);
2187 list_del_rcu(&q->tag_set_list);
2188 INIT_LIST_HEAD(&q->tag_set_list);
2189 if (list_is_singular(&set->tag_list)) {
2190
2191 set->flags &= ~BLK_MQ_F_TAG_SHARED;
2192
2193 blk_mq_update_tag_set_depth(set, false);
2194 }
2195 mutex_unlock(&set->tag_list_lock);
2196
2197 synchronize_rcu();
2198}
2199
2200static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
2201 struct request_queue *q)
2202{
2203 q->tag_set = set;
2204
2205 mutex_lock(&set->tag_list_lock);
2206
2207
2208 if (!list_empty(&set->tag_list) && !(set->flags & BLK_MQ_F_TAG_SHARED)) {
2209 set->flags |= BLK_MQ_F_TAG_SHARED;
2210
2211 blk_mq_update_tag_set_depth(set, true);
2212 }
2213 if (set->flags & BLK_MQ_F_TAG_SHARED)
2214 queue_set_hctx_shared(q, true);
2215 list_add_tail_rcu(&q->tag_set_list, &set->tag_list);
2216
2217 mutex_unlock(&set->tag_list_lock);
2218}
2219
2220
2221
2222
2223
2224
2225
2226void blk_mq_release(struct request_queue *q)
2227{
2228 struct blk_mq_hw_ctx *hctx;
2229 unsigned int i;
2230
2231
2232 queue_for_each_hw_ctx(q, hctx, i) {
2233 if (!hctx)
2234 continue;
2235 kfree(hctx->ctxs);
2236 kfree(hctx);
2237 }
2238
2239 q->mq_map = NULL;
2240
2241 kfree(q->queue_hw_ctx);
2242
2243
2244 free_percpu(q->queue_ctx);
2245}
2246
2247struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
2248{
2249 struct request_queue *uninit_q, *q;
2250
2251 uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node);
2252 if (!uninit_q)
2253 return ERR_PTR(-ENOMEM);
2254
2255 q = blk_mq_init_allocated_queue(set, uninit_q);
2256 if (IS_ERR(q))
2257 blk_cleanup_queue(uninit_q);
2258
2259 return q;
2260}
2261EXPORT_SYMBOL(blk_mq_init_queue);
2262
2263static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
2264 struct request_queue *q)
2265{
2266 int i, j;
2267 struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;
2268
2269 blk_mq_sysfs_unregister(q);
2270 for (i = 0; i < set->nr_hw_queues; i++) {
2271 int node;
2272
2273 if (hctxs[i])
2274 continue;
2275
2276 node = blk_mq_hw_queue_to_node(q->mq_map, i);
2277 hctxs[i] = kzalloc_node(sizeof(struct blk_mq_hw_ctx),
2278 GFP_KERNEL, node);
2279 if (!hctxs[i])
2280 break;
2281
2282 if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask, GFP_KERNEL,
2283 node)) {
2284 kfree(hctxs[i]);
2285 hctxs[i] = NULL;
2286 break;
2287 }
2288
2289 atomic_set(&hctxs[i]->nr_active, 0);
2290 hctxs[i]->numa_node = node;
2291 hctxs[i]->queue_num = i;
2292
2293 if (blk_mq_init_hctx(q, set, hctxs[i], i)) {
2294 free_cpumask_var(hctxs[i]->cpumask);
2295 kfree(hctxs[i]);
2296 hctxs[i] = NULL;
2297 break;
2298 }
2299 blk_mq_hctx_kobj_init(hctxs[i]);
2300 }
2301 for (j = i; j < q->nr_hw_queues; j++) {
2302 struct blk_mq_hw_ctx *hctx = hctxs[j];
2303
2304 if (hctx) {
2305 if (hctx->tags)
2306 blk_mq_free_map_and_requests(set, j);
2307 blk_mq_exit_hctx(q, set, hctx, j);
2308 free_cpumask_var(hctx->cpumask);
2309 kobject_put(&hctx->kobj);
2310 kfree(hctx->ctxs);
2311 kfree(hctx);
2312 hctxs[j] = NULL;
2313
2314 }
2315 }
2316 q->nr_hw_queues = i;
2317 blk_mq_sysfs_register(q);
2318}
2319
2320struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
2321 struct request_queue *q)
2322{
2323
2324 q->mq_ops = set->ops;
2325
2326 q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn,
2327 blk_stat_rq_ddir, 2, q);
2328 if (!q->poll_cb)
2329 goto err_exit;
2330
2331 q->queue_ctx = alloc_percpu(struct blk_mq_ctx);
2332 if (!q->queue_ctx)
2333 goto err_exit;
2334
2335 q->queue_hw_ctx = kzalloc_node(nr_cpu_ids * sizeof(*(q->queue_hw_ctx)),
2336 GFP_KERNEL, set->numa_node);
2337 if (!q->queue_hw_ctx)
2338 goto err_percpu;
2339
2340 q->mq_map = set->mq_map;
2341
2342 blk_mq_realloc_hw_ctxs(set, q);
2343 if (!q->nr_hw_queues)
2344 goto err_hctxs;
2345
2346 INIT_WORK(&q->timeout_work, blk_mq_timeout_work);
2347 blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ);
2348
2349 q->nr_queues = nr_cpu_ids;
2350
2351 q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
2352
2353 if (!(set->flags & BLK_MQ_F_SG_MERGE))
2354 q->queue_flags |= 1 << QUEUE_FLAG_NO_SG_MERGE;
2355
2356 q->sg_reserved_size = INT_MAX;
2357
2358 INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
2359 INIT_LIST_HEAD(&q->requeue_list);
2360 spin_lock_init(&q->requeue_lock);
2361
2362 blk_queue_make_request(q, blk_mq_make_request);
2363
2364
2365
2366
2367 q->nr_requests = set->queue_depth;
2368
2369 if (set->ops->complete)
2370 blk_queue_softirq_done(q, set->ops->complete);
2371
2372 blk_mq_init_cpu_queues(q, set->nr_hw_queues);
2373
2374 get_online_cpus();
2375 mutex_lock(&all_q_mutex);
2376
2377 list_add_tail(&q->all_q_node, &all_q_list);
2378 blk_mq_add_queue_tag_set(set, q);
2379 blk_mq_map_swqueue(q, cpu_online_mask);
2380
2381 mutex_unlock(&all_q_mutex);
2382 put_online_cpus();
2383
2384 if (!(set->flags & BLK_MQ_F_NO_SCHED)) {
2385 int ret;
2386
2387 ret = blk_mq_sched_init(q);
2388 if (ret)
2389 return ERR_PTR(ret);
2390 }
2391
2392 return q;
2393
2394err_hctxs:
2395 kfree(q->queue_hw_ctx);
2396err_percpu:
2397 free_percpu(q->queue_ctx);
2398err_exit:
2399 q->mq_ops = NULL;
2400 return ERR_PTR(-ENOMEM);
2401}
2402EXPORT_SYMBOL(blk_mq_init_allocated_queue);
2403
2404void blk_mq_free_queue(struct request_queue *q)
2405{
2406 struct blk_mq_tag_set *set = q->tag_set;
2407
2408 mutex_lock(&all_q_mutex);
2409 list_del_init(&q->all_q_node);
2410 mutex_unlock(&all_q_mutex);
2411
2412 blk_mq_del_queue_tag_set(q);
2413
2414 blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
2415 blk_mq_free_hw_queues(q, set);
2416}
2417
2418
2419static void blk_mq_queue_reinit(struct request_queue *q,
2420 const struct cpumask *online_mask)
2421{
2422 WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));
2423
2424 blk_mq_debugfs_unregister_hctxs(q);
2425 blk_mq_sysfs_unregister(q);
2426
2427
2428
2429
2430
2431
2432
2433 blk_mq_map_swqueue(q, online_mask);
2434
2435 blk_mq_sysfs_register(q);
2436 blk_mq_debugfs_register_hctxs(q);
2437}
2438
2439static void blk_mq_freeze_queue_list(struct list_head *list)
2440{
2441 struct request_queue *q;
2442
2443
2444
2445
2446
2447
2448
2449
2450 list_for_each_entry(q, list, all_q_node)
2451 blk_freeze_queue_start(q);
2452 list_for_each_entry(q, list, all_q_node) {
2453 blk_mq_freeze_queue_wait(q);
2454
2455
2456
2457
2458
2459 del_timer_sync(&q->timeout);
2460 }
2461}
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481static void __blk_mq_freeze_all_queue_list(void)
2482{
2483 struct request_queue *q, *next;
2484 LIST_HEAD(front);
2485 LIST_HEAD(tail);
2486
2487 list_for_each_entry_safe(q, next, &all_q_list, all_q_node) {
2488 if (q->front_queue)
2489 list_move(&q->all_q_node, &front);
2490 else if (q->tail_queue)
2491 list_move(&q->all_q_node, &tail);
2492 }
2493
2494 blk_mq_freeze_queue_list(&front);
2495 blk_mq_freeze_queue_list(&all_q_list);
2496 blk_mq_freeze_queue_list(&tail);
2497
2498 list_splice(&front, &all_q_list);
2499 list_splice_tail(&tail, &all_q_list);
2500}
2501
2502static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
2503 unsigned long action, void *hcpu)
2504{
2505 struct request_queue *q;
2506 int cpu = (unsigned long)hcpu;
2507
2508
2509
2510
2511
2512 static struct cpumask online_new;
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531 switch (action & ~CPU_TASKS_FROZEN) {
2532 case CPU_DEAD:
2533 case CPU_UP_CANCELED:
2534 cpumask_copy(&online_new, cpu_online_mask);
2535 break;
2536 case CPU_UP_PREPARE:
2537 cpumask_copy(&online_new, cpu_online_mask);
2538 cpumask_set_cpu(cpu, &online_new);
2539 break;
2540 default:
2541 return NOTIFY_OK;
2542 }
2543
2544 mutex_lock(&all_q_mutex);
2545
2546 __blk_mq_freeze_all_queue_list();
2547
2548 list_for_each_entry(q, &all_q_list, all_q_node)
2549 blk_mq_queue_reinit(q, &online_new);
2550
2551 list_for_each_entry(q, &all_q_list, all_q_node)
2552 blk_mq_unfreeze_queue(q);
2553
2554 mutex_unlock(&all_q_mutex);
2555 return NOTIFY_OK;
2556}
2557
2558static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
2559{
2560 int i;
2561
2562 for (i = 0; i < set->nr_hw_queues; i++)
2563 if (!__blk_mq_alloc_rq_map(set, i))
2564 goto out_unwind;
2565
2566 return 0;
2567
2568out_unwind:
2569 while (--i >= 0)
2570 blk_mq_free_rq_map(set->tags[i]);
2571
2572 return -ENOMEM;
2573}
2574
2575
2576
2577
2578
2579
2580static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
2581{
2582 unsigned int depth;
2583 int err;
2584
2585 depth = set->queue_depth;
2586 do {
2587 err = __blk_mq_alloc_rq_maps(set);
2588 if (!err)
2589 break;
2590
2591 set->queue_depth >>= 1;
2592 if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) {
2593 err = -ENOMEM;
2594 break;
2595 }
2596 } while (set->queue_depth);
2597
2598 if (!set->queue_depth || err) {
2599 pr_err("blk-mq: failed to allocate request map\n");
2600 return -ENOMEM;
2601 }
2602
2603 if (depth != set->queue_depth)
2604 pr_info("blk-mq: reduced tag depth (%u -> %u)\n",
2605 depth, set->queue_depth);
2606
2607 return 0;
2608}
2609
2610static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
2611{
2612 if (set->ops->aux_ops && set->ops->aux_ops->map_queues)
2613 return set->ops->aux_ops->map_queues(set);
2614 else
2615 return blk_mq_map_queues(set);
2616}
2617
2618
2619
2620
2621
2622
2623
2624int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
2625{
2626 int ret;
2627
2628 BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);
2629
2630 if (!set->nr_hw_queues)
2631 return -EINVAL;
2632 if (!set->queue_depth)
2633 return -EINVAL;
2634 if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
2635 return -EINVAL;
2636
2637 if (!set->ops->queue_rq)
2638 return -EINVAL;
2639
2640 if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
2641 pr_info("blk-mq: reduced tag depth to %u\n",
2642 BLK_MQ_MAX_DEPTH);
2643 set->queue_depth = BLK_MQ_MAX_DEPTH;
2644 }
2645
2646
2647
2648
2649
2650
2651 if (is_kdump_kernel()) {
2652 set->nr_hw_queues = 1;
2653 set->queue_depth = min(64U, set->queue_depth);
2654 }
2655
2656
2657
2658 if (set->nr_hw_queues > nr_cpu_ids)
2659 set->nr_hw_queues = nr_cpu_ids;
2660
2661 set->tags = kzalloc_node(nr_cpu_ids * sizeof(struct blk_mq_tags *),
2662 GFP_KERNEL, set->numa_node);
2663 if (!set->tags)
2664 return -ENOMEM;
2665
2666 ret = -ENOMEM;
2667 set->mq_map = kzalloc_node(sizeof(*set->mq_map) * nr_cpu_ids,
2668 GFP_KERNEL, set->numa_node);
2669 if (!set->mq_map)
2670 goto out_free_tags;
2671
2672 ret = blk_mq_update_queue_map(set);
2673 if (ret)
2674 goto out_free_mq_map;
2675
2676 ret = blk_mq_alloc_rq_maps(set);
2677 if (ret)
2678 goto out_free_mq_map;
2679
2680 mutex_init(&set->tag_list_lock);
2681 INIT_LIST_HEAD(&set->tag_list);
2682
2683 return 0;
2684
2685out_free_mq_map:
2686 kfree(set->mq_map);
2687 set->mq_map = NULL;
2688out_free_tags:
2689 kfree(set->tags);
2690 set->tags = NULL;
2691 return ret;
2692}
2693EXPORT_SYMBOL(blk_mq_alloc_tag_set);
2694
2695void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
2696{
2697 int i;
2698
2699 for (i = 0; i < nr_cpu_ids; i++)
2700 blk_mq_free_map_and_requests(set, i);
2701
2702 kfree(set->mq_map);
2703 set->mq_map = NULL;
2704
2705 kfree(set->tags);
2706 set->tags = NULL;
2707}
2708EXPORT_SYMBOL(blk_mq_free_tag_set);
2709
2710int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
2711{
2712 struct blk_mq_tag_set *set = q->tag_set;
2713 struct blk_mq_hw_ctx *hctx;
2714 int i, ret;
2715
2716 if (!set)
2717 return -EINVAL;
2718
2719 blk_mq_freeze_queue(q);
2720 blk_mq_quiesce_queue(q);
2721
2722 ret = 0;
2723 queue_for_each_hw_ctx(q, hctx, i) {
2724 if (!hctx->tags)
2725 continue;
2726
2727
2728
2729
2730 if (!hctx->sched_tags) {
2731 ret = blk_mq_tag_update_depth(hctx, &hctx->tags,
2732 min(nr, set->queue_depth),
2733 false);
2734 } else {
2735 ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
2736 nr, true);
2737 }
2738 if (ret)
2739 break;
2740 }
2741
2742 if (!ret)
2743 q->nr_requests = nr;
2744
2745 blk_mq_unfreeze_queue(q);
2746 blk_mq_start_stopped_hw_queues(q, true);
2747
2748 return ret;
2749}
2750
2751static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
2752 int nr_hw_queues)
2753{
2754 struct request_queue *q;
2755
2756 lockdep_assert_held(&set->tag_list_lock);
2757
2758 if (nr_hw_queues > nr_cpu_ids)
2759 nr_hw_queues = nr_cpu_ids;
2760 if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues)
2761 return;
2762
2763 list_for_each_entry(q, &set->tag_list, tag_set_list)
2764 blk_mq_freeze_queue(q);
2765
2766 set->nr_hw_queues = nr_hw_queues;
2767 blk_mq_update_queue_map(set);
2768 list_for_each_entry(q, &set->tag_list, tag_set_list) {
2769 blk_mq_realloc_hw_ctxs(set, q);
2770 blk_mq_queue_reinit(q, cpu_online_mask);
2771 }
2772
2773 list_for_each_entry(q, &set->tag_list, tag_set_list)
2774 blk_mq_unfreeze_queue(q);
2775}
2776
2777void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
2778{
2779 mutex_lock(&set->tag_list_lock);
2780 __blk_mq_update_nr_hw_queues(set, nr_hw_queues);
2781 mutex_unlock(&set->tag_list_lock);
2782}
2783EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
2784
2785static void blk_mq_poll_stats_start(struct request_queue *q)
2786{
2787
2788
2789
2790
2791 if (!test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) ||
2792 blk_stat_is_active(q->poll_cb))
2793 return;
2794
2795 blk_stat_activate_msecs(q->poll_cb, 100);
2796}
2797
2798static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb)
2799{
2800 struct request_queue *q = cb->data;
2801
2802 if (cb->stat[READ].nr_samples)
2803 q->poll_stat[READ] = cb->stat[READ];
2804 if (cb->stat[WRITE].nr_samples)
2805 q->poll_stat[WRITE] = cb->stat[WRITE];
2806}
2807
2808void blk_mq_disable_hotplug(void)
2809{
2810 mutex_lock(&all_q_mutex);
2811}
2812
2813void blk_mq_enable_hotplug(void)
2814{
2815 mutex_unlock(&all_q_mutex);
2816}
2817
2818static int __init blk_mq_init(void)
2819{
2820 blk_mq_cpu_init();
2821
2822 hotcpu_notifier(blk_mq_queue_reinit_notify, 0);
2823
2824 return 0;
2825}
2826subsys_initcall(blk_mq_init);
2827