linux/drivers/nvme/host/rdma.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * NVMe over Fabrics RDMA host code.
   4 * Copyright (c) 2015-2016 HGST, a Western Digital Company.
   5 */
   6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   7#include <linux/module.h>
   8#include <linux/init.h>
   9#include <linux/slab.h>
  10#include <rdma/mr_pool.h>
  11#include <linux/err.h>
  12#include <linux/string.h>
  13#include <linux/atomic.h>
  14#include <linux/blk-mq.h>
  15#include <linux/blk-mq-rdma.h>
  16#include <linux/types.h>
  17#include <linux/list.h>
  18#include <linux/mutex.h>
  19#include <linux/scatterlist.h>
  20#include <linux/nvme.h>
  21#include <asm/unaligned.h>
  22
  23#include <rdma/ib_verbs.h>
  24#include <rdma/rdma_cm.h>
  25#include <linux/nvme-rdma.h>
  26
  27#include "nvme.h"
  28#include "fabrics.h"
  29
  30
  31#define NVME_RDMA_CONNECT_TIMEOUT_MS    3000            /* 3 second */
  32
  33#define NVME_RDMA_MAX_SEGMENTS          256
  34
  35#define NVME_RDMA_MAX_INLINE_SEGMENTS   4
  36
  37struct nvme_rdma_device {
  38        struct ib_device        *dev;
  39        struct ib_pd            *pd;
  40        struct kref             ref;
  41        struct list_head        entry;
  42        unsigned int            num_inline_segments;
  43};
  44
  45struct nvme_rdma_qe {
  46        struct ib_cqe           cqe;
  47        void                    *data;
  48        u64                     dma;
  49};
  50
  51struct nvme_rdma_queue;
  52struct nvme_rdma_request {
  53        struct nvme_request     req;
  54        struct ib_mr            *mr;
  55        struct nvme_rdma_qe     sqe;
  56        union nvme_result       result;
  57        __le16                  status;
  58        refcount_t              ref;
  59        struct ib_sge           sge[1 + NVME_RDMA_MAX_INLINE_SEGMENTS];
  60        u32                     num_sge;
  61        int                     nents;
  62        struct ib_reg_wr        reg_wr;
  63        struct ib_cqe           reg_cqe;
  64        struct nvme_rdma_queue  *queue;
  65        struct sg_table         sg_table;
  66        struct scatterlist      first_sgl[];
  67};
  68
  69enum nvme_rdma_queue_flags {
  70        NVME_RDMA_Q_ALLOCATED           = 0,
  71        NVME_RDMA_Q_LIVE                = 1,
  72        NVME_RDMA_Q_TR_READY            = 2,
  73};
  74
  75struct nvme_rdma_queue {
  76        struct nvme_rdma_qe     *rsp_ring;
  77        int                     queue_size;
  78        size_t                  cmnd_capsule_len;
  79        struct nvme_rdma_ctrl   *ctrl;
  80        struct nvme_rdma_device *device;
  81        struct ib_cq            *ib_cq;
  82        struct ib_qp            *qp;
  83
  84        unsigned long           flags;
  85        struct rdma_cm_id       *cm_id;
  86        int                     cm_error;
  87        struct completion       cm_done;
  88};
  89
  90struct nvme_rdma_ctrl {
  91        /* read only in the hot path */
  92        struct nvme_rdma_queue  *queues;
  93
  94        /* other member variables */
  95        struct blk_mq_tag_set   tag_set;
  96        struct work_struct      err_work;
  97
  98        struct nvme_rdma_qe     async_event_sqe;
  99
 100        struct delayed_work     reconnect_work;
 101
 102        struct list_head        list;
 103
 104        struct blk_mq_tag_set   admin_tag_set;
 105        struct nvme_rdma_device *device;
 106
 107        u32                     max_fr_pages;
 108
 109        struct sockaddr_storage addr;
 110        struct sockaddr_storage src_addr;
 111
 112        struct nvme_ctrl        ctrl;
 113        bool                    use_inline_data;
 114        u32                     io_queues[HCTX_MAX_TYPES];
 115};
 116
 117static inline struct nvme_rdma_ctrl *to_rdma_ctrl(struct nvme_ctrl *ctrl)
 118{
 119        return container_of(ctrl, struct nvme_rdma_ctrl, ctrl);
 120}
 121
 122static LIST_HEAD(device_list);
 123static DEFINE_MUTEX(device_list_mutex);
 124
 125static LIST_HEAD(nvme_rdma_ctrl_list);
 126static DEFINE_MUTEX(nvme_rdma_ctrl_mutex);
 127
 128/*
 129 * Disabling this option makes small I/O goes faster, but is fundamentally
 130 * unsafe.  With it turned off we will have to register a global rkey that
 131 * allows read and write access to all physical memory.
 132 */
 133static bool register_always = true;
 134module_param(register_always, bool, 0444);
 135MODULE_PARM_DESC(register_always,
 136         "Use memory registration even for contiguous memory regions");
 137
 138static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
 139                struct rdma_cm_event *event);
 140static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc);
 141
 142static const struct blk_mq_ops nvme_rdma_mq_ops;
 143static const struct blk_mq_ops nvme_rdma_admin_mq_ops;
 144
 145/* XXX: really should move to a generic header sooner or later.. */
 146static inline void put_unaligned_le24(u32 val, u8 *p)
 147{
 148        *p++ = val;
 149        *p++ = val >> 8;
 150        *p++ = val >> 16;
 151}
 152
 153static inline int nvme_rdma_queue_idx(struct nvme_rdma_queue *queue)
 154{
 155        return queue - queue->ctrl->queues;
 156}
 157
 158static bool nvme_rdma_poll_queue(struct nvme_rdma_queue *queue)
 159{
 160        return nvme_rdma_queue_idx(queue) >
 161                queue->ctrl->io_queues[HCTX_TYPE_DEFAULT] +
 162                queue->ctrl->io_queues[HCTX_TYPE_READ];
 163}
 164
 165static inline size_t nvme_rdma_inline_data_size(struct nvme_rdma_queue *queue)
 166{
 167        return queue->cmnd_capsule_len - sizeof(struct nvme_command);
 168}
 169
 170static void nvme_rdma_free_qe(struct ib_device *ibdev, struct nvme_rdma_qe *qe,
 171                size_t capsule_size, enum dma_data_direction dir)
 172{
 173        ib_dma_unmap_single(ibdev, qe->dma, capsule_size, dir);
 174        kfree(qe->data);
 175}
 176
 177static int nvme_rdma_alloc_qe(struct ib_device *ibdev, struct nvme_rdma_qe *qe,
 178                size_t capsule_size, enum dma_data_direction dir)
 179{
 180        qe->data = kzalloc(capsule_size, GFP_KERNEL);
 181        if (!qe->data)
 182                return -ENOMEM;
 183
 184        qe->dma = ib_dma_map_single(ibdev, qe->data, capsule_size, dir);
 185        if (ib_dma_mapping_error(ibdev, qe->dma)) {
 186                kfree(qe->data);
 187                qe->data = NULL;
 188                return -ENOMEM;
 189        }
 190
 191        return 0;
 192}
 193
 194static void nvme_rdma_free_ring(struct ib_device *ibdev,
 195                struct nvme_rdma_qe *ring, size_t ib_queue_size,
 196                size_t capsule_size, enum dma_data_direction dir)
 197{
 198        int i;
 199
 200        for (i = 0; i < ib_queue_size; i++)
 201                nvme_rdma_free_qe(ibdev, &ring[i], capsule_size, dir);
 202        kfree(ring);
 203}
 204
 205static struct nvme_rdma_qe *nvme_rdma_alloc_ring(struct ib_device *ibdev,
 206                size_t ib_queue_size, size_t capsule_size,
 207                enum dma_data_direction dir)
 208{
 209        struct nvme_rdma_qe *ring;
 210        int i;
 211
 212        ring = kcalloc(ib_queue_size, sizeof(struct nvme_rdma_qe), GFP_KERNEL);
 213        if (!ring)
 214                return NULL;
 215
 216        /*
 217         * Bind the CQEs (post recv buffers) DMA mapping to the RDMA queue
 218         * lifetime. It's safe, since any chage in the underlying RDMA device
 219         * will issue error recovery and queue re-creation.
 220         */
 221        for (i = 0; i < ib_queue_size; i++) {
 222                if (nvme_rdma_alloc_qe(ibdev, &ring[i], capsule_size, dir))
 223                        goto out_free_ring;
 224        }
 225
 226        return ring;
 227
 228out_free_ring:
 229        nvme_rdma_free_ring(ibdev, ring, i, capsule_size, dir);
 230        return NULL;
 231}
 232
 233static void nvme_rdma_qp_event(struct ib_event *event, void *context)
 234{
 235        pr_debug("QP event %s (%d)\n",
 236                 ib_event_msg(event->event), event->event);
 237
 238}
 239
 240static int nvme_rdma_wait_for_cm(struct nvme_rdma_queue *queue)
 241{
 242        int ret;
 243
 244        ret = wait_for_completion_interruptible_timeout(&queue->cm_done,
 245                        msecs_to_jiffies(NVME_RDMA_CONNECT_TIMEOUT_MS) + 1);
 246        if (ret < 0)
 247                return ret;
 248        if (ret == 0)
 249                return -ETIMEDOUT;
 250        WARN_ON_ONCE(queue->cm_error > 0);
 251        return queue->cm_error;
 252}
 253
 254static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor)
 255{
 256        struct nvme_rdma_device *dev = queue->device;
 257        struct ib_qp_init_attr init_attr;
 258        int ret;
 259
 260        memset(&init_attr, 0, sizeof(init_attr));
 261        init_attr.event_handler = nvme_rdma_qp_event;
 262        /* +1 for drain */
 263        init_attr.cap.max_send_wr = factor * queue->queue_size + 1;
 264        /* +1 for drain */
 265        init_attr.cap.max_recv_wr = queue->queue_size + 1;
 266        init_attr.cap.max_recv_sge = 1;
 267        init_attr.cap.max_send_sge = 1 + dev->num_inline_segments;
 268        init_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
 269        init_attr.qp_type = IB_QPT_RC;
 270        init_attr.send_cq = queue->ib_cq;
 271        init_attr.recv_cq = queue->ib_cq;
 272
 273        ret = rdma_create_qp(queue->cm_id, dev->pd, &init_attr);
 274
 275        queue->qp = queue->cm_id->qp;
 276        return ret;
 277}
 278
 279static void nvme_rdma_exit_request(struct blk_mq_tag_set *set,
 280                struct request *rq, unsigned int hctx_idx)
 281{
 282        struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
 283
 284        kfree(req->sqe.data);
 285}
 286
 287static int nvme_rdma_init_request(struct blk_mq_tag_set *set,
 288                struct request *rq, unsigned int hctx_idx,
 289                unsigned int numa_node)
 290{
 291        struct nvme_rdma_ctrl *ctrl = set->driver_data;
 292        struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
 293        int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
 294        struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx];
 295
 296        nvme_req(rq)->ctrl = &ctrl->ctrl;
 297        req->sqe.data = kzalloc(sizeof(struct nvme_command), GFP_KERNEL);
 298        if (!req->sqe.data)
 299                return -ENOMEM;
 300
 301        req->queue = queue;
 302
 303        return 0;
 304}
 305
 306static int nvme_rdma_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
 307                unsigned int hctx_idx)
 308{
 309        struct nvme_rdma_ctrl *ctrl = data;
 310        struct nvme_rdma_queue *queue = &ctrl->queues[hctx_idx + 1];
 311
 312        BUG_ON(hctx_idx >= ctrl->ctrl.queue_count);
 313
 314        hctx->driver_data = queue;
 315        return 0;
 316}
 317
 318static int nvme_rdma_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
 319                unsigned int hctx_idx)
 320{
 321        struct nvme_rdma_ctrl *ctrl = data;
 322        struct nvme_rdma_queue *queue = &ctrl->queues[0];
 323
 324        BUG_ON(hctx_idx != 0);
 325
 326        hctx->driver_data = queue;
 327        return 0;
 328}
 329
 330static void nvme_rdma_free_dev(struct kref *ref)
 331{
 332        struct nvme_rdma_device *ndev =
 333                container_of(ref, struct nvme_rdma_device, ref);
 334
 335        mutex_lock(&device_list_mutex);
 336        list_del(&ndev->entry);
 337        mutex_unlock(&device_list_mutex);
 338
 339        ib_dealloc_pd(ndev->pd);
 340        kfree(ndev);
 341}
 342
 343static void nvme_rdma_dev_put(struct nvme_rdma_device *dev)
 344{
 345        kref_put(&dev->ref, nvme_rdma_free_dev);
 346}
 347
 348static int nvme_rdma_dev_get(struct nvme_rdma_device *dev)
 349{
 350        return kref_get_unless_zero(&dev->ref);
 351}
 352
 353static struct nvme_rdma_device *
 354nvme_rdma_find_get_device(struct rdma_cm_id *cm_id)
 355{
 356        struct nvme_rdma_device *ndev;
 357
 358        mutex_lock(&device_list_mutex);
 359        list_for_each_entry(ndev, &device_list, entry) {
 360                if (ndev->dev->node_guid == cm_id->device->node_guid &&
 361                    nvme_rdma_dev_get(ndev))
 362                        goto out_unlock;
 363        }
 364
 365        ndev = kzalloc(sizeof(*ndev), GFP_KERNEL);
 366        if (!ndev)
 367                goto out_err;
 368
 369        ndev->dev = cm_id->device;
 370        kref_init(&ndev->ref);
 371
 372        ndev->pd = ib_alloc_pd(ndev->dev,
 373                register_always ? 0 : IB_PD_UNSAFE_GLOBAL_RKEY);
 374        if (IS_ERR(ndev->pd))
 375                goto out_free_dev;
 376
 377        if (!(ndev->dev->attrs.device_cap_flags &
 378              IB_DEVICE_MEM_MGT_EXTENSIONS)) {
 379                dev_err(&ndev->dev->dev,
 380                        "Memory registrations not supported.\n");
 381                goto out_free_pd;
 382        }
 383
 384        ndev->num_inline_segments = min(NVME_RDMA_MAX_INLINE_SEGMENTS,
 385                                        ndev->dev->attrs.max_send_sge - 1);
 386        list_add(&ndev->entry, &device_list);
 387out_unlock:
 388        mutex_unlock(&device_list_mutex);
 389        return ndev;
 390
 391out_free_pd:
 392        ib_dealloc_pd(ndev->pd);
 393out_free_dev:
 394        kfree(ndev);
 395out_err:
 396        mutex_unlock(&device_list_mutex);
 397        return NULL;
 398}
 399
 400static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue)
 401{
 402        struct nvme_rdma_device *dev;
 403        struct ib_device *ibdev;
 404
 405        if (!test_and_clear_bit(NVME_RDMA_Q_TR_READY, &queue->flags))
 406                return;
 407
 408        dev = queue->device;
 409        ibdev = dev->dev;
 410
 411        ib_mr_pool_destroy(queue->qp, &queue->qp->rdma_mrs);
 412
 413        /*
 414         * The cm_id object might have been destroyed during RDMA connection
 415         * establishment error flow to avoid getting other cma events, thus
 416         * the destruction of the QP shouldn't use rdma_cm API.
 417         */
 418        ib_destroy_qp(queue->qp);
 419        ib_free_cq(queue->ib_cq);
 420
 421        nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size,
 422                        sizeof(struct nvme_completion), DMA_FROM_DEVICE);
 423
 424        nvme_rdma_dev_put(dev);
 425}
 426
 427static int nvme_rdma_get_max_fr_pages(struct ib_device *ibdev)
 428{
 429        return min_t(u32, NVME_RDMA_MAX_SEGMENTS,
 430                     ibdev->attrs.max_fast_reg_page_list_len - 1);
 431}
 432
 433static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
 434{
 435        struct ib_device *ibdev;
 436        const int send_wr_factor = 3;                   /* MR, SEND, INV */
 437        const int cq_factor = send_wr_factor + 1;       /* + RECV */
 438        int comp_vector, idx = nvme_rdma_queue_idx(queue);
 439        enum ib_poll_context poll_ctx;
 440        int ret, pages_per_mr;
 441
 442        queue->device = nvme_rdma_find_get_device(queue->cm_id);
 443        if (!queue->device) {
 444                dev_err(queue->cm_id->device->dev.parent,
 445                        "no client data found!\n");
 446                return -ECONNREFUSED;
 447        }
 448        ibdev = queue->device->dev;
 449
 450        /*
 451         * Spread I/O queues completion vectors according their queue index.
 452         * Admin queues can always go on completion vector 0.
 453         */
 454        comp_vector = idx == 0 ? idx : idx - 1;
 455
 456        /* Polling queues need direct cq polling context */
 457        if (nvme_rdma_poll_queue(queue))
 458                poll_ctx = IB_POLL_DIRECT;
 459        else
 460                poll_ctx = IB_POLL_SOFTIRQ;
 461
 462        /* +1 for ib_stop_cq */
 463        queue->ib_cq = ib_alloc_cq(ibdev, queue,
 464                                cq_factor * queue->queue_size + 1,
 465                                comp_vector, poll_ctx);
 466        if (IS_ERR(queue->ib_cq)) {
 467                ret = PTR_ERR(queue->ib_cq);
 468                goto out_put_dev;
 469        }
 470
 471        ret = nvme_rdma_create_qp(queue, send_wr_factor);
 472        if (ret)
 473                goto out_destroy_ib_cq;
 474
 475        queue->rsp_ring = nvme_rdma_alloc_ring(ibdev, queue->queue_size,
 476                        sizeof(struct nvme_completion), DMA_FROM_DEVICE);
 477        if (!queue->rsp_ring) {
 478                ret = -ENOMEM;
 479                goto out_destroy_qp;
 480        }
 481
 482        /*
 483         * Currently we don't use SG_GAPS MR's so if the first entry is
 484         * misaligned we'll end up using two entries for a single data page,
 485         * so one additional entry is required.
 486         */
 487        pages_per_mr = nvme_rdma_get_max_fr_pages(ibdev) + 1;
 488        ret = ib_mr_pool_init(queue->qp, &queue->qp->rdma_mrs,
 489                              queue->queue_size,
 490                              IB_MR_TYPE_MEM_REG,
 491                              pages_per_mr, 0);
 492        if (ret) {
 493                dev_err(queue->ctrl->ctrl.device,
 494                        "failed to initialize MR pool sized %d for QID %d\n",
 495                        queue->queue_size, idx);
 496                goto out_destroy_ring;
 497        }
 498
 499        set_bit(NVME_RDMA_Q_TR_READY, &queue->flags);
 500
 501        return 0;
 502
 503out_destroy_ring:
 504        nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size,
 505                            sizeof(struct nvme_completion), DMA_FROM_DEVICE);
 506out_destroy_qp:
 507        rdma_destroy_qp(queue->cm_id);
 508out_destroy_ib_cq:
 509        ib_free_cq(queue->ib_cq);
 510out_put_dev:
 511        nvme_rdma_dev_put(queue->device);
 512        return ret;
 513}
 514
 515static int nvme_rdma_alloc_queue(struct nvme_rdma_ctrl *ctrl,
 516                int idx, size_t queue_size)
 517{
 518        struct nvme_rdma_queue *queue;
 519        struct sockaddr *src_addr = NULL;
 520        int ret;
 521
 522        queue = &ctrl->queues[idx];
 523        queue->ctrl = ctrl;
 524        init_completion(&queue->cm_done);
 525
 526        if (idx > 0)
 527                queue->cmnd_capsule_len = ctrl->ctrl.ioccsz * 16;
 528        else
 529                queue->cmnd_capsule_len = sizeof(struct nvme_command);
 530
 531        queue->queue_size = queue_size;
 532
 533        queue->cm_id = rdma_create_id(&init_net, nvme_rdma_cm_handler, queue,
 534                        RDMA_PS_TCP, IB_QPT_RC);
 535        if (IS_ERR(queue->cm_id)) {
 536                dev_info(ctrl->ctrl.device,
 537                        "failed to create CM ID: %ld\n", PTR_ERR(queue->cm_id));
 538                return PTR_ERR(queue->cm_id);
 539        }
 540
 541        if (ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR)
 542                src_addr = (struct sockaddr *)&ctrl->src_addr;
 543
 544        queue->cm_error = -ETIMEDOUT;
 545        ret = rdma_resolve_addr(queue->cm_id, src_addr,
 546                        (struct sockaddr *)&ctrl->addr,
 547                        NVME_RDMA_CONNECT_TIMEOUT_MS);
 548        if (ret) {
 549                dev_info(ctrl->ctrl.device,
 550                        "rdma_resolve_addr failed (%d).\n", ret);
 551                goto out_destroy_cm_id;
 552        }
 553
 554        ret = nvme_rdma_wait_for_cm(queue);
 555        if (ret) {
 556                dev_info(ctrl->ctrl.device,
 557                        "rdma connection establishment failed (%d)\n", ret);
 558                goto out_destroy_cm_id;
 559        }
 560
 561        set_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags);
 562
 563        return 0;
 564
 565out_destroy_cm_id:
 566        rdma_destroy_id(queue->cm_id);
 567        nvme_rdma_destroy_queue_ib(queue);
 568        return ret;
 569}
 570
 571static void __nvme_rdma_stop_queue(struct nvme_rdma_queue *queue)
 572{
 573        rdma_disconnect(queue->cm_id);
 574        ib_drain_qp(queue->qp);
 575}
 576
 577static void nvme_rdma_stop_queue(struct nvme_rdma_queue *queue)
 578{
 579        if (!test_and_clear_bit(NVME_RDMA_Q_LIVE, &queue->flags))
 580                return;
 581        __nvme_rdma_stop_queue(queue);
 582}
 583
 584static void nvme_rdma_free_queue(struct nvme_rdma_queue *queue)
 585{
 586        if (!test_and_clear_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags))
 587                return;
 588
 589        nvme_rdma_destroy_queue_ib(queue);
 590        rdma_destroy_id(queue->cm_id);
 591}
 592
 593static void nvme_rdma_free_io_queues(struct nvme_rdma_ctrl *ctrl)
 594{
 595        int i;
 596
 597        for (i = 1; i < ctrl->ctrl.queue_count; i++)
 598                nvme_rdma_free_queue(&ctrl->queues[i]);
 599}
 600
 601static void nvme_rdma_stop_io_queues(struct nvme_rdma_ctrl *ctrl)
 602{
 603        int i;
 604
 605        for (i = 1; i < ctrl->ctrl.queue_count; i++)
 606                nvme_rdma_stop_queue(&ctrl->queues[i]);
 607}
 608
 609static int nvme_rdma_start_queue(struct nvme_rdma_ctrl *ctrl, int idx)
 610{
 611        struct nvme_rdma_queue *queue = &ctrl->queues[idx];
 612        bool poll = nvme_rdma_poll_queue(queue);
 613        int ret;
 614
 615        if (idx)
 616                ret = nvmf_connect_io_queue(&ctrl->ctrl, idx, poll);
 617        else
 618                ret = nvmf_connect_admin_queue(&ctrl->ctrl);
 619
 620        if (!ret) {
 621                set_bit(NVME_RDMA_Q_LIVE, &queue->flags);
 622        } else {
 623                if (test_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags))
 624                        __nvme_rdma_stop_queue(queue);
 625                dev_info(ctrl->ctrl.device,
 626                        "failed to connect queue: %d ret=%d\n", idx, ret);
 627        }
 628        return ret;
 629}
 630
 631static int nvme_rdma_start_io_queues(struct nvme_rdma_ctrl *ctrl)
 632{
 633        int i, ret = 0;
 634
 635        for (i = 1; i < ctrl->ctrl.queue_count; i++) {
 636                ret = nvme_rdma_start_queue(ctrl, i);
 637                if (ret)
 638                        goto out_stop_queues;
 639        }
 640
 641        return 0;
 642
 643out_stop_queues:
 644        for (i--; i >= 1; i--)
 645                nvme_rdma_stop_queue(&ctrl->queues[i]);
 646        return ret;
 647}
 648
 649static int nvme_rdma_alloc_io_queues(struct nvme_rdma_ctrl *ctrl)
 650{
 651        struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
 652        struct ib_device *ibdev = ctrl->device->dev;
 653        unsigned int nr_io_queues, nr_default_queues;
 654        unsigned int nr_read_queues, nr_poll_queues;
 655        int i, ret;
 656
 657        nr_read_queues = min_t(unsigned int, ibdev->num_comp_vectors,
 658                                min(opts->nr_io_queues, num_online_cpus()));
 659        nr_default_queues =  min_t(unsigned int, ibdev->num_comp_vectors,
 660                                min(opts->nr_write_queues, num_online_cpus()));
 661        nr_poll_queues = min(opts->nr_poll_queues, num_online_cpus());
 662        nr_io_queues = nr_read_queues + nr_default_queues + nr_poll_queues;
 663
 664        ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues);
 665        if (ret)
 666                return ret;
 667
 668        ctrl->ctrl.queue_count = nr_io_queues + 1;
 669        if (ctrl->ctrl.queue_count < 2)
 670                return 0;
 671
 672        dev_info(ctrl->ctrl.device,
 673                "creating %d I/O queues.\n", nr_io_queues);
 674
 675        if (opts->nr_write_queues && nr_read_queues < nr_io_queues) {
 676                /*
 677                 * separate read/write queues
 678                 * hand out dedicated default queues only after we have
 679                 * sufficient read queues.
 680                 */
 681                ctrl->io_queues[HCTX_TYPE_READ] = nr_read_queues;
 682                nr_io_queues -= ctrl->io_queues[HCTX_TYPE_READ];
 683                ctrl->io_queues[HCTX_TYPE_DEFAULT] =
 684                        min(nr_default_queues, nr_io_queues);
 685                nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
 686        } else {
 687                /*
 688                 * shared read/write queues
 689                 * either no write queues were requested, or we don't have
 690                 * sufficient queue count to have dedicated default queues.
 691                 */
 692                ctrl->io_queues[HCTX_TYPE_DEFAULT] =
 693                        min(nr_read_queues, nr_io_queues);
 694                nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
 695        }
 696
 697        if (opts->nr_poll_queues && nr_io_queues) {
 698                /* map dedicated poll queues only if we have queues left */
 699                ctrl->io_queues[HCTX_TYPE_POLL] =
 700                        min(nr_poll_queues, nr_io_queues);
 701        }
 702
 703        for (i = 1; i < ctrl->ctrl.queue_count; i++) {
 704                ret = nvme_rdma_alloc_queue(ctrl, i,
 705                                ctrl->ctrl.sqsize + 1);
 706                if (ret)
 707                        goto out_free_queues;
 708        }
 709
 710        return 0;
 711
 712out_free_queues:
 713        for (i--; i >= 1; i--)
 714                nvme_rdma_free_queue(&ctrl->queues[i]);
 715
 716        return ret;
 717}
 718
 719static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
 720                bool admin)
 721{
 722        struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
 723        struct blk_mq_tag_set *set;
 724        int ret;
 725
 726        if (admin) {
 727                set = &ctrl->admin_tag_set;
 728                memset(set, 0, sizeof(*set));
 729                set->ops = &nvme_rdma_admin_mq_ops;
 730                set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
 731                set->reserved_tags = 2; /* connect + keep-alive */
 732                set->numa_node = nctrl->numa_node;
 733                set->cmd_size = sizeof(struct nvme_rdma_request) +
 734                        NVME_INLINE_SG_CNT * sizeof(struct scatterlist);
 735                set->driver_data = ctrl;
 736                set->nr_hw_queues = 1;
 737                set->timeout = ADMIN_TIMEOUT;
 738                set->flags = BLK_MQ_F_NO_SCHED;
 739        } else {
 740                set = &ctrl->tag_set;
 741                memset(set, 0, sizeof(*set));
 742                set->ops = &nvme_rdma_mq_ops;
 743                set->queue_depth = nctrl->sqsize + 1;
 744                set->reserved_tags = 1; /* fabric connect */
 745                set->numa_node = nctrl->numa_node;
 746                set->flags = BLK_MQ_F_SHOULD_MERGE;
 747                set->cmd_size = sizeof(struct nvme_rdma_request) +
 748                        NVME_INLINE_SG_CNT * sizeof(struct scatterlist);
 749                set->driver_data = ctrl;
 750                set->nr_hw_queues = nctrl->queue_count - 1;
 751                set->timeout = NVME_IO_TIMEOUT;
 752                set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2;
 753        }
 754
 755        ret = blk_mq_alloc_tag_set(set);
 756        if (ret)
 757                return ERR_PTR(ret);
 758
 759        return set;
 760}
 761
 762static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl,
 763                bool remove)
 764{
 765        if (remove) {
 766                blk_cleanup_queue(ctrl->ctrl.admin_q);
 767                blk_cleanup_queue(ctrl->ctrl.fabrics_q);
 768                blk_mq_free_tag_set(ctrl->ctrl.admin_tagset);
 769        }
 770        if (ctrl->async_event_sqe.data) {
 771                nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe,
 772                                sizeof(struct nvme_command), DMA_TO_DEVICE);
 773                ctrl->async_event_sqe.data = NULL;
 774        }
 775        nvme_rdma_free_queue(&ctrl->queues[0]);
 776}
 777
 778static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
 779                bool new)
 780{
 781        int error;
 782
 783        error = nvme_rdma_alloc_queue(ctrl, 0, NVME_AQ_DEPTH);
 784        if (error)
 785                return error;
 786
 787        ctrl->device = ctrl->queues[0].device;
 788        ctrl->ctrl.numa_node = dev_to_node(ctrl->device->dev->dma_device);
 789
 790        ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev);
 791
 792        /*
 793         * Bind the async event SQE DMA mapping to the admin queue lifetime.
 794         * It's safe, since any chage in the underlying RDMA device will issue
 795         * error recovery and queue re-creation.
 796         */
 797        error = nvme_rdma_alloc_qe(ctrl->device->dev, &ctrl->async_event_sqe,
 798                        sizeof(struct nvme_command), DMA_TO_DEVICE);
 799        if (error)
 800                goto out_free_queue;
 801
 802        if (new) {
 803                ctrl->ctrl.admin_tagset = nvme_rdma_alloc_tagset(&ctrl->ctrl, true);
 804                if (IS_ERR(ctrl->ctrl.admin_tagset)) {
 805                        error = PTR_ERR(ctrl->ctrl.admin_tagset);
 806                        goto out_free_async_qe;
 807                }
 808
 809                ctrl->ctrl.fabrics_q = blk_mq_init_queue(&ctrl->admin_tag_set);
 810                if (IS_ERR(ctrl->ctrl.fabrics_q)) {
 811                        error = PTR_ERR(ctrl->ctrl.fabrics_q);
 812                        goto out_free_tagset;
 813                }
 814
 815                ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set);
 816                if (IS_ERR(ctrl->ctrl.admin_q)) {
 817                        error = PTR_ERR(ctrl->ctrl.admin_q);
 818                        goto out_cleanup_fabrics_q;
 819                }
 820        }
 821
 822        error = nvme_rdma_start_queue(ctrl, 0);
 823        if (error)
 824                goto out_cleanup_queue;
 825
 826        error = nvme_enable_ctrl(&ctrl->ctrl);
 827        if (error)
 828                goto out_stop_queue;
 829
 830        ctrl->ctrl.max_segments = ctrl->max_fr_pages;
 831        ctrl->ctrl.max_hw_sectors = ctrl->max_fr_pages << (ilog2(SZ_4K) - 9);
 832
 833        blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
 834
 835        error = nvme_init_identify(&ctrl->ctrl);
 836        if (error)
 837                goto out_stop_queue;
 838
 839        return 0;
 840
 841out_stop_queue:
 842        nvme_rdma_stop_queue(&ctrl->queues[0]);
 843out_cleanup_queue:
 844        if (new)
 845                blk_cleanup_queue(ctrl->ctrl.admin_q);
 846out_cleanup_fabrics_q:
 847        if (new)
 848                blk_cleanup_queue(ctrl->ctrl.fabrics_q);
 849out_free_tagset:
 850        if (new)
 851                blk_mq_free_tag_set(ctrl->ctrl.admin_tagset);
 852out_free_async_qe:
 853        nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe,
 854                sizeof(struct nvme_command), DMA_TO_DEVICE);
 855        ctrl->async_event_sqe.data = NULL;
 856out_free_queue:
 857        nvme_rdma_free_queue(&ctrl->queues[0]);
 858        return error;
 859}
 860
 861static void nvme_rdma_destroy_io_queues(struct nvme_rdma_ctrl *ctrl,
 862                bool remove)
 863{
 864        if (remove) {
 865                blk_cleanup_queue(ctrl->ctrl.connect_q);
 866                blk_mq_free_tag_set(ctrl->ctrl.tagset);
 867        }
 868        nvme_rdma_free_io_queues(ctrl);
 869}
 870
 871static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)
 872{
 873        int ret;
 874
 875        ret = nvme_rdma_alloc_io_queues(ctrl);
 876        if (ret)
 877                return ret;
 878
 879        if (new) {
 880                ctrl->ctrl.tagset = nvme_rdma_alloc_tagset(&ctrl->ctrl, false);
 881                if (IS_ERR(ctrl->ctrl.tagset)) {
 882                        ret = PTR_ERR(ctrl->ctrl.tagset);
 883                        goto out_free_io_queues;
 884                }
 885
 886                ctrl->ctrl.connect_q = blk_mq_init_queue(&ctrl->tag_set);
 887                if (IS_ERR(ctrl->ctrl.connect_q)) {
 888                        ret = PTR_ERR(ctrl->ctrl.connect_q);
 889                        goto out_free_tag_set;
 890                }
 891        } else {
 892                blk_mq_update_nr_hw_queues(&ctrl->tag_set,
 893                        ctrl->ctrl.queue_count - 1);
 894        }
 895
 896        ret = nvme_rdma_start_io_queues(ctrl);
 897        if (ret)
 898                goto out_cleanup_connect_q;
 899
 900        return 0;
 901
 902out_cleanup_connect_q:
 903        if (new)
 904                blk_cleanup_queue(ctrl->ctrl.connect_q);
 905out_free_tag_set:
 906        if (new)
 907                blk_mq_free_tag_set(ctrl->ctrl.tagset);
 908out_free_io_queues:
 909        nvme_rdma_free_io_queues(ctrl);
 910        return ret;
 911}
 912
 913static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl,
 914                bool remove)
 915{
 916        blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
 917        nvme_rdma_stop_queue(&ctrl->queues[0]);
 918        if (ctrl->ctrl.admin_tagset) {
 919                blk_mq_tagset_busy_iter(ctrl->ctrl.admin_tagset,
 920                        nvme_cancel_request, &ctrl->ctrl);
 921                blk_mq_tagset_wait_completed_request(ctrl->ctrl.admin_tagset);
 922        }
 923        if (remove)
 924                blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
 925        nvme_rdma_destroy_admin_queue(ctrl, remove);
 926}
 927
 928static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl,
 929                bool remove)
 930{
 931        if (ctrl->ctrl.queue_count > 1) {
 932                nvme_stop_queues(&ctrl->ctrl);
 933                nvme_rdma_stop_io_queues(ctrl);
 934                if (ctrl->ctrl.tagset) {
 935                        blk_mq_tagset_busy_iter(ctrl->ctrl.tagset,
 936                                nvme_cancel_request, &ctrl->ctrl);
 937                        blk_mq_tagset_wait_completed_request(ctrl->ctrl.tagset);
 938                }
 939                if (remove)
 940                        nvme_start_queues(&ctrl->ctrl);
 941                nvme_rdma_destroy_io_queues(ctrl, remove);
 942        }
 943}
 944
 945static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl)
 946{
 947        struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
 948
 949        if (list_empty(&ctrl->list))
 950                goto free_ctrl;
 951
 952        mutex_lock(&nvme_rdma_ctrl_mutex);
 953        list_del(&ctrl->list);
 954        mutex_unlock(&nvme_rdma_ctrl_mutex);
 955
 956        nvmf_free_options(nctrl->opts);
 957free_ctrl:
 958        kfree(ctrl->queues);
 959        kfree(ctrl);
 960}
 961
 962static void nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl *ctrl)
 963{
 964        /* If we are resetting/deleting then do nothing */
 965        if (ctrl->ctrl.state != NVME_CTRL_CONNECTING) {
 966                WARN_ON_ONCE(ctrl->ctrl.state == NVME_CTRL_NEW ||
 967                        ctrl->ctrl.state == NVME_CTRL_LIVE);
 968                return;
 969        }
 970
 971        if (nvmf_should_reconnect(&ctrl->ctrl)) {
 972                dev_info(ctrl->ctrl.device, "Reconnecting in %d seconds...\n",
 973                        ctrl->ctrl.opts->reconnect_delay);
 974                queue_delayed_work(nvme_wq, &ctrl->reconnect_work,
 975                                ctrl->ctrl.opts->reconnect_delay * HZ);
 976        } else {
 977                nvme_delete_ctrl(&ctrl->ctrl);
 978        }
 979}
 980
 981static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new)
 982{
 983        int ret = -EINVAL;
 984        bool changed;
 985
 986        ret = nvme_rdma_configure_admin_queue(ctrl, new);
 987        if (ret)
 988                return ret;
 989
 990        if (ctrl->ctrl.icdoff) {
 991                dev_err(ctrl->ctrl.device, "icdoff is not supported!\n");
 992                goto destroy_admin;
 993        }
 994
 995        if (!(ctrl->ctrl.sgls & (1 << 2))) {
 996                dev_err(ctrl->ctrl.device,
 997                        "Mandatory keyed sgls are not supported!\n");
 998                goto destroy_admin;
 999        }
1000
1001        if (ctrl->ctrl.opts->queue_size > ctrl->ctrl.sqsize + 1) {
1002                dev_warn(ctrl->ctrl.device,
1003                        "queue_size %zu > ctrl sqsize %u, clamping down\n",
1004                        ctrl->ctrl.opts->queue_size, ctrl->ctrl.sqsize + 1);
1005        }
1006
1007        if (ctrl->ctrl.sqsize + 1 > ctrl->ctrl.maxcmd) {
1008                dev_warn(ctrl->ctrl.device,
1009                        "sqsize %u > ctrl maxcmd %u, clamping down\n",
1010                        ctrl->ctrl.sqsize + 1, ctrl->ctrl.maxcmd);
1011                ctrl->ctrl.sqsize = ctrl->ctrl.maxcmd - 1;
1012        }
1013
1014        if (ctrl->ctrl.sgls & (1 << 20))
1015                ctrl->use_inline_data = true;
1016
1017        if (ctrl->ctrl.queue_count > 1) {
1018                ret = nvme_rdma_configure_io_queues(ctrl, new);
1019                if (ret)
1020                        goto destroy_admin;
1021        }
1022
1023        changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
1024        if (!changed) {
1025                /* state change failure is ok if we're in DELETING state */
1026                WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING);
1027                ret = -EINVAL;
1028                goto destroy_io;
1029        }
1030
1031        nvme_start_ctrl(&ctrl->ctrl);
1032        return 0;
1033
1034destroy_io:
1035        if (ctrl->ctrl.queue_count > 1)
1036                nvme_rdma_destroy_io_queues(ctrl, new);
1037destroy_admin:
1038        nvme_rdma_stop_queue(&ctrl->queues[0]);
1039        nvme_rdma_destroy_admin_queue(ctrl, new);
1040        return ret;
1041}
1042
1043static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
1044{
1045        struct nvme_rdma_ctrl *ctrl = container_of(to_delayed_work(work),
1046                        struct nvme_rdma_ctrl, reconnect_work);
1047
1048        ++ctrl->ctrl.nr_reconnects;
1049
1050        if (nvme_rdma_setup_ctrl(ctrl, false))
1051                goto requeue;
1052
1053        dev_info(ctrl->ctrl.device, "Successfully reconnected (%d attempts)\n",
1054                        ctrl->ctrl.nr_reconnects);
1055
1056        ctrl->ctrl.nr_reconnects = 0;
1057
1058        return;
1059
1060requeue:
1061        dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n",
1062                        ctrl->ctrl.nr_reconnects);
1063        nvme_rdma_reconnect_or_remove(ctrl);
1064}
1065
1066static void nvme_rdma_error_recovery_work(struct work_struct *work)
1067{
1068        struct nvme_rdma_ctrl *ctrl = container_of(work,
1069                        struct nvme_rdma_ctrl, err_work);
1070
1071        nvme_stop_keep_alive(&ctrl->ctrl);
1072        nvme_rdma_teardown_io_queues(ctrl, false);
1073        nvme_start_queues(&ctrl->ctrl);
1074        nvme_rdma_teardown_admin_queue(ctrl, false);
1075        blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
1076
1077        if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
1078                /* state change failure is ok if we're in DELETING state */
1079                WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING);
1080                return;
1081        }
1082
1083        nvme_rdma_reconnect_or_remove(ctrl);
1084}
1085
1086static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl)
1087{
1088        if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING))
1089                return;
1090
1091        queue_work(nvme_wq, &ctrl->err_work);
1092}
1093
1094static void nvme_rdma_wr_error(struct ib_cq *cq, struct ib_wc *wc,
1095                const char *op)
1096{
1097        struct nvme_rdma_queue *queue = cq->cq_context;
1098        struct nvme_rdma_ctrl *ctrl = queue->ctrl;
1099
1100        if (ctrl->ctrl.state == NVME_CTRL_LIVE)
1101                dev_info(ctrl->ctrl.device,
1102                             "%s for CQE 0x%p failed with status %s (%d)\n",
1103                             op, wc->wr_cqe,
1104                             ib_wc_status_msg(wc->status), wc->status);
1105        nvme_rdma_error_recovery(ctrl);
1106}
1107
1108static void nvme_rdma_memreg_done(struct ib_cq *cq, struct ib_wc *wc)
1109{
1110        if (unlikely(wc->status != IB_WC_SUCCESS))
1111                nvme_rdma_wr_error(cq, wc, "MEMREG");
1112}
1113
1114static void nvme_rdma_inv_rkey_done(struct ib_cq *cq, struct ib_wc *wc)
1115{
1116        struct nvme_rdma_request *req =
1117                container_of(wc->wr_cqe, struct nvme_rdma_request, reg_cqe);
1118        struct request *rq = blk_mq_rq_from_pdu(req);
1119
1120        if (unlikely(wc->status != IB_WC_SUCCESS)) {
1121                nvme_rdma_wr_error(cq, wc, "LOCAL_INV");
1122                return;
1123        }
1124
1125        if (refcount_dec_and_test(&req->ref))
1126                nvme_end_request(rq, req->status, req->result);
1127
1128}
1129
1130static int nvme_rdma_inv_rkey(struct nvme_rdma_queue *queue,
1131                struct nvme_rdma_request *req)
1132{
1133        struct ib_send_wr wr = {
1134                .opcode             = IB_WR_LOCAL_INV,
1135                .next               = NULL,
1136                .num_sge            = 0,
1137                .send_flags         = IB_SEND_SIGNALED,
1138                .ex.invalidate_rkey = req->mr->rkey,
1139        };
1140
1141        req->reg_cqe.done = nvme_rdma_inv_rkey_done;
1142        wr.wr_cqe = &req->reg_cqe;
1143
1144        return ib_post_send(queue->qp, &wr, NULL);
1145}
1146
1147static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue,
1148                struct request *rq)
1149{
1150        struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
1151        struct nvme_rdma_device *dev = queue->device;
1152        struct ib_device *ibdev = dev->dev;
1153
1154        if (!blk_rq_nr_phys_segments(rq))
1155                return;
1156
1157        if (req->mr) {
1158                ib_mr_pool_put(queue->qp, &queue->qp->rdma_mrs, req->mr);
1159                req->mr = NULL;
1160        }
1161
1162        ib_dma_unmap_sg(ibdev, req->sg_table.sgl, req->nents, rq_dma_dir(rq));
1163        sg_free_table_chained(&req->sg_table, NVME_INLINE_SG_CNT);
1164}
1165
1166static int nvme_rdma_set_sg_null(struct nvme_command *c)
1167{
1168        struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
1169
1170        sg->addr = 0;
1171        put_unaligned_le24(0, sg->length);
1172        put_unaligned_le32(0, sg->key);
1173        sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4;
1174        return 0;
1175}
1176
1177static int nvme_rdma_map_sg_inline(struct nvme_rdma_queue *queue,
1178                struct nvme_rdma_request *req, struct nvme_command *c,
1179                int count)
1180{
1181        struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
1182        struct scatterlist *sgl = req->sg_table.sgl;
1183        struct ib_sge *sge = &req->sge[1];
1184        u32 len = 0;
1185        int i;
1186
1187        for (i = 0; i < count; i++, sgl++, sge++) {
1188                sge->addr = sg_dma_address(sgl);
1189                sge->length = sg_dma_len(sgl);
1190                sge->lkey = queue->device->pd->local_dma_lkey;
1191                len += sge->length;
1192        }
1193
1194        sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff);
1195        sg->length = cpu_to_le32(len);
1196        sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
1197
1198        req->num_sge += count;
1199        return 0;
1200}
1201
1202static int nvme_rdma_map_sg_single(struct nvme_rdma_queue *queue,
1203                struct nvme_rdma_request *req, struct nvme_command *c)
1204{
1205        struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
1206
1207        sg->addr = cpu_to_le64(sg_dma_address(req->sg_table.sgl));
1208        put_unaligned_le24(sg_dma_len(req->sg_table.sgl), sg->length);
1209        put_unaligned_le32(queue->device->pd->unsafe_global_rkey, sg->key);
1210        sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4;
1211        return 0;
1212}
1213
1214static int nvme_rdma_map_sg_fr(struct nvme_rdma_queue *queue,
1215                struct nvme_rdma_request *req, struct nvme_command *c,
1216                int count)
1217{
1218        struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
1219        int nr;
1220
1221        req->mr = ib_mr_pool_get(queue->qp, &queue->qp->rdma_mrs);
1222        if (WARN_ON_ONCE(!req->mr))
1223                return -EAGAIN;
1224
1225        /*
1226         * Align the MR to a 4K page size to match the ctrl page size and
1227         * the block virtual boundary.
1228         */
1229        nr = ib_map_mr_sg(req->mr, req->sg_table.sgl, count, NULL, SZ_4K);
1230        if (unlikely(nr < count)) {
1231                ib_mr_pool_put(queue->qp, &queue->qp->rdma_mrs, req->mr);
1232                req->mr = NULL;
1233                if (nr < 0)
1234                        return nr;
1235                return -EINVAL;
1236        }
1237
1238        ib_update_fast_reg_key(req->mr, ib_inc_rkey(req->mr->rkey));
1239
1240        req->reg_cqe.done = nvme_rdma_memreg_done;
1241        memset(&req->reg_wr, 0, sizeof(req->reg_wr));
1242        req->reg_wr.wr.opcode = IB_WR_REG_MR;
1243        req->reg_wr.wr.wr_cqe = &req->reg_cqe;
1244        req->reg_wr.wr.num_sge = 0;
1245        req->reg_wr.mr = req->mr;
1246        req->reg_wr.key = req->mr->rkey;
1247        req->reg_wr.access = IB_ACCESS_LOCAL_WRITE |
1248                             IB_ACCESS_REMOTE_READ |
1249                             IB_ACCESS_REMOTE_WRITE;
1250
1251        sg->addr = cpu_to_le64(req->mr->iova);
1252        put_unaligned_le24(req->mr->length, sg->length);
1253        put_unaligned_le32(req->mr->rkey, sg->key);
1254        sg->type = (NVME_KEY_SGL_FMT_DATA_DESC << 4) |
1255                        NVME_SGL_FMT_INVALIDATE;
1256
1257        return 0;
1258}
1259
1260static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
1261                struct request *rq, struct nvme_command *c)
1262{
1263        struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
1264        struct nvme_rdma_device *dev = queue->device;
1265        struct ib_device *ibdev = dev->dev;
1266        int count, ret;
1267
1268        req->num_sge = 1;
1269        refcount_set(&req->ref, 2); /* send and recv completions */
1270
1271        c->common.flags |= NVME_CMD_SGL_METABUF;
1272
1273        if (!blk_rq_nr_phys_segments(rq))
1274                return nvme_rdma_set_sg_null(c);
1275
1276        req->sg_table.sgl = req->first_sgl;
1277        ret = sg_alloc_table_chained(&req->sg_table,
1278                        blk_rq_nr_phys_segments(rq), req->sg_table.sgl,
1279                        NVME_INLINE_SG_CNT);
1280        if (ret)
1281                return -ENOMEM;
1282
1283        req->nents = blk_rq_map_sg(rq->q, rq, req->sg_table.sgl);
1284
1285        count = ib_dma_map_sg(ibdev, req->sg_table.sgl, req->nents,
1286                              rq_dma_dir(rq));
1287        if (unlikely(count <= 0)) {
1288                ret = -EIO;
1289                goto out_free_table;
1290        }
1291
1292        if (count <= dev->num_inline_segments) {
1293                if (rq_data_dir(rq) == WRITE && nvme_rdma_queue_idx(queue) &&
1294                    queue->ctrl->use_inline_data &&
1295                    blk_rq_payload_bytes(rq) <=
1296                                nvme_rdma_inline_data_size(queue)) {
1297                        ret = nvme_rdma_map_sg_inline(queue, req, c, count);
1298                        goto out;
1299                }
1300
1301                if (count == 1 && dev->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) {
1302                        ret = nvme_rdma_map_sg_single(queue, req, c);
1303                        goto out;
1304                }
1305        }
1306
1307        ret = nvme_rdma_map_sg_fr(queue, req, c, count);
1308out:
1309        if (unlikely(ret))
1310                goto out_unmap_sg;
1311
1312        return 0;
1313
1314out_unmap_sg:
1315        ib_dma_unmap_sg(ibdev, req->sg_table.sgl, req->nents, rq_dma_dir(rq));
1316out_free_table:
1317        sg_free_table_chained(&req->sg_table, NVME_INLINE_SG_CNT);
1318        return ret;
1319}
1320
1321static void nvme_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc)
1322{
1323        struct nvme_rdma_qe *qe =
1324                container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe);
1325        struct nvme_rdma_request *req =
1326                container_of(qe, struct nvme_rdma_request, sqe);
1327        struct request *rq = blk_mq_rq_from_pdu(req);
1328
1329        if (unlikely(wc->status != IB_WC_SUCCESS)) {
1330                nvme_rdma_wr_error(cq, wc, "SEND");
1331                return;
1332        }
1333
1334        if (refcount_dec_and_test(&req->ref))
1335                nvme_end_request(rq, req->status, req->result);
1336}
1337
1338static int nvme_rdma_post_send(struct nvme_rdma_queue *queue,
1339                struct nvme_rdma_qe *qe, struct ib_sge *sge, u32 num_sge,
1340                struct ib_send_wr *first)
1341{
1342        struct ib_send_wr wr;
1343        int ret;
1344
1345        sge->addr   = qe->dma;
1346        sge->length = sizeof(struct nvme_command),
1347        sge->lkey   = queue->device->pd->local_dma_lkey;
1348
1349        wr.next       = NULL;
1350        wr.wr_cqe     = &qe->cqe;
1351        wr.sg_list    = sge;
1352        wr.num_sge    = num_sge;
1353        wr.opcode     = IB_WR_SEND;
1354        wr.send_flags = IB_SEND_SIGNALED;
1355
1356        if (first)
1357                first->next = &wr;
1358        else
1359                first = &wr;
1360
1361        ret = ib_post_send(queue->qp, first, NULL);
1362        if (unlikely(ret)) {
1363                dev_err(queue->ctrl->ctrl.device,
1364                             "%s failed with error code %d\n", __func__, ret);
1365        }
1366        return ret;
1367}
1368
1369static int nvme_rdma_post_recv(struct nvme_rdma_queue *queue,
1370                struct nvme_rdma_qe *qe)
1371{
1372        struct ib_recv_wr wr;
1373        struct ib_sge list;
1374        int ret;
1375
1376        list.addr   = qe->dma;
1377        list.length = sizeof(struct nvme_completion);
1378        list.lkey   = queue->device->pd->local_dma_lkey;
1379
1380        qe->cqe.done = nvme_rdma_recv_done;
1381
1382        wr.next     = NULL;
1383        wr.wr_cqe   = &qe->cqe;
1384        wr.sg_list  = &list;
1385        wr.num_sge  = 1;
1386
1387        ret = ib_post_recv(queue->qp, &wr, NULL);
1388        if (unlikely(ret)) {
1389                dev_err(queue->ctrl->ctrl.device,
1390                        "%s failed with error code %d\n", __func__, ret);
1391        }
1392        return ret;
1393}
1394
1395static struct blk_mq_tags *nvme_rdma_tagset(struct nvme_rdma_queue *queue)
1396{
1397        u32 queue_idx = nvme_rdma_queue_idx(queue);
1398
1399        if (queue_idx == 0)
1400                return queue->ctrl->admin_tag_set.tags[queue_idx];
1401        return queue->ctrl->tag_set.tags[queue_idx - 1];
1402}
1403
1404static void nvme_rdma_async_done(struct ib_cq *cq, struct ib_wc *wc)
1405{
1406        if (unlikely(wc->status != IB_WC_SUCCESS))
1407                nvme_rdma_wr_error(cq, wc, "ASYNC");
1408}
1409
1410static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg)
1411{
1412        struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(arg);
1413        struct nvme_rdma_queue *queue = &ctrl->queues[0];
1414        struct ib_device *dev = queue->device->dev;
1415        struct nvme_rdma_qe *sqe = &ctrl->async_event_sqe;
1416        struct nvme_command *cmd = sqe->data;
1417        struct ib_sge sge;
1418        int ret;
1419
1420        ib_dma_sync_single_for_cpu(dev, sqe->dma, sizeof(*cmd), DMA_TO_DEVICE);
1421
1422        memset(cmd, 0, sizeof(*cmd));
1423        cmd->common.opcode = nvme_admin_async_event;
1424        cmd->common.command_id = NVME_AQ_BLK_MQ_DEPTH;
1425        cmd->common.flags |= NVME_CMD_SGL_METABUF;
1426        nvme_rdma_set_sg_null(cmd);
1427
1428        sqe->cqe.done = nvme_rdma_async_done;
1429
1430        ib_dma_sync_single_for_device(dev, sqe->dma, sizeof(*cmd),
1431                        DMA_TO_DEVICE);
1432
1433        ret = nvme_rdma_post_send(queue, sqe, &sge, 1, NULL);
1434        WARN_ON_ONCE(ret);
1435}
1436
1437static void nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
1438                struct nvme_completion *cqe, struct ib_wc *wc)
1439{
1440        struct request *rq;
1441        struct nvme_rdma_request *req;
1442
1443        rq = blk_mq_tag_to_rq(nvme_rdma_tagset(queue), cqe->command_id);
1444        if (!rq) {
1445                dev_err(queue->ctrl->ctrl.device,
1446                        "tag 0x%x on QP %#x not found\n",
1447                        cqe->command_id, queue->qp->qp_num);
1448                nvme_rdma_error_recovery(queue->ctrl);
1449                return;
1450        }
1451        req = blk_mq_rq_to_pdu(rq);
1452
1453        req->status = cqe->status;
1454        req->result = cqe->result;
1455
1456        if (wc->wc_flags & IB_WC_WITH_INVALIDATE) {
1457                if (unlikely(wc->ex.invalidate_rkey != req->mr->rkey)) {
1458                        dev_err(queue->ctrl->ctrl.device,
1459                                "Bogus remote invalidation for rkey %#x\n",
1460                                req->mr->rkey);
1461                        nvme_rdma_error_recovery(queue->ctrl);
1462                }
1463        } else if (req->mr) {
1464                int ret;
1465
1466                ret = nvme_rdma_inv_rkey(queue, req);
1467                if (unlikely(ret < 0)) {
1468                        dev_err(queue->ctrl->ctrl.device,
1469                                "Queueing INV WR for rkey %#x failed (%d)\n",
1470                                req->mr->rkey, ret);
1471                        nvme_rdma_error_recovery(queue->ctrl);
1472                }
1473                /* the local invalidation completion will end the request */
1474                return;
1475        }
1476
1477        if (refcount_dec_and_test(&req->ref))
1478                nvme_end_request(rq, req->status, req->result);
1479}
1480
1481static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
1482{
1483        struct nvme_rdma_qe *qe =
1484                container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe);
1485        struct nvme_rdma_queue *queue = cq->cq_context;
1486        struct ib_device *ibdev = queue->device->dev;
1487        struct nvme_completion *cqe = qe->data;
1488        const size_t len = sizeof(struct nvme_completion);
1489
1490        if (unlikely(wc->status != IB_WC_SUCCESS)) {
1491                nvme_rdma_wr_error(cq, wc, "RECV");
1492                return;
1493        }
1494
1495        ib_dma_sync_single_for_cpu(ibdev, qe->dma, len, DMA_FROM_DEVICE);
1496        /*
1497         * AEN requests are special as they don't time out and can
1498         * survive any kind of queue freeze and often don't respond to
1499         * aborts.  We don't even bother to allocate a struct request
1500         * for them but rather special case them here.
1501         */
1502        if (unlikely(nvme_is_aen_req(nvme_rdma_queue_idx(queue),
1503                                     cqe->command_id)))
1504                nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
1505                                &cqe->result);
1506        else
1507                nvme_rdma_process_nvme_rsp(queue, cqe, wc);
1508        ib_dma_sync_single_for_device(ibdev, qe->dma, len, DMA_FROM_DEVICE);
1509
1510        nvme_rdma_post_recv(queue, qe);
1511}
1512
1513static int nvme_rdma_conn_established(struct nvme_rdma_queue *queue)
1514{
1515        int ret, i;
1516
1517        for (i = 0; i < queue->queue_size; i++) {
1518                ret = nvme_rdma_post_recv(queue, &queue->rsp_ring[i]);
1519                if (ret)
1520                        goto out_destroy_queue_ib;
1521        }
1522
1523        return 0;
1524
1525out_destroy_queue_ib:
1526        nvme_rdma_destroy_queue_ib(queue);
1527        return ret;
1528}
1529
1530static int nvme_rdma_conn_rejected(struct nvme_rdma_queue *queue,
1531                struct rdma_cm_event *ev)
1532{
1533        struct rdma_cm_id *cm_id = queue->cm_id;
1534        int status = ev->status;
1535        const char *rej_msg;
1536        const struct nvme_rdma_cm_rej *rej_data;
1537        u8 rej_data_len;
1538
1539        rej_msg = rdma_reject_msg(cm_id, status);
1540        rej_data = rdma_consumer_reject_data(cm_id, ev, &rej_data_len);
1541
1542        if (rej_data && rej_data_len >= sizeof(u16)) {
1543                u16 sts = le16_to_cpu(rej_data->sts);
1544
1545                dev_err(queue->ctrl->ctrl.device,
1546                      "Connect rejected: status %d (%s) nvme status %d (%s).\n",
1547                      status, rej_msg, sts, nvme_rdma_cm_msg(sts));
1548        } else {
1549                dev_err(queue->ctrl->ctrl.device,
1550                        "Connect rejected: status %d (%s).\n", status, rej_msg);
1551        }
1552
1553        return -ECONNRESET;
1554}
1555
1556static int nvme_rdma_addr_resolved(struct nvme_rdma_queue *queue)
1557{
1558        struct nvme_ctrl *ctrl = &queue->ctrl->ctrl;
1559        int ret;
1560
1561        ret = nvme_rdma_create_queue_ib(queue);
1562        if (ret)
1563                return ret;
1564
1565        if (ctrl->opts->tos >= 0)
1566                rdma_set_service_type(queue->cm_id, ctrl->opts->tos);
1567        ret = rdma_resolve_route(queue->cm_id, NVME_RDMA_CONNECT_TIMEOUT_MS);
1568        if (ret) {
1569                dev_err(ctrl->device, "rdma_resolve_route failed (%d).\n",
1570                        queue->cm_error);
1571                goto out_destroy_queue;
1572        }
1573
1574        return 0;
1575
1576out_destroy_queue:
1577        nvme_rdma_destroy_queue_ib(queue);
1578        return ret;
1579}
1580
1581static int nvme_rdma_route_resolved(struct nvme_rdma_queue *queue)
1582{
1583        struct nvme_rdma_ctrl *ctrl = queue->ctrl;
1584        struct rdma_conn_param param = { };
1585        struct nvme_rdma_cm_req priv = { };
1586        int ret;
1587
1588        param.qp_num = queue->qp->qp_num;
1589        param.flow_control = 1;
1590
1591        param.responder_resources = queue->device->dev->attrs.max_qp_rd_atom;
1592        /* maximum retry count */
1593        param.retry_count = 7;
1594        param.rnr_retry_count = 7;
1595        param.private_data = &priv;
1596        param.private_data_len = sizeof(priv);
1597
1598        priv.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0);
1599        priv.qid = cpu_to_le16(nvme_rdma_queue_idx(queue));
1600        /*
1601         * set the admin queue depth to the minimum size
1602         * specified by the Fabrics standard.
1603         */
1604        if (priv.qid == 0) {
1605                priv.hrqsize = cpu_to_le16(NVME_AQ_DEPTH);
1606                priv.hsqsize = cpu_to_le16(NVME_AQ_DEPTH - 1);
1607        } else {
1608                /*
1609                 * current interpretation of the fabrics spec
1610                 * is at minimum you make hrqsize sqsize+1, or a
1611                 * 1's based representation of sqsize.
1612                 */
1613                priv.hrqsize = cpu_to_le16(queue->queue_size);
1614                priv.hsqsize = cpu_to_le16(queue->ctrl->ctrl.sqsize);
1615        }
1616
1617        ret = rdma_connect(queue->cm_id, &param);
1618        if (ret) {
1619                dev_err(ctrl->ctrl.device,
1620                        "rdma_connect failed (%d).\n", ret);
1621                goto out_destroy_queue_ib;
1622        }
1623
1624        return 0;
1625
1626out_destroy_queue_ib:
1627        nvme_rdma_destroy_queue_ib(queue);
1628        return ret;
1629}
1630
1631static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
1632                struct rdma_cm_event *ev)
1633{
1634        struct nvme_rdma_queue *queue = cm_id->context;
1635        int cm_error = 0;
1636
1637        dev_dbg(queue->ctrl->ctrl.device, "%s (%d): status %d id %p\n",
1638                rdma_event_msg(ev->event), ev->event,
1639                ev->status, cm_id);
1640
1641        switch (ev->event) {
1642        case RDMA_CM_EVENT_ADDR_RESOLVED:
1643                cm_error = nvme_rdma_addr_resolved(queue);
1644                break;
1645        case RDMA_CM_EVENT_ROUTE_RESOLVED:
1646                cm_error = nvme_rdma_route_resolved(queue);
1647                break;
1648        case RDMA_CM_EVENT_ESTABLISHED:
1649                queue->cm_error = nvme_rdma_conn_established(queue);
1650                /* complete cm_done regardless of success/failure */
1651                complete(&queue->cm_done);
1652                return 0;
1653        case RDMA_CM_EVENT_REJECTED:
1654                nvme_rdma_destroy_queue_ib(queue);
1655                cm_error = nvme_rdma_conn_rejected(queue, ev);
1656                break;
1657        case RDMA_CM_EVENT_ROUTE_ERROR:
1658        case RDMA_CM_EVENT_CONNECT_ERROR:
1659        case RDMA_CM_EVENT_UNREACHABLE:
1660                nvme_rdma_destroy_queue_ib(queue);
1661                /* fall through */
1662        case RDMA_CM_EVENT_ADDR_ERROR:
1663                dev_dbg(queue->ctrl->ctrl.device,
1664                        "CM error event %d\n", ev->event);
1665                cm_error = -ECONNRESET;
1666                break;
1667        case RDMA_CM_EVENT_DISCONNECTED:
1668        case RDMA_CM_EVENT_ADDR_CHANGE:
1669        case RDMA_CM_EVENT_TIMEWAIT_EXIT:
1670                dev_dbg(queue->ctrl->ctrl.device,
1671                        "disconnect received - connection closed\n");
1672                nvme_rdma_error_recovery(queue->ctrl);
1673                break;
1674        case RDMA_CM_EVENT_DEVICE_REMOVAL:
1675                /* device removal is handled via the ib_client API */
1676                break;
1677        default:
1678                dev_err(queue->ctrl->ctrl.device,
1679                        "Unexpected RDMA CM event (%d)\n", ev->event);
1680                nvme_rdma_error_recovery(queue->ctrl);
1681                break;
1682        }
1683
1684        if (cm_error) {
1685                queue->cm_error = cm_error;
1686                complete(&queue->cm_done);
1687        }
1688
1689        return 0;
1690}
1691
1692static enum blk_eh_timer_return
1693nvme_rdma_timeout(struct request *rq, bool reserved)
1694{
1695        struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
1696        struct nvme_rdma_queue *queue = req->queue;
1697        struct nvme_rdma_ctrl *ctrl = queue->ctrl;
1698
1699        dev_warn(ctrl->ctrl.device, "I/O %d QID %d timeout\n",
1700                 rq->tag, nvme_rdma_queue_idx(queue));
1701
1702        /*
1703         * Restart the timer if a controller reset is already scheduled. Any
1704         * timed out commands would be handled before entering the connecting
1705         * state.
1706         */
1707        if (ctrl->ctrl.state == NVME_CTRL_RESETTING)
1708                return BLK_EH_RESET_TIMER;
1709
1710        if (ctrl->ctrl.state != NVME_CTRL_LIVE) {
1711                /*
1712                 * Teardown immediately if controller times out while starting
1713                 * or we are already started error recovery. all outstanding
1714                 * requests are completed on shutdown, so we return BLK_EH_DONE.
1715                 */
1716                flush_work(&ctrl->err_work);
1717                nvme_rdma_teardown_io_queues(ctrl, false);
1718                nvme_rdma_teardown_admin_queue(ctrl, false);
1719                return BLK_EH_DONE;
1720        }
1721
1722        dev_warn(ctrl->ctrl.device, "starting error recovery\n");
1723        nvme_rdma_error_recovery(ctrl);
1724
1725        return BLK_EH_RESET_TIMER;
1726}
1727
1728static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
1729                const struct blk_mq_queue_data *bd)
1730{
1731        struct nvme_ns *ns = hctx->queue->queuedata;
1732        struct nvme_rdma_queue *queue = hctx->driver_data;
1733        struct request *rq = bd->rq;
1734        struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
1735        struct nvme_rdma_qe *sqe = &req->sqe;
1736        struct nvme_command *c = sqe->data;
1737        struct ib_device *dev;
1738        bool queue_ready = test_bit(NVME_RDMA_Q_LIVE, &queue->flags);
1739        blk_status_t ret;
1740        int err;
1741
1742        WARN_ON_ONCE(rq->tag < 0);
1743
1744        if (!nvmf_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
1745                return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq);
1746
1747        dev = queue->device->dev;
1748
1749        req->sqe.dma = ib_dma_map_single(dev, req->sqe.data,
1750                                         sizeof(struct nvme_command),
1751                                         DMA_TO_DEVICE);
1752        err = ib_dma_mapping_error(dev, req->sqe.dma);
1753        if (unlikely(err))
1754                return BLK_STS_RESOURCE;
1755
1756        ib_dma_sync_single_for_cpu(dev, sqe->dma,
1757                        sizeof(struct nvme_command), DMA_TO_DEVICE);
1758
1759        ret = nvme_setup_cmd(ns, rq, c);
1760        if (ret)
1761                goto unmap_qe;
1762
1763        blk_mq_start_request(rq);
1764
1765        err = nvme_rdma_map_data(queue, rq, c);
1766        if (unlikely(err < 0)) {
1767                dev_err(queue->ctrl->ctrl.device,
1768                             "Failed to map data (%d)\n", err);
1769                goto err;
1770        }
1771
1772        sqe->cqe.done = nvme_rdma_send_done;
1773
1774        ib_dma_sync_single_for_device(dev, sqe->dma,
1775                        sizeof(struct nvme_command), DMA_TO_DEVICE);
1776
1777        err = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge,
1778                        req->mr ? &req->reg_wr.wr : NULL);
1779        if (unlikely(err))
1780                goto err_unmap;
1781
1782        return BLK_STS_OK;
1783
1784err_unmap:
1785        nvme_rdma_unmap_data(queue, rq);
1786err:
1787        if (err == -ENOMEM || err == -EAGAIN)
1788                ret = BLK_STS_RESOURCE;
1789        else
1790                ret = BLK_STS_IOERR;
1791        nvme_cleanup_cmd(rq);
1792unmap_qe:
1793        ib_dma_unmap_single(dev, req->sqe.dma, sizeof(struct nvme_command),
1794                            DMA_TO_DEVICE);
1795        return ret;
1796}
1797
1798static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx)
1799{
1800        struct nvme_rdma_queue *queue = hctx->driver_data;
1801
1802        return ib_process_cq_direct(queue->ib_cq, -1);
1803}
1804
1805static void nvme_rdma_complete_rq(struct request *rq)
1806{
1807        struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
1808        struct nvme_rdma_queue *queue = req->queue;
1809        struct ib_device *ibdev = queue->device->dev;
1810
1811        nvme_rdma_unmap_data(queue, rq);
1812        ib_dma_unmap_single(ibdev, req->sqe.dma, sizeof(struct nvme_command),
1813                            DMA_TO_DEVICE);
1814        nvme_complete_rq(rq);
1815}
1816
1817static int nvme_rdma_map_queues(struct blk_mq_tag_set *set)
1818{
1819        struct nvme_rdma_ctrl *ctrl = set->driver_data;
1820        struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
1821
1822        if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) {
1823                /* separate read/write queues */
1824                set->map[HCTX_TYPE_DEFAULT].nr_queues =
1825                        ctrl->io_queues[HCTX_TYPE_DEFAULT];
1826                set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
1827                set->map[HCTX_TYPE_READ].nr_queues =
1828                        ctrl->io_queues[HCTX_TYPE_READ];
1829                set->map[HCTX_TYPE_READ].queue_offset =
1830                        ctrl->io_queues[HCTX_TYPE_DEFAULT];
1831        } else {
1832                /* shared read/write queues */
1833                set->map[HCTX_TYPE_DEFAULT].nr_queues =
1834                        ctrl->io_queues[HCTX_TYPE_DEFAULT];
1835                set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
1836                set->map[HCTX_TYPE_READ].nr_queues =
1837                        ctrl->io_queues[HCTX_TYPE_DEFAULT];
1838                set->map[HCTX_TYPE_READ].queue_offset = 0;
1839        }
1840        blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_DEFAULT],
1841                        ctrl->device->dev, 0);
1842        blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_READ],
1843                        ctrl->device->dev, 0);
1844
1845        if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) {
1846                /* map dedicated poll queues only if we have queues left */
1847                set->map[HCTX_TYPE_POLL].nr_queues =
1848                                ctrl->io_queues[HCTX_TYPE_POLL];
1849                set->map[HCTX_TYPE_POLL].queue_offset =
1850                        ctrl->io_queues[HCTX_TYPE_DEFAULT] +
1851                        ctrl->io_queues[HCTX_TYPE_READ];
1852                blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]);
1853        }
1854
1855        dev_info(ctrl->ctrl.device,
1856                "mapped %d/%d/%d default/read/poll queues.\n",
1857                ctrl->io_queues[HCTX_TYPE_DEFAULT],
1858                ctrl->io_queues[HCTX_TYPE_READ],
1859                ctrl->io_queues[HCTX_TYPE_POLL]);
1860
1861        return 0;
1862}
1863
1864static const struct blk_mq_ops nvme_rdma_mq_ops = {
1865        .queue_rq       = nvme_rdma_queue_rq,
1866        .complete       = nvme_rdma_complete_rq,
1867        .init_request   = nvme_rdma_init_request,
1868        .exit_request   = nvme_rdma_exit_request,
1869        .init_hctx      = nvme_rdma_init_hctx,
1870        .timeout        = nvme_rdma_timeout,
1871        .map_queues     = nvme_rdma_map_queues,
1872        .poll           = nvme_rdma_poll,
1873};
1874
1875static const struct blk_mq_ops nvme_rdma_admin_mq_ops = {
1876        .queue_rq       = nvme_rdma_queue_rq,
1877        .complete       = nvme_rdma_complete_rq,
1878        .init_request   = nvme_rdma_init_request,
1879        .exit_request   = nvme_rdma_exit_request,
1880        .init_hctx      = nvme_rdma_init_admin_hctx,
1881        .timeout        = nvme_rdma_timeout,
1882};
1883
1884static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
1885{
1886        cancel_work_sync(&ctrl->err_work);
1887        cancel_delayed_work_sync(&ctrl->reconnect_work);
1888
1889        nvme_rdma_teardown_io_queues(ctrl, shutdown);
1890        blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
1891        if (shutdown)
1892                nvme_shutdown_ctrl(&ctrl->ctrl);
1893        else
1894                nvme_disable_ctrl(&ctrl->ctrl);
1895        nvme_rdma_teardown_admin_queue(ctrl, shutdown);
1896}
1897
1898static void nvme_rdma_delete_ctrl(struct nvme_ctrl *ctrl)
1899{
1900        nvme_rdma_shutdown_ctrl(to_rdma_ctrl(ctrl), true);
1901}
1902
1903static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
1904{
1905        struct nvme_rdma_ctrl *ctrl =
1906                container_of(work, struct nvme_rdma_ctrl, ctrl.reset_work);
1907
1908        nvme_stop_ctrl(&ctrl->ctrl);
1909        nvme_rdma_shutdown_ctrl(ctrl, false);
1910
1911        if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
1912                /* state change failure should never happen */
1913                WARN_ON_ONCE(1);
1914                return;
1915        }
1916
1917        if (nvme_rdma_setup_ctrl(ctrl, false))
1918                goto out_fail;
1919
1920        return;
1921
1922out_fail:
1923        ++ctrl->ctrl.nr_reconnects;
1924        nvme_rdma_reconnect_or_remove(ctrl);
1925}
1926
1927static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = {
1928        .name                   = "rdma",
1929        .module                 = THIS_MODULE,
1930        .flags                  = NVME_F_FABRICS,
1931        .reg_read32             = nvmf_reg_read32,
1932        .reg_read64             = nvmf_reg_read64,
1933        .reg_write32            = nvmf_reg_write32,
1934        .free_ctrl              = nvme_rdma_free_ctrl,
1935        .submit_async_event     = nvme_rdma_submit_async_event,
1936        .delete_ctrl            = nvme_rdma_delete_ctrl,
1937        .get_address            = nvmf_get_address,
1938};
1939
1940/*
1941 * Fails a connection request if it matches an existing controller
1942 * (association) with the same tuple:
1943 * <Host NQN, Host ID, local address, remote address, remote port, SUBSYS NQN>
1944 *
1945 * if local address is not specified in the request, it will match an
1946 * existing controller with all the other parameters the same and no
1947 * local port address specified as well.
1948 *
1949 * The ports don't need to be compared as they are intrinsically
1950 * already matched by the port pointers supplied.
1951 */
1952static bool
1953nvme_rdma_existing_controller(struct nvmf_ctrl_options *opts)
1954{
1955        struct nvme_rdma_ctrl *ctrl;
1956        bool found = false;
1957
1958        mutex_lock(&nvme_rdma_ctrl_mutex);
1959        list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list) {
1960                found = nvmf_ip_options_match(&ctrl->ctrl, opts);
1961                if (found)
1962                        break;
1963        }
1964        mutex_unlock(&nvme_rdma_ctrl_mutex);
1965
1966        return found;
1967}
1968
1969static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
1970                struct nvmf_ctrl_options *opts)
1971{
1972        struct nvme_rdma_ctrl *ctrl;
1973        int ret;
1974        bool changed;
1975
1976        ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
1977        if (!ctrl)
1978                return ERR_PTR(-ENOMEM);
1979        ctrl->ctrl.opts = opts;
1980        INIT_LIST_HEAD(&ctrl->list);
1981
1982        if (!(opts->mask & NVMF_OPT_TRSVCID)) {
1983                opts->trsvcid =
1984                        kstrdup(__stringify(NVME_RDMA_IP_PORT), GFP_KERNEL);
1985                if (!opts->trsvcid) {
1986                        ret = -ENOMEM;
1987                        goto out_free_ctrl;
1988                }
1989                opts->mask |= NVMF_OPT_TRSVCID;
1990        }
1991
1992        ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
1993                        opts->traddr, opts->trsvcid, &ctrl->addr);
1994        if (ret) {
1995                pr_err("malformed address passed: %s:%s\n",
1996                        opts->traddr, opts->trsvcid);
1997                goto out_free_ctrl;
1998        }
1999
2000        if (opts->mask & NVMF_OPT_HOST_TRADDR) {
2001                ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
2002                        opts->host_traddr, NULL, &ctrl->src_addr);
2003                if (ret) {
2004                        pr_err("malformed src address passed: %s\n",
2005                               opts->host_traddr);
2006                        goto out_free_ctrl;
2007                }
2008        }
2009
2010        if (!opts->duplicate_connect && nvme_rdma_existing_controller(opts)) {
2011                ret = -EALREADY;
2012                goto out_free_ctrl;
2013        }
2014
2015        INIT_DELAYED_WORK(&ctrl->reconnect_work,
2016                        nvme_rdma_reconnect_ctrl_work);
2017        INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work);
2018        INIT_WORK(&ctrl->ctrl.reset_work, nvme_rdma_reset_ctrl_work);
2019
2020        ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues +
2021                                opts->nr_poll_queues + 1;
2022        ctrl->ctrl.sqsize = opts->queue_size - 1;
2023        ctrl->ctrl.kato = opts->kato;
2024
2025        ret = -ENOMEM;
2026        ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(*ctrl->queues),
2027                                GFP_KERNEL);
2028        if (!ctrl->queues)
2029                goto out_free_ctrl;
2030
2031        ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_rdma_ctrl_ops,
2032                                0 /* no quirks, we're perfect! */);
2033        if (ret)
2034                goto out_kfree_queues;
2035
2036        changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING);
2037        WARN_ON_ONCE(!changed);
2038
2039        ret = nvme_rdma_setup_ctrl(ctrl, true);
2040        if (ret)
2041                goto out_uninit_ctrl;
2042
2043        dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISpcs\n",
2044                ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
2045
2046        nvme_get_ctrl(&ctrl->ctrl);
2047
2048        mutex_lock(&nvme_rdma_ctrl_mutex);
2049        list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list);
2050        mutex_unlock(&nvme_rdma_ctrl_mutex);
2051
2052        return &ctrl->ctrl;
2053
2054out_uninit_ctrl:
2055        nvme_uninit_ctrl(&ctrl->ctrl);
2056        nvme_put_ctrl(&ctrl->ctrl);
2057        if (ret > 0)
2058                ret = -EIO;
2059        return ERR_PTR(ret);
2060out_kfree_queues:
2061        kfree(ctrl->queues);
2062out_free_ctrl:
2063        kfree(ctrl);
2064        return ERR_PTR(ret);
2065}
2066
2067static struct nvmf_transport_ops nvme_rdma_transport = {
2068        .name           = "rdma",
2069        .module         = THIS_MODULE,
2070        .required_opts  = NVMF_OPT_TRADDR,
2071        .allowed_opts   = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
2072                          NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
2073                          NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES |
2074                          NVMF_OPT_TOS,
2075        .create_ctrl    = nvme_rdma_create_ctrl,
2076};
2077
2078static void nvme_rdma_remove_one(struct ib_device *ib_device, void *client_data)
2079{
2080        struct nvme_rdma_ctrl *ctrl;
2081        struct nvme_rdma_device *ndev;
2082        bool found = false;
2083
2084        mutex_lock(&device_list_mutex);
2085        list_for_each_entry(ndev, &device_list, entry) {
2086                if (ndev->dev == ib_device) {
2087                        found = true;
2088                        break;
2089                }
2090        }
2091        mutex_unlock(&device_list_mutex);
2092
2093        if (!found)
2094                return;
2095
2096        /* Delete all controllers using this device */
2097        mutex_lock(&nvme_rdma_ctrl_mutex);
2098        list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list) {
2099                if (ctrl->device->dev != ib_device)
2100                        continue;
2101                nvme_delete_ctrl(&ctrl->ctrl);
2102        }
2103        mutex_unlock(&nvme_rdma_ctrl_mutex);
2104
2105        flush_workqueue(nvme_delete_wq);
2106}
2107
2108static struct ib_client nvme_rdma_ib_client = {
2109        .name   = "nvme_rdma",
2110        .remove = nvme_rdma_remove_one
2111};
2112
2113static int __init nvme_rdma_init_module(void)
2114{
2115        int ret;
2116
2117        ret = ib_register_client(&nvme_rdma_ib_client);
2118        if (ret)
2119                return ret;
2120
2121        ret = nvmf_register_transport(&nvme_rdma_transport);
2122        if (ret)
2123                goto err_unreg_client;
2124
2125        return 0;
2126
2127err_unreg_client:
2128        ib_unregister_client(&nvme_rdma_ib_client);
2129        return ret;
2130}
2131
2132static void __exit nvme_rdma_cleanup_module(void)
2133{
2134        struct nvme_rdma_ctrl *ctrl;
2135
2136        nvmf_unregister_transport(&nvme_rdma_transport);
2137        ib_unregister_client(&nvme_rdma_ib_client);
2138
2139        mutex_lock(&nvme_rdma_ctrl_mutex);
2140        list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list)
2141                nvme_delete_ctrl(&ctrl->ctrl);
2142        mutex_unlock(&nvme_rdma_ctrl_mutex);
2143        flush_workqueue(nvme_delete_wq);
2144}
2145
2146module_init(nvme_rdma_init_module);
2147module_exit(nvme_rdma_cleanup_module);
2148
2149MODULE_LICENSE("GPL v2");
2150