linux/drivers/nvme/host/rdma.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * NVMe over Fabrics RDMA host code.
   4 * Copyright (c) 2015-2016 HGST, a Western Digital Company.
   5 */
   6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   7#include <linux/module.h>
   8#include <linux/init.h>
   9#include <linux/slab.h>
  10#include <rdma/mr_pool.h>
  11#include <linux/err.h>
  12#include <linux/string.h>
  13#include <linux/atomic.h>
  14#include <linux/blk-mq.h>
  15#include <linux/blk-mq-rdma.h>
  16#include <linux/types.h>
  17#include <linux/list.h>
  18#include <linux/mutex.h>
  19#include <linux/scatterlist.h>
  20#include <linux/nvme.h>
  21#include <asm/unaligned.h>
  22
  23#include <rdma/ib_verbs.h>
  24#include <rdma/rdma_cm.h>
  25#include <linux/nvme-rdma.h>
  26
  27#include "nvme.h"
  28#include "fabrics.h"
  29
  30
  31#define NVME_RDMA_CONNECT_TIMEOUT_MS    3000            /* 3 second */
  32
  33#define NVME_RDMA_MAX_SEGMENTS          256
  34
  35#define NVME_RDMA_MAX_INLINE_SEGMENTS   4
  36
  37#define NVME_RDMA_DATA_SGL_SIZE \
  38        (sizeof(struct scatterlist) * NVME_INLINE_SG_CNT)
  39#define NVME_RDMA_METADATA_SGL_SIZE \
  40        (sizeof(struct scatterlist) * NVME_INLINE_METADATA_SG_CNT)
  41
  42struct nvme_rdma_device {
  43        struct ib_device        *dev;
  44        struct ib_pd            *pd;
  45        struct kref             ref;
  46        struct list_head        entry;
  47        unsigned int            num_inline_segments;
  48};
  49
  50struct nvme_rdma_qe {
  51        struct ib_cqe           cqe;
  52        void                    *data;
  53        u64                     dma;
  54};
  55
  56struct nvme_rdma_sgl {
  57        int                     nents;
  58        struct sg_table         sg_table;
  59};
  60
  61struct nvme_rdma_queue;
  62struct nvme_rdma_request {
  63        struct nvme_request     req;
  64        struct ib_mr            *mr;
  65        struct nvme_rdma_qe     sqe;
  66        union nvme_result       result;
  67        __le16                  status;
  68        refcount_t              ref;
  69        struct ib_sge           sge[1 + NVME_RDMA_MAX_INLINE_SEGMENTS];
  70        u32                     num_sge;
  71        struct ib_reg_wr        reg_wr;
  72        struct ib_cqe           reg_cqe;
  73        struct nvme_rdma_queue  *queue;
  74        struct nvme_rdma_sgl    data_sgl;
  75        struct nvme_rdma_sgl    *metadata_sgl;
  76        bool                    use_sig_mr;
  77};
  78
  79enum nvme_rdma_queue_flags {
  80        NVME_RDMA_Q_ALLOCATED           = 0,
  81        NVME_RDMA_Q_LIVE                = 1,
  82        NVME_RDMA_Q_TR_READY            = 2,
  83};
  84
  85struct nvme_rdma_queue {
  86        struct nvme_rdma_qe     *rsp_ring;
  87        int                     queue_size;
  88        size_t                  cmnd_capsule_len;
  89        struct nvme_rdma_ctrl   *ctrl;
  90        struct nvme_rdma_device *device;
  91        struct ib_cq            *ib_cq;
  92        struct ib_qp            *qp;
  93
  94        unsigned long           flags;
  95        struct rdma_cm_id       *cm_id;
  96        int                     cm_error;
  97        struct completion       cm_done;
  98        bool                    pi_support;
  99        int                     cq_size;
 100        struct mutex            queue_lock;
 101};
 102
 103struct nvme_rdma_ctrl {
 104        /* read only in the hot path */
 105        struct nvme_rdma_queue  *queues;
 106
 107        /* other member variables */
 108        struct blk_mq_tag_set   tag_set;
 109        struct work_struct      err_work;
 110
 111        struct nvme_rdma_qe     async_event_sqe;
 112
 113        struct delayed_work     reconnect_work;
 114
 115        struct list_head        list;
 116
 117        struct blk_mq_tag_set   admin_tag_set;
 118        struct nvme_rdma_device *device;
 119
 120        u32                     max_fr_pages;
 121
 122        struct sockaddr_storage addr;
 123        struct sockaddr_storage src_addr;
 124
 125        struct nvme_ctrl        ctrl;
 126        bool                    use_inline_data;
 127        u32                     io_queues[HCTX_MAX_TYPES];
 128};
 129
 130static inline struct nvme_rdma_ctrl *to_rdma_ctrl(struct nvme_ctrl *ctrl)
 131{
 132        return container_of(ctrl, struct nvme_rdma_ctrl, ctrl);
 133}
 134
 135static LIST_HEAD(device_list);
 136static DEFINE_MUTEX(device_list_mutex);
 137
 138static LIST_HEAD(nvme_rdma_ctrl_list);
 139static DEFINE_MUTEX(nvme_rdma_ctrl_mutex);
 140
 141/*
 142 * Disabling this option makes small I/O goes faster, but is fundamentally
 143 * unsafe.  With it turned off we will have to register a global rkey that
 144 * allows read and write access to all physical memory.
 145 */
 146static bool register_always = true;
 147module_param(register_always, bool, 0444);
 148MODULE_PARM_DESC(register_always,
 149         "Use memory registration even for contiguous memory regions");
 150
 151static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
 152                struct rdma_cm_event *event);
 153static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc);
 154static void nvme_rdma_complete_rq(struct request *rq);
 155
 156static const struct blk_mq_ops nvme_rdma_mq_ops;
 157static const struct blk_mq_ops nvme_rdma_admin_mq_ops;
 158
 159static inline int nvme_rdma_queue_idx(struct nvme_rdma_queue *queue)
 160{
 161        return queue - queue->ctrl->queues;
 162}
 163
 164static bool nvme_rdma_poll_queue(struct nvme_rdma_queue *queue)
 165{
 166        return nvme_rdma_queue_idx(queue) >
 167                queue->ctrl->io_queues[HCTX_TYPE_DEFAULT] +
 168                queue->ctrl->io_queues[HCTX_TYPE_READ];
 169}
 170
 171static inline size_t nvme_rdma_inline_data_size(struct nvme_rdma_queue *queue)
 172{
 173        return queue->cmnd_capsule_len - sizeof(struct nvme_command);
 174}
 175
 176static void nvme_rdma_free_qe(struct ib_device *ibdev, struct nvme_rdma_qe *qe,
 177                size_t capsule_size, enum dma_data_direction dir)
 178{
 179        ib_dma_unmap_single(ibdev, qe->dma, capsule_size, dir);
 180        kfree(qe->data);
 181}
 182
 183static int nvme_rdma_alloc_qe(struct ib_device *ibdev, struct nvme_rdma_qe *qe,
 184                size_t capsule_size, enum dma_data_direction dir)
 185{
 186        qe->data = kzalloc(capsule_size, GFP_KERNEL);
 187        if (!qe->data)
 188                return -ENOMEM;
 189
 190        qe->dma = ib_dma_map_single(ibdev, qe->data, capsule_size, dir);
 191        if (ib_dma_mapping_error(ibdev, qe->dma)) {
 192                kfree(qe->data);
 193                qe->data = NULL;
 194                return -ENOMEM;
 195        }
 196
 197        return 0;
 198}
 199
 200static void nvme_rdma_free_ring(struct ib_device *ibdev,
 201                struct nvme_rdma_qe *ring, size_t ib_queue_size,
 202                size_t capsule_size, enum dma_data_direction dir)
 203{
 204        int i;
 205
 206        for (i = 0; i < ib_queue_size; i++)
 207                nvme_rdma_free_qe(ibdev, &ring[i], capsule_size, dir);
 208        kfree(ring);
 209}
 210
 211static struct nvme_rdma_qe *nvme_rdma_alloc_ring(struct ib_device *ibdev,
 212                size_t ib_queue_size, size_t capsule_size,
 213                enum dma_data_direction dir)
 214{
 215        struct nvme_rdma_qe *ring;
 216        int i;
 217
 218        ring = kcalloc(ib_queue_size, sizeof(struct nvme_rdma_qe), GFP_KERNEL);
 219        if (!ring)
 220                return NULL;
 221
 222        /*
 223         * Bind the CQEs (post recv buffers) DMA mapping to the RDMA queue
 224         * lifetime. It's safe, since any chage in the underlying RDMA device
 225         * will issue error recovery and queue re-creation.
 226         */
 227        for (i = 0; i < ib_queue_size; i++) {
 228                if (nvme_rdma_alloc_qe(ibdev, &ring[i], capsule_size, dir))
 229                        goto out_free_ring;
 230        }
 231
 232        return ring;
 233
 234out_free_ring:
 235        nvme_rdma_free_ring(ibdev, ring, i, capsule_size, dir);
 236        return NULL;
 237}
 238
 239static void nvme_rdma_qp_event(struct ib_event *event, void *context)
 240{
 241        pr_debug("QP event %s (%d)\n",
 242                 ib_event_msg(event->event), event->event);
 243
 244}
 245
 246static int nvme_rdma_wait_for_cm(struct nvme_rdma_queue *queue)
 247{
 248        int ret;
 249
 250        ret = wait_for_completion_interruptible_timeout(&queue->cm_done,
 251                        msecs_to_jiffies(NVME_RDMA_CONNECT_TIMEOUT_MS) + 1);
 252        if (ret < 0)
 253                return ret;
 254        if (ret == 0)
 255                return -ETIMEDOUT;
 256        WARN_ON_ONCE(queue->cm_error > 0);
 257        return queue->cm_error;
 258}
 259
 260static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor)
 261{
 262        struct nvme_rdma_device *dev = queue->device;
 263        struct ib_qp_init_attr init_attr;
 264        int ret;
 265
 266        memset(&init_attr, 0, sizeof(init_attr));
 267        init_attr.event_handler = nvme_rdma_qp_event;
 268        /* +1 for drain */
 269        init_attr.cap.max_send_wr = factor * queue->queue_size + 1;
 270        /* +1 for drain */
 271        init_attr.cap.max_recv_wr = queue->queue_size + 1;
 272        init_attr.cap.max_recv_sge = 1;
 273        init_attr.cap.max_send_sge = 1 + dev->num_inline_segments;
 274        init_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
 275        init_attr.qp_type = IB_QPT_RC;
 276        init_attr.send_cq = queue->ib_cq;
 277        init_attr.recv_cq = queue->ib_cq;
 278        if (queue->pi_support)
 279                init_attr.create_flags |= IB_QP_CREATE_INTEGRITY_EN;
 280        init_attr.qp_context = queue;
 281
 282        ret = rdma_create_qp(queue->cm_id, dev->pd, &init_attr);
 283
 284        queue->qp = queue->cm_id->qp;
 285        return ret;
 286}
 287
 288static void nvme_rdma_exit_request(struct blk_mq_tag_set *set,
 289                struct request *rq, unsigned int hctx_idx)
 290{
 291        struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
 292
 293        kfree(req->sqe.data);
 294}
 295
 296static int nvme_rdma_init_request(struct blk_mq_tag_set *set,
 297                struct request *rq, unsigned int hctx_idx,
 298                unsigned int numa_node)
 299{
 300        struct nvme_rdma_ctrl *ctrl = set->driver_data;
 301        struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
 302        int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
 303        struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx];
 304
 305        nvme_req(rq)->ctrl = &ctrl->ctrl;
 306        req->sqe.data = kzalloc(sizeof(struct nvme_command), GFP_KERNEL);
 307        if (!req->sqe.data)
 308                return -ENOMEM;
 309
 310        /* metadata nvme_rdma_sgl struct is located after command's data SGL */
 311        if (queue->pi_support)
 312                req->metadata_sgl = (void *)nvme_req(rq) +
 313                        sizeof(struct nvme_rdma_request) +
 314                        NVME_RDMA_DATA_SGL_SIZE;
 315
 316        req->queue = queue;
 317        nvme_req(rq)->cmd = req->sqe.data;
 318
 319        return 0;
 320}
 321
 322static int nvme_rdma_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
 323                unsigned int hctx_idx)
 324{
 325        struct nvme_rdma_ctrl *ctrl = data;
 326        struct nvme_rdma_queue *queue = &ctrl->queues[hctx_idx + 1];
 327
 328        BUG_ON(hctx_idx >= ctrl->ctrl.queue_count);
 329
 330        hctx->driver_data = queue;
 331        return 0;
 332}
 333
 334static int nvme_rdma_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
 335                unsigned int hctx_idx)
 336{
 337        struct nvme_rdma_ctrl *ctrl = data;
 338        struct nvme_rdma_queue *queue = &ctrl->queues[0];
 339
 340        BUG_ON(hctx_idx != 0);
 341
 342        hctx->driver_data = queue;
 343        return 0;
 344}
 345
 346static void nvme_rdma_free_dev(struct kref *ref)
 347{
 348        struct nvme_rdma_device *ndev =
 349                container_of(ref, struct nvme_rdma_device, ref);
 350
 351        mutex_lock(&device_list_mutex);
 352        list_del(&ndev->entry);
 353        mutex_unlock(&device_list_mutex);
 354
 355        ib_dealloc_pd(ndev->pd);
 356        kfree(ndev);
 357}
 358
 359static void nvme_rdma_dev_put(struct nvme_rdma_device *dev)
 360{
 361        kref_put(&dev->ref, nvme_rdma_free_dev);
 362}
 363
 364static int nvme_rdma_dev_get(struct nvme_rdma_device *dev)
 365{
 366        return kref_get_unless_zero(&dev->ref);
 367}
 368
 369static struct nvme_rdma_device *
 370nvme_rdma_find_get_device(struct rdma_cm_id *cm_id)
 371{
 372        struct nvme_rdma_device *ndev;
 373
 374        mutex_lock(&device_list_mutex);
 375        list_for_each_entry(ndev, &device_list, entry) {
 376                if (ndev->dev->node_guid == cm_id->device->node_guid &&
 377                    nvme_rdma_dev_get(ndev))
 378                        goto out_unlock;
 379        }
 380
 381        ndev = kzalloc(sizeof(*ndev), GFP_KERNEL);
 382        if (!ndev)
 383                goto out_err;
 384
 385        ndev->dev = cm_id->device;
 386        kref_init(&ndev->ref);
 387
 388        ndev->pd = ib_alloc_pd(ndev->dev,
 389                register_always ? 0 : IB_PD_UNSAFE_GLOBAL_RKEY);
 390        if (IS_ERR(ndev->pd))
 391                goto out_free_dev;
 392
 393        if (!(ndev->dev->attrs.device_cap_flags &
 394              IB_DEVICE_MEM_MGT_EXTENSIONS)) {
 395                dev_err(&ndev->dev->dev,
 396                        "Memory registrations not supported.\n");
 397                goto out_free_pd;
 398        }
 399
 400        ndev->num_inline_segments = min(NVME_RDMA_MAX_INLINE_SEGMENTS,
 401                                        ndev->dev->attrs.max_send_sge - 1);
 402        list_add(&ndev->entry, &device_list);
 403out_unlock:
 404        mutex_unlock(&device_list_mutex);
 405        return ndev;
 406
 407out_free_pd:
 408        ib_dealloc_pd(ndev->pd);
 409out_free_dev:
 410        kfree(ndev);
 411out_err:
 412        mutex_unlock(&device_list_mutex);
 413        return NULL;
 414}
 415
 416static void nvme_rdma_free_cq(struct nvme_rdma_queue *queue)
 417{
 418        if (nvme_rdma_poll_queue(queue))
 419                ib_free_cq(queue->ib_cq);
 420        else
 421                ib_cq_pool_put(queue->ib_cq, queue->cq_size);
 422}
 423
 424static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue)
 425{
 426        struct nvme_rdma_device *dev;
 427        struct ib_device *ibdev;
 428
 429        if (!test_and_clear_bit(NVME_RDMA_Q_TR_READY, &queue->flags))
 430                return;
 431
 432        dev = queue->device;
 433        ibdev = dev->dev;
 434
 435        if (queue->pi_support)
 436                ib_mr_pool_destroy(queue->qp, &queue->qp->sig_mrs);
 437        ib_mr_pool_destroy(queue->qp, &queue->qp->rdma_mrs);
 438
 439        /*
 440         * The cm_id object might have been destroyed during RDMA connection
 441         * establishment error flow to avoid getting other cma events, thus
 442         * the destruction of the QP shouldn't use rdma_cm API.
 443         */
 444        ib_destroy_qp(queue->qp);
 445        nvme_rdma_free_cq(queue);
 446
 447        nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size,
 448                        sizeof(struct nvme_completion), DMA_FROM_DEVICE);
 449
 450        nvme_rdma_dev_put(dev);
 451}
 452
 453static int nvme_rdma_get_max_fr_pages(struct ib_device *ibdev, bool pi_support)
 454{
 455        u32 max_page_list_len;
 456
 457        if (pi_support)
 458                max_page_list_len = ibdev->attrs.max_pi_fast_reg_page_list_len;
 459        else
 460                max_page_list_len = ibdev->attrs.max_fast_reg_page_list_len;
 461
 462        return min_t(u32, NVME_RDMA_MAX_SEGMENTS, max_page_list_len - 1);
 463}
 464
 465static int nvme_rdma_create_cq(struct ib_device *ibdev,
 466                struct nvme_rdma_queue *queue)
 467{
 468        int ret, comp_vector, idx = nvme_rdma_queue_idx(queue);
 469        enum ib_poll_context poll_ctx;
 470
 471        /*
 472         * Spread I/O queues completion vectors according their queue index.
 473         * Admin queues can always go on completion vector 0.
 474         */
 475        comp_vector = (idx == 0 ? idx : idx - 1) % ibdev->num_comp_vectors;
 476
 477        /* Polling queues need direct cq polling context */
 478        if (nvme_rdma_poll_queue(queue)) {
 479                poll_ctx = IB_POLL_DIRECT;
 480                queue->ib_cq = ib_alloc_cq(ibdev, queue, queue->cq_size,
 481                                           comp_vector, poll_ctx);
 482        } else {
 483                poll_ctx = IB_POLL_SOFTIRQ;
 484                queue->ib_cq = ib_cq_pool_get(ibdev, queue->cq_size,
 485                                              comp_vector, poll_ctx);
 486        }
 487
 488        if (IS_ERR(queue->ib_cq)) {
 489                ret = PTR_ERR(queue->ib_cq);
 490                return ret;
 491        }
 492
 493        return 0;
 494}
 495
 496static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
 497{
 498        struct ib_device *ibdev;
 499        const int send_wr_factor = 3;                   /* MR, SEND, INV */
 500        const int cq_factor = send_wr_factor + 1;       /* + RECV */
 501        int ret, pages_per_mr;
 502
 503        queue->device = nvme_rdma_find_get_device(queue->cm_id);
 504        if (!queue->device) {
 505                dev_err(queue->cm_id->device->dev.parent,
 506                        "no client data found!\n");
 507                return -ECONNREFUSED;
 508        }
 509        ibdev = queue->device->dev;
 510
 511        /* +1 for ib_stop_cq */
 512        queue->cq_size = cq_factor * queue->queue_size + 1;
 513
 514        ret = nvme_rdma_create_cq(ibdev, queue);
 515        if (ret)
 516                goto out_put_dev;
 517
 518        ret = nvme_rdma_create_qp(queue, send_wr_factor);
 519        if (ret)
 520                goto out_destroy_ib_cq;
 521
 522        queue->rsp_ring = nvme_rdma_alloc_ring(ibdev, queue->queue_size,
 523                        sizeof(struct nvme_completion), DMA_FROM_DEVICE);
 524        if (!queue->rsp_ring) {
 525                ret = -ENOMEM;
 526                goto out_destroy_qp;
 527        }
 528
 529        /*
 530         * Currently we don't use SG_GAPS MR's so if the first entry is
 531         * misaligned we'll end up using two entries for a single data page,
 532         * so one additional entry is required.
 533         */
 534        pages_per_mr = nvme_rdma_get_max_fr_pages(ibdev, queue->pi_support) + 1;
 535        ret = ib_mr_pool_init(queue->qp, &queue->qp->rdma_mrs,
 536                              queue->queue_size,
 537                              IB_MR_TYPE_MEM_REG,
 538                              pages_per_mr, 0);
 539        if (ret) {
 540                dev_err(queue->ctrl->ctrl.device,
 541                        "failed to initialize MR pool sized %d for QID %d\n",
 542                        queue->queue_size, nvme_rdma_queue_idx(queue));
 543                goto out_destroy_ring;
 544        }
 545
 546        if (queue->pi_support) {
 547                ret = ib_mr_pool_init(queue->qp, &queue->qp->sig_mrs,
 548                                      queue->queue_size, IB_MR_TYPE_INTEGRITY,
 549                                      pages_per_mr, pages_per_mr);
 550                if (ret) {
 551                        dev_err(queue->ctrl->ctrl.device,
 552                                "failed to initialize PI MR pool sized %d for QID %d\n",
 553                                queue->queue_size, nvme_rdma_queue_idx(queue));
 554                        goto out_destroy_mr_pool;
 555                }
 556        }
 557
 558        set_bit(NVME_RDMA_Q_TR_READY, &queue->flags);
 559
 560        return 0;
 561
 562out_destroy_mr_pool:
 563        ib_mr_pool_destroy(queue->qp, &queue->qp->rdma_mrs);
 564out_destroy_ring:
 565        nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size,
 566                            sizeof(struct nvme_completion), DMA_FROM_DEVICE);
 567out_destroy_qp:
 568        rdma_destroy_qp(queue->cm_id);
 569out_destroy_ib_cq:
 570        nvme_rdma_free_cq(queue);
 571out_put_dev:
 572        nvme_rdma_dev_put(queue->device);
 573        return ret;
 574}
 575
 576static int nvme_rdma_alloc_queue(struct nvme_rdma_ctrl *ctrl,
 577                int idx, size_t queue_size)
 578{
 579        struct nvme_rdma_queue *queue;
 580        struct sockaddr *src_addr = NULL;
 581        int ret;
 582
 583        queue = &ctrl->queues[idx];
 584        mutex_init(&queue->queue_lock);
 585        queue->ctrl = ctrl;
 586        if (idx && ctrl->ctrl.max_integrity_segments)
 587                queue->pi_support = true;
 588        else
 589                queue->pi_support = false;
 590        init_completion(&queue->cm_done);
 591
 592        if (idx > 0)
 593                queue->cmnd_capsule_len = ctrl->ctrl.ioccsz * 16;
 594        else
 595                queue->cmnd_capsule_len = sizeof(struct nvme_command);
 596
 597        queue->queue_size = queue_size;
 598
 599        queue->cm_id = rdma_create_id(&init_net, nvme_rdma_cm_handler, queue,
 600                        RDMA_PS_TCP, IB_QPT_RC);
 601        if (IS_ERR(queue->cm_id)) {
 602                dev_info(ctrl->ctrl.device,
 603                        "failed to create CM ID: %ld\n", PTR_ERR(queue->cm_id));
 604                ret = PTR_ERR(queue->cm_id);
 605                goto out_destroy_mutex;
 606        }
 607
 608        if (ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR)
 609                src_addr = (struct sockaddr *)&ctrl->src_addr;
 610
 611        queue->cm_error = -ETIMEDOUT;
 612        ret = rdma_resolve_addr(queue->cm_id, src_addr,
 613                        (struct sockaddr *)&ctrl->addr,
 614                        NVME_RDMA_CONNECT_TIMEOUT_MS);
 615        if (ret) {
 616                dev_info(ctrl->ctrl.device,
 617                        "rdma_resolve_addr failed (%d).\n", ret);
 618                goto out_destroy_cm_id;
 619        }
 620
 621        ret = nvme_rdma_wait_for_cm(queue);
 622        if (ret) {
 623                dev_info(ctrl->ctrl.device,
 624                        "rdma connection establishment failed (%d)\n", ret);
 625                goto out_destroy_cm_id;
 626        }
 627
 628        set_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags);
 629
 630        return 0;
 631
 632out_destroy_cm_id:
 633        rdma_destroy_id(queue->cm_id);
 634        nvme_rdma_destroy_queue_ib(queue);
 635out_destroy_mutex:
 636        mutex_destroy(&queue->queue_lock);
 637        return ret;
 638}
 639
 640static void __nvme_rdma_stop_queue(struct nvme_rdma_queue *queue)
 641{
 642        rdma_disconnect(queue->cm_id);
 643        ib_drain_qp(queue->qp);
 644}
 645
 646static void nvme_rdma_stop_queue(struct nvme_rdma_queue *queue)
 647{
 648        mutex_lock(&queue->queue_lock);
 649        if (test_and_clear_bit(NVME_RDMA_Q_LIVE, &queue->flags))
 650                __nvme_rdma_stop_queue(queue);
 651        mutex_unlock(&queue->queue_lock);
 652}
 653
 654static void nvme_rdma_free_queue(struct nvme_rdma_queue *queue)
 655{
 656        if (!test_and_clear_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags))
 657                return;
 658
 659        rdma_destroy_id(queue->cm_id);
 660        nvme_rdma_destroy_queue_ib(queue);
 661        mutex_destroy(&queue->queue_lock);
 662}
 663
 664static void nvme_rdma_free_io_queues(struct nvme_rdma_ctrl *ctrl)
 665{
 666        int i;
 667
 668        for (i = 1; i < ctrl->ctrl.queue_count; i++)
 669                nvme_rdma_free_queue(&ctrl->queues[i]);
 670}
 671
 672static void nvme_rdma_stop_io_queues(struct nvme_rdma_ctrl *ctrl)
 673{
 674        int i;
 675
 676        for (i = 1; i < ctrl->ctrl.queue_count; i++)
 677                nvme_rdma_stop_queue(&ctrl->queues[i]);
 678}
 679
 680static int nvme_rdma_start_queue(struct nvme_rdma_ctrl *ctrl, int idx)
 681{
 682        struct nvme_rdma_queue *queue = &ctrl->queues[idx];
 683        int ret;
 684
 685        if (idx)
 686                ret = nvmf_connect_io_queue(&ctrl->ctrl, idx);
 687        else
 688                ret = nvmf_connect_admin_queue(&ctrl->ctrl);
 689
 690        if (!ret) {
 691                set_bit(NVME_RDMA_Q_LIVE, &queue->flags);
 692        } else {
 693                if (test_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags))
 694                        __nvme_rdma_stop_queue(queue);
 695                dev_info(ctrl->ctrl.device,
 696                        "failed to connect queue: %d ret=%d\n", idx, ret);
 697        }
 698        return ret;
 699}
 700
 701static int nvme_rdma_start_io_queues(struct nvme_rdma_ctrl *ctrl)
 702{
 703        int i, ret = 0;
 704
 705        for (i = 1; i < ctrl->ctrl.queue_count; i++) {
 706                ret = nvme_rdma_start_queue(ctrl, i);
 707                if (ret)
 708                        goto out_stop_queues;
 709        }
 710
 711        return 0;
 712
 713out_stop_queues:
 714        for (i--; i >= 1; i--)
 715                nvme_rdma_stop_queue(&ctrl->queues[i]);
 716        return ret;
 717}
 718
 719static int nvme_rdma_alloc_io_queues(struct nvme_rdma_ctrl *ctrl)
 720{
 721        struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
 722        struct ib_device *ibdev = ctrl->device->dev;
 723        unsigned int nr_io_queues, nr_default_queues;
 724        unsigned int nr_read_queues, nr_poll_queues;
 725        int i, ret;
 726
 727        nr_read_queues = min_t(unsigned int, ibdev->num_comp_vectors,
 728                                min(opts->nr_io_queues, num_online_cpus()));
 729        nr_default_queues =  min_t(unsigned int, ibdev->num_comp_vectors,
 730                                min(opts->nr_write_queues, num_online_cpus()));
 731        nr_poll_queues = min(opts->nr_poll_queues, num_online_cpus());
 732        nr_io_queues = nr_read_queues + nr_default_queues + nr_poll_queues;
 733
 734        ret = nvme_set_queue_count(&ctrl->ctrl, &nr_io_queues);
 735        if (ret)
 736                return ret;
 737
 738        if (nr_io_queues == 0) {
 739                dev_err(ctrl->ctrl.device,
 740                        "unable to set any I/O queues\n");
 741                return -ENOMEM;
 742        }
 743
 744        ctrl->ctrl.queue_count = nr_io_queues + 1;
 745        dev_info(ctrl->ctrl.device,
 746                "creating %d I/O queues.\n", nr_io_queues);
 747
 748        if (opts->nr_write_queues && nr_read_queues < nr_io_queues) {
 749                /*
 750                 * separate read/write queues
 751                 * hand out dedicated default queues only after we have
 752                 * sufficient read queues.
 753                 */
 754                ctrl->io_queues[HCTX_TYPE_READ] = nr_read_queues;
 755                nr_io_queues -= ctrl->io_queues[HCTX_TYPE_READ];
 756                ctrl->io_queues[HCTX_TYPE_DEFAULT] =
 757                        min(nr_default_queues, nr_io_queues);
 758                nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
 759        } else {
 760                /*
 761                 * shared read/write queues
 762                 * either no write queues were requested, or we don't have
 763                 * sufficient queue count to have dedicated default queues.
 764                 */
 765                ctrl->io_queues[HCTX_TYPE_DEFAULT] =
 766                        min(nr_read_queues, nr_io_queues);
 767                nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
 768        }
 769
 770        if (opts->nr_poll_queues && nr_io_queues) {
 771                /* map dedicated poll queues only if we have queues left */
 772                ctrl->io_queues[HCTX_TYPE_POLL] =
 773                        min(nr_poll_queues, nr_io_queues);
 774        }
 775
 776        for (i = 1; i < ctrl->ctrl.queue_count; i++) {
 777                ret = nvme_rdma_alloc_queue(ctrl, i,
 778                                ctrl->ctrl.sqsize + 1);
 779                if (ret)
 780                        goto out_free_queues;
 781        }
 782
 783        return 0;
 784
 785out_free_queues:
 786        for (i--; i >= 1; i--)
 787                nvme_rdma_free_queue(&ctrl->queues[i]);
 788
 789        return ret;
 790}
 791
 792static struct blk_mq_tag_set *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
 793                bool admin)
 794{
 795        struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
 796        struct blk_mq_tag_set *set;
 797        int ret;
 798
 799        if (admin) {
 800                set = &ctrl->admin_tag_set;
 801                memset(set, 0, sizeof(*set));
 802                set->ops = &nvme_rdma_admin_mq_ops;
 803                set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
 804                set->reserved_tags = NVMF_RESERVED_TAGS;
 805                set->numa_node = nctrl->numa_node;
 806                set->cmd_size = sizeof(struct nvme_rdma_request) +
 807                                NVME_RDMA_DATA_SGL_SIZE;
 808                set->driver_data = ctrl;
 809                set->nr_hw_queues = 1;
 810                set->timeout = NVME_ADMIN_TIMEOUT;
 811                set->flags = BLK_MQ_F_NO_SCHED;
 812        } else {
 813                set = &ctrl->tag_set;
 814                memset(set, 0, sizeof(*set));
 815                set->ops = &nvme_rdma_mq_ops;
 816                set->queue_depth = nctrl->sqsize + 1;
 817                set->reserved_tags = NVMF_RESERVED_TAGS;
 818                set->numa_node = nctrl->numa_node;
 819                set->flags = BLK_MQ_F_SHOULD_MERGE;
 820                set->cmd_size = sizeof(struct nvme_rdma_request) +
 821                                NVME_RDMA_DATA_SGL_SIZE;
 822                if (nctrl->max_integrity_segments)
 823                        set->cmd_size += sizeof(struct nvme_rdma_sgl) +
 824                                         NVME_RDMA_METADATA_SGL_SIZE;
 825                set->driver_data = ctrl;
 826                set->nr_hw_queues = nctrl->queue_count - 1;
 827                set->timeout = NVME_IO_TIMEOUT;
 828                set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2;
 829        }
 830
 831        ret = blk_mq_alloc_tag_set(set);
 832        if (ret)
 833                return ERR_PTR(ret);
 834
 835        return set;
 836}
 837
 838static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl,
 839                bool remove)
 840{
 841        if (remove) {
 842                blk_cleanup_queue(ctrl->ctrl.admin_q);
 843                blk_cleanup_queue(ctrl->ctrl.fabrics_q);
 844                blk_mq_free_tag_set(ctrl->ctrl.admin_tagset);
 845        }
 846        if (ctrl->async_event_sqe.data) {
 847                cancel_work_sync(&ctrl->ctrl.async_event_work);
 848                nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe,
 849                                sizeof(struct nvme_command), DMA_TO_DEVICE);
 850                ctrl->async_event_sqe.data = NULL;
 851        }
 852        nvme_rdma_free_queue(&ctrl->queues[0]);
 853}
 854
 855static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
 856                bool new)
 857{
 858        bool pi_capable = false;
 859        int error;
 860
 861        error = nvme_rdma_alloc_queue(ctrl, 0, NVME_AQ_DEPTH);
 862        if (error)
 863                return error;
 864
 865        ctrl->device = ctrl->queues[0].device;
 866        ctrl->ctrl.numa_node = ibdev_to_node(ctrl->device->dev);
 867
 868        /* T10-PI support */
 869        if (ctrl->device->dev->attrs.device_cap_flags &
 870            IB_DEVICE_INTEGRITY_HANDOVER)
 871                pi_capable = true;
 872
 873        ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev,
 874                                                        pi_capable);
 875
 876        /*
 877         * Bind the async event SQE DMA mapping to the admin queue lifetime.
 878         * It's safe, since any chage in the underlying RDMA device will issue
 879         * error recovery and queue re-creation.
 880         */
 881        error = nvme_rdma_alloc_qe(ctrl->device->dev, &ctrl->async_event_sqe,
 882                        sizeof(struct nvme_command), DMA_TO_DEVICE);
 883        if (error)
 884                goto out_free_queue;
 885
 886        if (new) {
 887                ctrl->ctrl.admin_tagset = nvme_rdma_alloc_tagset(&ctrl->ctrl, true);
 888                if (IS_ERR(ctrl->ctrl.admin_tagset)) {
 889                        error = PTR_ERR(ctrl->ctrl.admin_tagset);
 890                        goto out_free_async_qe;
 891                }
 892
 893                ctrl->ctrl.fabrics_q = blk_mq_init_queue(&ctrl->admin_tag_set);
 894                if (IS_ERR(ctrl->ctrl.fabrics_q)) {
 895                        error = PTR_ERR(ctrl->ctrl.fabrics_q);
 896                        goto out_free_tagset;
 897                }
 898
 899                ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set);
 900                if (IS_ERR(ctrl->ctrl.admin_q)) {
 901                        error = PTR_ERR(ctrl->ctrl.admin_q);
 902                        goto out_cleanup_fabrics_q;
 903                }
 904        }
 905
 906        error = nvme_rdma_start_queue(ctrl, 0);
 907        if (error)
 908                goto out_cleanup_queue;
 909
 910        error = nvme_enable_ctrl(&ctrl->ctrl);
 911        if (error)
 912                goto out_stop_queue;
 913
 914        ctrl->ctrl.max_segments = ctrl->max_fr_pages;
 915        ctrl->ctrl.max_hw_sectors = ctrl->max_fr_pages << (ilog2(SZ_4K) - 9);
 916        if (pi_capable)
 917                ctrl->ctrl.max_integrity_segments = ctrl->max_fr_pages;
 918        else
 919                ctrl->ctrl.max_integrity_segments = 0;
 920
 921        blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
 922
 923        error = nvme_init_ctrl_finish(&ctrl->ctrl);
 924        if (error)
 925                goto out_quiesce_queue;
 926
 927        return 0;
 928
 929out_quiesce_queue:
 930        blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
 931        blk_sync_queue(ctrl->ctrl.admin_q);
 932out_stop_queue:
 933        nvme_rdma_stop_queue(&ctrl->queues[0]);
 934        nvme_cancel_admin_tagset(&ctrl->ctrl);
 935out_cleanup_queue:
 936        if (new)
 937                blk_cleanup_queue(ctrl->ctrl.admin_q);
 938out_cleanup_fabrics_q:
 939        if (new)
 940                blk_cleanup_queue(ctrl->ctrl.fabrics_q);
 941out_free_tagset:
 942        if (new)
 943                blk_mq_free_tag_set(ctrl->ctrl.admin_tagset);
 944out_free_async_qe:
 945        if (ctrl->async_event_sqe.data) {
 946                nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe,
 947                        sizeof(struct nvme_command), DMA_TO_DEVICE);
 948                ctrl->async_event_sqe.data = NULL;
 949        }
 950out_free_queue:
 951        nvme_rdma_free_queue(&ctrl->queues[0]);
 952        return error;
 953}
 954
 955static void nvme_rdma_destroy_io_queues(struct nvme_rdma_ctrl *ctrl,
 956                bool remove)
 957{
 958        if (remove) {
 959                blk_cleanup_queue(ctrl->ctrl.connect_q);
 960                blk_mq_free_tag_set(ctrl->ctrl.tagset);
 961        }
 962        nvme_rdma_free_io_queues(ctrl);
 963}
 964
 965static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)
 966{
 967        int ret;
 968
 969        ret = nvme_rdma_alloc_io_queues(ctrl);
 970        if (ret)
 971                return ret;
 972
 973        if (new) {
 974                ctrl->ctrl.tagset = nvme_rdma_alloc_tagset(&ctrl->ctrl, false);
 975                if (IS_ERR(ctrl->ctrl.tagset)) {
 976                        ret = PTR_ERR(ctrl->ctrl.tagset);
 977                        goto out_free_io_queues;
 978                }
 979
 980                ctrl->ctrl.connect_q = blk_mq_init_queue(&ctrl->tag_set);
 981                if (IS_ERR(ctrl->ctrl.connect_q)) {
 982                        ret = PTR_ERR(ctrl->ctrl.connect_q);
 983                        goto out_free_tag_set;
 984                }
 985        }
 986
 987        ret = nvme_rdma_start_io_queues(ctrl);
 988        if (ret)
 989                goto out_cleanup_connect_q;
 990
 991        if (!new) {
 992                nvme_start_queues(&ctrl->ctrl);
 993                if (!nvme_wait_freeze_timeout(&ctrl->ctrl, NVME_IO_TIMEOUT)) {
 994                        /*
 995                         * If we timed out waiting for freeze we are likely to
 996                         * be stuck.  Fail the controller initialization just
 997                         * to be safe.
 998                         */
 999                        ret = -ENODEV;
1000                        goto out_wait_freeze_timed_out;
1001                }
1002                blk_mq_update_nr_hw_queues(ctrl->ctrl.tagset,
1003                        ctrl->ctrl.queue_count - 1);
1004                nvme_unfreeze(&ctrl->ctrl);
1005        }
1006
1007        return 0;
1008
1009out_wait_freeze_timed_out:
1010        nvme_stop_queues(&ctrl->ctrl);
1011        nvme_sync_io_queues(&ctrl->ctrl);
1012        nvme_rdma_stop_io_queues(ctrl);
1013out_cleanup_connect_q:
1014        nvme_cancel_tagset(&ctrl->ctrl);
1015        if (new)
1016                blk_cleanup_queue(ctrl->ctrl.connect_q);
1017out_free_tag_set:
1018        if (new)
1019                blk_mq_free_tag_set(ctrl->ctrl.tagset);
1020out_free_io_queues:
1021        nvme_rdma_free_io_queues(ctrl);
1022        return ret;
1023}
1024
1025static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl,
1026                bool remove)
1027{
1028        blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
1029        blk_sync_queue(ctrl->ctrl.admin_q);
1030        nvme_rdma_stop_queue(&ctrl->queues[0]);
1031        nvme_cancel_admin_tagset(&ctrl->ctrl);
1032        if (remove)
1033                blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
1034        nvme_rdma_destroy_admin_queue(ctrl, remove);
1035}
1036
1037static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl,
1038                bool remove)
1039{
1040        if (ctrl->ctrl.queue_count > 1) {
1041                nvme_start_freeze(&ctrl->ctrl);
1042                nvme_stop_queues(&ctrl->ctrl);
1043                nvme_sync_io_queues(&ctrl->ctrl);
1044                nvme_rdma_stop_io_queues(ctrl);
1045                nvme_cancel_tagset(&ctrl->ctrl);
1046                if (remove)
1047                        nvme_start_queues(&ctrl->ctrl);
1048                nvme_rdma_destroy_io_queues(ctrl, remove);
1049        }
1050}
1051
1052static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl)
1053{
1054        struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
1055
1056        if (list_empty(&ctrl->list))
1057                goto free_ctrl;
1058
1059        mutex_lock(&nvme_rdma_ctrl_mutex);
1060        list_del(&ctrl->list);
1061        mutex_unlock(&nvme_rdma_ctrl_mutex);
1062
1063        nvmf_free_options(nctrl->opts);
1064free_ctrl:
1065        kfree(ctrl->queues);
1066        kfree(ctrl);
1067}
1068
1069static void nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl *ctrl)
1070{
1071        /* If we are resetting/deleting then do nothing */
1072        if (ctrl->ctrl.state != NVME_CTRL_CONNECTING) {
1073                WARN_ON_ONCE(ctrl->ctrl.state == NVME_CTRL_NEW ||
1074                        ctrl->ctrl.state == NVME_CTRL_LIVE);
1075                return;
1076        }
1077
1078        if (nvmf_should_reconnect(&ctrl->ctrl)) {
1079                dev_info(ctrl->ctrl.device, "Reconnecting in %d seconds...\n",
1080                        ctrl->ctrl.opts->reconnect_delay);
1081                queue_delayed_work(nvme_wq, &ctrl->reconnect_work,
1082                                ctrl->ctrl.opts->reconnect_delay * HZ);
1083        } else {
1084                nvme_delete_ctrl(&ctrl->ctrl);
1085        }
1086}
1087
1088static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new)
1089{
1090        int ret;
1091        bool changed;
1092
1093        ret = nvme_rdma_configure_admin_queue(ctrl, new);
1094        if (ret)
1095                return ret;
1096
1097        if (ctrl->ctrl.icdoff) {
1098                dev_err(ctrl->ctrl.device, "icdoff is not supported!\n");
1099                goto destroy_admin;
1100        }
1101
1102        if (!(ctrl->ctrl.sgls & (1 << 2))) {
1103                dev_err(ctrl->ctrl.device,
1104                        "Mandatory keyed sgls are not supported!\n");
1105                goto destroy_admin;
1106        }
1107
1108        if (ctrl->ctrl.opts->queue_size > ctrl->ctrl.sqsize + 1) {
1109                dev_warn(ctrl->ctrl.device,
1110                        "queue_size %zu > ctrl sqsize %u, clamping down\n",
1111                        ctrl->ctrl.opts->queue_size, ctrl->ctrl.sqsize + 1);
1112        }
1113
1114        if (ctrl->ctrl.sqsize + 1 > ctrl->ctrl.maxcmd) {
1115                dev_warn(ctrl->ctrl.device,
1116                        "sqsize %u > ctrl maxcmd %u, clamping down\n",
1117                        ctrl->ctrl.sqsize + 1, ctrl->ctrl.maxcmd);
1118                ctrl->ctrl.sqsize = ctrl->ctrl.maxcmd - 1;
1119        }
1120
1121        if (ctrl->ctrl.sgls & (1 << 20))
1122                ctrl->use_inline_data = true;
1123
1124        if (ctrl->ctrl.queue_count > 1) {
1125                ret = nvme_rdma_configure_io_queues(ctrl, new);
1126                if (ret)
1127                        goto destroy_admin;
1128        }
1129
1130        changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
1131        if (!changed) {
1132                /*
1133                 * state change failure is ok if we started ctrl delete,
1134                 * unless we're during creation of a new controller to
1135                 * avoid races with teardown flow.
1136                 */
1137                WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING &&
1138                             ctrl->ctrl.state != NVME_CTRL_DELETING_NOIO);
1139                WARN_ON_ONCE(new);
1140                ret = -EINVAL;
1141                goto destroy_io;
1142        }
1143
1144        nvme_start_ctrl(&ctrl->ctrl);
1145        return 0;
1146
1147destroy_io:
1148        if (ctrl->ctrl.queue_count > 1) {
1149                nvme_stop_queues(&ctrl->ctrl);
1150                nvme_sync_io_queues(&ctrl->ctrl);
1151                nvme_rdma_stop_io_queues(ctrl);
1152                nvme_cancel_tagset(&ctrl->ctrl);
1153                nvme_rdma_destroy_io_queues(ctrl, new);
1154        }
1155destroy_admin:
1156        blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
1157        blk_sync_queue(ctrl->ctrl.admin_q);
1158        nvme_rdma_stop_queue(&ctrl->queues[0]);
1159        nvme_cancel_admin_tagset(&ctrl->ctrl);
1160        nvme_rdma_destroy_admin_queue(ctrl, new);
1161        return ret;
1162}
1163
1164static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
1165{
1166        struct nvme_rdma_ctrl *ctrl = container_of(to_delayed_work(work),
1167                        struct nvme_rdma_ctrl, reconnect_work);
1168
1169        ++ctrl->ctrl.nr_reconnects;
1170
1171        if (nvme_rdma_setup_ctrl(ctrl, false))
1172                goto requeue;
1173
1174        dev_info(ctrl->ctrl.device, "Successfully reconnected (%d attempts)\n",
1175                        ctrl->ctrl.nr_reconnects);
1176
1177        ctrl->ctrl.nr_reconnects = 0;
1178
1179        return;
1180
1181requeue:
1182        dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n",
1183                        ctrl->ctrl.nr_reconnects);
1184        nvme_rdma_reconnect_or_remove(ctrl);
1185}
1186
1187static void nvme_rdma_error_recovery_work(struct work_struct *work)
1188{
1189        struct nvme_rdma_ctrl *ctrl = container_of(work,
1190                        struct nvme_rdma_ctrl, err_work);
1191
1192        nvme_stop_keep_alive(&ctrl->ctrl);
1193        nvme_rdma_teardown_io_queues(ctrl, false);
1194        nvme_start_queues(&ctrl->ctrl);
1195        nvme_rdma_teardown_admin_queue(ctrl, false);
1196        blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
1197
1198        if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
1199                /* state change failure is ok if we started ctrl delete */
1200                WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING &&
1201                             ctrl->ctrl.state != NVME_CTRL_DELETING_NOIO);
1202                return;
1203        }
1204
1205        nvme_rdma_reconnect_or_remove(ctrl);
1206}
1207
1208static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl)
1209{
1210        if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING))
1211                return;
1212
1213        dev_warn(ctrl->ctrl.device, "starting error recovery\n");
1214        queue_work(nvme_reset_wq, &ctrl->err_work);
1215}
1216
1217static void nvme_rdma_end_request(struct nvme_rdma_request *req)
1218{
1219        struct request *rq = blk_mq_rq_from_pdu(req);
1220
1221        if (!refcount_dec_and_test(&req->ref))
1222                return;
1223        if (!nvme_try_complete_req(rq, req->status, req->result))
1224                nvme_rdma_complete_rq(rq);
1225}
1226
1227static void nvme_rdma_wr_error(struct ib_cq *cq, struct ib_wc *wc,
1228                const char *op)
1229{
1230        struct nvme_rdma_queue *queue = wc->qp->qp_context;
1231        struct nvme_rdma_ctrl *ctrl = queue->ctrl;
1232
1233        if (ctrl->ctrl.state == NVME_CTRL_LIVE)
1234                dev_info(ctrl->ctrl.device,
1235                             "%s for CQE 0x%p failed with status %s (%d)\n",
1236                             op, wc->wr_cqe,
1237                             ib_wc_status_msg(wc->status), wc->status);
1238        nvme_rdma_error_recovery(ctrl);
1239}
1240
1241static void nvme_rdma_memreg_done(struct ib_cq *cq, struct ib_wc *wc)
1242{
1243        if (unlikely(wc->status != IB_WC_SUCCESS))
1244                nvme_rdma_wr_error(cq, wc, "MEMREG");
1245}
1246
1247static void nvme_rdma_inv_rkey_done(struct ib_cq *cq, struct ib_wc *wc)
1248{
1249        struct nvme_rdma_request *req =
1250                container_of(wc->wr_cqe, struct nvme_rdma_request, reg_cqe);
1251
1252        if (unlikely(wc->status != IB_WC_SUCCESS))
1253                nvme_rdma_wr_error(cq, wc, "LOCAL_INV");
1254        else
1255                nvme_rdma_end_request(req);
1256}
1257
1258static int nvme_rdma_inv_rkey(struct nvme_rdma_queue *queue,
1259                struct nvme_rdma_request *req)
1260{
1261        struct ib_send_wr wr = {
1262                .opcode             = IB_WR_LOCAL_INV,
1263                .next               = NULL,
1264                .num_sge            = 0,
1265                .send_flags         = IB_SEND_SIGNALED,
1266                .ex.invalidate_rkey = req->mr->rkey,
1267        };
1268
1269        req->reg_cqe.done = nvme_rdma_inv_rkey_done;
1270        wr.wr_cqe = &req->reg_cqe;
1271
1272        return ib_post_send(queue->qp, &wr, NULL);
1273}
1274
1275static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue,
1276                struct request *rq)
1277{
1278        struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
1279        struct nvme_rdma_device *dev = queue->device;
1280        struct ib_device *ibdev = dev->dev;
1281        struct list_head *pool = &queue->qp->rdma_mrs;
1282
1283        if (!blk_rq_nr_phys_segments(rq))
1284                return;
1285
1286        if (blk_integrity_rq(rq)) {
1287                ib_dma_unmap_sg(ibdev, req->metadata_sgl->sg_table.sgl,
1288                                req->metadata_sgl->nents, rq_dma_dir(rq));
1289                sg_free_table_chained(&req->metadata_sgl->sg_table,
1290                                      NVME_INLINE_METADATA_SG_CNT);
1291        }
1292
1293        if (req->use_sig_mr)
1294                pool = &queue->qp->sig_mrs;
1295
1296        if (req->mr) {
1297                ib_mr_pool_put(queue->qp, pool, req->mr);
1298                req->mr = NULL;
1299        }
1300
1301        ib_dma_unmap_sg(ibdev, req->data_sgl.sg_table.sgl, req->data_sgl.nents,
1302                        rq_dma_dir(rq));
1303        sg_free_table_chained(&req->data_sgl.sg_table, NVME_INLINE_SG_CNT);
1304}
1305
1306static int nvme_rdma_set_sg_null(struct nvme_command *c)
1307{
1308        struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
1309
1310        sg->addr = 0;
1311        put_unaligned_le24(0, sg->length);
1312        put_unaligned_le32(0, sg->key);
1313        sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4;
1314        return 0;
1315}
1316
1317static int nvme_rdma_map_sg_inline(struct nvme_rdma_queue *queue,
1318                struct nvme_rdma_request *req, struct nvme_command *c,
1319                int count)
1320{
1321        struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
1322        struct ib_sge *sge = &req->sge[1];
1323        struct scatterlist *sgl;
1324        u32 len = 0;
1325        int i;
1326
1327        for_each_sg(req->data_sgl.sg_table.sgl, sgl, count, i) {
1328                sge->addr = sg_dma_address(sgl);
1329                sge->length = sg_dma_len(sgl);
1330                sge->lkey = queue->device->pd->local_dma_lkey;
1331                len += sge->length;
1332                sge++;
1333        }
1334
1335        sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff);
1336        sg->length = cpu_to_le32(len);
1337        sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
1338
1339        req->num_sge += count;
1340        return 0;
1341}
1342
1343static int nvme_rdma_map_sg_single(struct nvme_rdma_queue *queue,
1344                struct nvme_rdma_request *req, struct nvme_command *c)
1345{
1346        struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
1347
1348        sg->addr = cpu_to_le64(sg_dma_address(req->data_sgl.sg_table.sgl));
1349        put_unaligned_le24(sg_dma_len(req->data_sgl.sg_table.sgl), sg->length);
1350        put_unaligned_le32(queue->device->pd->unsafe_global_rkey, sg->key);
1351        sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4;
1352        return 0;
1353}
1354
1355static int nvme_rdma_map_sg_fr(struct nvme_rdma_queue *queue,
1356                struct nvme_rdma_request *req, struct nvme_command *c,
1357                int count)
1358{
1359        struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
1360        int nr;
1361
1362        req->mr = ib_mr_pool_get(queue->qp, &queue->qp->rdma_mrs);
1363        if (WARN_ON_ONCE(!req->mr))
1364                return -EAGAIN;
1365
1366        /*
1367         * Align the MR to a 4K page size to match the ctrl page size and
1368         * the block virtual boundary.
1369         */
1370        nr = ib_map_mr_sg(req->mr, req->data_sgl.sg_table.sgl, count, NULL,
1371                          SZ_4K);
1372        if (unlikely(nr < count)) {
1373                ib_mr_pool_put(queue->qp, &queue->qp->rdma_mrs, req->mr);
1374                req->mr = NULL;
1375                if (nr < 0)
1376                        return nr;
1377                return -EINVAL;
1378        }
1379
1380        ib_update_fast_reg_key(req->mr, ib_inc_rkey(req->mr->rkey));
1381
1382        req->reg_cqe.done = nvme_rdma_memreg_done;
1383        memset(&req->reg_wr, 0, sizeof(req->reg_wr));
1384        req->reg_wr.wr.opcode = IB_WR_REG_MR;
1385        req->reg_wr.wr.wr_cqe = &req->reg_cqe;
1386        req->reg_wr.wr.num_sge = 0;
1387        req->reg_wr.mr = req->mr;
1388        req->reg_wr.key = req->mr->rkey;
1389        req->reg_wr.access = IB_ACCESS_LOCAL_WRITE |
1390                             IB_ACCESS_REMOTE_READ |
1391                             IB_ACCESS_REMOTE_WRITE;
1392
1393        sg->addr = cpu_to_le64(req->mr->iova);
1394        put_unaligned_le24(req->mr->length, sg->length);
1395        put_unaligned_le32(req->mr->rkey, sg->key);
1396        sg->type = (NVME_KEY_SGL_FMT_DATA_DESC << 4) |
1397                        NVME_SGL_FMT_INVALIDATE;
1398
1399        return 0;
1400}
1401
1402static void nvme_rdma_set_sig_domain(struct blk_integrity *bi,
1403                struct nvme_command *cmd, struct ib_sig_domain *domain,
1404                u16 control, u8 pi_type)
1405{
1406        domain->sig_type = IB_SIG_TYPE_T10_DIF;
1407        domain->sig.dif.bg_type = IB_T10DIF_CRC;
1408        domain->sig.dif.pi_interval = 1 << bi->interval_exp;
1409        domain->sig.dif.ref_tag = le32_to_cpu(cmd->rw.reftag);
1410        if (control & NVME_RW_PRINFO_PRCHK_REF)
1411                domain->sig.dif.ref_remap = true;
1412
1413        domain->sig.dif.app_tag = le16_to_cpu(cmd->rw.apptag);
1414        domain->sig.dif.apptag_check_mask = le16_to_cpu(cmd->rw.appmask);
1415        domain->sig.dif.app_escape = true;
1416        if (pi_type == NVME_NS_DPS_PI_TYPE3)
1417                domain->sig.dif.ref_escape = true;
1418}
1419
1420static void nvme_rdma_set_sig_attrs(struct blk_integrity *bi,
1421                struct nvme_command *cmd, struct ib_sig_attrs *sig_attrs,
1422                u8 pi_type)
1423{
1424        u16 control = le16_to_cpu(cmd->rw.control);
1425
1426        memset(sig_attrs, 0, sizeof(*sig_attrs));
1427        if (control & NVME_RW_PRINFO_PRACT) {
1428                /* for WRITE_INSERT/READ_STRIP no memory domain */
1429                sig_attrs->mem.sig_type = IB_SIG_TYPE_NONE;
1430                nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->wire, control,
1431                                         pi_type);
1432                /* Clear the PRACT bit since HCA will generate/verify the PI */
1433                control &= ~NVME_RW_PRINFO_PRACT;
1434                cmd->rw.control = cpu_to_le16(control);
1435        } else {
1436                /* for WRITE_PASS/READ_PASS both wire/memory domains exist */
1437                nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->wire, control,
1438                                         pi_type);
1439                nvme_rdma_set_sig_domain(bi, cmd, &sig_attrs->mem, control,
1440                                         pi_type);
1441        }
1442}
1443
1444static void nvme_rdma_set_prot_checks(struct nvme_command *cmd, u8 *mask)
1445{
1446        *mask = 0;
1447        if (le16_to_cpu(cmd->rw.control) & NVME_RW_PRINFO_PRCHK_REF)
1448                *mask |= IB_SIG_CHECK_REFTAG;
1449        if (le16_to_cpu(cmd->rw.control) & NVME_RW_PRINFO_PRCHK_GUARD)
1450                *mask |= IB_SIG_CHECK_GUARD;
1451}
1452
1453static void nvme_rdma_sig_done(struct ib_cq *cq, struct ib_wc *wc)
1454{
1455        if (unlikely(wc->status != IB_WC_SUCCESS))
1456                nvme_rdma_wr_error(cq, wc, "SIG");
1457}
1458
1459static int nvme_rdma_map_sg_pi(struct nvme_rdma_queue *queue,
1460                struct nvme_rdma_request *req, struct nvme_command *c,
1461                int count, int pi_count)
1462{
1463        struct nvme_rdma_sgl *sgl = &req->data_sgl;
1464        struct ib_reg_wr *wr = &req->reg_wr;
1465        struct request *rq = blk_mq_rq_from_pdu(req);
1466        struct nvme_ns *ns = rq->q->queuedata;
1467        struct bio *bio = rq->bio;
1468        struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
1469        int nr;
1470
1471        req->mr = ib_mr_pool_get(queue->qp, &queue->qp->sig_mrs);
1472        if (WARN_ON_ONCE(!req->mr))
1473                return -EAGAIN;
1474
1475        nr = ib_map_mr_sg_pi(req->mr, sgl->sg_table.sgl, count, NULL,
1476                             req->metadata_sgl->sg_table.sgl, pi_count, NULL,
1477                             SZ_4K);
1478        if (unlikely(nr))
1479                goto mr_put;
1480
1481        nvme_rdma_set_sig_attrs(blk_get_integrity(bio->bi_bdev->bd_disk), c,
1482                                req->mr->sig_attrs, ns->pi_type);
1483        nvme_rdma_set_prot_checks(c, &req->mr->sig_attrs->check_mask);
1484
1485        ib_update_fast_reg_key(req->mr, ib_inc_rkey(req->mr->rkey));
1486
1487        req->reg_cqe.done = nvme_rdma_sig_done;
1488        memset(wr, 0, sizeof(*wr));
1489        wr->wr.opcode = IB_WR_REG_MR_INTEGRITY;
1490        wr->wr.wr_cqe = &req->reg_cqe;
1491        wr->wr.num_sge = 0;
1492        wr->wr.send_flags = 0;
1493        wr->mr = req->mr;
1494        wr->key = req->mr->rkey;
1495        wr->access = IB_ACCESS_LOCAL_WRITE |
1496                     IB_ACCESS_REMOTE_READ |
1497                     IB_ACCESS_REMOTE_WRITE;
1498
1499        sg->addr = cpu_to_le64(req->mr->iova);
1500        put_unaligned_le24(req->mr->length, sg->length);
1501        put_unaligned_le32(req->mr->rkey, sg->key);
1502        sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4;
1503
1504        return 0;
1505
1506mr_put:
1507        ib_mr_pool_put(queue->qp, &queue->qp->sig_mrs, req->mr);
1508        req->mr = NULL;
1509        if (nr < 0)
1510                return nr;
1511        return -EINVAL;
1512}
1513
1514static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
1515                struct request *rq, struct nvme_command *c)
1516{
1517        struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
1518        struct nvme_rdma_device *dev = queue->device;
1519        struct ib_device *ibdev = dev->dev;
1520        int pi_count = 0;
1521        int count, ret;
1522
1523        req->num_sge = 1;
1524        refcount_set(&req->ref, 2); /* send and recv completions */
1525
1526        c->common.flags |= NVME_CMD_SGL_METABUF;
1527
1528        if (!blk_rq_nr_phys_segments(rq))
1529                return nvme_rdma_set_sg_null(c);
1530
1531        req->data_sgl.sg_table.sgl = (struct scatterlist *)(req + 1);
1532        ret = sg_alloc_table_chained(&req->data_sgl.sg_table,
1533                        blk_rq_nr_phys_segments(rq), req->data_sgl.sg_table.sgl,
1534                        NVME_INLINE_SG_CNT);
1535        if (ret)
1536                return -ENOMEM;
1537
1538        req->data_sgl.nents = blk_rq_map_sg(rq->q, rq,
1539                                            req->data_sgl.sg_table.sgl);
1540
1541        count = ib_dma_map_sg(ibdev, req->data_sgl.sg_table.sgl,
1542                              req->data_sgl.nents, rq_dma_dir(rq));
1543        if (unlikely(count <= 0)) {
1544                ret = -EIO;
1545                goto out_free_table;
1546        }
1547
1548        if (blk_integrity_rq(rq)) {
1549                req->metadata_sgl->sg_table.sgl =
1550                        (struct scatterlist *)(req->metadata_sgl + 1);
1551                ret = sg_alloc_table_chained(&req->metadata_sgl->sg_table,
1552                                blk_rq_count_integrity_sg(rq->q, rq->bio),
1553                                req->metadata_sgl->sg_table.sgl,
1554                                NVME_INLINE_METADATA_SG_CNT);
1555                if (unlikely(ret)) {
1556                        ret = -ENOMEM;
1557                        goto out_unmap_sg;
1558                }
1559
1560                req->metadata_sgl->nents = blk_rq_map_integrity_sg(rq->q,
1561                                rq->bio, req->metadata_sgl->sg_table.sgl);
1562                pi_count = ib_dma_map_sg(ibdev,
1563                                         req->metadata_sgl->sg_table.sgl,
1564                                         req->metadata_sgl->nents,
1565                                         rq_dma_dir(rq));
1566                if (unlikely(pi_count <= 0)) {
1567                        ret = -EIO;
1568                        goto out_free_pi_table;
1569                }
1570        }
1571
1572        if (req->use_sig_mr) {
1573                ret = nvme_rdma_map_sg_pi(queue, req, c, count, pi_count);
1574                goto out;
1575        }
1576
1577        if (count <= dev->num_inline_segments) {
1578                if (rq_data_dir(rq) == WRITE && nvme_rdma_queue_idx(queue) &&
1579                    queue->ctrl->use_inline_data &&
1580                    blk_rq_payload_bytes(rq) <=
1581                                nvme_rdma_inline_data_size(queue)) {
1582                        ret = nvme_rdma_map_sg_inline(queue, req, c, count);
1583                        goto out;
1584                }
1585
1586                if (count == 1 && dev->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY) {
1587                        ret = nvme_rdma_map_sg_single(queue, req, c);
1588                        goto out;
1589                }
1590        }
1591
1592        ret = nvme_rdma_map_sg_fr(queue, req, c, count);
1593out:
1594        if (unlikely(ret))
1595                goto out_unmap_pi_sg;
1596
1597        return 0;
1598
1599out_unmap_pi_sg:
1600        if (blk_integrity_rq(rq))
1601                ib_dma_unmap_sg(ibdev, req->metadata_sgl->sg_table.sgl,
1602                                req->metadata_sgl->nents, rq_dma_dir(rq));
1603out_free_pi_table:
1604        if (blk_integrity_rq(rq))
1605                sg_free_table_chained(&req->metadata_sgl->sg_table,
1606                                      NVME_INLINE_METADATA_SG_CNT);
1607out_unmap_sg:
1608        ib_dma_unmap_sg(ibdev, req->data_sgl.sg_table.sgl, req->data_sgl.nents,
1609                        rq_dma_dir(rq));
1610out_free_table:
1611        sg_free_table_chained(&req->data_sgl.sg_table, NVME_INLINE_SG_CNT);
1612        return ret;
1613}
1614
1615static void nvme_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc)
1616{
1617        struct nvme_rdma_qe *qe =
1618                container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe);
1619        struct nvme_rdma_request *req =
1620                container_of(qe, struct nvme_rdma_request, sqe);
1621
1622        if (unlikely(wc->status != IB_WC_SUCCESS))
1623                nvme_rdma_wr_error(cq, wc, "SEND");
1624        else
1625                nvme_rdma_end_request(req);
1626}
1627
1628static int nvme_rdma_post_send(struct nvme_rdma_queue *queue,
1629                struct nvme_rdma_qe *qe, struct ib_sge *sge, u32 num_sge,
1630                struct ib_send_wr *first)
1631{
1632        struct ib_send_wr wr;
1633        int ret;
1634
1635        sge->addr   = qe->dma;
1636        sge->length = sizeof(struct nvme_command);
1637        sge->lkey   = queue->device->pd->local_dma_lkey;
1638
1639        wr.next       = NULL;
1640        wr.wr_cqe     = &qe->cqe;
1641        wr.sg_list    = sge;
1642        wr.num_sge    = num_sge;
1643        wr.opcode     = IB_WR_SEND;
1644        wr.send_flags = IB_SEND_SIGNALED;
1645
1646        if (first)
1647                first->next = &wr;
1648        else
1649                first = &wr;
1650
1651        ret = ib_post_send(queue->qp, first, NULL);
1652        if (unlikely(ret)) {
1653                dev_err(queue->ctrl->ctrl.device,
1654                             "%s failed with error code %d\n", __func__, ret);
1655        }
1656        return ret;
1657}
1658
1659static int nvme_rdma_post_recv(struct nvme_rdma_queue *queue,
1660                struct nvme_rdma_qe *qe)
1661{
1662        struct ib_recv_wr wr;
1663        struct ib_sge list;
1664        int ret;
1665
1666        list.addr   = qe->dma;
1667        list.length = sizeof(struct nvme_completion);
1668        list.lkey   = queue->device->pd->local_dma_lkey;
1669
1670        qe->cqe.done = nvme_rdma_recv_done;
1671
1672        wr.next     = NULL;
1673        wr.wr_cqe   = &qe->cqe;
1674        wr.sg_list  = &list;
1675        wr.num_sge  = 1;
1676
1677        ret = ib_post_recv(queue->qp, &wr, NULL);
1678        if (unlikely(ret)) {
1679                dev_err(queue->ctrl->ctrl.device,
1680                        "%s failed with error code %d\n", __func__, ret);
1681        }
1682        return ret;
1683}
1684
1685static struct blk_mq_tags *nvme_rdma_tagset(struct nvme_rdma_queue *queue)
1686{
1687        u32 queue_idx = nvme_rdma_queue_idx(queue);
1688
1689        if (queue_idx == 0)
1690                return queue->ctrl->admin_tag_set.tags[queue_idx];
1691        return queue->ctrl->tag_set.tags[queue_idx - 1];
1692}
1693
1694static void nvme_rdma_async_done(struct ib_cq *cq, struct ib_wc *wc)
1695{
1696        if (unlikely(wc->status != IB_WC_SUCCESS))
1697                nvme_rdma_wr_error(cq, wc, "ASYNC");
1698}
1699
1700static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg)
1701{
1702        struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(arg);
1703        struct nvme_rdma_queue *queue = &ctrl->queues[0];
1704        struct ib_device *dev = queue->device->dev;
1705        struct nvme_rdma_qe *sqe = &ctrl->async_event_sqe;
1706        struct nvme_command *cmd = sqe->data;
1707        struct ib_sge sge;
1708        int ret;
1709
1710        ib_dma_sync_single_for_cpu(dev, sqe->dma, sizeof(*cmd), DMA_TO_DEVICE);
1711
1712        memset(cmd, 0, sizeof(*cmd));
1713        cmd->common.opcode = nvme_admin_async_event;
1714        cmd->common.command_id = NVME_AQ_BLK_MQ_DEPTH;
1715        cmd->common.flags |= NVME_CMD_SGL_METABUF;
1716        nvme_rdma_set_sg_null(cmd);
1717
1718        sqe->cqe.done = nvme_rdma_async_done;
1719
1720        ib_dma_sync_single_for_device(dev, sqe->dma, sizeof(*cmd),
1721                        DMA_TO_DEVICE);
1722
1723        ret = nvme_rdma_post_send(queue, sqe, &sge, 1, NULL);
1724        WARN_ON_ONCE(ret);
1725}
1726
1727static void nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
1728                struct nvme_completion *cqe, struct ib_wc *wc)
1729{
1730        struct request *rq;
1731        struct nvme_rdma_request *req;
1732
1733        rq = nvme_find_rq(nvme_rdma_tagset(queue), cqe->command_id);
1734        if (!rq) {
1735                dev_err(queue->ctrl->ctrl.device,
1736                        "got bad command_id %#x on QP %#x\n",
1737                        cqe->command_id, queue->qp->qp_num);
1738                nvme_rdma_error_recovery(queue->ctrl);
1739                return;
1740        }
1741        req = blk_mq_rq_to_pdu(rq);
1742
1743        req->status = cqe->status;
1744        req->result = cqe->result;
1745
1746        if (wc->wc_flags & IB_WC_WITH_INVALIDATE) {
1747                if (unlikely(!req->mr ||
1748                             wc->ex.invalidate_rkey != req->mr->rkey)) {
1749                        dev_err(queue->ctrl->ctrl.device,
1750                                "Bogus remote invalidation for rkey %#x\n",
1751                                req->mr ? req->mr->rkey : 0);
1752                        nvme_rdma_error_recovery(queue->ctrl);
1753                }
1754        } else if (req->mr) {
1755                int ret;
1756
1757                ret = nvme_rdma_inv_rkey(queue, req);
1758                if (unlikely(ret < 0)) {
1759                        dev_err(queue->ctrl->ctrl.device,
1760                                "Queueing INV WR for rkey %#x failed (%d)\n",
1761                                req->mr->rkey, ret);
1762                        nvme_rdma_error_recovery(queue->ctrl);
1763                }
1764                /* the local invalidation completion will end the request */
1765                return;
1766        }
1767
1768        nvme_rdma_end_request(req);
1769}
1770
1771static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
1772{
1773        struct nvme_rdma_qe *qe =
1774                container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe);
1775        struct nvme_rdma_queue *queue = wc->qp->qp_context;
1776        struct ib_device *ibdev = queue->device->dev;
1777        struct nvme_completion *cqe = qe->data;
1778        const size_t len = sizeof(struct nvme_completion);
1779
1780        if (unlikely(wc->status != IB_WC_SUCCESS)) {
1781                nvme_rdma_wr_error(cq, wc, "RECV");
1782                return;
1783        }
1784
1785        /* sanity checking for received data length */
1786        if (unlikely(wc->byte_len < len)) {
1787                dev_err(queue->ctrl->ctrl.device,
1788                        "Unexpected nvme completion length(%d)\n", wc->byte_len);
1789                nvme_rdma_error_recovery(queue->ctrl);
1790                return;
1791        }
1792
1793        ib_dma_sync_single_for_cpu(ibdev, qe->dma, len, DMA_FROM_DEVICE);
1794        /*
1795         * AEN requests are special as they don't time out and can
1796         * survive any kind of queue freeze and often don't respond to
1797         * aborts.  We don't even bother to allocate a struct request
1798         * for them but rather special case them here.
1799         */
1800        if (unlikely(nvme_is_aen_req(nvme_rdma_queue_idx(queue),
1801                                     cqe->command_id)))
1802                nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
1803                                &cqe->result);
1804        else
1805                nvme_rdma_process_nvme_rsp(queue, cqe, wc);
1806        ib_dma_sync_single_for_device(ibdev, qe->dma, len, DMA_FROM_DEVICE);
1807
1808        nvme_rdma_post_recv(queue, qe);
1809}
1810
1811static int nvme_rdma_conn_established(struct nvme_rdma_queue *queue)
1812{
1813        int ret, i;
1814
1815        for (i = 0; i < queue->queue_size; i++) {
1816                ret = nvme_rdma_post_recv(queue, &queue->rsp_ring[i]);
1817                if (ret)
1818                        return ret;
1819        }
1820
1821        return 0;
1822}
1823
1824static int nvme_rdma_conn_rejected(struct nvme_rdma_queue *queue,
1825                struct rdma_cm_event *ev)
1826{
1827        struct rdma_cm_id *cm_id = queue->cm_id;
1828        int status = ev->status;
1829        const char *rej_msg;
1830        const struct nvme_rdma_cm_rej *rej_data;
1831        u8 rej_data_len;
1832
1833        rej_msg = rdma_reject_msg(cm_id, status);
1834        rej_data = rdma_consumer_reject_data(cm_id, ev, &rej_data_len);
1835
1836        if (rej_data && rej_data_len >= sizeof(u16)) {
1837                u16 sts = le16_to_cpu(rej_data->sts);
1838
1839                dev_err(queue->ctrl->ctrl.device,
1840                      "Connect rejected: status %d (%s) nvme status %d (%s).\n",
1841                      status, rej_msg, sts, nvme_rdma_cm_msg(sts));
1842        } else {
1843                dev_err(queue->ctrl->ctrl.device,
1844                        "Connect rejected: status %d (%s).\n", status, rej_msg);
1845        }
1846
1847        return -ECONNRESET;
1848}
1849
1850static int nvme_rdma_addr_resolved(struct nvme_rdma_queue *queue)
1851{
1852        struct nvme_ctrl *ctrl = &queue->ctrl->ctrl;
1853        int ret;
1854
1855        ret = nvme_rdma_create_queue_ib(queue);
1856        if (ret)
1857                return ret;
1858
1859        if (ctrl->opts->tos >= 0)
1860                rdma_set_service_type(queue->cm_id, ctrl->opts->tos);
1861        ret = rdma_resolve_route(queue->cm_id, NVME_RDMA_CONNECT_TIMEOUT_MS);
1862        if (ret) {
1863                dev_err(ctrl->device, "rdma_resolve_route failed (%d).\n",
1864                        queue->cm_error);
1865                goto out_destroy_queue;
1866        }
1867
1868        return 0;
1869
1870out_destroy_queue:
1871        nvme_rdma_destroy_queue_ib(queue);
1872        return ret;
1873}
1874
1875static int nvme_rdma_route_resolved(struct nvme_rdma_queue *queue)
1876{
1877        struct nvme_rdma_ctrl *ctrl = queue->ctrl;
1878        struct rdma_conn_param param = { };
1879        struct nvme_rdma_cm_req priv = { };
1880        int ret;
1881
1882        param.qp_num = queue->qp->qp_num;
1883        param.flow_control = 1;
1884
1885        param.responder_resources = queue->device->dev->attrs.max_qp_rd_atom;
1886        /* maximum retry count */
1887        param.retry_count = 7;
1888        param.rnr_retry_count = 7;
1889        param.private_data = &priv;
1890        param.private_data_len = sizeof(priv);
1891
1892        priv.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0);
1893        priv.qid = cpu_to_le16(nvme_rdma_queue_idx(queue));
1894        /*
1895         * set the admin queue depth to the minimum size
1896         * specified by the Fabrics standard.
1897         */
1898        if (priv.qid == 0) {
1899                priv.hrqsize = cpu_to_le16(NVME_AQ_DEPTH);
1900                priv.hsqsize = cpu_to_le16(NVME_AQ_DEPTH - 1);
1901        } else {
1902                /*
1903                 * current interpretation of the fabrics spec
1904                 * is at minimum you make hrqsize sqsize+1, or a
1905                 * 1's based representation of sqsize.
1906                 */
1907                priv.hrqsize = cpu_to_le16(queue->queue_size);
1908                priv.hsqsize = cpu_to_le16(queue->ctrl->ctrl.sqsize);
1909        }
1910
1911        ret = rdma_connect_locked(queue->cm_id, &param);
1912        if (ret) {
1913                dev_err(ctrl->ctrl.device,
1914                        "rdma_connect_locked failed (%d).\n", ret);
1915                return ret;
1916        }
1917
1918        return 0;
1919}
1920
1921static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
1922                struct rdma_cm_event *ev)
1923{
1924        struct nvme_rdma_queue *queue = cm_id->context;
1925        int cm_error = 0;
1926
1927        dev_dbg(queue->ctrl->ctrl.device, "%s (%d): status %d id %p\n",
1928                rdma_event_msg(ev->event), ev->event,
1929                ev->status, cm_id);
1930
1931        switch (ev->event) {
1932        case RDMA_CM_EVENT_ADDR_RESOLVED:
1933                cm_error = nvme_rdma_addr_resolved(queue);
1934                break;
1935        case RDMA_CM_EVENT_ROUTE_RESOLVED:
1936                cm_error = nvme_rdma_route_resolved(queue);
1937                break;
1938        case RDMA_CM_EVENT_ESTABLISHED:
1939                queue->cm_error = nvme_rdma_conn_established(queue);
1940                /* complete cm_done regardless of success/failure */
1941                complete(&queue->cm_done);
1942                return 0;
1943        case RDMA_CM_EVENT_REJECTED:
1944                cm_error = nvme_rdma_conn_rejected(queue, ev);
1945                break;
1946        case RDMA_CM_EVENT_ROUTE_ERROR:
1947        case RDMA_CM_EVENT_CONNECT_ERROR:
1948        case RDMA_CM_EVENT_UNREACHABLE:
1949        case RDMA_CM_EVENT_ADDR_ERROR:
1950                dev_dbg(queue->ctrl->ctrl.device,
1951                        "CM error event %d\n", ev->event);
1952                cm_error = -ECONNRESET;
1953                break;
1954        case RDMA_CM_EVENT_DISCONNECTED:
1955        case RDMA_CM_EVENT_ADDR_CHANGE:
1956        case RDMA_CM_EVENT_TIMEWAIT_EXIT:
1957                dev_dbg(queue->ctrl->ctrl.device,
1958                        "disconnect received - connection closed\n");
1959                nvme_rdma_error_recovery(queue->ctrl);
1960                break;
1961        case RDMA_CM_EVENT_DEVICE_REMOVAL:
1962                /* device removal is handled via the ib_client API */
1963                break;
1964        default:
1965                dev_err(queue->ctrl->ctrl.device,
1966                        "Unexpected RDMA CM event (%d)\n", ev->event);
1967                nvme_rdma_error_recovery(queue->ctrl);
1968                break;
1969        }
1970
1971        if (cm_error) {
1972                queue->cm_error = cm_error;
1973                complete(&queue->cm_done);
1974        }
1975
1976        return 0;
1977}
1978
1979static void nvme_rdma_complete_timed_out(struct request *rq)
1980{
1981        struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
1982        struct nvme_rdma_queue *queue = req->queue;
1983
1984        nvme_rdma_stop_queue(queue);
1985        if (blk_mq_request_started(rq) && !blk_mq_request_completed(rq)) {
1986                nvme_req(rq)->status = NVME_SC_HOST_ABORTED_CMD;
1987                blk_mq_complete_request(rq);
1988        }
1989}
1990
1991static enum blk_eh_timer_return
1992nvme_rdma_timeout(struct request *rq, bool reserved)
1993{
1994        struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
1995        struct nvme_rdma_queue *queue = req->queue;
1996        struct nvme_rdma_ctrl *ctrl = queue->ctrl;
1997
1998        dev_warn(ctrl->ctrl.device, "I/O %d QID %d timeout\n",
1999                 rq->tag, nvme_rdma_queue_idx(queue));
2000
2001        if (ctrl->ctrl.state != NVME_CTRL_LIVE) {
2002                /*
2003                 * If we are resetting, connecting or deleting we should
2004                 * complete immediately because we may block controller
2005                 * teardown or setup sequence
2006                 * - ctrl disable/shutdown fabrics requests
2007                 * - connect requests
2008                 * - initialization admin requests
2009                 * - I/O requests that entered after unquiescing and
2010                 *   the controller stopped responding
2011                 *
2012                 * All other requests should be cancelled by the error
2013                 * recovery work, so it's fine that we fail it here.
2014                 */
2015                nvme_rdma_complete_timed_out(rq);
2016                return BLK_EH_DONE;
2017        }
2018
2019        /*
2020         * LIVE state should trigger the normal error recovery which will
2021         * handle completing this request.
2022         */
2023        nvme_rdma_error_recovery(ctrl);
2024        return BLK_EH_RESET_TIMER;
2025}
2026
2027static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
2028                const struct blk_mq_queue_data *bd)
2029{
2030        struct nvme_ns *ns = hctx->queue->queuedata;
2031        struct nvme_rdma_queue *queue = hctx->driver_data;
2032        struct request *rq = bd->rq;
2033        struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
2034        struct nvme_rdma_qe *sqe = &req->sqe;
2035        struct nvme_command *c = nvme_req(rq)->cmd;
2036        struct ib_device *dev;
2037        bool queue_ready = test_bit(NVME_RDMA_Q_LIVE, &queue->flags);
2038        blk_status_t ret;
2039        int err;
2040
2041        WARN_ON_ONCE(rq->tag < 0);
2042
2043        if (!nvme_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
2044                return nvme_fail_nonready_command(&queue->ctrl->ctrl, rq);
2045
2046        dev = queue->device->dev;
2047
2048        req->sqe.dma = ib_dma_map_single(dev, req->sqe.data,
2049                                         sizeof(struct nvme_command),
2050                                         DMA_TO_DEVICE);
2051        err = ib_dma_mapping_error(dev, req->sqe.dma);
2052        if (unlikely(err))
2053                return BLK_STS_RESOURCE;
2054
2055        ib_dma_sync_single_for_cpu(dev, sqe->dma,
2056                        sizeof(struct nvme_command), DMA_TO_DEVICE);
2057
2058        ret = nvme_setup_cmd(ns, rq);
2059        if (ret)
2060                goto unmap_qe;
2061
2062        blk_mq_start_request(rq);
2063
2064        if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) &&
2065            queue->pi_support &&
2066            (c->common.opcode == nvme_cmd_write ||
2067             c->common.opcode == nvme_cmd_read) &&
2068            nvme_ns_has_pi(ns))
2069                req->use_sig_mr = true;
2070        else
2071                req->use_sig_mr = false;
2072
2073        err = nvme_rdma_map_data(queue, rq, c);
2074        if (unlikely(err < 0)) {
2075                dev_err(queue->ctrl->ctrl.device,
2076                             "Failed to map data (%d)\n", err);
2077                goto err;
2078        }
2079
2080        sqe->cqe.done = nvme_rdma_send_done;
2081
2082        ib_dma_sync_single_for_device(dev, sqe->dma,
2083                        sizeof(struct nvme_command), DMA_TO_DEVICE);
2084
2085        err = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge,
2086                        req->mr ? &req->reg_wr.wr : NULL);
2087        if (unlikely(err))
2088                goto err_unmap;
2089
2090        return BLK_STS_OK;
2091
2092err_unmap:
2093        nvme_rdma_unmap_data(queue, rq);
2094err:
2095        if (err == -EIO)
2096                ret = nvme_host_path_error(rq);
2097        else if (err == -ENOMEM || err == -EAGAIN)
2098                ret = BLK_STS_RESOURCE;
2099        else
2100                ret = BLK_STS_IOERR;
2101        nvme_cleanup_cmd(rq);
2102unmap_qe:
2103        ib_dma_unmap_single(dev, req->sqe.dma, sizeof(struct nvme_command),
2104                            DMA_TO_DEVICE);
2105        return ret;
2106}
2107
2108static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx)
2109{
2110        struct nvme_rdma_queue *queue = hctx->driver_data;
2111
2112        return ib_process_cq_direct(queue->ib_cq, -1);
2113}
2114
2115static void nvme_rdma_check_pi_status(struct nvme_rdma_request *req)
2116{
2117        struct request *rq = blk_mq_rq_from_pdu(req);
2118        struct ib_mr_status mr_status;
2119        int ret;
2120
2121        ret = ib_check_mr_status(req->mr, IB_MR_CHECK_SIG_STATUS, &mr_status);
2122        if (ret) {
2123                pr_err("ib_check_mr_status failed, ret %d\n", ret);
2124                nvme_req(rq)->status = NVME_SC_INVALID_PI;
2125                return;
2126        }
2127
2128        if (mr_status.fail_status & IB_MR_CHECK_SIG_STATUS) {
2129                switch (mr_status.sig_err.err_type) {
2130                case IB_SIG_BAD_GUARD:
2131                        nvme_req(rq)->status = NVME_SC_GUARD_CHECK;
2132                        break;
2133                case IB_SIG_BAD_REFTAG:
2134                        nvme_req(rq)->status = NVME_SC_REFTAG_CHECK;
2135                        break;
2136                case IB_SIG_BAD_APPTAG:
2137                        nvme_req(rq)->status = NVME_SC_APPTAG_CHECK;
2138                        break;
2139                }
2140                pr_err("PI error found type %d expected 0x%x vs actual 0x%x\n",
2141                       mr_status.sig_err.err_type, mr_status.sig_err.expected,
2142                       mr_status.sig_err.actual);
2143        }
2144}
2145
2146static void nvme_rdma_complete_rq(struct request *rq)
2147{
2148        struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
2149        struct nvme_rdma_queue *queue = req->queue;
2150        struct ib_device *ibdev = queue->device->dev;
2151
2152        if (req->use_sig_mr)
2153                nvme_rdma_check_pi_status(req);
2154
2155        nvme_rdma_unmap_data(queue, rq);
2156        ib_dma_unmap_single(ibdev, req->sqe.dma, sizeof(struct nvme_command),
2157                            DMA_TO_DEVICE);
2158        nvme_complete_rq(rq);
2159}
2160
2161static int nvme_rdma_map_queues(struct blk_mq_tag_set *set)
2162{
2163        struct nvme_rdma_ctrl *ctrl = set->driver_data;
2164        struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
2165
2166        if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) {
2167                /* separate read/write queues */
2168                set->map[HCTX_TYPE_DEFAULT].nr_queues =
2169                        ctrl->io_queues[HCTX_TYPE_DEFAULT];
2170                set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
2171                set->map[HCTX_TYPE_READ].nr_queues =
2172                        ctrl->io_queues[HCTX_TYPE_READ];
2173                set->map[HCTX_TYPE_READ].queue_offset =
2174                        ctrl->io_queues[HCTX_TYPE_DEFAULT];
2175        } else {
2176                /* shared read/write queues */
2177                set->map[HCTX_TYPE_DEFAULT].nr_queues =
2178                        ctrl->io_queues[HCTX_TYPE_DEFAULT];
2179                set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
2180                set->map[HCTX_TYPE_READ].nr_queues =
2181                        ctrl->io_queues[HCTX_TYPE_DEFAULT];
2182                set->map[HCTX_TYPE_READ].queue_offset = 0;
2183        }
2184        blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_DEFAULT],
2185                        ctrl->device->dev, 0);
2186        blk_mq_rdma_map_queues(&set->map[HCTX_TYPE_READ],
2187                        ctrl->device->dev, 0);
2188
2189        if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) {
2190                /* map dedicated poll queues only if we have queues left */
2191                set->map[HCTX_TYPE_POLL].nr_queues =
2192                                ctrl->io_queues[HCTX_TYPE_POLL];
2193                set->map[HCTX_TYPE_POLL].queue_offset =
2194                        ctrl->io_queues[HCTX_TYPE_DEFAULT] +
2195                        ctrl->io_queues[HCTX_TYPE_READ];
2196                blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]);
2197        }
2198
2199        dev_info(ctrl->ctrl.device,
2200                "mapped %d/%d/%d default/read/poll queues.\n",
2201                ctrl->io_queues[HCTX_TYPE_DEFAULT],
2202                ctrl->io_queues[HCTX_TYPE_READ],
2203                ctrl->io_queues[HCTX_TYPE_POLL]);
2204
2205        return 0;
2206}
2207
2208static const struct blk_mq_ops nvme_rdma_mq_ops = {
2209        .queue_rq       = nvme_rdma_queue_rq,
2210        .complete       = nvme_rdma_complete_rq,
2211        .init_request   = nvme_rdma_init_request,
2212        .exit_request   = nvme_rdma_exit_request,
2213        .init_hctx      = nvme_rdma_init_hctx,
2214        .timeout        = nvme_rdma_timeout,
2215        .map_queues     = nvme_rdma_map_queues,
2216        .poll           = nvme_rdma_poll,
2217};
2218
2219static const struct blk_mq_ops nvme_rdma_admin_mq_ops = {
2220        .queue_rq       = nvme_rdma_queue_rq,
2221        .complete       = nvme_rdma_complete_rq,
2222        .init_request   = nvme_rdma_init_request,
2223        .exit_request   = nvme_rdma_exit_request,
2224        .init_hctx      = nvme_rdma_init_admin_hctx,
2225        .timeout        = nvme_rdma_timeout,
2226};
2227
2228static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
2229{
2230        cancel_work_sync(&ctrl->err_work);
2231        cancel_delayed_work_sync(&ctrl->reconnect_work);
2232
2233        nvme_rdma_teardown_io_queues(ctrl, shutdown);
2234        blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
2235        if (shutdown)
2236                nvme_shutdown_ctrl(&ctrl->ctrl);
2237        else
2238                nvme_disable_ctrl(&ctrl->ctrl);
2239        nvme_rdma_teardown_admin_queue(ctrl, shutdown);
2240}
2241
2242static void nvme_rdma_delete_ctrl(struct nvme_ctrl *ctrl)
2243{
2244        nvme_rdma_shutdown_ctrl(to_rdma_ctrl(ctrl), true);
2245}
2246
2247static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
2248{
2249        struct nvme_rdma_ctrl *ctrl =
2250                container_of(work, struct nvme_rdma_ctrl, ctrl.reset_work);
2251
2252        nvme_stop_ctrl(&ctrl->ctrl);
2253        nvme_rdma_shutdown_ctrl(ctrl, false);
2254
2255        if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
2256                /* state change failure should never happen */
2257                WARN_ON_ONCE(1);
2258                return;
2259        }
2260
2261        if (nvme_rdma_setup_ctrl(ctrl, false))
2262                goto out_fail;
2263
2264        return;
2265
2266out_fail:
2267        ++ctrl->ctrl.nr_reconnects;
2268        nvme_rdma_reconnect_or_remove(ctrl);
2269}
2270
2271static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = {
2272        .name                   = "rdma",
2273        .module                 = THIS_MODULE,
2274        .flags                  = NVME_F_FABRICS | NVME_F_METADATA_SUPPORTED,
2275        .reg_read32             = nvmf_reg_read32,
2276        .reg_read64             = nvmf_reg_read64,
2277        .reg_write32            = nvmf_reg_write32,
2278        .free_ctrl              = nvme_rdma_free_ctrl,
2279        .submit_async_event     = nvme_rdma_submit_async_event,
2280        .delete_ctrl            = nvme_rdma_delete_ctrl,
2281        .get_address            = nvmf_get_address,
2282};
2283
2284/*
2285 * Fails a connection request if it matches an existing controller
2286 * (association) with the same tuple:
2287 * <Host NQN, Host ID, local address, remote address, remote port, SUBSYS NQN>
2288 *
2289 * if local address is not specified in the request, it will match an
2290 * existing controller with all the other parameters the same and no
2291 * local port address specified as well.
2292 *
2293 * The ports don't need to be compared as they are intrinsically
2294 * already matched by the port pointers supplied.
2295 */
2296static bool
2297nvme_rdma_existing_controller(struct nvmf_ctrl_options *opts)
2298{
2299        struct nvme_rdma_ctrl *ctrl;
2300        bool found = false;
2301
2302        mutex_lock(&nvme_rdma_ctrl_mutex);
2303        list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list) {
2304                found = nvmf_ip_options_match(&ctrl->ctrl, opts);
2305                if (found)
2306                        break;
2307        }
2308        mutex_unlock(&nvme_rdma_ctrl_mutex);
2309
2310        return found;
2311}
2312
2313static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
2314                struct nvmf_ctrl_options *opts)
2315{
2316        struct nvme_rdma_ctrl *ctrl;
2317        int ret;
2318        bool changed;
2319
2320        ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
2321        if (!ctrl)
2322                return ERR_PTR(-ENOMEM);
2323        ctrl->ctrl.opts = opts;
2324        INIT_LIST_HEAD(&ctrl->list);
2325
2326        if (!(opts->mask & NVMF_OPT_TRSVCID)) {
2327                opts->trsvcid =
2328                        kstrdup(__stringify(NVME_RDMA_IP_PORT), GFP_KERNEL);
2329                if (!opts->trsvcid) {
2330                        ret = -ENOMEM;
2331                        goto out_free_ctrl;
2332                }
2333                opts->mask |= NVMF_OPT_TRSVCID;
2334        }
2335
2336        ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
2337                        opts->traddr, opts->trsvcid, &ctrl->addr);
2338        if (ret) {
2339                pr_err("malformed address passed: %s:%s\n",
2340                        opts->traddr, opts->trsvcid);
2341                goto out_free_ctrl;
2342        }
2343
2344        if (opts->mask & NVMF_OPT_HOST_TRADDR) {
2345                ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
2346                        opts->host_traddr, NULL, &ctrl->src_addr);
2347                if (ret) {
2348                        pr_err("malformed src address passed: %s\n",
2349                               opts->host_traddr);
2350                        goto out_free_ctrl;
2351                }
2352        }
2353
2354        if (!opts->duplicate_connect && nvme_rdma_existing_controller(opts)) {
2355                ret = -EALREADY;
2356                goto out_free_ctrl;
2357        }
2358
2359        INIT_DELAYED_WORK(&ctrl->reconnect_work,
2360                        nvme_rdma_reconnect_ctrl_work);
2361        INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work);
2362        INIT_WORK(&ctrl->ctrl.reset_work, nvme_rdma_reset_ctrl_work);
2363
2364        ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues +
2365                                opts->nr_poll_queues + 1;
2366        ctrl->ctrl.sqsize = opts->queue_size - 1;
2367        ctrl->ctrl.kato = opts->kato;
2368
2369        ret = -ENOMEM;
2370        ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(*ctrl->queues),
2371                                GFP_KERNEL);
2372        if (!ctrl->queues)
2373                goto out_free_ctrl;
2374
2375        ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_rdma_ctrl_ops,
2376                                0 /* no quirks, we're perfect! */);
2377        if (ret)
2378                goto out_kfree_queues;
2379
2380        changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING);
2381        WARN_ON_ONCE(!changed);
2382
2383        ret = nvme_rdma_setup_ctrl(ctrl, true);
2384        if (ret)
2385                goto out_uninit_ctrl;
2386
2387        dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISpcs\n",
2388                ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
2389
2390        mutex_lock(&nvme_rdma_ctrl_mutex);
2391        list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list);
2392        mutex_unlock(&nvme_rdma_ctrl_mutex);
2393
2394        return &ctrl->ctrl;
2395
2396out_uninit_ctrl:
2397        nvme_uninit_ctrl(&ctrl->ctrl);
2398        nvme_put_ctrl(&ctrl->ctrl);
2399        if (ret > 0)
2400                ret = -EIO;
2401        return ERR_PTR(ret);
2402out_kfree_queues:
2403        kfree(ctrl->queues);
2404out_free_ctrl:
2405        kfree(ctrl);
2406        return ERR_PTR(ret);
2407}
2408
2409static struct nvmf_transport_ops nvme_rdma_transport = {
2410        .name           = "rdma",
2411        .module         = THIS_MODULE,
2412        .required_opts  = NVMF_OPT_TRADDR,
2413        .allowed_opts   = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
2414                          NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
2415                          NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES |
2416                          NVMF_OPT_TOS,
2417        .create_ctrl    = nvme_rdma_create_ctrl,
2418};
2419
2420static void nvme_rdma_remove_one(struct ib_device *ib_device, void *client_data)
2421{
2422        struct nvme_rdma_ctrl *ctrl;
2423        struct nvme_rdma_device *ndev;
2424        bool found = false;
2425
2426        mutex_lock(&device_list_mutex);
2427        list_for_each_entry(ndev, &device_list, entry) {
2428                if (ndev->dev == ib_device) {
2429                        found = true;
2430                        break;
2431                }
2432        }
2433        mutex_unlock(&device_list_mutex);
2434
2435        if (!found)
2436                return;
2437
2438        /* Delete all controllers using this device */
2439        mutex_lock(&nvme_rdma_ctrl_mutex);
2440        list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list) {
2441                if (ctrl->device->dev != ib_device)
2442                        continue;
2443                nvme_delete_ctrl(&ctrl->ctrl);
2444        }
2445        mutex_unlock(&nvme_rdma_ctrl_mutex);
2446
2447        flush_workqueue(nvme_delete_wq);
2448}
2449
2450static struct ib_client nvme_rdma_ib_client = {
2451        .name   = "nvme_rdma",
2452        .remove = nvme_rdma_remove_one
2453};
2454
2455static int __init nvme_rdma_init_module(void)
2456{
2457        int ret;
2458
2459        ret = ib_register_client(&nvme_rdma_ib_client);
2460        if (ret)
2461                return ret;
2462
2463        ret = nvmf_register_transport(&nvme_rdma_transport);
2464        if (ret)
2465                goto err_unreg_client;
2466
2467        return 0;
2468
2469err_unreg_client:
2470        ib_unregister_client(&nvme_rdma_ib_client);
2471        return ret;
2472}
2473
2474static void __exit nvme_rdma_cleanup_module(void)
2475{
2476        struct nvme_rdma_ctrl *ctrl;
2477
2478        nvmf_unregister_transport(&nvme_rdma_transport);
2479        ib_unregister_client(&nvme_rdma_ib_client);
2480
2481        mutex_lock(&nvme_rdma_ctrl_mutex);
2482        list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list)
2483                nvme_delete_ctrl(&ctrl->ctrl);
2484        mutex_unlock(&nvme_rdma_ctrl_mutex);
2485        flush_workqueue(nvme_delete_wq);
2486}
2487
2488module_init(nvme_rdma_init_module);
2489module_exit(nvme_rdma_cleanup_module);
2490
2491MODULE_LICENSE("GPL v2");
2492