linux/drivers/nvme/host/rdma.c
<<
>>
Prefs
   1/*
   2 * NVMe over Fabrics RDMA host code.
   3 * Copyright (c) 2015-2016 HGST, a Western Digital Company.
   4 *
   5 * This program is free software; you can redistribute it and/or modify it
   6 * under the terms and conditions of the GNU General Public License,
   7 * version 2, as published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope it will be useful, but WITHOUT
  10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
  12 * more details.
  13 */
  14#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  15#include <linux/module.h>
  16#include <linux/init.h>
  17#include <linux/slab.h>
  18#include <linux/err.h>
  19#include <linux/string.h>
  20#include <linux/atomic.h>
  21#include <linux/blk-mq.h>
  22#include <linux/types.h>
  23#include <linux/list.h>
  24#include <linux/mutex.h>
  25#include <linux/scatterlist.h>
  26#include <linux/nvme.h>
  27#include <asm/unaligned.h>
  28
  29#include <rdma/ib_verbs.h>
  30#include <rdma/rdma_cm.h>
  31#include <rdma/ib_cm.h>
  32#include <linux/nvme-rdma.h>
  33
  34#include "nvme.h"
  35#include "fabrics.h"
  36
  37
  38#define NVME_RDMA_CONNECT_TIMEOUT_MS    1000            /* 1 second */
  39
  40#define NVME_RDMA_MAX_SEGMENT_SIZE      0xffffff        /* 24-bit SGL field */
  41
  42#define NVME_RDMA_MAX_SEGMENTS          256
  43
  44#define NVME_RDMA_MAX_INLINE_SEGMENTS   1
  45
  46/*
  47 * We handle AEN commands ourselves and don't even let the
  48 * block layer know about them.
  49 */
  50#define NVME_RDMA_NR_AEN_COMMANDS      1
  51#define NVME_RDMA_AQ_BLKMQ_DEPTH       \
  52        (NVMF_AQ_DEPTH - NVME_RDMA_NR_AEN_COMMANDS)
  53
  54struct nvme_rdma_device {
  55        struct ib_device       *dev;
  56        struct ib_pd           *pd;
  57        struct kref             ref;
  58        struct list_head        entry;
  59};
  60
  61struct nvme_rdma_qe {
  62        struct ib_cqe           cqe;
  63        void                    *data;
  64        u64                     dma;
  65};
  66
  67struct nvme_rdma_queue;
  68struct nvme_rdma_request {
  69        struct ib_mr            *mr;
  70        struct nvme_rdma_qe     sqe;
  71        struct ib_sge           sge[1 + NVME_RDMA_MAX_INLINE_SEGMENTS];
  72        u32                     num_sge;
  73        int                     nents;
  74        bool                    inline_data;
  75        struct ib_reg_wr        reg_wr;
  76        struct ib_cqe           reg_cqe;
  77        struct nvme_rdma_queue  *queue;
  78        struct sg_table         sg_table;
  79        struct scatterlist      first_sgl[];
  80};
  81
  82enum nvme_rdma_queue_flags {
  83        NVME_RDMA_Q_CONNECTED = (1 << 0),
  84        NVME_RDMA_IB_QUEUE_ALLOCATED = (1 << 1),
  85        NVME_RDMA_Q_DELETING = (1 << 2),
  86        NVME_RDMA_Q_LIVE = (1 << 3),
  87};
  88
  89struct nvme_rdma_queue {
  90        struct nvme_rdma_qe     *rsp_ring;
  91        u8                      sig_count;
  92        int                     queue_size;
  93        size_t                  cmnd_capsule_len;
  94        struct nvme_rdma_ctrl   *ctrl;
  95        struct nvme_rdma_device *device;
  96        struct ib_cq            *ib_cq;
  97        struct ib_qp            *qp;
  98
  99        unsigned long           flags;
 100        struct rdma_cm_id       *cm_id;
 101        int                     cm_error;
 102        struct completion       cm_done;
 103};
 104
 105struct nvme_rdma_ctrl {
 106        /* read and written in the hot path */
 107        spinlock_t              lock;
 108
 109        /* read only in the hot path */
 110        struct nvme_rdma_queue  *queues;
 111        u32                     queue_count;
 112
 113        /* other member variables */
 114        struct blk_mq_tag_set   tag_set;
 115        struct work_struct      delete_work;
 116        struct work_struct      reset_work;
 117        struct work_struct      err_work;
 118
 119        struct nvme_rdma_qe     async_event_sqe;
 120
 121        int                     reconnect_delay;
 122        struct delayed_work     reconnect_work;
 123
 124        struct list_head        list;
 125
 126        struct blk_mq_tag_set   admin_tag_set;
 127        struct nvme_rdma_device *device;
 128
 129        u64                     cap;
 130        u32                     max_fr_pages;
 131
 132        union {
 133                struct sockaddr addr;
 134                struct sockaddr_in addr_in;
 135        };
 136
 137        struct nvme_ctrl        ctrl;
 138};
 139
 140static inline struct nvme_rdma_ctrl *to_rdma_ctrl(struct nvme_ctrl *ctrl)
 141{
 142        return container_of(ctrl, struct nvme_rdma_ctrl, ctrl);
 143}
 144
 145static LIST_HEAD(device_list);
 146static DEFINE_MUTEX(device_list_mutex);
 147
 148static LIST_HEAD(nvme_rdma_ctrl_list);
 149static DEFINE_MUTEX(nvme_rdma_ctrl_mutex);
 150
 151static struct workqueue_struct *nvme_rdma_wq;
 152
 153/*
 154 * Disabling this option makes small I/O goes faster, but is fundamentally
 155 * unsafe.  With it turned off we will have to register a global rkey that
 156 * allows read and write access to all physical memory.
 157 */
 158static bool register_always = true;
 159module_param(register_always, bool, 0444);
 160MODULE_PARM_DESC(register_always,
 161         "Use memory registration even for contiguous memory regions");
 162
 163static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
 164                struct rdma_cm_event *event);
 165static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc);
 166
 167/* XXX: really should move to a generic header sooner or later.. */
 168static inline void put_unaligned_le24(u32 val, u8 *p)
 169{
 170        *p++ = val;
 171        *p++ = val >> 8;
 172        *p++ = val >> 16;
 173}
 174
 175static inline int nvme_rdma_queue_idx(struct nvme_rdma_queue *queue)
 176{
 177        return queue - queue->ctrl->queues;
 178}
 179
 180static inline size_t nvme_rdma_inline_data_size(struct nvme_rdma_queue *queue)
 181{
 182        return queue->cmnd_capsule_len - sizeof(struct nvme_command);
 183}
 184
 185static void nvme_rdma_free_qe(struct ib_device *ibdev, struct nvme_rdma_qe *qe,
 186                size_t capsule_size, enum dma_data_direction dir)
 187{
 188        ib_dma_unmap_single(ibdev, qe->dma, capsule_size, dir);
 189        kfree(qe->data);
 190}
 191
 192static int nvme_rdma_alloc_qe(struct ib_device *ibdev, struct nvme_rdma_qe *qe,
 193                size_t capsule_size, enum dma_data_direction dir)
 194{
 195        qe->data = kzalloc(capsule_size, GFP_KERNEL);
 196        if (!qe->data)
 197                return -ENOMEM;
 198
 199        qe->dma = ib_dma_map_single(ibdev, qe->data, capsule_size, dir);
 200        if (ib_dma_mapping_error(ibdev, qe->dma)) {
 201                kfree(qe->data);
 202                return -ENOMEM;
 203        }
 204
 205        return 0;
 206}
 207
 208static void nvme_rdma_free_ring(struct ib_device *ibdev,
 209                struct nvme_rdma_qe *ring, size_t ib_queue_size,
 210                size_t capsule_size, enum dma_data_direction dir)
 211{
 212        int i;
 213
 214        for (i = 0; i < ib_queue_size; i++)
 215                nvme_rdma_free_qe(ibdev, &ring[i], capsule_size, dir);
 216        kfree(ring);
 217}
 218
 219static struct nvme_rdma_qe *nvme_rdma_alloc_ring(struct ib_device *ibdev,
 220                size_t ib_queue_size, size_t capsule_size,
 221                enum dma_data_direction dir)
 222{
 223        struct nvme_rdma_qe *ring;
 224        int i;
 225
 226        ring = kcalloc(ib_queue_size, sizeof(struct nvme_rdma_qe), GFP_KERNEL);
 227        if (!ring)
 228                return NULL;
 229
 230        for (i = 0; i < ib_queue_size; i++) {
 231                if (nvme_rdma_alloc_qe(ibdev, &ring[i], capsule_size, dir))
 232                        goto out_free_ring;
 233        }
 234
 235        return ring;
 236
 237out_free_ring:
 238        nvme_rdma_free_ring(ibdev, ring, i, capsule_size, dir);
 239        return NULL;
 240}
 241
 242static void nvme_rdma_qp_event(struct ib_event *event, void *context)
 243{
 244        pr_debug("QP event %d\n", event->event);
 245}
 246
 247static int nvme_rdma_wait_for_cm(struct nvme_rdma_queue *queue)
 248{
 249        wait_for_completion_interruptible_timeout(&queue->cm_done,
 250                        msecs_to_jiffies(NVME_RDMA_CONNECT_TIMEOUT_MS) + 1);
 251        return queue->cm_error;
 252}
 253
 254static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor)
 255{
 256        struct nvme_rdma_device *dev = queue->device;
 257        struct ib_qp_init_attr init_attr;
 258        int ret;
 259
 260        memset(&init_attr, 0, sizeof(init_attr));
 261        init_attr.event_handler = nvme_rdma_qp_event;
 262        /* +1 for drain */
 263        init_attr.cap.max_send_wr = factor * queue->queue_size + 1;
 264        /* +1 for drain */
 265        init_attr.cap.max_recv_wr = queue->queue_size + 1;
 266        init_attr.cap.max_recv_sge = 1;
 267        init_attr.cap.max_send_sge = 1 + NVME_RDMA_MAX_INLINE_SEGMENTS;
 268        init_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
 269        init_attr.qp_type = IB_QPT_RC;
 270        init_attr.send_cq = queue->ib_cq;
 271        init_attr.recv_cq = queue->ib_cq;
 272
 273        ret = rdma_create_qp(queue->cm_id, dev->pd, &init_attr);
 274
 275        queue->qp = queue->cm_id->qp;
 276        return ret;
 277}
 278
 279static int nvme_rdma_reinit_request(void *data, struct request *rq)
 280{
 281        struct nvme_rdma_ctrl *ctrl = data;
 282        struct nvme_rdma_device *dev = ctrl->device;
 283        struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
 284        int ret = 0;
 285
 286        if (!req->mr->need_inval)
 287                goto out;
 288
 289        ib_dereg_mr(req->mr);
 290
 291        req->mr = ib_alloc_mr(dev->pd, IB_MR_TYPE_MEM_REG,
 292                        ctrl->max_fr_pages);
 293        if (IS_ERR(req->mr)) {
 294                ret = PTR_ERR(req->mr);
 295                req->mr = NULL;
 296                goto out;
 297        }
 298
 299        req->mr->need_inval = false;
 300
 301out:
 302        return ret;
 303}
 304
 305static void __nvme_rdma_exit_request(struct nvme_rdma_ctrl *ctrl,
 306                struct request *rq, unsigned int queue_idx)
 307{
 308        struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
 309        struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx];
 310        struct nvme_rdma_device *dev = queue->device;
 311
 312        if (req->mr)
 313                ib_dereg_mr(req->mr);
 314
 315        nvme_rdma_free_qe(dev->dev, &req->sqe, sizeof(struct nvme_command),
 316                        DMA_TO_DEVICE);
 317}
 318
 319static void nvme_rdma_exit_request(void *data, struct request *rq,
 320                                unsigned int hctx_idx, unsigned int rq_idx)
 321{
 322        return __nvme_rdma_exit_request(data, rq, hctx_idx + 1);
 323}
 324
 325static void nvme_rdma_exit_admin_request(void *data, struct request *rq,
 326                                unsigned int hctx_idx, unsigned int rq_idx)
 327{
 328        return __nvme_rdma_exit_request(data, rq, 0);
 329}
 330
 331static int __nvme_rdma_init_request(struct nvme_rdma_ctrl *ctrl,
 332                struct request *rq, unsigned int queue_idx)
 333{
 334        struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
 335        struct nvme_rdma_queue *queue = &ctrl->queues[queue_idx];
 336        struct nvme_rdma_device *dev = queue->device;
 337        struct ib_device *ibdev = dev->dev;
 338        int ret;
 339
 340        BUG_ON(queue_idx >= ctrl->queue_count);
 341
 342        ret = nvme_rdma_alloc_qe(ibdev, &req->sqe, sizeof(struct nvme_command),
 343                        DMA_TO_DEVICE);
 344        if (ret)
 345                return ret;
 346
 347        req->mr = ib_alloc_mr(dev->pd, IB_MR_TYPE_MEM_REG,
 348                        ctrl->max_fr_pages);
 349        if (IS_ERR(req->mr)) {
 350                ret = PTR_ERR(req->mr);
 351                goto out_free_qe;
 352        }
 353
 354        req->queue = queue;
 355
 356        return 0;
 357
 358out_free_qe:
 359        nvme_rdma_free_qe(dev->dev, &req->sqe, sizeof(struct nvme_command),
 360                        DMA_TO_DEVICE);
 361        return -ENOMEM;
 362}
 363
 364static int nvme_rdma_init_request(void *data, struct request *rq,
 365                                unsigned int hctx_idx, unsigned int rq_idx,
 366                                unsigned int numa_node)
 367{
 368        return __nvme_rdma_init_request(data, rq, hctx_idx + 1);
 369}
 370
 371static int nvme_rdma_init_admin_request(void *data, struct request *rq,
 372                                unsigned int hctx_idx, unsigned int rq_idx,
 373                                unsigned int numa_node)
 374{
 375        return __nvme_rdma_init_request(data, rq, 0);
 376}
 377
 378static int nvme_rdma_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
 379                unsigned int hctx_idx)
 380{
 381        struct nvme_rdma_ctrl *ctrl = data;
 382        struct nvme_rdma_queue *queue = &ctrl->queues[hctx_idx + 1];
 383
 384        BUG_ON(hctx_idx >= ctrl->queue_count);
 385
 386        hctx->driver_data = queue;
 387        return 0;
 388}
 389
 390static int nvme_rdma_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
 391                unsigned int hctx_idx)
 392{
 393        struct nvme_rdma_ctrl *ctrl = data;
 394        struct nvme_rdma_queue *queue = &ctrl->queues[0];
 395
 396        BUG_ON(hctx_idx != 0);
 397
 398        hctx->driver_data = queue;
 399        return 0;
 400}
 401
 402static void nvme_rdma_free_dev(struct kref *ref)
 403{
 404        struct nvme_rdma_device *ndev =
 405                container_of(ref, struct nvme_rdma_device, ref);
 406
 407        mutex_lock(&device_list_mutex);
 408        list_del(&ndev->entry);
 409        mutex_unlock(&device_list_mutex);
 410
 411        ib_dealloc_pd(ndev->pd);
 412        kfree(ndev);
 413}
 414
 415static void nvme_rdma_dev_put(struct nvme_rdma_device *dev)
 416{
 417        kref_put(&dev->ref, nvme_rdma_free_dev);
 418}
 419
 420static int nvme_rdma_dev_get(struct nvme_rdma_device *dev)
 421{
 422        return kref_get_unless_zero(&dev->ref);
 423}
 424
 425static struct nvme_rdma_device *
 426nvme_rdma_find_get_device(struct rdma_cm_id *cm_id)
 427{
 428        struct nvme_rdma_device *ndev;
 429
 430        mutex_lock(&device_list_mutex);
 431        list_for_each_entry(ndev, &device_list, entry) {
 432                if (ndev->dev->node_guid == cm_id->device->node_guid &&
 433                    nvme_rdma_dev_get(ndev))
 434                        goto out_unlock;
 435        }
 436
 437        ndev = kzalloc(sizeof(*ndev), GFP_KERNEL);
 438        if (!ndev)
 439                goto out_err;
 440
 441        ndev->dev = cm_id->device;
 442        kref_init(&ndev->ref);
 443
 444        ndev->pd = ib_alloc_pd(ndev->dev,
 445                register_always ? 0 : IB_PD_UNSAFE_GLOBAL_RKEY);
 446        if (IS_ERR(ndev->pd))
 447                goto out_free_dev;
 448
 449        if (!(ndev->dev->attrs.device_cap_flags &
 450              IB_DEVICE_MEM_MGT_EXTENSIONS)) {
 451                dev_err(&ndev->dev->dev,
 452                        "Memory registrations not supported.\n");
 453                goto out_free_pd;
 454        }
 455
 456        list_add(&ndev->entry, &device_list);
 457out_unlock:
 458        mutex_unlock(&device_list_mutex);
 459        return ndev;
 460
 461out_free_pd:
 462        ib_dealloc_pd(ndev->pd);
 463out_free_dev:
 464        kfree(ndev);
 465out_err:
 466        mutex_unlock(&device_list_mutex);
 467        return NULL;
 468}
 469
 470static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue)
 471{
 472        struct nvme_rdma_device *dev;
 473        struct ib_device *ibdev;
 474
 475        if (!test_and_clear_bit(NVME_RDMA_IB_QUEUE_ALLOCATED, &queue->flags))
 476                return;
 477
 478        dev = queue->device;
 479        ibdev = dev->dev;
 480        rdma_destroy_qp(queue->cm_id);
 481        ib_free_cq(queue->ib_cq);
 482
 483        nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size,
 484                        sizeof(struct nvme_completion), DMA_FROM_DEVICE);
 485
 486        nvme_rdma_dev_put(dev);
 487}
 488
 489static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue,
 490                struct nvme_rdma_device *dev)
 491{
 492        struct ib_device *ibdev = dev->dev;
 493        const int send_wr_factor = 3;                   /* MR, SEND, INV */
 494        const int cq_factor = send_wr_factor + 1;       /* + RECV */
 495        int comp_vector, idx = nvme_rdma_queue_idx(queue);
 496
 497        int ret;
 498
 499        queue->device = dev;
 500
 501        /*
 502         * The admin queue is barely used once the controller is live, so don't
 503         * bother to spread it out.
 504         */
 505        if (idx == 0)
 506                comp_vector = 0;
 507        else
 508                comp_vector = idx % ibdev->num_comp_vectors;
 509
 510
 511        /* +1 for ib_stop_cq */
 512        queue->ib_cq = ib_alloc_cq(dev->dev, queue,
 513                                cq_factor * queue->queue_size + 1, comp_vector,
 514                                IB_POLL_SOFTIRQ);
 515        if (IS_ERR(queue->ib_cq)) {
 516                ret = PTR_ERR(queue->ib_cq);
 517                goto out;
 518        }
 519
 520        ret = nvme_rdma_create_qp(queue, send_wr_factor);
 521        if (ret)
 522                goto out_destroy_ib_cq;
 523
 524        queue->rsp_ring = nvme_rdma_alloc_ring(ibdev, queue->queue_size,
 525                        sizeof(struct nvme_completion), DMA_FROM_DEVICE);
 526        if (!queue->rsp_ring) {
 527                ret = -ENOMEM;
 528                goto out_destroy_qp;
 529        }
 530        set_bit(NVME_RDMA_IB_QUEUE_ALLOCATED, &queue->flags);
 531
 532        return 0;
 533
 534out_destroy_qp:
 535        ib_destroy_qp(queue->qp);
 536out_destroy_ib_cq:
 537        ib_free_cq(queue->ib_cq);
 538out:
 539        return ret;
 540}
 541
 542static int nvme_rdma_init_queue(struct nvme_rdma_ctrl *ctrl,
 543                int idx, size_t queue_size)
 544{
 545        struct nvme_rdma_queue *queue;
 546        int ret;
 547
 548        queue = &ctrl->queues[idx];
 549        queue->ctrl = ctrl;
 550        init_completion(&queue->cm_done);
 551
 552        if (idx > 0)
 553                queue->cmnd_capsule_len = ctrl->ctrl.ioccsz * 16;
 554        else
 555                queue->cmnd_capsule_len = sizeof(struct nvme_command);
 556
 557        queue->queue_size = queue_size;
 558
 559        queue->cm_id = rdma_create_id(&init_net, nvme_rdma_cm_handler, queue,
 560                        RDMA_PS_TCP, IB_QPT_RC);
 561        if (IS_ERR(queue->cm_id)) {
 562                dev_info(ctrl->ctrl.device,
 563                        "failed to create CM ID: %ld\n", PTR_ERR(queue->cm_id));
 564                return PTR_ERR(queue->cm_id);
 565        }
 566
 567        queue->cm_error = -ETIMEDOUT;
 568        ret = rdma_resolve_addr(queue->cm_id, NULL, &ctrl->addr,
 569                        NVME_RDMA_CONNECT_TIMEOUT_MS);
 570        if (ret) {
 571                dev_info(ctrl->ctrl.device,
 572                        "rdma_resolve_addr failed (%d).\n", ret);
 573                goto out_destroy_cm_id;
 574        }
 575
 576        ret = nvme_rdma_wait_for_cm(queue);
 577        if (ret) {
 578                dev_info(ctrl->ctrl.device,
 579                        "rdma_resolve_addr wait failed (%d).\n", ret);
 580                goto out_destroy_cm_id;
 581        }
 582
 583        clear_bit(NVME_RDMA_Q_DELETING, &queue->flags);
 584        set_bit(NVME_RDMA_Q_CONNECTED, &queue->flags);
 585
 586        return 0;
 587
 588out_destroy_cm_id:
 589        nvme_rdma_destroy_queue_ib(queue);
 590        rdma_destroy_id(queue->cm_id);
 591        return ret;
 592}
 593
 594static void nvme_rdma_stop_queue(struct nvme_rdma_queue *queue)
 595{
 596        rdma_disconnect(queue->cm_id);
 597        ib_drain_qp(queue->qp);
 598}
 599
 600static void nvme_rdma_free_queue(struct nvme_rdma_queue *queue)
 601{
 602        nvme_rdma_destroy_queue_ib(queue);
 603        rdma_destroy_id(queue->cm_id);
 604}
 605
 606static void nvme_rdma_stop_and_free_queue(struct nvme_rdma_queue *queue)
 607{
 608        if (test_and_set_bit(NVME_RDMA_Q_DELETING, &queue->flags))
 609                return;
 610        nvme_rdma_stop_queue(queue);
 611        nvme_rdma_free_queue(queue);
 612}
 613
 614static void nvme_rdma_free_io_queues(struct nvme_rdma_ctrl *ctrl)
 615{
 616        int i;
 617
 618        for (i = 1; i < ctrl->queue_count; i++)
 619                nvme_rdma_stop_and_free_queue(&ctrl->queues[i]);
 620}
 621
 622static int nvme_rdma_connect_io_queues(struct nvme_rdma_ctrl *ctrl)
 623{
 624        int i, ret = 0;
 625
 626        for (i = 1; i < ctrl->queue_count; i++) {
 627                ret = nvmf_connect_io_queue(&ctrl->ctrl, i);
 628                if (ret) {
 629                        dev_info(ctrl->ctrl.device,
 630                                "failed to connect i/o queue: %d\n", ret);
 631                        goto out_free_queues;
 632                }
 633                set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[i].flags);
 634        }
 635
 636        return 0;
 637
 638out_free_queues:
 639        nvme_rdma_free_io_queues(ctrl);
 640        return ret;
 641}
 642
 643static int nvme_rdma_init_io_queues(struct nvme_rdma_ctrl *ctrl)
 644{
 645        int i, ret;
 646
 647        for (i = 1; i < ctrl->queue_count; i++) {
 648                ret = nvme_rdma_init_queue(ctrl, i,
 649                                           ctrl->ctrl.opts->queue_size);
 650                if (ret) {
 651                        dev_info(ctrl->ctrl.device,
 652                                "failed to initialize i/o queue: %d\n", ret);
 653                        goto out_free_queues;
 654                }
 655        }
 656
 657        return 0;
 658
 659out_free_queues:
 660        for (i--; i >= 1; i--)
 661                nvme_rdma_stop_and_free_queue(&ctrl->queues[i]);
 662
 663        return ret;
 664}
 665
 666static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl)
 667{
 668        nvme_rdma_free_qe(ctrl->queues[0].device->dev, &ctrl->async_event_sqe,
 669                        sizeof(struct nvme_command), DMA_TO_DEVICE);
 670        nvme_rdma_stop_and_free_queue(&ctrl->queues[0]);
 671        blk_cleanup_queue(ctrl->ctrl.admin_q);
 672        blk_mq_free_tag_set(&ctrl->admin_tag_set);
 673        nvme_rdma_dev_put(ctrl->device);
 674}
 675
 676static void nvme_rdma_free_ctrl(struct nvme_ctrl *nctrl)
 677{
 678        struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
 679
 680        if (list_empty(&ctrl->list))
 681                goto free_ctrl;
 682
 683        mutex_lock(&nvme_rdma_ctrl_mutex);
 684        list_del(&ctrl->list);
 685        mutex_unlock(&nvme_rdma_ctrl_mutex);
 686
 687        kfree(ctrl->queues);
 688        nvmf_free_options(nctrl->opts);
 689free_ctrl:
 690        kfree(ctrl);
 691}
 692
 693static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
 694{
 695        struct nvme_rdma_ctrl *ctrl = container_of(to_delayed_work(work),
 696                        struct nvme_rdma_ctrl, reconnect_work);
 697        bool changed;
 698        int ret;
 699
 700        if (ctrl->queue_count > 1) {
 701                nvme_rdma_free_io_queues(ctrl);
 702
 703                ret = blk_mq_reinit_tagset(&ctrl->tag_set);
 704                if (ret)
 705                        goto requeue;
 706        }
 707
 708        nvme_rdma_stop_and_free_queue(&ctrl->queues[0]);
 709
 710        ret = blk_mq_reinit_tagset(&ctrl->admin_tag_set);
 711        if (ret)
 712                goto requeue;
 713
 714        ret = nvme_rdma_init_queue(ctrl, 0, NVMF_AQ_DEPTH);
 715        if (ret)
 716                goto requeue;
 717
 718        blk_mq_start_stopped_hw_queues(ctrl->ctrl.admin_q, true);
 719
 720        ret = nvmf_connect_admin_queue(&ctrl->ctrl);
 721        if (ret)
 722                goto stop_admin_q;
 723
 724        set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[0].flags);
 725
 726        ret = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap);
 727        if (ret)
 728                goto stop_admin_q;
 729
 730        nvme_start_keep_alive(&ctrl->ctrl);
 731
 732        if (ctrl->queue_count > 1) {
 733                ret = nvme_rdma_init_io_queues(ctrl);
 734                if (ret)
 735                        goto stop_admin_q;
 736
 737                ret = nvme_rdma_connect_io_queues(ctrl);
 738                if (ret)
 739                        goto stop_admin_q;
 740        }
 741
 742        changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
 743        WARN_ON_ONCE(!changed);
 744
 745        if (ctrl->queue_count > 1) {
 746                nvme_start_queues(&ctrl->ctrl);
 747                nvme_queue_scan(&ctrl->ctrl);
 748                nvme_queue_async_events(&ctrl->ctrl);
 749        }
 750
 751        dev_info(ctrl->ctrl.device, "Successfully reconnected\n");
 752
 753        return;
 754
 755stop_admin_q:
 756        blk_mq_stop_hw_queues(ctrl->ctrl.admin_q);
 757requeue:
 758        /* Make sure we are not resetting/deleting */
 759        if (ctrl->ctrl.state == NVME_CTRL_RECONNECTING) {
 760                dev_info(ctrl->ctrl.device,
 761                        "Failed reconnect attempt, requeueing...\n");
 762                queue_delayed_work(nvme_rdma_wq, &ctrl->reconnect_work,
 763                                        ctrl->reconnect_delay * HZ);
 764        }
 765}
 766
 767static void nvme_rdma_error_recovery_work(struct work_struct *work)
 768{
 769        struct nvme_rdma_ctrl *ctrl = container_of(work,
 770                        struct nvme_rdma_ctrl, err_work);
 771        int i;
 772
 773        nvme_stop_keep_alive(&ctrl->ctrl);
 774
 775        for (i = 0; i < ctrl->queue_count; i++) {
 776                clear_bit(NVME_RDMA_Q_CONNECTED, &ctrl->queues[i].flags);
 777                clear_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[i].flags);
 778        }
 779
 780        if (ctrl->queue_count > 1)
 781                nvme_stop_queues(&ctrl->ctrl);
 782        blk_mq_stop_hw_queues(ctrl->ctrl.admin_q);
 783
 784        /* We must take care of fastfail/requeue all our inflight requests */
 785        if (ctrl->queue_count > 1)
 786                blk_mq_tagset_busy_iter(&ctrl->tag_set,
 787                                        nvme_cancel_request, &ctrl->ctrl);
 788        blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
 789                                nvme_cancel_request, &ctrl->ctrl);
 790
 791        dev_info(ctrl->ctrl.device, "reconnecting in %d seconds\n",
 792                ctrl->reconnect_delay);
 793
 794        queue_delayed_work(nvme_rdma_wq, &ctrl->reconnect_work,
 795                                ctrl->reconnect_delay * HZ);
 796}
 797
 798static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl)
 799{
 800        if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING))
 801                return;
 802
 803        queue_work(nvme_rdma_wq, &ctrl->err_work);
 804}
 805
 806static void nvme_rdma_wr_error(struct ib_cq *cq, struct ib_wc *wc,
 807                const char *op)
 808{
 809        struct nvme_rdma_queue *queue = cq->cq_context;
 810        struct nvme_rdma_ctrl *ctrl = queue->ctrl;
 811
 812        if (ctrl->ctrl.state == NVME_CTRL_LIVE)
 813                dev_info(ctrl->ctrl.device,
 814                             "%s for CQE 0x%p failed with status %s (%d)\n",
 815                             op, wc->wr_cqe,
 816                             ib_wc_status_msg(wc->status), wc->status);
 817        nvme_rdma_error_recovery(ctrl);
 818}
 819
 820static void nvme_rdma_memreg_done(struct ib_cq *cq, struct ib_wc *wc)
 821{
 822        if (unlikely(wc->status != IB_WC_SUCCESS))
 823                nvme_rdma_wr_error(cq, wc, "MEMREG");
 824}
 825
 826static void nvme_rdma_inv_rkey_done(struct ib_cq *cq, struct ib_wc *wc)
 827{
 828        if (unlikely(wc->status != IB_WC_SUCCESS))
 829                nvme_rdma_wr_error(cq, wc, "LOCAL_INV");
 830}
 831
 832static int nvme_rdma_inv_rkey(struct nvme_rdma_queue *queue,
 833                struct nvme_rdma_request *req)
 834{
 835        struct ib_send_wr *bad_wr;
 836        struct ib_send_wr wr = {
 837                .opcode             = IB_WR_LOCAL_INV,
 838                .next               = NULL,
 839                .num_sge            = 0,
 840                .send_flags         = 0,
 841                .ex.invalidate_rkey = req->mr->rkey,
 842        };
 843
 844        req->reg_cqe.done = nvme_rdma_inv_rkey_done;
 845        wr.wr_cqe = &req->reg_cqe;
 846
 847        return ib_post_send(queue->qp, &wr, &bad_wr);
 848}
 849
 850static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue,
 851                struct request *rq)
 852{
 853        struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
 854        struct nvme_rdma_ctrl *ctrl = queue->ctrl;
 855        struct nvme_rdma_device *dev = queue->device;
 856        struct ib_device *ibdev = dev->dev;
 857        int res;
 858
 859        if (!blk_rq_bytes(rq))
 860                return;
 861
 862        if (req->mr->need_inval) {
 863                res = nvme_rdma_inv_rkey(queue, req);
 864                if (res < 0) {
 865                        dev_err(ctrl->ctrl.device,
 866                                "Queueing INV WR for rkey %#x failed (%d)\n",
 867                                req->mr->rkey, res);
 868                        nvme_rdma_error_recovery(queue->ctrl);
 869                }
 870        }
 871
 872        ib_dma_unmap_sg(ibdev, req->sg_table.sgl,
 873                        req->nents, rq_data_dir(rq) ==
 874                                    WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
 875
 876        nvme_cleanup_cmd(rq);
 877        sg_free_table_chained(&req->sg_table, true);
 878}
 879
 880static int nvme_rdma_set_sg_null(struct nvme_command *c)
 881{
 882        struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
 883
 884        sg->addr = 0;
 885        put_unaligned_le24(0, sg->length);
 886        put_unaligned_le32(0, sg->key);
 887        sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4;
 888        return 0;
 889}
 890
 891static int nvme_rdma_map_sg_inline(struct nvme_rdma_queue *queue,
 892                struct nvme_rdma_request *req, struct nvme_command *c)
 893{
 894        struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
 895
 896        req->sge[1].addr = sg_dma_address(req->sg_table.sgl);
 897        req->sge[1].length = sg_dma_len(req->sg_table.sgl);
 898        req->sge[1].lkey = queue->device->pd->local_dma_lkey;
 899
 900        sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff);
 901        sg->length = cpu_to_le32(sg_dma_len(req->sg_table.sgl));
 902        sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
 903
 904        req->inline_data = true;
 905        req->num_sge++;
 906        return 0;
 907}
 908
 909static int nvme_rdma_map_sg_single(struct nvme_rdma_queue *queue,
 910                struct nvme_rdma_request *req, struct nvme_command *c)
 911{
 912        struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
 913
 914        sg->addr = cpu_to_le64(sg_dma_address(req->sg_table.sgl));
 915        put_unaligned_le24(sg_dma_len(req->sg_table.sgl), sg->length);
 916        put_unaligned_le32(queue->device->pd->unsafe_global_rkey, sg->key);
 917        sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4;
 918        return 0;
 919}
 920
 921static int nvme_rdma_map_sg_fr(struct nvme_rdma_queue *queue,
 922                struct nvme_rdma_request *req, struct nvme_command *c,
 923                int count)
 924{
 925        struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
 926        int nr;
 927
 928        nr = ib_map_mr_sg(req->mr, req->sg_table.sgl, count, NULL, PAGE_SIZE);
 929        if (nr < count) {
 930                if (nr < 0)
 931                        return nr;
 932                return -EINVAL;
 933        }
 934
 935        ib_update_fast_reg_key(req->mr, ib_inc_rkey(req->mr->rkey));
 936
 937        req->reg_cqe.done = nvme_rdma_memreg_done;
 938        memset(&req->reg_wr, 0, sizeof(req->reg_wr));
 939        req->reg_wr.wr.opcode = IB_WR_REG_MR;
 940        req->reg_wr.wr.wr_cqe = &req->reg_cqe;
 941        req->reg_wr.wr.num_sge = 0;
 942        req->reg_wr.mr = req->mr;
 943        req->reg_wr.key = req->mr->rkey;
 944        req->reg_wr.access = IB_ACCESS_LOCAL_WRITE |
 945                             IB_ACCESS_REMOTE_READ |
 946                             IB_ACCESS_REMOTE_WRITE;
 947
 948        req->mr->need_inval = true;
 949
 950        sg->addr = cpu_to_le64(req->mr->iova);
 951        put_unaligned_le24(req->mr->length, sg->length);
 952        put_unaligned_le32(req->mr->rkey, sg->key);
 953        sg->type = (NVME_KEY_SGL_FMT_DATA_DESC << 4) |
 954                        NVME_SGL_FMT_INVALIDATE;
 955
 956        return 0;
 957}
 958
 959static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
 960                struct request *rq, unsigned int map_len,
 961                struct nvme_command *c)
 962{
 963        struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
 964        struct nvme_rdma_device *dev = queue->device;
 965        struct ib_device *ibdev = dev->dev;
 966        int nents, count;
 967        int ret;
 968
 969        req->num_sge = 1;
 970        req->inline_data = false;
 971        req->mr->need_inval = false;
 972
 973        c->common.flags |= NVME_CMD_SGL_METABUF;
 974
 975        if (!blk_rq_bytes(rq))
 976                return nvme_rdma_set_sg_null(c);
 977
 978        req->sg_table.sgl = req->first_sgl;
 979        ret = sg_alloc_table_chained(&req->sg_table, rq->nr_phys_segments,
 980                                req->sg_table.sgl);
 981        if (ret)
 982                return -ENOMEM;
 983
 984        nents = blk_rq_map_sg(rq->q, rq, req->sg_table.sgl);
 985        BUG_ON(nents > rq->nr_phys_segments);
 986        req->nents = nents;
 987
 988        count = ib_dma_map_sg(ibdev, req->sg_table.sgl, nents,
 989                    rq_data_dir(rq) == WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
 990        if (unlikely(count <= 0)) {
 991                sg_free_table_chained(&req->sg_table, true);
 992                return -EIO;
 993        }
 994
 995        if (count == 1) {
 996                if (rq_data_dir(rq) == WRITE &&
 997                    map_len <= nvme_rdma_inline_data_size(queue) &&
 998                    nvme_rdma_queue_idx(queue))
 999                        return nvme_rdma_map_sg_inline(queue, req, c);
1000
1001                if (dev->pd->flags & IB_PD_UNSAFE_GLOBAL_RKEY)
1002                        return nvme_rdma_map_sg_single(queue, req, c);
1003        }
1004
1005        return nvme_rdma_map_sg_fr(queue, req, c, count);
1006}
1007
1008static void nvme_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc)
1009{
1010        if (unlikely(wc->status != IB_WC_SUCCESS))
1011                nvme_rdma_wr_error(cq, wc, "SEND");
1012}
1013
1014static int nvme_rdma_post_send(struct nvme_rdma_queue *queue,
1015                struct nvme_rdma_qe *qe, struct ib_sge *sge, u32 num_sge,
1016                struct ib_send_wr *first, bool flush)
1017{
1018        struct ib_send_wr wr, *bad_wr;
1019        int ret;
1020
1021        sge->addr   = qe->dma;
1022        sge->length = sizeof(struct nvme_command),
1023        sge->lkey   = queue->device->pd->local_dma_lkey;
1024
1025        qe->cqe.done = nvme_rdma_send_done;
1026
1027        wr.next       = NULL;
1028        wr.wr_cqe     = &qe->cqe;
1029        wr.sg_list    = sge;
1030        wr.num_sge    = num_sge;
1031        wr.opcode     = IB_WR_SEND;
1032        wr.send_flags = 0;
1033
1034        /*
1035         * Unsignalled send completions are another giant desaster in the
1036         * IB Verbs spec:  If we don't regularly post signalled sends
1037         * the send queue will fill up and only a QP reset will rescue us.
1038         * Would have been way to obvious to handle this in hardware or
1039         * at least the RDMA stack..
1040         *
1041         * This messy and racy code sniplet is copy and pasted from the iSER
1042         * initiator, and the magic '32' comes from there as well.
1043         *
1044         * Always signal the flushes. The magic request used for the flush
1045         * sequencer is not allocated in our driver's tagset and it's
1046         * triggered to be freed by blk_cleanup_queue(). So we need to
1047         * always mark it as signaled to ensure that the "wr_cqe", which is
1048         * embeded in request's payload, is not freed when __ib_process_cq()
1049         * calls wr_cqe->done().
1050         */
1051        if ((++queue->sig_count % 32) == 0 || flush)
1052                wr.send_flags |= IB_SEND_SIGNALED;
1053
1054        if (first)
1055                first->next = &wr;
1056        else
1057                first = &wr;
1058
1059        ret = ib_post_send(queue->qp, first, &bad_wr);
1060        if (ret) {
1061                dev_err(queue->ctrl->ctrl.device,
1062                             "%s failed with error code %d\n", __func__, ret);
1063        }
1064        return ret;
1065}
1066
1067static int nvme_rdma_post_recv(struct nvme_rdma_queue *queue,
1068                struct nvme_rdma_qe *qe)
1069{
1070        struct ib_recv_wr wr, *bad_wr;
1071        struct ib_sge list;
1072        int ret;
1073
1074        list.addr   = qe->dma;
1075        list.length = sizeof(struct nvme_completion);
1076        list.lkey   = queue->device->pd->local_dma_lkey;
1077
1078        qe->cqe.done = nvme_rdma_recv_done;
1079
1080        wr.next     = NULL;
1081        wr.wr_cqe   = &qe->cqe;
1082        wr.sg_list  = &list;
1083        wr.num_sge  = 1;
1084
1085        ret = ib_post_recv(queue->qp, &wr, &bad_wr);
1086        if (ret) {
1087                dev_err(queue->ctrl->ctrl.device,
1088                        "%s failed with error code %d\n", __func__, ret);
1089        }
1090        return ret;
1091}
1092
1093static struct blk_mq_tags *nvme_rdma_tagset(struct nvme_rdma_queue *queue)
1094{
1095        u32 queue_idx = nvme_rdma_queue_idx(queue);
1096
1097        if (queue_idx == 0)
1098                return queue->ctrl->admin_tag_set.tags[queue_idx];
1099        return queue->ctrl->tag_set.tags[queue_idx - 1];
1100}
1101
1102static void nvme_rdma_submit_async_event(struct nvme_ctrl *arg, int aer_idx)
1103{
1104        struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(arg);
1105        struct nvme_rdma_queue *queue = &ctrl->queues[0];
1106        struct ib_device *dev = queue->device->dev;
1107        struct nvme_rdma_qe *sqe = &ctrl->async_event_sqe;
1108        struct nvme_command *cmd = sqe->data;
1109        struct ib_sge sge;
1110        int ret;
1111
1112        if (WARN_ON_ONCE(aer_idx != 0))
1113                return;
1114
1115        ib_dma_sync_single_for_cpu(dev, sqe->dma, sizeof(*cmd), DMA_TO_DEVICE);
1116
1117        memset(cmd, 0, sizeof(*cmd));
1118        cmd->common.opcode = nvme_admin_async_event;
1119        cmd->common.command_id = NVME_RDMA_AQ_BLKMQ_DEPTH;
1120        cmd->common.flags |= NVME_CMD_SGL_METABUF;
1121        nvme_rdma_set_sg_null(cmd);
1122
1123        ib_dma_sync_single_for_device(dev, sqe->dma, sizeof(*cmd),
1124                        DMA_TO_DEVICE);
1125
1126        ret = nvme_rdma_post_send(queue, sqe, &sge, 1, NULL, false);
1127        WARN_ON_ONCE(ret);
1128}
1129
1130static int nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
1131                struct nvme_completion *cqe, struct ib_wc *wc, int tag)
1132{
1133        u16 status = le16_to_cpu(cqe->status);
1134        struct request *rq;
1135        struct nvme_rdma_request *req;
1136        int ret = 0;
1137
1138        status >>= 1;
1139
1140        rq = blk_mq_tag_to_rq(nvme_rdma_tagset(queue), cqe->command_id);
1141        if (!rq) {
1142                dev_err(queue->ctrl->ctrl.device,
1143                        "tag 0x%x on QP %#x not found\n",
1144                        cqe->command_id, queue->qp->qp_num);
1145                nvme_rdma_error_recovery(queue->ctrl);
1146                return ret;
1147        }
1148        req = blk_mq_rq_to_pdu(rq);
1149
1150        if (rq->cmd_type == REQ_TYPE_DRV_PRIV && rq->special)
1151                memcpy(rq->special, cqe, sizeof(*cqe));
1152
1153        if (rq->tag == tag)
1154                ret = 1;
1155
1156        if ((wc->wc_flags & IB_WC_WITH_INVALIDATE) &&
1157            wc->ex.invalidate_rkey == req->mr->rkey)
1158                req->mr->need_inval = false;
1159
1160        blk_mq_complete_request(rq, status);
1161
1162        return ret;
1163}
1164
1165static int __nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc, int tag)
1166{
1167        struct nvme_rdma_qe *qe =
1168                container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe);
1169        struct nvme_rdma_queue *queue = cq->cq_context;
1170        struct ib_device *ibdev = queue->device->dev;
1171        struct nvme_completion *cqe = qe->data;
1172        const size_t len = sizeof(struct nvme_completion);
1173        int ret = 0;
1174
1175        if (unlikely(wc->status != IB_WC_SUCCESS)) {
1176                nvme_rdma_wr_error(cq, wc, "RECV");
1177                return 0;
1178        }
1179
1180        ib_dma_sync_single_for_cpu(ibdev, qe->dma, len, DMA_FROM_DEVICE);
1181        /*
1182         * AEN requests are special as they don't time out and can
1183         * survive any kind of queue freeze and often don't respond to
1184         * aborts.  We don't even bother to allocate a struct request
1185         * for them but rather special case them here.
1186         */
1187        if (unlikely(nvme_rdma_queue_idx(queue) == 0 &&
1188                        cqe->command_id >= NVME_RDMA_AQ_BLKMQ_DEPTH))
1189                nvme_complete_async_event(&queue->ctrl->ctrl, cqe);
1190        else
1191                ret = nvme_rdma_process_nvme_rsp(queue, cqe, wc, tag);
1192        ib_dma_sync_single_for_device(ibdev, qe->dma, len, DMA_FROM_DEVICE);
1193
1194        nvme_rdma_post_recv(queue, qe);
1195        return ret;
1196}
1197
1198static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
1199{
1200        __nvme_rdma_recv_done(cq, wc, -1);
1201}
1202
1203static int nvme_rdma_conn_established(struct nvme_rdma_queue *queue)
1204{
1205        int ret, i;
1206
1207        for (i = 0; i < queue->queue_size; i++) {
1208                ret = nvme_rdma_post_recv(queue, &queue->rsp_ring[i]);
1209                if (ret)
1210                        goto out_destroy_queue_ib;
1211        }
1212
1213        return 0;
1214
1215out_destroy_queue_ib:
1216        nvme_rdma_destroy_queue_ib(queue);
1217        return ret;
1218}
1219
1220static int nvme_rdma_conn_rejected(struct nvme_rdma_queue *queue,
1221                struct rdma_cm_event *ev)
1222{
1223        if (ev->param.conn.private_data_len) {
1224                struct nvme_rdma_cm_rej *rej =
1225                        (struct nvme_rdma_cm_rej *)ev->param.conn.private_data;
1226
1227                dev_err(queue->ctrl->ctrl.device,
1228                        "Connect rejected, status %d.", le16_to_cpu(rej->sts));
1229                /* XXX: Think of something clever to do here... */
1230        } else {
1231                dev_err(queue->ctrl->ctrl.device,
1232                        "Connect rejected, no private data.\n");
1233        }
1234
1235        return -ECONNRESET;
1236}
1237
1238static int nvme_rdma_addr_resolved(struct nvme_rdma_queue *queue)
1239{
1240        struct nvme_rdma_device *dev;
1241        int ret;
1242
1243        dev = nvme_rdma_find_get_device(queue->cm_id);
1244        if (!dev) {
1245                dev_err(queue->cm_id->device->dma_device,
1246                        "no client data found!\n");
1247                return -ECONNREFUSED;
1248        }
1249
1250        ret = nvme_rdma_create_queue_ib(queue, dev);
1251        if (ret) {
1252                nvme_rdma_dev_put(dev);
1253                goto out;
1254        }
1255
1256        ret = rdma_resolve_route(queue->cm_id, NVME_RDMA_CONNECT_TIMEOUT_MS);
1257        if (ret) {
1258                dev_err(queue->ctrl->ctrl.device,
1259                        "rdma_resolve_route failed (%d).\n",
1260                        queue->cm_error);
1261                goto out_destroy_queue;
1262        }
1263
1264        return 0;
1265
1266out_destroy_queue:
1267        nvme_rdma_destroy_queue_ib(queue);
1268out:
1269        return ret;
1270}
1271
1272static int nvme_rdma_route_resolved(struct nvme_rdma_queue *queue)
1273{
1274        struct nvme_rdma_ctrl *ctrl = queue->ctrl;
1275        struct rdma_conn_param param = { };
1276        struct nvme_rdma_cm_req priv = { };
1277        int ret;
1278
1279        param.qp_num = queue->qp->qp_num;
1280        param.flow_control = 1;
1281
1282        param.responder_resources = queue->device->dev->attrs.max_qp_rd_atom;
1283        /* maximum retry count */
1284        param.retry_count = 7;
1285        param.rnr_retry_count = 7;
1286        param.private_data = &priv;
1287        param.private_data_len = sizeof(priv);
1288
1289        priv.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0);
1290        priv.qid = cpu_to_le16(nvme_rdma_queue_idx(queue));
1291        /*
1292         * set the admin queue depth to the minimum size
1293         * specified by the Fabrics standard.
1294         */
1295        if (priv.qid == 0) {
1296                priv.hrqsize = cpu_to_le16(NVMF_AQ_DEPTH);
1297                priv.hsqsize = cpu_to_le16(NVMF_AQ_DEPTH - 1);
1298        } else {
1299                /*
1300                 * current interpretation of the fabrics spec
1301                 * is at minimum you make hrqsize sqsize+1, or a
1302                 * 1's based representation of sqsize.
1303                 */
1304                priv.hrqsize = cpu_to_le16(queue->queue_size);
1305                priv.hsqsize = cpu_to_le16(queue->ctrl->ctrl.sqsize);
1306        }
1307
1308        ret = rdma_connect(queue->cm_id, &param);
1309        if (ret) {
1310                dev_err(ctrl->ctrl.device,
1311                        "rdma_connect failed (%d).\n", ret);
1312                goto out_destroy_queue_ib;
1313        }
1314
1315        return 0;
1316
1317out_destroy_queue_ib:
1318        nvme_rdma_destroy_queue_ib(queue);
1319        return ret;
1320}
1321
1322static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
1323                struct rdma_cm_event *ev)
1324{
1325        struct nvme_rdma_queue *queue = cm_id->context;
1326        int cm_error = 0;
1327
1328        dev_dbg(queue->ctrl->ctrl.device, "%s (%d): status %d id %p\n",
1329                rdma_event_msg(ev->event), ev->event,
1330                ev->status, cm_id);
1331
1332        switch (ev->event) {
1333        case RDMA_CM_EVENT_ADDR_RESOLVED:
1334                cm_error = nvme_rdma_addr_resolved(queue);
1335                break;
1336        case RDMA_CM_EVENT_ROUTE_RESOLVED:
1337                cm_error = nvme_rdma_route_resolved(queue);
1338                break;
1339        case RDMA_CM_EVENT_ESTABLISHED:
1340                queue->cm_error = nvme_rdma_conn_established(queue);
1341                /* complete cm_done regardless of success/failure */
1342                complete(&queue->cm_done);
1343                return 0;
1344        case RDMA_CM_EVENT_REJECTED:
1345                cm_error = nvme_rdma_conn_rejected(queue, ev);
1346                break;
1347        case RDMA_CM_EVENT_ADDR_ERROR:
1348        case RDMA_CM_EVENT_ROUTE_ERROR:
1349        case RDMA_CM_EVENT_CONNECT_ERROR:
1350        case RDMA_CM_EVENT_UNREACHABLE:
1351                dev_dbg(queue->ctrl->ctrl.device,
1352                        "CM error event %d\n", ev->event);
1353                cm_error = -ECONNRESET;
1354                break;
1355        case RDMA_CM_EVENT_DISCONNECTED:
1356        case RDMA_CM_EVENT_ADDR_CHANGE:
1357        case RDMA_CM_EVENT_TIMEWAIT_EXIT:
1358                dev_dbg(queue->ctrl->ctrl.device,
1359                        "disconnect received - connection closed\n");
1360                nvme_rdma_error_recovery(queue->ctrl);
1361                break;
1362        case RDMA_CM_EVENT_DEVICE_REMOVAL:
1363                /* device removal is handled via the ib_client API */
1364                break;
1365        default:
1366                dev_err(queue->ctrl->ctrl.device,
1367                        "Unexpected RDMA CM event (%d)\n", ev->event);
1368                nvme_rdma_error_recovery(queue->ctrl);
1369                break;
1370        }
1371
1372        if (cm_error) {
1373                queue->cm_error = cm_error;
1374                complete(&queue->cm_done);
1375        }
1376
1377        return 0;
1378}
1379
1380static enum blk_eh_timer_return
1381nvme_rdma_timeout(struct request *rq, bool reserved)
1382{
1383        struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
1384
1385        /* queue error recovery */
1386        nvme_rdma_error_recovery(req->queue->ctrl);
1387
1388        /* fail with DNR on cmd timeout */
1389        rq->errors = NVME_SC_ABORT_REQ | NVME_SC_DNR;
1390
1391        return BLK_EH_HANDLED;
1392}
1393
1394/*
1395 * We cannot accept any other command until the Connect command has completed.
1396 */
1397static inline bool nvme_rdma_queue_is_ready(struct nvme_rdma_queue *queue,
1398                struct request *rq)
1399{
1400        if (unlikely(!test_bit(NVME_RDMA_Q_LIVE, &queue->flags))) {
1401                struct nvme_command *cmd = (struct nvme_command *)rq->cmd;
1402
1403                if (rq->cmd_type != REQ_TYPE_DRV_PRIV ||
1404                    cmd->common.opcode != nvme_fabrics_command ||
1405                    cmd->fabrics.fctype != nvme_fabrics_type_connect)
1406                        return false;
1407        }
1408
1409        return true;
1410}
1411
1412static int nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
1413                const struct blk_mq_queue_data *bd)
1414{
1415        struct nvme_ns *ns = hctx->queue->queuedata;
1416        struct nvme_rdma_queue *queue = hctx->driver_data;
1417        struct request *rq = bd->rq;
1418        struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
1419        struct nvme_rdma_qe *sqe = &req->sqe;
1420        struct nvme_command *c = sqe->data;
1421        bool flush = false;
1422        struct ib_device *dev;
1423        unsigned int map_len;
1424        int ret;
1425
1426        WARN_ON_ONCE(rq->tag < 0);
1427
1428        if (!nvme_rdma_queue_is_ready(queue, rq))
1429                return BLK_MQ_RQ_QUEUE_BUSY;
1430
1431        dev = queue->device->dev;
1432        ib_dma_sync_single_for_cpu(dev, sqe->dma,
1433                        sizeof(struct nvme_command), DMA_TO_DEVICE);
1434
1435        ret = nvme_setup_cmd(ns, rq, c);
1436        if (ret)
1437                return ret;
1438
1439        c->common.command_id = rq->tag;
1440        blk_mq_start_request(rq);
1441
1442        map_len = nvme_map_len(rq);
1443        ret = nvme_rdma_map_data(queue, rq, map_len, c);
1444        if (ret < 0) {
1445                dev_err(queue->ctrl->ctrl.device,
1446                             "Failed to map data (%d)\n", ret);
1447                nvme_cleanup_cmd(rq);
1448                goto err;
1449        }
1450
1451        ib_dma_sync_single_for_device(dev, sqe->dma,
1452                        sizeof(struct nvme_command), DMA_TO_DEVICE);
1453
1454        if (rq->cmd_type == REQ_TYPE_FS && req_op(rq) == REQ_OP_FLUSH)
1455                flush = true;
1456        ret = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge,
1457                        req->mr->need_inval ? &req->reg_wr.wr : NULL, flush);
1458        if (ret) {
1459                nvme_rdma_unmap_data(queue, rq);
1460                goto err;
1461        }
1462
1463        return BLK_MQ_RQ_QUEUE_OK;
1464err:
1465        return (ret == -ENOMEM || ret == -EAGAIN) ?
1466                BLK_MQ_RQ_QUEUE_BUSY : BLK_MQ_RQ_QUEUE_ERROR;
1467}
1468
1469static int nvme_rdma_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
1470{
1471        struct nvme_rdma_queue *queue = hctx->driver_data;
1472        struct ib_cq *cq = queue->ib_cq;
1473        struct ib_wc wc;
1474        int found = 0;
1475
1476        ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
1477        while (ib_poll_cq(cq, 1, &wc) > 0) {
1478                struct ib_cqe *cqe = wc.wr_cqe;
1479
1480                if (cqe) {
1481                        if (cqe->done == nvme_rdma_recv_done)
1482                                found |= __nvme_rdma_recv_done(cq, &wc, tag);
1483                        else
1484                                cqe->done(cq, &wc);
1485                }
1486        }
1487
1488        return found;
1489}
1490
1491static void nvme_rdma_complete_rq(struct request *rq)
1492{
1493        struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
1494        struct nvme_rdma_queue *queue = req->queue;
1495        int error = 0;
1496
1497        nvme_rdma_unmap_data(queue, rq);
1498
1499        if (unlikely(rq->errors)) {
1500                if (nvme_req_needs_retry(rq, rq->errors)) {
1501                        nvme_requeue_req(rq);
1502                        return;
1503                }
1504
1505                if (rq->cmd_type == REQ_TYPE_DRV_PRIV)
1506                        error = rq->errors;
1507                else
1508                        error = nvme_error_status(rq->errors);
1509        }
1510
1511        blk_mq_end_request(rq, error);
1512}
1513
1514static struct blk_mq_ops nvme_rdma_mq_ops = {
1515        .queue_rq       = nvme_rdma_queue_rq,
1516        .complete       = nvme_rdma_complete_rq,
1517        .init_request   = nvme_rdma_init_request,
1518        .exit_request   = nvme_rdma_exit_request,
1519        .reinit_request = nvme_rdma_reinit_request,
1520        .init_hctx      = nvme_rdma_init_hctx,
1521        .poll           = nvme_rdma_poll,
1522        .timeout        = nvme_rdma_timeout,
1523};
1524
1525static struct blk_mq_ops nvme_rdma_admin_mq_ops = {
1526        .queue_rq       = nvme_rdma_queue_rq,
1527        .complete       = nvme_rdma_complete_rq,
1528        .init_request   = nvme_rdma_init_admin_request,
1529        .exit_request   = nvme_rdma_exit_admin_request,
1530        .reinit_request = nvme_rdma_reinit_request,
1531        .init_hctx      = nvme_rdma_init_admin_hctx,
1532        .timeout        = nvme_rdma_timeout,
1533};
1534
1535static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl)
1536{
1537        int error;
1538
1539        error = nvme_rdma_init_queue(ctrl, 0, NVMF_AQ_DEPTH);
1540        if (error)
1541                return error;
1542
1543        ctrl->device = ctrl->queues[0].device;
1544
1545        /*
1546         * We need a reference on the device as long as the tag_set is alive,
1547         * as the MRs in the request structures need a valid ib_device.
1548         */
1549        error = -EINVAL;
1550        if (!nvme_rdma_dev_get(ctrl->device))
1551                goto out_free_queue;
1552
1553        ctrl->max_fr_pages = min_t(u32, NVME_RDMA_MAX_SEGMENTS,
1554                ctrl->device->dev->attrs.max_fast_reg_page_list_len);
1555
1556        memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set));
1557        ctrl->admin_tag_set.ops = &nvme_rdma_admin_mq_ops;
1558        ctrl->admin_tag_set.queue_depth = NVME_RDMA_AQ_BLKMQ_DEPTH;
1559        ctrl->admin_tag_set.reserved_tags = 2; /* connect + keep-alive */
1560        ctrl->admin_tag_set.numa_node = NUMA_NO_NODE;
1561        ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_rdma_request) +
1562                SG_CHUNK_SIZE * sizeof(struct scatterlist);
1563        ctrl->admin_tag_set.driver_data = ctrl;
1564        ctrl->admin_tag_set.nr_hw_queues = 1;
1565        ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT;
1566
1567        error = blk_mq_alloc_tag_set(&ctrl->admin_tag_set);
1568        if (error)
1569                goto out_put_dev;
1570
1571        ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set);
1572        if (IS_ERR(ctrl->ctrl.admin_q)) {
1573                error = PTR_ERR(ctrl->ctrl.admin_q);
1574                goto out_free_tagset;
1575        }
1576
1577        error = nvmf_connect_admin_queue(&ctrl->ctrl);
1578        if (error)
1579                goto out_cleanup_queue;
1580
1581        set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[0].flags);
1582
1583        error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->cap);
1584        if (error) {
1585                dev_err(ctrl->ctrl.device,
1586                        "prop_get NVME_REG_CAP failed\n");
1587                goto out_cleanup_queue;
1588        }
1589
1590        ctrl->ctrl.sqsize =
1591                min_t(int, NVME_CAP_MQES(ctrl->cap) + 1, ctrl->ctrl.sqsize);
1592
1593        error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap);
1594        if (error)
1595                goto out_cleanup_queue;
1596
1597        ctrl->ctrl.max_hw_sectors =
1598                (ctrl->max_fr_pages - 1) << (PAGE_SHIFT - 9);
1599
1600        error = nvme_init_identify(&ctrl->ctrl);
1601        if (error)
1602                goto out_cleanup_queue;
1603
1604        error = nvme_rdma_alloc_qe(ctrl->queues[0].device->dev,
1605                        &ctrl->async_event_sqe, sizeof(struct nvme_command),
1606                        DMA_TO_DEVICE);
1607        if (error)
1608                goto out_cleanup_queue;
1609
1610        nvme_start_keep_alive(&ctrl->ctrl);
1611
1612        return 0;
1613
1614out_cleanup_queue:
1615        blk_cleanup_queue(ctrl->ctrl.admin_q);
1616out_free_tagset:
1617        /* disconnect and drain the queue before freeing the tagset */
1618        nvme_rdma_stop_queue(&ctrl->queues[0]);
1619        blk_mq_free_tag_set(&ctrl->admin_tag_set);
1620out_put_dev:
1621        nvme_rdma_dev_put(ctrl->device);
1622out_free_queue:
1623        nvme_rdma_free_queue(&ctrl->queues[0]);
1624        return error;
1625}
1626
1627static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl)
1628{
1629        nvme_stop_keep_alive(&ctrl->ctrl);
1630        cancel_work_sync(&ctrl->err_work);
1631        cancel_delayed_work_sync(&ctrl->reconnect_work);
1632
1633        if (ctrl->queue_count > 1) {
1634                nvme_stop_queues(&ctrl->ctrl);
1635                blk_mq_tagset_busy_iter(&ctrl->tag_set,
1636                                        nvme_cancel_request, &ctrl->ctrl);
1637                nvme_rdma_free_io_queues(ctrl);
1638        }
1639
1640        if (test_bit(NVME_RDMA_Q_CONNECTED, &ctrl->queues[0].flags))
1641                nvme_shutdown_ctrl(&ctrl->ctrl);
1642
1643        blk_mq_stop_hw_queues(ctrl->ctrl.admin_q);
1644        blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
1645                                nvme_cancel_request, &ctrl->ctrl);
1646        nvme_rdma_destroy_admin_queue(ctrl);
1647}
1648
1649static void __nvme_rdma_remove_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
1650{
1651        nvme_uninit_ctrl(&ctrl->ctrl);
1652        if (shutdown)
1653                nvme_rdma_shutdown_ctrl(ctrl);
1654
1655        if (ctrl->ctrl.tagset) {
1656                blk_cleanup_queue(ctrl->ctrl.connect_q);
1657                blk_mq_free_tag_set(&ctrl->tag_set);
1658                nvme_rdma_dev_put(ctrl->device);
1659        }
1660
1661        nvme_put_ctrl(&ctrl->ctrl);
1662}
1663
1664static void nvme_rdma_del_ctrl_work(struct work_struct *work)
1665{
1666        struct nvme_rdma_ctrl *ctrl = container_of(work,
1667                                struct nvme_rdma_ctrl, delete_work);
1668
1669        __nvme_rdma_remove_ctrl(ctrl, true);
1670}
1671
1672static int __nvme_rdma_del_ctrl(struct nvme_rdma_ctrl *ctrl)
1673{
1674        if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING))
1675                return -EBUSY;
1676
1677        if (!queue_work(nvme_rdma_wq, &ctrl->delete_work))
1678                return -EBUSY;
1679
1680        return 0;
1681}
1682
1683static int nvme_rdma_del_ctrl(struct nvme_ctrl *nctrl)
1684{
1685        struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
1686        int ret = 0;
1687
1688        /*
1689         * Keep a reference until all work is flushed since
1690         * __nvme_rdma_del_ctrl can free the ctrl mem
1691         */
1692        if (!kref_get_unless_zero(&ctrl->ctrl.kref))
1693                return -EBUSY;
1694        ret = __nvme_rdma_del_ctrl(ctrl);
1695        if (!ret)
1696                flush_work(&ctrl->delete_work);
1697        nvme_put_ctrl(&ctrl->ctrl);
1698        return ret;
1699}
1700
1701static void nvme_rdma_remove_ctrl_work(struct work_struct *work)
1702{
1703        struct nvme_rdma_ctrl *ctrl = container_of(work,
1704                                struct nvme_rdma_ctrl, delete_work);
1705
1706        __nvme_rdma_remove_ctrl(ctrl, false);
1707}
1708
1709static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
1710{
1711        struct nvme_rdma_ctrl *ctrl = container_of(work,
1712                                        struct nvme_rdma_ctrl, reset_work);
1713        int ret;
1714        bool changed;
1715
1716        nvme_rdma_shutdown_ctrl(ctrl);
1717
1718        ret = nvme_rdma_configure_admin_queue(ctrl);
1719        if (ret) {
1720                /* ctrl is already shutdown, just remove the ctrl */
1721                INIT_WORK(&ctrl->delete_work, nvme_rdma_remove_ctrl_work);
1722                goto del_dead_ctrl;
1723        }
1724
1725        if (ctrl->queue_count > 1) {
1726                ret = blk_mq_reinit_tagset(&ctrl->tag_set);
1727                if (ret)
1728                        goto del_dead_ctrl;
1729
1730                ret = nvme_rdma_init_io_queues(ctrl);
1731                if (ret)
1732                        goto del_dead_ctrl;
1733
1734                ret = nvme_rdma_connect_io_queues(ctrl);
1735                if (ret)
1736                        goto del_dead_ctrl;
1737        }
1738
1739        changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
1740        WARN_ON_ONCE(!changed);
1741
1742        if (ctrl->queue_count > 1) {
1743                nvme_start_queues(&ctrl->ctrl);
1744                nvme_queue_scan(&ctrl->ctrl);
1745                nvme_queue_async_events(&ctrl->ctrl);
1746        }
1747
1748        return;
1749
1750del_dead_ctrl:
1751        /* Deleting this dead controller... */
1752        dev_warn(ctrl->ctrl.device, "Removing after reset failure\n");
1753        WARN_ON(!queue_work(nvme_rdma_wq, &ctrl->delete_work));
1754}
1755
1756static int nvme_rdma_reset_ctrl(struct nvme_ctrl *nctrl)
1757{
1758        struct nvme_rdma_ctrl *ctrl = to_rdma_ctrl(nctrl);
1759
1760        if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING))
1761                return -EBUSY;
1762
1763        if (!queue_work(nvme_rdma_wq, &ctrl->reset_work))
1764                return -EBUSY;
1765
1766        flush_work(&ctrl->reset_work);
1767
1768        return 0;
1769}
1770
1771static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = {
1772        .name                   = "rdma",
1773        .module                 = THIS_MODULE,
1774        .is_fabrics             = true,
1775        .reg_read32             = nvmf_reg_read32,
1776        .reg_read64             = nvmf_reg_read64,
1777        .reg_write32            = nvmf_reg_write32,
1778        .reset_ctrl             = nvme_rdma_reset_ctrl,
1779        .free_ctrl              = nvme_rdma_free_ctrl,
1780        .submit_async_event     = nvme_rdma_submit_async_event,
1781        .delete_ctrl            = nvme_rdma_del_ctrl,
1782        .get_subsysnqn          = nvmf_get_subsysnqn,
1783        .get_address            = nvmf_get_address,
1784};
1785
1786static int nvme_rdma_create_io_queues(struct nvme_rdma_ctrl *ctrl)
1787{
1788        struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
1789        int ret;
1790
1791        ret = nvme_set_queue_count(&ctrl->ctrl, &opts->nr_io_queues);
1792        if (ret)
1793                return ret;
1794
1795        ctrl->queue_count = opts->nr_io_queues + 1;
1796        if (ctrl->queue_count < 2)
1797                return 0;
1798
1799        dev_info(ctrl->ctrl.device,
1800                "creating %d I/O queues.\n", opts->nr_io_queues);
1801
1802        ret = nvme_rdma_init_io_queues(ctrl);
1803        if (ret)
1804                return ret;
1805
1806        /*
1807         * We need a reference on the device as long as the tag_set is alive,
1808         * as the MRs in the request structures need a valid ib_device.
1809         */
1810        ret = -EINVAL;
1811        if (!nvme_rdma_dev_get(ctrl->device))
1812                goto out_free_io_queues;
1813
1814        memset(&ctrl->tag_set, 0, sizeof(ctrl->tag_set));
1815        ctrl->tag_set.ops = &nvme_rdma_mq_ops;
1816        ctrl->tag_set.queue_depth = ctrl->ctrl.opts->queue_size;
1817        ctrl->tag_set.reserved_tags = 1; /* fabric connect */
1818        ctrl->tag_set.numa_node = NUMA_NO_NODE;
1819        ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
1820        ctrl->tag_set.cmd_size = sizeof(struct nvme_rdma_request) +
1821                SG_CHUNK_SIZE * sizeof(struct scatterlist);
1822        ctrl->tag_set.driver_data = ctrl;
1823        ctrl->tag_set.nr_hw_queues = ctrl->queue_count - 1;
1824        ctrl->tag_set.timeout = NVME_IO_TIMEOUT;
1825
1826        ret = blk_mq_alloc_tag_set(&ctrl->tag_set);
1827        if (ret)
1828                goto out_put_dev;
1829        ctrl->ctrl.tagset = &ctrl->tag_set;
1830
1831        ctrl->ctrl.connect_q = blk_mq_init_queue(&ctrl->tag_set);
1832        if (IS_ERR(ctrl->ctrl.connect_q)) {
1833                ret = PTR_ERR(ctrl->ctrl.connect_q);
1834                goto out_free_tag_set;
1835        }
1836
1837        ret = nvme_rdma_connect_io_queues(ctrl);
1838        if (ret)
1839                goto out_cleanup_connect_q;
1840
1841        return 0;
1842
1843out_cleanup_connect_q:
1844        blk_cleanup_queue(ctrl->ctrl.connect_q);
1845out_free_tag_set:
1846        blk_mq_free_tag_set(&ctrl->tag_set);
1847out_put_dev:
1848        nvme_rdma_dev_put(ctrl->device);
1849out_free_io_queues:
1850        nvme_rdma_free_io_queues(ctrl);
1851        return ret;
1852}
1853
1854static int nvme_rdma_parse_ipaddr(struct sockaddr_in *in_addr, char *p)
1855{
1856        u8 *addr = (u8 *)&in_addr->sin_addr.s_addr;
1857        size_t buflen = strlen(p);
1858
1859        /* XXX: handle IPv6 addresses */
1860
1861        if (buflen > INET_ADDRSTRLEN)
1862                return -EINVAL;
1863        if (in4_pton(p, buflen, addr, '\0', NULL) == 0)
1864                return -EINVAL;
1865        in_addr->sin_family = AF_INET;
1866        return 0;
1867}
1868
1869static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
1870                struct nvmf_ctrl_options *opts)
1871{
1872        struct nvme_rdma_ctrl *ctrl;
1873        int ret;
1874        bool changed;
1875
1876        ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
1877        if (!ctrl)
1878                return ERR_PTR(-ENOMEM);
1879        ctrl->ctrl.opts = opts;
1880        INIT_LIST_HEAD(&ctrl->list);
1881
1882        ret = nvme_rdma_parse_ipaddr(&ctrl->addr_in, opts->traddr);
1883        if (ret) {
1884                pr_err("malformed IP address passed: %s\n", opts->traddr);
1885                goto out_free_ctrl;
1886        }
1887
1888        if (opts->mask & NVMF_OPT_TRSVCID) {
1889                u16 port;
1890
1891                ret = kstrtou16(opts->trsvcid, 0, &port);
1892                if (ret)
1893                        goto out_free_ctrl;
1894
1895                ctrl->addr_in.sin_port = cpu_to_be16(port);
1896        } else {
1897                ctrl->addr_in.sin_port = cpu_to_be16(NVME_RDMA_IP_PORT);
1898        }
1899
1900        ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_rdma_ctrl_ops,
1901                                0 /* no quirks, we're perfect! */);
1902        if (ret)
1903                goto out_free_ctrl;
1904
1905        ctrl->reconnect_delay = opts->reconnect_delay;
1906        INIT_DELAYED_WORK(&ctrl->reconnect_work,
1907                        nvme_rdma_reconnect_ctrl_work);
1908        INIT_WORK(&ctrl->err_work, nvme_rdma_error_recovery_work);
1909        INIT_WORK(&ctrl->delete_work, nvme_rdma_del_ctrl_work);
1910        INIT_WORK(&ctrl->reset_work, nvme_rdma_reset_ctrl_work);
1911        spin_lock_init(&ctrl->lock);
1912
1913        ctrl->queue_count = opts->nr_io_queues + 1; /* +1 for admin queue */
1914        ctrl->ctrl.sqsize = opts->queue_size - 1;
1915        ctrl->ctrl.kato = opts->kato;
1916
1917        ret = -ENOMEM;
1918        ctrl->queues = kcalloc(ctrl->queue_count, sizeof(*ctrl->queues),
1919                                GFP_KERNEL);
1920        if (!ctrl->queues)
1921                goto out_uninit_ctrl;
1922
1923        ret = nvme_rdma_configure_admin_queue(ctrl);
1924        if (ret)
1925                goto out_kfree_queues;
1926
1927        /* sanity check icdoff */
1928        if (ctrl->ctrl.icdoff) {
1929                dev_err(ctrl->ctrl.device, "icdoff is not supported!\n");
1930                goto out_remove_admin_queue;
1931        }
1932
1933        /* sanity check keyed sgls */
1934        if (!(ctrl->ctrl.sgls & (1 << 20))) {
1935                dev_err(ctrl->ctrl.device, "Mandatory keyed sgls are not support\n");
1936                goto out_remove_admin_queue;
1937        }
1938
1939        if (opts->queue_size > ctrl->ctrl.maxcmd) {
1940                /* warn if maxcmd is lower than queue_size */
1941                dev_warn(ctrl->ctrl.device,
1942                        "queue_size %zu > ctrl maxcmd %u, clamping down\n",
1943                        opts->queue_size, ctrl->ctrl.maxcmd);
1944                opts->queue_size = ctrl->ctrl.maxcmd;
1945        }
1946
1947        if (opts->nr_io_queues) {
1948                ret = nvme_rdma_create_io_queues(ctrl);
1949                if (ret)
1950                        goto out_remove_admin_queue;
1951        }
1952
1953        changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
1954        WARN_ON_ONCE(!changed);
1955
1956        dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n",
1957                ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
1958
1959        kref_get(&ctrl->ctrl.kref);
1960
1961        mutex_lock(&nvme_rdma_ctrl_mutex);
1962        list_add_tail(&ctrl->list, &nvme_rdma_ctrl_list);
1963        mutex_unlock(&nvme_rdma_ctrl_mutex);
1964
1965        if (opts->nr_io_queues) {
1966                nvme_queue_scan(&ctrl->ctrl);
1967                nvme_queue_async_events(&ctrl->ctrl);
1968        }
1969
1970        return &ctrl->ctrl;
1971
1972out_remove_admin_queue:
1973        nvme_stop_keep_alive(&ctrl->ctrl);
1974        nvme_rdma_destroy_admin_queue(ctrl);
1975out_kfree_queues:
1976        kfree(ctrl->queues);
1977out_uninit_ctrl:
1978        nvme_uninit_ctrl(&ctrl->ctrl);
1979        nvme_put_ctrl(&ctrl->ctrl);
1980        if (ret > 0)
1981                ret = -EIO;
1982        return ERR_PTR(ret);
1983out_free_ctrl:
1984        kfree(ctrl);
1985        return ERR_PTR(ret);
1986}
1987
1988static struct nvmf_transport_ops nvme_rdma_transport = {
1989        .name           = "rdma",
1990        .required_opts  = NVMF_OPT_TRADDR,
1991        .allowed_opts   = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY,
1992        .create_ctrl    = nvme_rdma_create_ctrl,
1993};
1994
1995static void nvme_rdma_add_one(struct ib_device *ib_device)
1996{
1997}
1998
1999static void nvme_rdma_remove_one(struct ib_device *ib_device, void *client_data)
2000{
2001        struct nvme_rdma_ctrl *ctrl;
2002
2003        /* Delete all controllers using this device */
2004        mutex_lock(&nvme_rdma_ctrl_mutex);
2005        list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list) {
2006                if (ctrl->device->dev != ib_device)
2007                        continue;
2008                dev_info(ctrl->ctrl.device,
2009                        "Removing ctrl: NQN \"%s\", addr %pISp\n",
2010                        ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
2011                __nvme_rdma_del_ctrl(ctrl);
2012        }
2013        mutex_unlock(&nvme_rdma_ctrl_mutex);
2014
2015        flush_workqueue(nvme_rdma_wq);
2016}
2017
2018static struct ib_client nvme_rdma_ib_client = {
2019        .name   = "nvme_rdma",
2020        .add = nvme_rdma_add_one,
2021        .remove = nvme_rdma_remove_one
2022};
2023
2024static int __init nvme_rdma_init_module(void)
2025{
2026        int ret;
2027
2028        nvme_rdma_wq = create_workqueue("nvme_rdma_wq");
2029        if (!nvme_rdma_wq)
2030                return -ENOMEM;
2031
2032        ret = ib_register_client(&nvme_rdma_ib_client);
2033        if (ret) {
2034                destroy_workqueue(nvme_rdma_wq);
2035                return ret;
2036        }
2037
2038        nvmf_register_transport(&nvme_rdma_transport);
2039        return 0;
2040}
2041
2042static void __exit nvme_rdma_cleanup_module(void)
2043{
2044        nvmf_unregister_transport(&nvme_rdma_transport);
2045        ib_unregister_client(&nvme_rdma_ib_client);
2046        destroy_workqueue(nvme_rdma_wq);
2047}
2048
2049module_init(nvme_rdma_init_module);
2050module_exit(nvme_rdma_cleanup_module);
2051
2052MODULE_LICENSE("GPL v2");
2053