linux/drivers/nvme/host/tcp.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * NVMe over Fabrics TCP host.
   4 * Copyright (c) 2018 Lightbits Labs. All rights reserved.
   5 */
   6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   7#include <linux/module.h>
   8#include <linux/init.h>
   9#include <linux/slab.h>
  10#include <linux/err.h>
  11#include <linux/nvme-tcp.h>
  12#include <net/sock.h>
  13#include <net/tcp.h>
  14#include <linux/blk-mq.h>
  15#include <crypto/hash.h>
  16#include <net/busy_poll.h>
  17
  18#include "nvme.h"
  19#include "fabrics.h"
  20
  21struct nvme_tcp_queue;
  22
  23/* Define the socket priority to use for connections were it is desirable
  24 * that the NIC consider performing optimized packet processing or filtering.
  25 * A non-zero value being sufficient to indicate general consideration of any
  26 * possible optimization.  Making it a module param allows for alternative
  27 * values that may be unique for some NIC implementations.
  28 */
  29static int so_priority;
  30module_param(so_priority, int, 0644);
  31MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority");
  32
  33enum nvme_tcp_send_state {
  34        NVME_TCP_SEND_CMD_PDU = 0,
  35        NVME_TCP_SEND_H2C_PDU,
  36        NVME_TCP_SEND_DATA,
  37        NVME_TCP_SEND_DDGST,
  38};
  39
  40struct nvme_tcp_request {
  41        struct nvme_request     req;
  42        void                    *pdu;
  43        struct nvme_tcp_queue   *queue;
  44        u32                     data_len;
  45        u32                     pdu_len;
  46        u32                     pdu_sent;
  47        u16                     ttag;
  48        struct list_head        entry;
  49        __le32                  ddgst;
  50
  51        struct bio              *curr_bio;
  52        struct iov_iter         iter;
  53
  54        /* send state */
  55        size_t                  offset;
  56        size_t                  data_sent;
  57        enum nvme_tcp_send_state state;
  58};
  59
  60enum nvme_tcp_queue_flags {
  61        NVME_TCP_Q_ALLOCATED    = 0,
  62        NVME_TCP_Q_LIVE         = 1,
  63        NVME_TCP_Q_POLLING      = 2,
  64};
  65
  66enum nvme_tcp_recv_state {
  67        NVME_TCP_RECV_PDU = 0,
  68        NVME_TCP_RECV_DATA,
  69        NVME_TCP_RECV_DDGST,
  70};
  71
  72struct nvme_tcp_ctrl;
  73struct nvme_tcp_queue {
  74        struct socket           *sock;
  75        struct work_struct      io_work;
  76        int                     io_cpu;
  77
  78        spinlock_t              lock;
  79        struct mutex            send_mutex;
  80        struct list_head        send_list;
  81
  82        /* recv state */
  83        void                    *pdu;
  84        int                     pdu_remaining;
  85        int                     pdu_offset;
  86        size_t                  data_remaining;
  87        size_t                  ddgst_remaining;
  88        unsigned int            nr_cqe;
  89
  90        /* send state */
  91        struct nvme_tcp_request *request;
  92
  93        int                     queue_size;
  94        size_t                  cmnd_capsule_len;
  95        struct nvme_tcp_ctrl    *ctrl;
  96        unsigned long           flags;
  97        bool                    rd_enabled;
  98
  99        bool                    hdr_digest;
 100        bool                    data_digest;
 101        struct ahash_request    *rcv_hash;
 102        struct ahash_request    *snd_hash;
 103        __le32                  exp_ddgst;
 104        __le32                  recv_ddgst;
 105
 106        struct page_frag_cache  pf_cache;
 107
 108        void (*state_change)(struct sock *);
 109        void (*data_ready)(struct sock *);
 110        void (*write_space)(struct sock *);
 111};
 112
 113struct nvme_tcp_ctrl {
 114        /* read only in the hot path */
 115        struct nvme_tcp_queue   *queues;
 116        struct blk_mq_tag_set   tag_set;
 117
 118        /* other member variables */
 119        struct list_head        list;
 120        struct blk_mq_tag_set   admin_tag_set;
 121        struct sockaddr_storage addr;
 122        struct sockaddr_storage src_addr;
 123        struct nvme_ctrl        ctrl;
 124
 125        struct work_struct      err_work;
 126        struct delayed_work     connect_work;
 127        struct nvme_tcp_request async_req;
 128        u32                     io_queues[HCTX_MAX_TYPES];
 129};
 130
 131static LIST_HEAD(nvme_tcp_ctrl_list);
 132static DEFINE_MUTEX(nvme_tcp_ctrl_mutex);
 133static struct workqueue_struct *nvme_tcp_wq;
 134static const struct blk_mq_ops nvme_tcp_mq_ops;
 135static const struct blk_mq_ops nvme_tcp_admin_mq_ops;
 136static int nvme_tcp_try_send(struct nvme_tcp_queue *queue);
 137
 138static inline struct nvme_tcp_ctrl *to_tcp_ctrl(struct nvme_ctrl *ctrl)
 139{
 140        return container_of(ctrl, struct nvme_tcp_ctrl, ctrl);
 141}
 142
 143static inline int nvme_tcp_queue_id(struct nvme_tcp_queue *queue)
 144{
 145        return queue - queue->ctrl->queues;
 146}
 147
 148static inline struct blk_mq_tags *nvme_tcp_tagset(struct nvme_tcp_queue *queue)
 149{
 150        u32 queue_idx = nvme_tcp_queue_id(queue);
 151
 152        if (queue_idx == 0)
 153                return queue->ctrl->admin_tag_set.tags[queue_idx];
 154        return queue->ctrl->tag_set.tags[queue_idx - 1];
 155}
 156
 157static inline u8 nvme_tcp_hdgst_len(struct nvme_tcp_queue *queue)
 158{
 159        return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0;
 160}
 161
 162static inline u8 nvme_tcp_ddgst_len(struct nvme_tcp_queue *queue)
 163{
 164        return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
 165}
 166
 167static inline size_t nvme_tcp_inline_data_size(struct nvme_tcp_queue *queue)
 168{
 169        return queue->cmnd_capsule_len - sizeof(struct nvme_command);
 170}
 171
 172static inline bool nvme_tcp_async_req(struct nvme_tcp_request *req)
 173{
 174        return req == &req->queue->ctrl->async_req;
 175}
 176
 177static inline bool nvme_tcp_has_inline_data(struct nvme_tcp_request *req)
 178{
 179        struct request *rq;
 180
 181        if (unlikely(nvme_tcp_async_req(req)))
 182                return false; /* async events don't have a request */
 183
 184        rq = blk_mq_rq_from_pdu(req);
 185
 186        return rq_data_dir(rq) == WRITE && req->data_len &&
 187                req->data_len <= nvme_tcp_inline_data_size(req->queue);
 188}
 189
 190static inline struct page *nvme_tcp_req_cur_page(struct nvme_tcp_request *req)
 191{
 192        return req->iter.bvec->bv_page;
 193}
 194
 195static inline size_t nvme_tcp_req_cur_offset(struct nvme_tcp_request *req)
 196{
 197        return req->iter.bvec->bv_offset + req->iter.iov_offset;
 198}
 199
 200static inline size_t nvme_tcp_req_cur_length(struct nvme_tcp_request *req)
 201{
 202        return min_t(size_t, req->iter.bvec->bv_len - req->iter.iov_offset,
 203                        req->pdu_len - req->pdu_sent);
 204}
 205
 206static inline size_t nvme_tcp_req_offset(struct nvme_tcp_request *req)
 207{
 208        return req->iter.iov_offset;
 209}
 210
 211static inline size_t nvme_tcp_pdu_data_left(struct nvme_tcp_request *req)
 212{
 213        return rq_data_dir(blk_mq_rq_from_pdu(req)) == WRITE ?
 214                        req->pdu_len - req->pdu_sent : 0;
 215}
 216
 217static inline size_t nvme_tcp_pdu_last_send(struct nvme_tcp_request *req,
 218                int len)
 219{
 220        return nvme_tcp_pdu_data_left(req) <= len;
 221}
 222
 223static void nvme_tcp_init_iter(struct nvme_tcp_request *req,
 224                unsigned int dir)
 225{
 226        struct request *rq = blk_mq_rq_from_pdu(req);
 227        struct bio_vec *vec;
 228        unsigned int size;
 229        int nsegs;
 230        size_t offset;
 231
 232        if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) {
 233                vec = &rq->special_vec;
 234                nsegs = 1;
 235                size = blk_rq_payload_bytes(rq);
 236                offset = 0;
 237        } else {
 238                struct bio *bio = req->curr_bio;
 239
 240                vec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
 241                nsegs = bio_segments(bio);
 242                size = bio->bi_iter.bi_size;
 243                offset = bio->bi_iter.bi_bvec_done;
 244        }
 245
 246        iov_iter_bvec(&req->iter, dir, vec, nsegs, size);
 247        req->iter.iov_offset = offset;
 248}
 249
 250static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req,
 251                int len)
 252{
 253        req->data_sent += len;
 254        req->pdu_sent += len;
 255        iov_iter_advance(&req->iter, len);
 256        if (!iov_iter_count(&req->iter) &&
 257            req->data_sent < req->data_len) {
 258                req->curr_bio = req->curr_bio->bi_next;
 259                nvme_tcp_init_iter(req, WRITE);
 260        }
 261}
 262
 263static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
 264                bool sync)
 265{
 266        struct nvme_tcp_queue *queue = req->queue;
 267        bool empty;
 268
 269        spin_lock(&queue->lock);
 270        empty = list_empty(&queue->send_list) && !queue->request;
 271        list_add_tail(&req->entry, &queue->send_list);
 272        spin_unlock(&queue->lock);
 273
 274        /*
 275         * if we're the first on the send_list and we can try to send
 276         * directly, otherwise queue io_work. Also, only do that if we
 277         * are on the same cpu, so we don't introduce contention.
 278         */
 279        if (queue->io_cpu == smp_processor_id() &&
 280            sync && empty && mutex_trylock(&queue->send_mutex)) {
 281                nvme_tcp_try_send(queue);
 282                mutex_unlock(&queue->send_mutex);
 283        } else {
 284                queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
 285        }
 286}
 287
 288static inline struct nvme_tcp_request *
 289nvme_tcp_fetch_request(struct nvme_tcp_queue *queue)
 290{
 291        struct nvme_tcp_request *req;
 292
 293        spin_lock(&queue->lock);
 294        req = list_first_entry_or_null(&queue->send_list,
 295                        struct nvme_tcp_request, entry);
 296        if (req)
 297                list_del(&req->entry);
 298        spin_unlock(&queue->lock);
 299
 300        return req;
 301}
 302
 303static inline void nvme_tcp_ddgst_final(struct ahash_request *hash,
 304                __le32 *dgst)
 305{
 306        ahash_request_set_crypt(hash, NULL, (u8 *)dgst, 0);
 307        crypto_ahash_final(hash);
 308}
 309
 310static inline void nvme_tcp_ddgst_update(struct ahash_request *hash,
 311                struct page *page, off_t off, size_t len)
 312{
 313        struct scatterlist sg;
 314
 315        sg_init_marker(&sg, 1);
 316        sg_set_page(&sg, page, len, off);
 317        ahash_request_set_crypt(hash, &sg, NULL, len);
 318        crypto_ahash_update(hash);
 319}
 320
 321static inline void nvme_tcp_hdgst(struct ahash_request *hash,
 322                void *pdu, size_t len)
 323{
 324        struct scatterlist sg;
 325
 326        sg_init_one(&sg, pdu, len);
 327        ahash_request_set_crypt(hash, &sg, pdu + len, len);
 328        crypto_ahash_digest(hash);
 329}
 330
 331static int nvme_tcp_verify_hdgst(struct nvme_tcp_queue *queue,
 332                void *pdu, size_t pdu_len)
 333{
 334        struct nvme_tcp_hdr *hdr = pdu;
 335        __le32 recv_digest;
 336        __le32 exp_digest;
 337
 338        if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) {
 339                dev_err(queue->ctrl->ctrl.device,
 340                        "queue %d: header digest flag is cleared\n",
 341                        nvme_tcp_queue_id(queue));
 342                return -EPROTO;
 343        }
 344
 345        recv_digest = *(__le32 *)(pdu + hdr->hlen);
 346        nvme_tcp_hdgst(queue->rcv_hash, pdu, pdu_len);
 347        exp_digest = *(__le32 *)(pdu + hdr->hlen);
 348        if (recv_digest != exp_digest) {
 349                dev_err(queue->ctrl->ctrl.device,
 350                        "header digest error: recv %#x expected %#x\n",
 351                        le32_to_cpu(recv_digest), le32_to_cpu(exp_digest));
 352                return -EIO;
 353        }
 354
 355        return 0;
 356}
 357
 358static int nvme_tcp_check_ddgst(struct nvme_tcp_queue *queue, void *pdu)
 359{
 360        struct nvme_tcp_hdr *hdr = pdu;
 361        u8 digest_len = nvme_tcp_hdgst_len(queue);
 362        u32 len;
 363
 364        len = le32_to_cpu(hdr->plen) - hdr->hlen -
 365                ((hdr->flags & NVME_TCP_F_HDGST) ? digest_len : 0);
 366
 367        if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) {
 368                dev_err(queue->ctrl->ctrl.device,
 369                        "queue %d: data digest flag is cleared\n",
 370                nvme_tcp_queue_id(queue));
 371                return -EPROTO;
 372        }
 373        crypto_ahash_init(queue->rcv_hash);
 374
 375        return 0;
 376}
 377
 378static void nvme_tcp_exit_request(struct blk_mq_tag_set *set,
 379                struct request *rq, unsigned int hctx_idx)
 380{
 381        struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
 382
 383        page_frag_free(req->pdu);
 384}
 385
 386static int nvme_tcp_init_request(struct blk_mq_tag_set *set,
 387                struct request *rq, unsigned int hctx_idx,
 388                unsigned int numa_node)
 389{
 390        struct nvme_tcp_ctrl *ctrl = set->driver_data;
 391        struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
 392        int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
 393        struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx];
 394        u8 hdgst = nvme_tcp_hdgst_len(queue);
 395
 396        req->pdu = page_frag_alloc(&queue->pf_cache,
 397                sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
 398                GFP_KERNEL | __GFP_ZERO);
 399        if (!req->pdu)
 400                return -ENOMEM;
 401
 402        req->queue = queue;
 403        nvme_req(rq)->ctrl = &ctrl->ctrl;
 404
 405        return 0;
 406}
 407
 408static int nvme_tcp_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
 409                unsigned int hctx_idx)
 410{
 411        struct nvme_tcp_ctrl *ctrl = data;
 412        struct nvme_tcp_queue *queue = &ctrl->queues[hctx_idx + 1];
 413
 414        hctx->driver_data = queue;
 415        return 0;
 416}
 417
 418static int nvme_tcp_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
 419                unsigned int hctx_idx)
 420{
 421        struct nvme_tcp_ctrl *ctrl = data;
 422        struct nvme_tcp_queue *queue = &ctrl->queues[0];
 423
 424        hctx->driver_data = queue;
 425        return 0;
 426}
 427
 428static enum nvme_tcp_recv_state
 429nvme_tcp_recv_state(struct nvme_tcp_queue *queue)
 430{
 431        return  (queue->pdu_remaining) ? NVME_TCP_RECV_PDU :
 432                (queue->ddgst_remaining) ? NVME_TCP_RECV_DDGST :
 433                NVME_TCP_RECV_DATA;
 434}
 435
 436static void nvme_tcp_init_recv_ctx(struct nvme_tcp_queue *queue)
 437{
 438        queue->pdu_remaining = sizeof(struct nvme_tcp_rsp_pdu) +
 439                                nvme_tcp_hdgst_len(queue);
 440        queue->pdu_offset = 0;
 441        queue->data_remaining = -1;
 442        queue->ddgst_remaining = 0;
 443}
 444
 445static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl)
 446{
 447        if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
 448                return;
 449
 450        queue_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work);
 451}
 452
 453static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
 454                struct nvme_completion *cqe)
 455{
 456        struct request *rq;
 457
 458        rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), cqe->command_id);
 459        if (!rq) {
 460                dev_err(queue->ctrl->ctrl.device,
 461                        "queue %d tag 0x%x not found\n",
 462                        nvme_tcp_queue_id(queue), cqe->command_id);
 463                nvme_tcp_error_recovery(&queue->ctrl->ctrl);
 464                return -EINVAL;
 465        }
 466
 467        nvme_end_request(rq, cqe->status, cqe->result);
 468        queue->nr_cqe++;
 469
 470        return 0;
 471}
 472
 473static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue,
 474                struct nvme_tcp_data_pdu *pdu)
 475{
 476        struct request *rq;
 477
 478        rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
 479        if (!rq) {
 480                dev_err(queue->ctrl->ctrl.device,
 481                        "queue %d tag %#x not found\n",
 482                        nvme_tcp_queue_id(queue), pdu->command_id);
 483                return -ENOENT;
 484        }
 485
 486        if (!blk_rq_payload_bytes(rq)) {
 487                dev_err(queue->ctrl->ctrl.device,
 488                        "queue %d tag %#x unexpected data\n",
 489                        nvme_tcp_queue_id(queue), rq->tag);
 490                return -EIO;
 491        }
 492
 493        queue->data_remaining = le32_to_cpu(pdu->data_length);
 494
 495        if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS &&
 496            unlikely(!(pdu->hdr.flags & NVME_TCP_F_DATA_LAST))) {
 497                dev_err(queue->ctrl->ctrl.device,
 498                        "queue %d tag %#x SUCCESS set but not last PDU\n",
 499                        nvme_tcp_queue_id(queue), rq->tag);
 500                nvme_tcp_error_recovery(&queue->ctrl->ctrl);
 501                return -EPROTO;
 502        }
 503
 504        return 0;
 505}
 506
 507static int nvme_tcp_handle_comp(struct nvme_tcp_queue *queue,
 508                struct nvme_tcp_rsp_pdu *pdu)
 509{
 510        struct nvme_completion *cqe = &pdu->cqe;
 511        int ret = 0;
 512
 513        /*
 514         * AEN requests are special as they don't time out and can
 515         * survive any kind of queue freeze and often don't respond to
 516         * aborts.  We don't even bother to allocate a struct request
 517         * for them but rather special case them here.
 518         */
 519        if (unlikely(nvme_is_aen_req(nvme_tcp_queue_id(queue),
 520                                     cqe->command_id)))
 521                nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
 522                                &cqe->result);
 523        else
 524                ret = nvme_tcp_process_nvme_cqe(queue, cqe);
 525
 526        return ret;
 527}
 528
 529static int nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req,
 530                struct nvme_tcp_r2t_pdu *pdu)
 531{
 532        struct nvme_tcp_data_pdu *data = req->pdu;
 533        struct nvme_tcp_queue *queue = req->queue;
 534        struct request *rq = blk_mq_rq_from_pdu(req);
 535        u8 hdgst = nvme_tcp_hdgst_len(queue);
 536        u8 ddgst = nvme_tcp_ddgst_len(queue);
 537
 538        req->pdu_len = le32_to_cpu(pdu->r2t_length);
 539        req->pdu_sent = 0;
 540
 541        if (unlikely(req->data_sent + req->pdu_len > req->data_len)) {
 542                dev_err(queue->ctrl->ctrl.device,
 543                        "req %d r2t len %u exceeded data len %u (%zu sent)\n",
 544                        rq->tag, req->pdu_len, req->data_len,
 545                        req->data_sent);
 546                return -EPROTO;
 547        }
 548
 549        if (unlikely(le32_to_cpu(pdu->r2t_offset) < req->data_sent)) {
 550                dev_err(queue->ctrl->ctrl.device,
 551                        "req %d unexpected r2t offset %u (expected %zu)\n",
 552                        rq->tag, le32_to_cpu(pdu->r2t_offset),
 553                        req->data_sent);
 554                return -EPROTO;
 555        }
 556
 557        memset(data, 0, sizeof(*data));
 558        data->hdr.type = nvme_tcp_h2c_data;
 559        data->hdr.flags = NVME_TCP_F_DATA_LAST;
 560        if (queue->hdr_digest)
 561                data->hdr.flags |= NVME_TCP_F_HDGST;
 562        if (queue->data_digest)
 563                data->hdr.flags |= NVME_TCP_F_DDGST;
 564        data->hdr.hlen = sizeof(*data);
 565        data->hdr.pdo = data->hdr.hlen + hdgst;
 566        data->hdr.plen =
 567                cpu_to_le32(data->hdr.hlen + hdgst + req->pdu_len + ddgst);
 568        data->ttag = pdu->ttag;
 569        data->command_id = rq->tag;
 570        data->data_offset = cpu_to_le32(req->data_sent);
 571        data->data_length = cpu_to_le32(req->pdu_len);
 572        return 0;
 573}
 574
 575static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue,
 576                struct nvme_tcp_r2t_pdu *pdu)
 577{
 578        struct nvme_tcp_request *req;
 579        struct request *rq;
 580        int ret;
 581
 582        rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
 583        if (!rq) {
 584                dev_err(queue->ctrl->ctrl.device,
 585                        "queue %d tag %#x not found\n",
 586                        nvme_tcp_queue_id(queue), pdu->command_id);
 587                return -ENOENT;
 588        }
 589        req = blk_mq_rq_to_pdu(rq);
 590
 591        ret = nvme_tcp_setup_h2c_data_pdu(req, pdu);
 592        if (unlikely(ret))
 593                return ret;
 594
 595        req->state = NVME_TCP_SEND_H2C_PDU;
 596        req->offset = 0;
 597
 598        nvme_tcp_queue_request(req, false);
 599
 600        return 0;
 601}
 602
 603static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb,
 604                unsigned int *offset, size_t *len)
 605{
 606        struct nvme_tcp_hdr *hdr;
 607        char *pdu = queue->pdu;
 608        size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining);
 609        int ret;
 610
 611        ret = skb_copy_bits(skb, *offset,
 612                &pdu[queue->pdu_offset], rcv_len);
 613        if (unlikely(ret))
 614                return ret;
 615
 616        queue->pdu_remaining -= rcv_len;
 617        queue->pdu_offset += rcv_len;
 618        *offset += rcv_len;
 619        *len -= rcv_len;
 620        if (queue->pdu_remaining)
 621                return 0;
 622
 623        hdr = queue->pdu;
 624        if (queue->hdr_digest) {
 625                ret = nvme_tcp_verify_hdgst(queue, queue->pdu, hdr->hlen);
 626                if (unlikely(ret))
 627                        return ret;
 628        }
 629
 630
 631        if (queue->data_digest) {
 632                ret = nvme_tcp_check_ddgst(queue, queue->pdu);
 633                if (unlikely(ret))
 634                        return ret;
 635        }
 636
 637        switch (hdr->type) {
 638        case nvme_tcp_c2h_data:
 639                return nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu);
 640        case nvme_tcp_rsp:
 641                nvme_tcp_init_recv_ctx(queue);
 642                return nvme_tcp_handle_comp(queue, (void *)queue->pdu);
 643        case nvme_tcp_r2t:
 644                nvme_tcp_init_recv_ctx(queue);
 645                return nvme_tcp_handle_r2t(queue, (void *)queue->pdu);
 646        default:
 647                dev_err(queue->ctrl->ctrl.device,
 648                        "unsupported pdu type (%d)\n", hdr->type);
 649                return -EINVAL;
 650        }
 651}
 652
 653static inline void nvme_tcp_end_request(struct request *rq, u16 status)
 654{
 655        union nvme_result res = {};
 656
 657        nvme_end_request(rq, cpu_to_le16(status << 1), res);
 658}
 659
 660static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
 661                              unsigned int *offset, size_t *len)
 662{
 663        struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
 664        struct nvme_tcp_request *req;
 665        struct request *rq;
 666
 667        rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
 668        if (!rq) {
 669                dev_err(queue->ctrl->ctrl.device,
 670                        "queue %d tag %#x not found\n",
 671                        nvme_tcp_queue_id(queue), pdu->command_id);
 672                return -ENOENT;
 673        }
 674        req = blk_mq_rq_to_pdu(rq);
 675
 676        while (true) {
 677                int recv_len, ret;
 678
 679                recv_len = min_t(size_t, *len, queue->data_remaining);
 680                if (!recv_len)
 681                        break;
 682
 683                if (!iov_iter_count(&req->iter)) {
 684                        req->curr_bio = req->curr_bio->bi_next;
 685
 686                        /*
 687                         * If we don`t have any bios it means that controller
 688                         * sent more data than we requested, hence error
 689                         */
 690                        if (!req->curr_bio) {
 691                                dev_err(queue->ctrl->ctrl.device,
 692                                        "queue %d no space in request %#x",
 693                                        nvme_tcp_queue_id(queue), rq->tag);
 694                                nvme_tcp_init_recv_ctx(queue);
 695                                return -EIO;
 696                        }
 697                        nvme_tcp_init_iter(req, READ);
 698                }
 699
 700                /* we can read only from what is left in this bio */
 701                recv_len = min_t(size_t, recv_len,
 702                                iov_iter_count(&req->iter));
 703
 704                if (queue->data_digest)
 705                        ret = skb_copy_and_hash_datagram_iter(skb, *offset,
 706                                &req->iter, recv_len, queue->rcv_hash);
 707                else
 708                        ret = skb_copy_datagram_iter(skb, *offset,
 709                                        &req->iter, recv_len);
 710                if (ret) {
 711                        dev_err(queue->ctrl->ctrl.device,
 712                                "queue %d failed to copy request %#x data",
 713                                nvme_tcp_queue_id(queue), rq->tag);
 714                        return ret;
 715                }
 716
 717                *len -= recv_len;
 718                *offset += recv_len;
 719                queue->data_remaining -= recv_len;
 720        }
 721
 722        if (!queue->data_remaining) {
 723                if (queue->data_digest) {
 724                        nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst);
 725                        queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH;
 726                } else {
 727                        if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
 728                                nvme_tcp_end_request(rq, NVME_SC_SUCCESS);
 729                                queue->nr_cqe++;
 730                        }
 731                        nvme_tcp_init_recv_ctx(queue);
 732                }
 733        }
 734
 735        return 0;
 736}
 737
 738static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue,
 739                struct sk_buff *skb, unsigned int *offset, size_t *len)
 740{
 741        struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
 742        char *ddgst = (char *)&queue->recv_ddgst;
 743        size_t recv_len = min_t(size_t, *len, queue->ddgst_remaining);
 744        off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining;
 745        int ret;
 746
 747        ret = skb_copy_bits(skb, *offset, &ddgst[off], recv_len);
 748        if (unlikely(ret))
 749                return ret;
 750
 751        queue->ddgst_remaining -= recv_len;
 752        *offset += recv_len;
 753        *len -= recv_len;
 754        if (queue->ddgst_remaining)
 755                return 0;
 756
 757        if (queue->recv_ddgst != queue->exp_ddgst) {
 758                dev_err(queue->ctrl->ctrl.device,
 759                        "data digest error: recv %#x expected %#x\n",
 760                        le32_to_cpu(queue->recv_ddgst),
 761                        le32_to_cpu(queue->exp_ddgst));
 762                return -EIO;
 763        }
 764
 765        if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
 766                struct request *rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue),
 767                                                pdu->command_id);
 768
 769                nvme_tcp_end_request(rq, NVME_SC_SUCCESS);
 770                queue->nr_cqe++;
 771        }
 772
 773        nvme_tcp_init_recv_ctx(queue);
 774        return 0;
 775}
 776
 777static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
 778                             unsigned int offset, size_t len)
 779{
 780        struct nvme_tcp_queue *queue = desc->arg.data;
 781        size_t consumed = len;
 782        int result;
 783
 784        while (len) {
 785                switch (nvme_tcp_recv_state(queue)) {
 786                case NVME_TCP_RECV_PDU:
 787                        result = nvme_tcp_recv_pdu(queue, skb, &offset, &len);
 788                        break;
 789                case NVME_TCP_RECV_DATA:
 790                        result = nvme_tcp_recv_data(queue, skb, &offset, &len);
 791                        break;
 792                case NVME_TCP_RECV_DDGST:
 793                        result = nvme_tcp_recv_ddgst(queue, skb, &offset, &len);
 794                        break;
 795                default:
 796                        result = -EFAULT;
 797                }
 798                if (result) {
 799                        dev_err(queue->ctrl->ctrl.device,
 800                                "receive failed:  %d\n", result);
 801                        queue->rd_enabled = false;
 802                        nvme_tcp_error_recovery(&queue->ctrl->ctrl);
 803                        return result;
 804                }
 805        }
 806
 807        return consumed;
 808}
 809
 810static void nvme_tcp_data_ready(struct sock *sk)
 811{
 812        struct nvme_tcp_queue *queue;
 813
 814        read_lock_bh(&sk->sk_callback_lock);
 815        queue = sk->sk_user_data;
 816        if (likely(queue && queue->rd_enabled) &&
 817            !test_bit(NVME_TCP_Q_POLLING, &queue->flags))
 818                queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
 819        read_unlock_bh(&sk->sk_callback_lock);
 820}
 821
 822static void nvme_tcp_write_space(struct sock *sk)
 823{
 824        struct nvme_tcp_queue *queue;
 825
 826        read_lock_bh(&sk->sk_callback_lock);
 827        queue = sk->sk_user_data;
 828        if (likely(queue && sk_stream_is_writeable(sk))) {
 829                clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 830                queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
 831        }
 832        read_unlock_bh(&sk->sk_callback_lock);
 833}
 834
 835static void nvme_tcp_state_change(struct sock *sk)
 836{
 837        struct nvme_tcp_queue *queue;
 838
 839        read_lock(&sk->sk_callback_lock);
 840        queue = sk->sk_user_data;
 841        if (!queue)
 842                goto done;
 843
 844        switch (sk->sk_state) {
 845        case TCP_CLOSE:
 846        case TCP_CLOSE_WAIT:
 847        case TCP_LAST_ACK:
 848        case TCP_FIN_WAIT1:
 849        case TCP_FIN_WAIT2:
 850                /* fallthrough */
 851                nvme_tcp_error_recovery(&queue->ctrl->ctrl);
 852                break;
 853        default:
 854                dev_info(queue->ctrl->ctrl.device,
 855                        "queue %d socket state %d\n",
 856                        nvme_tcp_queue_id(queue), sk->sk_state);
 857        }
 858
 859        queue->state_change(sk);
 860done:
 861        read_unlock(&sk->sk_callback_lock);
 862}
 863
 864static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue)
 865{
 866        queue->request = NULL;
 867}
 868
 869static void nvme_tcp_fail_request(struct nvme_tcp_request *req)
 870{
 871        nvme_tcp_end_request(blk_mq_rq_from_pdu(req), NVME_SC_HOST_PATH_ERROR);
 872}
 873
 874static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
 875{
 876        struct nvme_tcp_queue *queue = req->queue;
 877
 878        while (true) {
 879                struct page *page = nvme_tcp_req_cur_page(req);
 880                size_t offset = nvme_tcp_req_cur_offset(req);
 881                size_t len = nvme_tcp_req_cur_length(req);
 882                bool last = nvme_tcp_pdu_last_send(req, len);
 883                int ret, flags = MSG_DONTWAIT;
 884
 885                if (last && !queue->data_digest)
 886                        flags |= MSG_EOR;
 887                else
 888                        flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
 889
 890                /* can't zcopy slab pages */
 891                if (unlikely(PageSlab(page))) {
 892                        ret = sock_no_sendpage(queue->sock, page, offset, len,
 893                                        flags);
 894                } else {
 895                        ret = kernel_sendpage(queue->sock, page, offset, len,
 896                                        flags);
 897                }
 898                if (ret <= 0)
 899                        return ret;
 900
 901                nvme_tcp_advance_req(req, ret);
 902                if (queue->data_digest)
 903                        nvme_tcp_ddgst_update(queue->snd_hash, page,
 904                                        offset, ret);
 905
 906                /* fully successful last write*/
 907                if (last && ret == len) {
 908                        if (queue->data_digest) {
 909                                nvme_tcp_ddgst_final(queue->snd_hash,
 910                                        &req->ddgst);
 911                                req->state = NVME_TCP_SEND_DDGST;
 912                                req->offset = 0;
 913                        } else {
 914                                nvme_tcp_done_send_req(queue);
 915                        }
 916                        return 1;
 917                }
 918        }
 919        return -EAGAIN;
 920}
 921
 922static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
 923{
 924        struct nvme_tcp_queue *queue = req->queue;
 925        struct nvme_tcp_cmd_pdu *pdu = req->pdu;
 926        bool inline_data = nvme_tcp_has_inline_data(req);
 927        u8 hdgst = nvme_tcp_hdgst_len(queue);
 928        int len = sizeof(*pdu) + hdgst - req->offset;
 929        int flags = MSG_DONTWAIT;
 930        int ret;
 931
 932        if (inline_data)
 933                flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
 934        else
 935                flags |= MSG_EOR;
 936
 937        if (queue->hdr_digest && !req->offset)
 938                nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
 939
 940        ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
 941                        offset_in_page(pdu) + req->offset, len,  flags);
 942        if (unlikely(ret <= 0))
 943                return ret;
 944
 945        len -= ret;
 946        if (!len) {
 947                if (inline_data) {
 948                        req->state = NVME_TCP_SEND_DATA;
 949                        if (queue->data_digest)
 950                                crypto_ahash_init(queue->snd_hash);
 951                        nvme_tcp_init_iter(req, WRITE);
 952                } else {
 953                        nvme_tcp_done_send_req(queue);
 954                }
 955                return 1;
 956        }
 957        req->offset += ret;
 958
 959        return -EAGAIN;
 960}
 961
 962static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
 963{
 964        struct nvme_tcp_queue *queue = req->queue;
 965        struct nvme_tcp_data_pdu *pdu = req->pdu;
 966        u8 hdgst = nvme_tcp_hdgst_len(queue);
 967        int len = sizeof(*pdu) - req->offset + hdgst;
 968        int ret;
 969
 970        if (queue->hdr_digest && !req->offset)
 971                nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
 972
 973        ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
 974                        offset_in_page(pdu) + req->offset, len,
 975                        MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST);
 976        if (unlikely(ret <= 0))
 977                return ret;
 978
 979        len -= ret;
 980        if (!len) {
 981                req->state = NVME_TCP_SEND_DATA;
 982                if (queue->data_digest)
 983                        crypto_ahash_init(queue->snd_hash);
 984                if (!req->data_sent)
 985                        nvme_tcp_init_iter(req, WRITE);
 986                return 1;
 987        }
 988        req->offset += ret;
 989
 990        return -EAGAIN;
 991}
 992
 993static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req)
 994{
 995        struct nvme_tcp_queue *queue = req->queue;
 996        int ret;
 997        struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_EOR };
 998        struct kvec iov = {
 999                .iov_base = &req->ddgst + req->offset,
1000                .iov_len = NVME_TCP_DIGEST_LENGTH - req->offset
1001        };
1002
1003        ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
1004        if (unlikely(ret <= 0))
1005                return ret;
1006
1007        if (req->offset + ret == NVME_TCP_DIGEST_LENGTH) {
1008                nvme_tcp_done_send_req(queue);
1009                return 1;
1010        }
1011
1012        req->offset += ret;
1013        return -EAGAIN;
1014}
1015
1016static int nvme_tcp_try_send(struct nvme_tcp_queue *queue)
1017{
1018        struct nvme_tcp_request *req;
1019        int ret = 1;
1020
1021        if (!queue->request) {
1022                queue->request = nvme_tcp_fetch_request(queue);
1023                if (!queue->request)
1024                        return 0;
1025        }
1026        req = queue->request;
1027
1028        if (req->state == NVME_TCP_SEND_CMD_PDU) {
1029                ret = nvme_tcp_try_send_cmd_pdu(req);
1030                if (ret <= 0)
1031                        goto done;
1032                if (!nvme_tcp_has_inline_data(req))
1033                        return ret;
1034        }
1035
1036        if (req->state == NVME_TCP_SEND_H2C_PDU) {
1037                ret = nvme_tcp_try_send_data_pdu(req);
1038                if (ret <= 0)
1039                        goto done;
1040        }
1041
1042        if (req->state == NVME_TCP_SEND_DATA) {
1043                ret = nvme_tcp_try_send_data(req);
1044                if (ret <= 0)
1045                        goto done;
1046        }
1047
1048        if (req->state == NVME_TCP_SEND_DDGST)
1049                ret = nvme_tcp_try_send_ddgst(req);
1050done:
1051        if (ret == -EAGAIN) {
1052                ret = 0;
1053        } else if (ret < 0) {
1054                dev_err(queue->ctrl->ctrl.device,
1055                        "failed to send request %d\n", ret);
1056                if (ret != -EPIPE && ret != -ECONNRESET)
1057                        nvme_tcp_fail_request(queue->request);
1058                nvme_tcp_done_send_req(queue);
1059        }
1060        return ret;
1061}
1062
1063static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
1064{
1065        struct socket *sock = queue->sock;
1066        struct sock *sk = sock->sk;
1067        read_descriptor_t rd_desc;
1068        int consumed;
1069
1070        rd_desc.arg.data = queue;
1071        rd_desc.count = 1;
1072        lock_sock(sk);
1073        queue->nr_cqe = 0;
1074        consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
1075        release_sock(sk);
1076        return consumed;
1077}
1078
1079static void nvme_tcp_io_work(struct work_struct *w)
1080{
1081        struct nvme_tcp_queue *queue =
1082                container_of(w, struct nvme_tcp_queue, io_work);
1083        unsigned long deadline = jiffies + msecs_to_jiffies(1);
1084
1085        do {
1086                bool pending = false;
1087                int result;
1088
1089                if (mutex_trylock(&queue->send_mutex)) {
1090                        result = nvme_tcp_try_send(queue);
1091                        mutex_unlock(&queue->send_mutex);
1092                        if (result > 0)
1093                                pending = true;
1094                        else if (unlikely(result < 0))
1095                                break;
1096                }
1097
1098                result = nvme_tcp_try_recv(queue);
1099                if (result > 0)
1100                        pending = true;
1101                else if (unlikely(result < 0))
1102                        return;
1103
1104                if (!pending)
1105                        return;
1106
1107        } while (!time_after(jiffies, deadline)); /* quota is exhausted */
1108
1109        queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
1110}
1111
1112static void nvme_tcp_free_crypto(struct nvme_tcp_queue *queue)
1113{
1114        struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash);
1115
1116        ahash_request_free(queue->rcv_hash);
1117        ahash_request_free(queue->snd_hash);
1118        crypto_free_ahash(tfm);
1119}
1120
1121static int nvme_tcp_alloc_crypto(struct nvme_tcp_queue *queue)
1122{
1123        struct crypto_ahash *tfm;
1124
1125        tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC);
1126        if (IS_ERR(tfm))
1127                return PTR_ERR(tfm);
1128
1129        queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL);
1130        if (!queue->snd_hash)
1131                goto free_tfm;
1132        ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL);
1133
1134        queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL);
1135        if (!queue->rcv_hash)
1136                goto free_snd_hash;
1137        ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL);
1138
1139        return 0;
1140free_snd_hash:
1141        ahash_request_free(queue->snd_hash);
1142free_tfm:
1143        crypto_free_ahash(tfm);
1144        return -ENOMEM;
1145}
1146
1147static void nvme_tcp_free_async_req(struct nvme_tcp_ctrl *ctrl)
1148{
1149        struct nvme_tcp_request *async = &ctrl->async_req;
1150
1151        page_frag_free(async->pdu);
1152}
1153
1154static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl)
1155{
1156        struct nvme_tcp_queue *queue = &ctrl->queues[0];
1157        struct nvme_tcp_request *async = &ctrl->async_req;
1158        u8 hdgst = nvme_tcp_hdgst_len(queue);
1159
1160        async->pdu = page_frag_alloc(&queue->pf_cache,
1161                sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
1162                GFP_KERNEL | __GFP_ZERO);
1163        if (!async->pdu)
1164                return -ENOMEM;
1165
1166        async->queue = &ctrl->queues[0];
1167        return 0;
1168}
1169
1170static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
1171{
1172        struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1173        struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1174
1175        if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
1176                return;
1177
1178        if (queue->hdr_digest || queue->data_digest)
1179                nvme_tcp_free_crypto(queue);
1180
1181        sock_release(queue->sock);
1182        kfree(queue->pdu);
1183}
1184
1185static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue)
1186{
1187        struct nvme_tcp_icreq_pdu *icreq;
1188        struct nvme_tcp_icresp_pdu *icresp;
1189        struct msghdr msg = {};
1190        struct kvec iov;
1191        bool ctrl_hdgst, ctrl_ddgst;
1192        int ret;
1193
1194        icreq = kzalloc(sizeof(*icreq), GFP_KERNEL);
1195        if (!icreq)
1196                return -ENOMEM;
1197
1198        icresp = kzalloc(sizeof(*icresp), GFP_KERNEL);
1199        if (!icresp) {
1200                ret = -ENOMEM;
1201                goto free_icreq;
1202        }
1203
1204        icreq->hdr.type = nvme_tcp_icreq;
1205        icreq->hdr.hlen = sizeof(*icreq);
1206        icreq->hdr.pdo = 0;
1207        icreq->hdr.plen = cpu_to_le32(icreq->hdr.hlen);
1208        icreq->pfv = cpu_to_le16(NVME_TCP_PFV_1_0);
1209        icreq->maxr2t = 0; /* single inflight r2t supported */
1210        icreq->hpda = 0; /* no alignment constraint */
1211        if (queue->hdr_digest)
1212                icreq->digest |= NVME_TCP_HDR_DIGEST_ENABLE;
1213        if (queue->data_digest)
1214                icreq->digest |= NVME_TCP_DATA_DIGEST_ENABLE;
1215
1216        iov.iov_base = icreq;
1217        iov.iov_len = sizeof(*icreq);
1218        ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
1219        if (ret < 0)
1220                goto free_icresp;
1221
1222        memset(&msg, 0, sizeof(msg));
1223        iov.iov_base = icresp;
1224        iov.iov_len = sizeof(*icresp);
1225        ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
1226                        iov.iov_len, msg.msg_flags);
1227        if (ret < 0)
1228                goto free_icresp;
1229
1230        ret = -EINVAL;
1231        if (icresp->hdr.type != nvme_tcp_icresp) {
1232                pr_err("queue %d: bad type returned %d\n",
1233                        nvme_tcp_queue_id(queue), icresp->hdr.type);
1234                goto free_icresp;
1235        }
1236
1237        if (le32_to_cpu(icresp->hdr.plen) != sizeof(*icresp)) {
1238                pr_err("queue %d: bad pdu length returned %d\n",
1239                        nvme_tcp_queue_id(queue), icresp->hdr.plen);
1240                goto free_icresp;
1241        }
1242
1243        if (icresp->pfv != NVME_TCP_PFV_1_0) {
1244                pr_err("queue %d: bad pfv returned %d\n",
1245                        nvme_tcp_queue_id(queue), icresp->pfv);
1246                goto free_icresp;
1247        }
1248
1249        ctrl_ddgst = !!(icresp->digest & NVME_TCP_DATA_DIGEST_ENABLE);
1250        if ((queue->data_digest && !ctrl_ddgst) ||
1251            (!queue->data_digest && ctrl_ddgst)) {
1252                pr_err("queue %d: data digest mismatch host: %s ctrl: %s\n",
1253                        nvme_tcp_queue_id(queue),
1254                        queue->data_digest ? "enabled" : "disabled",
1255                        ctrl_ddgst ? "enabled" : "disabled");
1256                goto free_icresp;
1257        }
1258
1259        ctrl_hdgst = !!(icresp->digest & NVME_TCP_HDR_DIGEST_ENABLE);
1260        if ((queue->hdr_digest && !ctrl_hdgst) ||
1261            (!queue->hdr_digest && ctrl_hdgst)) {
1262                pr_err("queue %d: header digest mismatch host: %s ctrl: %s\n",
1263                        nvme_tcp_queue_id(queue),
1264                        queue->hdr_digest ? "enabled" : "disabled",
1265                        ctrl_hdgst ? "enabled" : "disabled");
1266                goto free_icresp;
1267        }
1268
1269        if (icresp->cpda != 0) {
1270                pr_err("queue %d: unsupported cpda returned %d\n",
1271                        nvme_tcp_queue_id(queue), icresp->cpda);
1272                goto free_icresp;
1273        }
1274
1275        ret = 0;
1276free_icresp:
1277        kfree(icresp);
1278free_icreq:
1279        kfree(icreq);
1280        return ret;
1281}
1282
1283static bool nvme_tcp_admin_queue(struct nvme_tcp_queue *queue)
1284{
1285        return nvme_tcp_queue_id(queue) == 0;
1286}
1287
1288static bool nvme_tcp_default_queue(struct nvme_tcp_queue *queue)
1289{
1290        struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1291        int qid = nvme_tcp_queue_id(queue);
1292
1293        return !nvme_tcp_admin_queue(queue) &&
1294                qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT];
1295}
1296
1297static bool nvme_tcp_read_queue(struct nvme_tcp_queue *queue)
1298{
1299        struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1300        int qid = nvme_tcp_queue_id(queue);
1301
1302        return !nvme_tcp_admin_queue(queue) &&
1303                !nvme_tcp_default_queue(queue) &&
1304                qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] +
1305                          ctrl->io_queues[HCTX_TYPE_READ];
1306}
1307
1308static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue)
1309{
1310        struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1311        int qid = nvme_tcp_queue_id(queue);
1312
1313        return !nvme_tcp_admin_queue(queue) &&
1314                !nvme_tcp_default_queue(queue) &&
1315                !nvme_tcp_read_queue(queue) &&
1316                qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] +
1317                          ctrl->io_queues[HCTX_TYPE_READ] +
1318                          ctrl->io_queues[HCTX_TYPE_POLL];
1319}
1320
1321static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
1322{
1323        struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1324        int qid = nvme_tcp_queue_id(queue);
1325        int n = 0;
1326
1327        if (nvme_tcp_default_queue(queue))
1328                n = qid - 1;
1329        else if (nvme_tcp_read_queue(queue))
1330                n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - 1;
1331        else if (nvme_tcp_poll_queue(queue))
1332                n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] -
1333                                ctrl->io_queues[HCTX_TYPE_READ] - 1;
1334        queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
1335}
1336
1337static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
1338                int qid, size_t queue_size)
1339{
1340        struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1341        struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1342        int ret, rcv_pdu_size;
1343
1344        queue->ctrl = ctrl;
1345        INIT_LIST_HEAD(&queue->send_list);
1346        spin_lock_init(&queue->lock);
1347        mutex_init(&queue->send_mutex);
1348        INIT_WORK(&queue->io_work, nvme_tcp_io_work);
1349        queue->queue_size = queue_size;
1350
1351        if (qid > 0)
1352                queue->cmnd_capsule_len = nctrl->ioccsz * 16;
1353        else
1354                queue->cmnd_capsule_len = sizeof(struct nvme_command) +
1355                                                NVME_TCP_ADMIN_CCSZ;
1356
1357        ret = sock_create(ctrl->addr.ss_family, SOCK_STREAM,
1358                        IPPROTO_TCP, &queue->sock);
1359        if (ret) {
1360                dev_err(nctrl->device,
1361                        "failed to create socket: %d\n", ret);
1362                return ret;
1363        }
1364
1365        /* Single syn retry */
1366        tcp_sock_set_syncnt(queue->sock->sk, 1);
1367
1368        /* Set TCP no delay */
1369        tcp_sock_set_nodelay(queue->sock->sk);
1370
1371        /*
1372         * Cleanup whatever is sitting in the TCP transmit queue on socket
1373         * close. This is done to prevent stale data from being sent should
1374         * the network connection be restored before TCP times out.
1375         */
1376        sock_no_linger(queue->sock->sk);
1377
1378        if (so_priority > 0)
1379                sock_set_priority(queue->sock->sk, so_priority);
1380
1381        /* Set socket type of service */
1382        if (nctrl->opts->tos >= 0)
1383                ip_sock_set_tos(queue->sock->sk, nctrl->opts->tos);
1384
1385        /* Set 10 seconds timeout for icresp recvmsg */
1386        queue->sock->sk->sk_rcvtimeo = 10 * HZ;
1387
1388        queue->sock->sk->sk_allocation = GFP_ATOMIC;
1389        nvme_tcp_set_queue_io_cpu(queue);
1390        queue->request = NULL;
1391        queue->data_remaining = 0;
1392        queue->ddgst_remaining = 0;
1393        queue->pdu_remaining = 0;
1394        queue->pdu_offset = 0;
1395        sk_set_memalloc(queue->sock->sk);
1396
1397        if (nctrl->opts->mask & NVMF_OPT_HOST_TRADDR) {
1398                ret = kernel_bind(queue->sock, (struct sockaddr *)&ctrl->src_addr,
1399                        sizeof(ctrl->src_addr));
1400                if (ret) {
1401                        dev_err(nctrl->device,
1402                                "failed to bind queue %d socket %d\n",
1403                                qid, ret);
1404                        goto err_sock;
1405                }
1406        }
1407
1408        queue->hdr_digest = nctrl->opts->hdr_digest;
1409        queue->data_digest = nctrl->opts->data_digest;
1410        if (queue->hdr_digest || queue->data_digest) {
1411                ret = nvme_tcp_alloc_crypto(queue);
1412                if (ret) {
1413                        dev_err(nctrl->device,
1414                                "failed to allocate queue %d crypto\n", qid);
1415                        goto err_sock;
1416                }
1417        }
1418
1419        rcv_pdu_size = sizeof(struct nvme_tcp_rsp_pdu) +
1420                        nvme_tcp_hdgst_len(queue);
1421        queue->pdu = kmalloc(rcv_pdu_size, GFP_KERNEL);
1422        if (!queue->pdu) {
1423                ret = -ENOMEM;
1424                goto err_crypto;
1425        }
1426
1427        dev_dbg(nctrl->device, "connecting queue %d\n",
1428                        nvme_tcp_queue_id(queue));
1429
1430        ret = kernel_connect(queue->sock, (struct sockaddr *)&ctrl->addr,
1431                sizeof(ctrl->addr), 0);
1432        if (ret) {
1433                dev_err(nctrl->device,
1434                        "failed to connect socket: %d\n", ret);
1435                goto err_rcv_pdu;
1436        }
1437
1438        ret = nvme_tcp_init_connection(queue);
1439        if (ret)
1440                goto err_init_connect;
1441
1442        queue->rd_enabled = true;
1443        set_bit(NVME_TCP_Q_ALLOCATED, &queue->flags);
1444        nvme_tcp_init_recv_ctx(queue);
1445
1446        write_lock_bh(&queue->sock->sk->sk_callback_lock);
1447        queue->sock->sk->sk_user_data = queue;
1448        queue->state_change = queue->sock->sk->sk_state_change;
1449        queue->data_ready = queue->sock->sk->sk_data_ready;
1450        queue->write_space = queue->sock->sk->sk_write_space;
1451        queue->sock->sk->sk_data_ready = nvme_tcp_data_ready;
1452        queue->sock->sk->sk_state_change = nvme_tcp_state_change;
1453        queue->sock->sk->sk_write_space = nvme_tcp_write_space;
1454#ifdef CONFIG_NET_RX_BUSY_POLL
1455        queue->sock->sk->sk_ll_usec = 1;
1456#endif
1457        write_unlock_bh(&queue->sock->sk->sk_callback_lock);
1458
1459        return 0;
1460
1461err_init_connect:
1462        kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1463err_rcv_pdu:
1464        kfree(queue->pdu);
1465err_crypto:
1466        if (queue->hdr_digest || queue->data_digest)
1467                nvme_tcp_free_crypto(queue);
1468err_sock:
1469        sock_release(queue->sock);
1470        queue->sock = NULL;
1471        return ret;
1472}
1473
1474static void nvme_tcp_restore_sock_calls(struct nvme_tcp_queue *queue)
1475{
1476        struct socket *sock = queue->sock;
1477
1478        write_lock_bh(&sock->sk->sk_callback_lock);
1479        sock->sk->sk_user_data  = NULL;
1480        sock->sk->sk_data_ready = queue->data_ready;
1481        sock->sk->sk_state_change = queue->state_change;
1482        sock->sk->sk_write_space  = queue->write_space;
1483        write_unlock_bh(&sock->sk->sk_callback_lock);
1484}
1485
1486static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue)
1487{
1488        kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1489        nvme_tcp_restore_sock_calls(queue);
1490        cancel_work_sync(&queue->io_work);
1491}
1492
1493static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
1494{
1495        struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1496        struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1497
1498        if (!test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags))
1499                return;
1500
1501        __nvme_tcp_stop_queue(queue);
1502}
1503
1504static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
1505{
1506        struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1507        int ret;
1508
1509        if (idx)
1510                ret = nvmf_connect_io_queue(nctrl, idx, false);
1511        else
1512                ret = nvmf_connect_admin_queue(nctrl);
1513
1514        if (!ret) {
1515                set_bit(NVME_TCP_Q_LIVE, &ctrl->queues[idx].flags);
1516        } else {
1517                if (test_bit(NVME_TCP_Q_ALLOCATED, &ctrl->queues[idx].flags))
1518                        __nvme_tcp_stop_queue(&ctrl->queues[idx]);
1519                dev_err(nctrl->device,
1520                        "failed to connect queue: %d ret=%d\n", idx, ret);
1521        }
1522        return ret;
1523}
1524
1525static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl,
1526                bool admin)
1527{
1528        struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1529        struct blk_mq_tag_set *set;
1530        int ret;
1531
1532        if (admin) {
1533                set = &ctrl->admin_tag_set;
1534                memset(set, 0, sizeof(*set));
1535                set->ops = &nvme_tcp_admin_mq_ops;
1536                set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
1537                set->reserved_tags = 2; /* connect + keep-alive */
1538                set->numa_node = nctrl->numa_node;
1539                set->flags = BLK_MQ_F_BLOCKING;
1540                set->cmd_size = sizeof(struct nvme_tcp_request);
1541                set->driver_data = ctrl;
1542                set->nr_hw_queues = 1;
1543                set->timeout = ADMIN_TIMEOUT;
1544        } else {
1545                set = &ctrl->tag_set;
1546                memset(set, 0, sizeof(*set));
1547                set->ops = &nvme_tcp_mq_ops;
1548                set->queue_depth = nctrl->sqsize + 1;
1549                set->reserved_tags = 1; /* fabric connect */
1550                set->numa_node = nctrl->numa_node;
1551                set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
1552                set->cmd_size = sizeof(struct nvme_tcp_request);
1553                set->driver_data = ctrl;
1554                set->nr_hw_queues = nctrl->queue_count - 1;
1555                set->timeout = NVME_IO_TIMEOUT;
1556                set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2;
1557        }
1558
1559        ret = blk_mq_alloc_tag_set(set);
1560        if (ret)
1561                return ERR_PTR(ret);
1562
1563        return set;
1564}
1565
1566static void nvme_tcp_free_admin_queue(struct nvme_ctrl *ctrl)
1567{
1568        if (to_tcp_ctrl(ctrl)->async_req.pdu) {
1569                nvme_tcp_free_async_req(to_tcp_ctrl(ctrl));
1570                to_tcp_ctrl(ctrl)->async_req.pdu = NULL;
1571        }
1572
1573        nvme_tcp_free_queue(ctrl, 0);
1574}
1575
1576static void nvme_tcp_free_io_queues(struct nvme_ctrl *ctrl)
1577{
1578        int i;
1579
1580        for (i = 1; i < ctrl->queue_count; i++)
1581                nvme_tcp_free_queue(ctrl, i);
1582}
1583
1584static void nvme_tcp_stop_io_queues(struct nvme_ctrl *ctrl)
1585{
1586        int i;
1587
1588        for (i = 1; i < ctrl->queue_count; i++)
1589                nvme_tcp_stop_queue(ctrl, i);
1590}
1591
1592static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl)
1593{
1594        int i, ret = 0;
1595
1596        for (i = 1; i < ctrl->queue_count; i++) {
1597                ret = nvme_tcp_start_queue(ctrl, i);
1598                if (ret)
1599                        goto out_stop_queues;
1600        }
1601
1602        return 0;
1603
1604out_stop_queues:
1605        for (i--; i >= 1; i--)
1606                nvme_tcp_stop_queue(ctrl, i);
1607        return ret;
1608}
1609
1610static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl)
1611{
1612        int ret;
1613
1614        ret = nvme_tcp_alloc_queue(ctrl, 0, NVME_AQ_DEPTH);
1615        if (ret)
1616                return ret;
1617
1618        ret = nvme_tcp_alloc_async_req(to_tcp_ctrl(ctrl));
1619        if (ret)
1620                goto out_free_queue;
1621
1622        return 0;
1623
1624out_free_queue:
1625        nvme_tcp_free_queue(ctrl, 0);
1626        return ret;
1627}
1628
1629static int __nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
1630{
1631        int i, ret;
1632
1633        for (i = 1; i < ctrl->queue_count; i++) {
1634                ret = nvme_tcp_alloc_queue(ctrl, i,
1635                                ctrl->sqsize + 1);
1636                if (ret)
1637                        goto out_free_queues;
1638        }
1639
1640        return 0;
1641
1642out_free_queues:
1643        for (i--; i >= 1; i--)
1644                nvme_tcp_free_queue(ctrl, i);
1645
1646        return ret;
1647}
1648
1649static unsigned int nvme_tcp_nr_io_queues(struct nvme_ctrl *ctrl)
1650{
1651        unsigned int nr_io_queues;
1652
1653        nr_io_queues = min(ctrl->opts->nr_io_queues, num_online_cpus());
1654        nr_io_queues += min(ctrl->opts->nr_write_queues, num_online_cpus());
1655        nr_io_queues += min(ctrl->opts->nr_poll_queues, num_online_cpus());
1656
1657        return nr_io_queues;
1658}
1659
1660static void nvme_tcp_set_io_queues(struct nvme_ctrl *nctrl,
1661                unsigned int nr_io_queues)
1662{
1663        struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1664        struct nvmf_ctrl_options *opts = nctrl->opts;
1665
1666        if (opts->nr_write_queues && opts->nr_io_queues < nr_io_queues) {
1667                /*
1668                 * separate read/write queues
1669                 * hand out dedicated default queues only after we have
1670                 * sufficient read queues.
1671                 */
1672                ctrl->io_queues[HCTX_TYPE_READ] = opts->nr_io_queues;
1673                nr_io_queues -= ctrl->io_queues[HCTX_TYPE_READ];
1674                ctrl->io_queues[HCTX_TYPE_DEFAULT] =
1675                        min(opts->nr_write_queues, nr_io_queues);
1676                nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
1677        } else {
1678                /*
1679                 * shared read/write queues
1680                 * either no write queues were requested, or we don't have
1681                 * sufficient queue count to have dedicated default queues.
1682                 */
1683                ctrl->io_queues[HCTX_TYPE_DEFAULT] =
1684                        min(opts->nr_io_queues, nr_io_queues);
1685                nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
1686        }
1687
1688        if (opts->nr_poll_queues && nr_io_queues) {
1689                /* map dedicated poll queues only if we have queues left */
1690                ctrl->io_queues[HCTX_TYPE_POLL] =
1691                        min(opts->nr_poll_queues, nr_io_queues);
1692        }
1693}
1694
1695static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
1696{
1697        unsigned int nr_io_queues;
1698        int ret;
1699
1700        nr_io_queues = nvme_tcp_nr_io_queues(ctrl);
1701        ret = nvme_set_queue_count(ctrl, &nr_io_queues);
1702        if (ret)
1703                return ret;
1704
1705        ctrl->queue_count = nr_io_queues + 1;
1706        if (ctrl->queue_count < 2)
1707                return 0;
1708
1709        dev_info(ctrl->device,
1710                "creating %d I/O queues.\n", nr_io_queues);
1711
1712        nvme_tcp_set_io_queues(ctrl, nr_io_queues);
1713
1714        return __nvme_tcp_alloc_io_queues(ctrl);
1715}
1716
1717static void nvme_tcp_destroy_io_queues(struct nvme_ctrl *ctrl, bool remove)
1718{
1719        nvme_tcp_stop_io_queues(ctrl);
1720        if (remove) {
1721                blk_cleanup_queue(ctrl->connect_q);
1722                blk_mq_free_tag_set(ctrl->tagset);
1723        }
1724        nvme_tcp_free_io_queues(ctrl);
1725}
1726
1727static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
1728{
1729        int ret;
1730
1731        ret = nvme_tcp_alloc_io_queues(ctrl);
1732        if (ret)
1733                return ret;
1734
1735        if (new) {
1736                ctrl->tagset = nvme_tcp_alloc_tagset(ctrl, false);
1737                if (IS_ERR(ctrl->tagset)) {
1738                        ret = PTR_ERR(ctrl->tagset);
1739                        goto out_free_io_queues;
1740                }
1741
1742                ctrl->connect_q = blk_mq_init_queue(ctrl->tagset);
1743                if (IS_ERR(ctrl->connect_q)) {
1744                        ret = PTR_ERR(ctrl->connect_q);
1745                        goto out_free_tag_set;
1746                }
1747        } else {
1748                blk_mq_update_nr_hw_queues(ctrl->tagset,
1749                        ctrl->queue_count - 1);
1750        }
1751
1752        ret = nvme_tcp_start_io_queues(ctrl);
1753        if (ret)
1754                goto out_cleanup_connect_q;
1755
1756        return 0;
1757
1758out_cleanup_connect_q:
1759        if (new)
1760                blk_cleanup_queue(ctrl->connect_q);
1761out_free_tag_set:
1762        if (new)
1763                blk_mq_free_tag_set(ctrl->tagset);
1764out_free_io_queues:
1765        nvme_tcp_free_io_queues(ctrl);
1766        return ret;
1767}
1768
1769static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove)
1770{
1771        nvme_tcp_stop_queue(ctrl, 0);
1772        if (remove) {
1773                blk_cleanup_queue(ctrl->admin_q);
1774                blk_cleanup_queue(ctrl->fabrics_q);
1775                blk_mq_free_tag_set(ctrl->admin_tagset);
1776        }
1777        nvme_tcp_free_admin_queue(ctrl);
1778}
1779
1780static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
1781{
1782        int error;
1783
1784        error = nvme_tcp_alloc_admin_queue(ctrl);
1785        if (error)
1786                return error;
1787
1788        if (new) {
1789                ctrl->admin_tagset = nvme_tcp_alloc_tagset(ctrl, true);
1790                if (IS_ERR(ctrl->admin_tagset)) {
1791                        error = PTR_ERR(ctrl->admin_tagset);
1792                        goto out_free_queue;
1793                }
1794
1795                ctrl->fabrics_q = blk_mq_init_queue(ctrl->admin_tagset);
1796                if (IS_ERR(ctrl->fabrics_q)) {
1797                        error = PTR_ERR(ctrl->fabrics_q);
1798                        goto out_free_tagset;
1799                }
1800
1801                ctrl->admin_q = blk_mq_init_queue(ctrl->admin_tagset);
1802                if (IS_ERR(ctrl->admin_q)) {
1803                        error = PTR_ERR(ctrl->admin_q);
1804                        goto out_cleanup_fabrics_q;
1805                }
1806        }
1807
1808        error = nvme_tcp_start_queue(ctrl, 0);
1809        if (error)
1810                goto out_cleanup_queue;
1811
1812        error = nvme_enable_ctrl(ctrl);
1813        if (error)
1814                goto out_stop_queue;
1815
1816        blk_mq_unquiesce_queue(ctrl->admin_q);
1817
1818        error = nvme_init_identify(ctrl);
1819        if (error)
1820                goto out_stop_queue;
1821
1822        return 0;
1823
1824out_stop_queue:
1825        nvme_tcp_stop_queue(ctrl, 0);
1826out_cleanup_queue:
1827        if (new)
1828                blk_cleanup_queue(ctrl->admin_q);
1829out_cleanup_fabrics_q:
1830        if (new)
1831                blk_cleanup_queue(ctrl->fabrics_q);
1832out_free_tagset:
1833        if (new)
1834                blk_mq_free_tag_set(ctrl->admin_tagset);
1835out_free_queue:
1836        nvme_tcp_free_admin_queue(ctrl);
1837        return error;
1838}
1839
1840static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl,
1841                bool remove)
1842{
1843        blk_mq_quiesce_queue(ctrl->admin_q);
1844        nvme_tcp_stop_queue(ctrl, 0);
1845        if (ctrl->admin_tagset) {
1846                blk_mq_tagset_busy_iter(ctrl->admin_tagset,
1847                        nvme_cancel_request, ctrl);
1848                blk_mq_tagset_wait_completed_request(ctrl->admin_tagset);
1849        }
1850        if (remove)
1851                blk_mq_unquiesce_queue(ctrl->admin_q);
1852        nvme_tcp_destroy_admin_queue(ctrl, remove);
1853}
1854
1855static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
1856                bool remove)
1857{
1858        if (ctrl->queue_count <= 1)
1859                return;
1860        nvme_stop_queues(ctrl);
1861        nvme_tcp_stop_io_queues(ctrl);
1862        if (ctrl->tagset) {
1863                blk_mq_tagset_busy_iter(ctrl->tagset,
1864                        nvme_cancel_request, ctrl);
1865                blk_mq_tagset_wait_completed_request(ctrl->tagset);
1866        }
1867        if (remove)
1868                nvme_start_queues(ctrl);
1869        nvme_tcp_destroy_io_queues(ctrl, remove);
1870}
1871
1872static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl)
1873{
1874        /* If we are resetting/deleting then do nothing */
1875        if (ctrl->state != NVME_CTRL_CONNECTING) {
1876                WARN_ON_ONCE(ctrl->state == NVME_CTRL_NEW ||
1877                        ctrl->state == NVME_CTRL_LIVE);
1878                return;
1879        }
1880
1881        if (nvmf_should_reconnect(ctrl)) {
1882                dev_info(ctrl->device, "Reconnecting in %d seconds...\n",
1883                        ctrl->opts->reconnect_delay);
1884                queue_delayed_work(nvme_wq, &to_tcp_ctrl(ctrl)->connect_work,
1885                                ctrl->opts->reconnect_delay * HZ);
1886        } else {
1887                dev_info(ctrl->device, "Removing controller...\n");
1888                nvme_delete_ctrl(ctrl);
1889        }
1890}
1891
1892static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
1893{
1894        struct nvmf_ctrl_options *opts = ctrl->opts;
1895        int ret;
1896
1897        ret = nvme_tcp_configure_admin_queue(ctrl, new);
1898        if (ret)
1899                return ret;
1900
1901        if (ctrl->icdoff) {
1902                dev_err(ctrl->device, "icdoff is not supported!\n");
1903                goto destroy_admin;
1904        }
1905
1906        if (opts->queue_size > ctrl->sqsize + 1)
1907                dev_warn(ctrl->device,
1908                        "queue_size %zu > ctrl sqsize %u, clamping down\n",
1909                        opts->queue_size, ctrl->sqsize + 1);
1910
1911        if (ctrl->sqsize + 1 > ctrl->maxcmd) {
1912                dev_warn(ctrl->device,
1913                        "sqsize %u > ctrl maxcmd %u, clamping down\n",
1914                        ctrl->sqsize + 1, ctrl->maxcmd);
1915                ctrl->sqsize = ctrl->maxcmd - 1;
1916        }
1917
1918        if (ctrl->queue_count > 1) {
1919                ret = nvme_tcp_configure_io_queues(ctrl, new);
1920                if (ret)
1921                        goto destroy_admin;
1922        }
1923
1924        if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) {
1925                /*
1926                 * state change failure is ok if we're in DELETING state,
1927                 * unless we're during creation of a new controller to
1928                 * avoid races with teardown flow.
1929                 */
1930                WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING);
1931                WARN_ON_ONCE(new);
1932                ret = -EINVAL;
1933                goto destroy_io;
1934        }
1935
1936        nvme_start_ctrl(ctrl);
1937        return 0;
1938
1939destroy_io:
1940        if (ctrl->queue_count > 1)
1941                nvme_tcp_destroy_io_queues(ctrl, new);
1942destroy_admin:
1943        nvme_tcp_stop_queue(ctrl, 0);
1944        nvme_tcp_destroy_admin_queue(ctrl, new);
1945        return ret;
1946}
1947
1948static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
1949{
1950        struct nvme_tcp_ctrl *tcp_ctrl = container_of(to_delayed_work(work),
1951                        struct nvme_tcp_ctrl, connect_work);
1952        struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
1953
1954        ++ctrl->nr_reconnects;
1955
1956        if (nvme_tcp_setup_ctrl(ctrl, false))
1957                goto requeue;
1958
1959        dev_info(ctrl->device, "Successfully reconnected (%d attempt)\n",
1960                        ctrl->nr_reconnects);
1961
1962        ctrl->nr_reconnects = 0;
1963
1964        return;
1965
1966requeue:
1967        dev_info(ctrl->device, "Failed reconnect attempt %d\n",
1968                        ctrl->nr_reconnects);
1969        nvme_tcp_reconnect_or_remove(ctrl);
1970}
1971
1972static void nvme_tcp_error_recovery_work(struct work_struct *work)
1973{
1974        struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
1975                                struct nvme_tcp_ctrl, err_work);
1976        struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
1977
1978        nvme_stop_keep_alive(ctrl);
1979        nvme_tcp_teardown_io_queues(ctrl, false);
1980        /* unquiesce to fail fast pending requests */
1981        nvme_start_queues(ctrl);
1982        nvme_tcp_teardown_admin_queue(ctrl, false);
1983        blk_mq_unquiesce_queue(ctrl->admin_q);
1984
1985        if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
1986                /* state change failure is ok if we're in DELETING state */
1987                WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING);
1988                return;
1989        }
1990
1991        nvme_tcp_reconnect_or_remove(ctrl);
1992}
1993
1994static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
1995{
1996        cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work);
1997        cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
1998
1999        nvme_tcp_teardown_io_queues(ctrl, shutdown);
2000        blk_mq_quiesce_queue(ctrl->admin_q);
2001        if (shutdown)
2002                nvme_shutdown_ctrl(ctrl);
2003        else
2004                nvme_disable_ctrl(ctrl);
2005        nvme_tcp_teardown_admin_queue(ctrl, shutdown);
2006}
2007
2008static void nvme_tcp_delete_ctrl(struct nvme_ctrl *ctrl)
2009{
2010        nvme_tcp_teardown_ctrl(ctrl, true);
2011}
2012
2013static void nvme_reset_ctrl_work(struct work_struct *work)
2014{
2015        struct nvme_ctrl *ctrl =
2016                container_of(work, struct nvme_ctrl, reset_work);
2017
2018        nvme_stop_ctrl(ctrl);
2019        nvme_tcp_teardown_ctrl(ctrl, false);
2020
2021        if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
2022                /* state change failure is ok if we're in DELETING state */
2023                WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING);
2024                return;
2025        }
2026
2027        if (nvme_tcp_setup_ctrl(ctrl, false))
2028                goto out_fail;
2029
2030        return;
2031
2032out_fail:
2033        ++ctrl->nr_reconnects;
2034        nvme_tcp_reconnect_or_remove(ctrl);
2035}
2036
2037static void nvme_tcp_free_ctrl(struct nvme_ctrl *nctrl)
2038{
2039        struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
2040
2041        if (list_empty(&ctrl->list))
2042                goto free_ctrl;
2043
2044        mutex_lock(&nvme_tcp_ctrl_mutex);
2045        list_del(&ctrl->list);
2046        mutex_unlock(&nvme_tcp_ctrl_mutex);
2047
2048        nvmf_free_options(nctrl->opts);
2049free_ctrl:
2050        kfree(ctrl->queues);
2051        kfree(ctrl);
2052}
2053
2054static void nvme_tcp_set_sg_null(struct nvme_command *c)
2055{
2056        struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
2057
2058        sg->addr = 0;
2059        sg->length = 0;
2060        sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
2061                        NVME_SGL_FMT_TRANSPORT_A;
2062}
2063
2064static void nvme_tcp_set_sg_inline(struct nvme_tcp_queue *queue,
2065                struct nvme_command *c, u32 data_len)
2066{
2067        struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
2068
2069        sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff);
2070        sg->length = cpu_to_le32(data_len);
2071        sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
2072}
2073
2074static void nvme_tcp_set_sg_host_data(struct nvme_command *c,
2075                u32 data_len)
2076{
2077        struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
2078
2079        sg->addr = 0;
2080        sg->length = cpu_to_le32(data_len);
2081        sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
2082                        NVME_SGL_FMT_TRANSPORT_A;
2083}
2084
2085static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg)
2086{
2087        struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(arg);
2088        struct nvme_tcp_queue *queue = &ctrl->queues[0];
2089        struct nvme_tcp_cmd_pdu *pdu = ctrl->async_req.pdu;
2090        struct nvme_command *cmd = &pdu->cmd;
2091        u8 hdgst = nvme_tcp_hdgst_len(queue);
2092
2093        memset(pdu, 0, sizeof(*pdu));
2094        pdu->hdr.type = nvme_tcp_cmd;
2095        if (queue->hdr_digest)
2096                pdu->hdr.flags |= NVME_TCP_F_HDGST;
2097        pdu->hdr.hlen = sizeof(*pdu);
2098        pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
2099
2100        cmd->common.opcode = nvme_admin_async_event;
2101        cmd->common.command_id = NVME_AQ_BLK_MQ_DEPTH;
2102        cmd->common.flags |= NVME_CMD_SGL_METABUF;
2103        nvme_tcp_set_sg_null(cmd);
2104
2105        ctrl->async_req.state = NVME_TCP_SEND_CMD_PDU;
2106        ctrl->async_req.offset = 0;
2107        ctrl->async_req.curr_bio = NULL;
2108        ctrl->async_req.data_len = 0;
2109
2110        nvme_tcp_queue_request(&ctrl->async_req, true);
2111}
2112
2113static enum blk_eh_timer_return
2114nvme_tcp_timeout(struct request *rq, bool reserved)
2115{
2116        struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2117        struct nvme_tcp_ctrl *ctrl = req->queue->ctrl;
2118        struct nvme_tcp_cmd_pdu *pdu = req->pdu;
2119
2120        /*
2121         * Restart the timer if a controller reset is already scheduled. Any
2122         * timed out commands would be handled before entering the connecting
2123         * state.
2124         */
2125        if (ctrl->ctrl.state == NVME_CTRL_RESETTING)
2126                return BLK_EH_RESET_TIMER;
2127
2128        dev_warn(ctrl->ctrl.device,
2129                "queue %d: timeout request %#x type %d\n",
2130                nvme_tcp_queue_id(req->queue), rq->tag, pdu->hdr.type);
2131
2132        if (ctrl->ctrl.state != NVME_CTRL_LIVE) {
2133                /*
2134                 * Teardown immediately if controller times out while starting
2135                 * or we are already started error recovery. all outstanding
2136                 * requests are completed on shutdown, so we return BLK_EH_DONE.
2137                 */
2138                flush_work(&ctrl->err_work);
2139                nvme_tcp_teardown_io_queues(&ctrl->ctrl, false);
2140                nvme_tcp_teardown_admin_queue(&ctrl->ctrl, false);
2141                return BLK_EH_DONE;
2142        }
2143
2144        dev_warn(ctrl->ctrl.device, "starting error recovery\n");
2145        nvme_tcp_error_recovery(&ctrl->ctrl);
2146
2147        return BLK_EH_RESET_TIMER;
2148}
2149
2150static blk_status_t nvme_tcp_map_data(struct nvme_tcp_queue *queue,
2151                        struct request *rq)
2152{
2153        struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2154        struct nvme_tcp_cmd_pdu *pdu = req->pdu;
2155        struct nvme_command *c = &pdu->cmd;
2156
2157        c->common.flags |= NVME_CMD_SGL_METABUF;
2158
2159        if (!blk_rq_nr_phys_segments(rq))
2160                nvme_tcp_set_sg_null(c);
2161        else if (rq_data_dir(rq) == WRITE &&
2162            req->data_len <= nvme_tcp_inline_data_size(queue))
2163                nvme_tcp_set_sg_inline(queue, c, req->data_len);
2164        else
2165                nvme_tcp_set_sg_host_data(c, req->data_len);
2166
2167        return 0;
2168}
2169
2170static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns,
2171                struct request *rq)
2172{
2173        struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2174        struct nvme_tcp_cmd_pdu *pdu = req->pdu;
2175        struct nvme_tcp_queue *queue = req->queue;
2176        u8 hdgst = nvme_tcp_hdgst_len(queue), ddgst = 0;
2177        blk_status_t ret;
2178
2179        ret = nvme_setup_cmd(ns, rq, &pdu->cmd);
2180        if (ret)
2181                return ret;
2182
2183        req->state = NVME_TCP_SEND_CMD_PDU;
2184        req->offset = 0;
2185        req->data_sent = 0;
2186        req->pdu_len = 0;
2187        req->pdu_sent = 0;
2188        req->data_len = blk_rq_nr_phys_segments(rq) ?
2189                                blk_rq_payload_bytes(rq) : 0;
2190        req->curr_bio = rq->bio;
2191
2192        if (rq_data_dir(rq) == WRITE &&
2193            req->data_len <= nvme_tcp_inline_data_size(queue))
2194                req->pdu_len = req->data_len;
2195        else if (req->curr_bio)
2196                nvme_tcp_init_iter(req, READ);
2197
2198        pdu->hdr.type = nvme_tcp_cmd;
2199        pdu->hdr.flags = 0;
2200        if (queue->hdr_digest)
2201                pdu->hdr.flags |= NVME_TCP_F_HDGST;
2202        if (queue->data_digest && req->pdu_len) {
2203                pdu->hdr.flags |= NVME_TCP_F_DDGST;
2204                ddgst = nvme_tcp_ddgst_len(queue);
2205        }
2206        pdu->hdr.hlen = sizeof(*pdu);
2207        pdu->hdr.pdo = req->pdu_len ? pdu->hdr.hlen + hdgst : 0;
2208        pdu->hdr.plen =
2209                cpu_to_le32(pdu->hdr.hlen + hdgst + req->pdu_len + ddgst);
2210
2211        ret = nvme_tcp_map_data(queue, rq);
2212        if (unlikely(ret)) {
2213                nvme_cleanup_cmd(rq);
2214                dev_err(queue->ctrl->ctrl.device,
2215                        "Failed to map data (%d)\n", ret);
2216                return ret;
2217        }
2218
2219        return 0;
2220}
2221
2222static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
2223                const struct blk_mq_queue_data *bd)
2224{
2225        struct nvme_ns *ns = hctx->queue->queuedata;
2226        struct nvme_tcp_queue *queue = hctx->driver_data;
2227        struct request *rq = bd->rq;
2228        struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2229        bool queue_ready = test_bit(NVME_TCP_Q_LIVE, &queue->flags);
2230        blk_status_t ret;
2231
2232        if (!nvmf_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
2233                return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq);
2234
2235        ret = nvme_tcp_setup_cmd_pdu(ns, rq);
2236        if (unlikely(ret))
2237                return ret;
2238
2239        blk_mq_start_request(rq);
2240
2241        nvme_tcp_queue_request(req, true);
2242
2243        return BLK_STS_OK;
2244}
2245
2246static int nvme_tcp_map_queues(struct blk_mq_tag_set *set)
2247{
2248        struct nvme_tcp_ctrl *ctrl = set->driver_data;
2249        struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
2250
2251        if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) {
2252                /* separate read/write queues */
2253                set->map[HCTX_TYPE_DEFAULT].nr_queues =
2254                        ctrl->io_queues[HCTX_TYPE_DEFAULT];
2255                set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
2256                set->map[HCTX_TYPE_READ].nr_queues =
2257                        ctrl->io_queues[HCTX_TYPE_READ];
2258                set->map[HCTX_TYPE_READ].queue_offset =
2259                        ctrl->io_queues[HCTX_TYPE_DEFAULT];
2260        } else {
2261                /* shared read/write queues */
2262                set->map[HCTX_TYPE_DEFAULT].nr_queues =
2263                        ctrl->io_queues[HCTX_TYPE_DEFAULT];
2264                set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
2265                set->map[HCTX_TYPE_READ].nr_queues =
2266                        ctrl->io_queues[HCTX_TYPE_DEFAULT];
2267                set->map[HCTX_TYPE_READ].queue_offset = 0;
2268        }
2269        blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
2270        blk_mq_map_queues(&set->map[HCTX_TYPE_READ]);
2271
2272        if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) {
2273                /* map dedicated poll queues only if we have queues left */
2274                set->map[HCTX_TYPE_POLL].nr_queues =
2275                                ctrl->io_queues[HCTX_TYPE_POLL];
2276                set->map[HCTX_TYPE_POLL].queue_offset =
2277                        ctrl->io_queues[HCTX_TYPE_DEFAULT] +
2278                        ctrl->io_queues[HCTX_TYPE_READ];
2279                blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]);
2280        }
2281
2282        dev_info(ctrl->ctrl.device,
2283                "mapped %d/%d/%d default/read/poll queues.\n",
2284                ctrl->io_queues[HCTX_TYPE_DEFAULT],
2285                ctrl->io_queues[HCTX_TYPE_READ],
2286                ctrl->io_queues[HCTX_TYPE_POLL]);
2287
2288        return 0;
2289}
2290
2291static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx)
2292{
2293        struct nvme_tcp_queue *queue = hctx->driver_data;
2294        struct sock *sk = queue->sock->sk;
2295
2296        if (!test_bit(NVME_TCP_Q_LIVE, &queue->flags))
2297                return 0;
2298
2299        set_bit(NVME_TCP_Q_POLLING, &queue->flags);
2300        if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue))
2301                sk_busy_loop(sk, true);
2302        nvme_tcp_try_recv(queue);
2303        clear_bit(NVME_TCP_Q_POLLING, &queue->flags);
2304        return queue->nr_cqe;
2305}
2306
2307static const struct blk_mq_ops nvme_tcp_mq_ops = {
2308        .queue_rq       = nvme_tcp_queue_rq,
2309        .complete       = nvme_complete_rq,
2310        .init_request   = nvme_tcp_init_request,
2311        .exit_request   = nvme_tcp_exit_request,
2312        .init_hctx      = nvme_tcp_init_hctx,
2313        .timeout        = nvme_tcp_timeout,
2314        .map_queues     = nvme_tcp_map_queues,
2315        .poll           = nvme_tcp_poll,
2316};
2317
2318static const struct blk_mq_ops nvme_tcp_admin_mq_ops = {
2319        .queue_rq       = nvme_tcp_queue_rq,
2320        .complete       = nvme_complete_rq,
2321        .init_request   = nvme_tcp_init_request,
2322        .exit_request   = nvme_tcp_exit_request,
2323        .init_hctx      = nvme_tcp_init_admin_hctx,
2324        .timeout        = nvme_tcp_timeout,
2325};
2326
2327static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = {
2328        .name                   = "tcp",
2329        .module                 = THIS_MODULE,
2330        .flags                  = NVME_F_FABRICS,
2331        .reg_read32             = nvmf_reg_read32,
2332        .reg_read64             = nvmf_reg_read64,
2333        .reg_write32            = nvmf_reg_write32,
2334        .free_ctrl              = nvme_tcp_free_ctrl,
2335        .submit_async_event     = nvme_tcp_submit_async_event,
2336        .delete_ctrl            = nvme_tcp_delete_ctrl,
2337        .get_address            = nvmf_get_address,
2338};
2339
2340static bool
2341nvme_tcp_existing_controller(struct nvmf_ctrl_options *opts)
2342{
2343        struct nvme_tcp_ctrl *ctrl;
2344        bool found = false;
2345
2346        mutex_lock(&nvme_tcp_ctrl_mutex);
2347        list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list) {
2348                found = nvmf_ip_options_match(&ctrl->ctrl, opts);
2349                if (found)
2350                        break;
2351        }
2352        mutex_unlock(&nvme_tcp_ctrl_mutex);
2353
2354        return found;
2355}
2356
2357static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
2358                struct nvmf_ctrl_options *opts)
2359{
2360        struct nvme_tcp_ctrl *ctrl;
2361        int ret;
2362
2363        ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
2364        if (!ctrl)
2365                return ERR_PTR(-ENOMEM);
2366
2367        INIT_LIST_HEAD(&ctrl->list);
2368        ctrl->ctrl.opts = opts;
2369        ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues +
2370                                opts->nr_poll_queues + 1;
2371        ctrl->ctrl.sqsize = opts->queue_size - 1;
2372        ctrl->ctrl.kato = opts->kato;
2373
2374        INIT_DELAYED_WORK(&ctrl->connect_work,
2375                        nvme_tcp_reconnect_ctrl_work);
2376        INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work);
2377        INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work);
2378
2379        if (!(opts->mask & NVMF_OPT_TRSVCID)) {
2380                opts->trsvcid =
2381                        kstrdup(__stringify(NVME_TCP_DISC_PORT), GFP_KERNEL);
2382                if (!opts->trsvcid) {
2383                        ret = -ENOMEM;
2384                        goto out_free_ctrl;
2385                }
2386                opts->mask |= NVMF_OPT_TRSVCID;
2387        }
2388
2389        ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
2390                        opts->traddr, opts->trsvcid, &ctrl->addr);
2391        if (ret) {
2392                pr_err("malformed address passed: %s:%s\n",
2393                        opts->traddr, opts->trsvcid);
2394                goto out_free_ctrl;
2395        }
2396
2397        if (opts->mask & NVMF_OPT_HOST_TRADDR) {
2398                ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
2399                        opts->host_traddr, NULL, &ctrl->src_addr);
2400                if (ret) {
2401                        pr_err("malformed src address passed: %s\n",
2402                               opts->host_traddr);
2403                        goto out_free_ctrl;
2404                }
2405        }
2406
2407        if (!opts->duplicate_connect && nvme_tcp_existing_controller(opts)) {
2408                ret = -EALREADY;
2409                goto out_free_ctrl;
2410        }
2411
2412        ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(*ctrl->queues),
2413                                GFP_KERNEL);
2414        if (!ctrl->queues) {
2415                ret = -ENOMEM;
2416                goto out_free_ctrl;
2417        }
2418
2419        ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_tcp_ctrl_ops, 0);
2420        if (ret)
2421                goto out_kfree_queues;
2422
2423        if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
2424                WARN_ON_ONCE(1);
2425                ret = -EINTR;
2426                goto out_uninit_ctrl;
2427        }
2428
2429        ret = nvme_tcp_setup_ctrl(&ctrl->ctrl, true);
2430        if (ret)
2431                goto out_uninit_ctrl;
2432
2433        dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n",
2434                ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
2435
2436        mutex_lock(&nvme_tcp_ctrl_mutex);
2437        list_add_tail(&ctrl->list, &nvme_tcp_ctrl_list);
2438        mutex_unlock(&nvme_tcp_ctrl_mutex);
2439
2440        return &ctrl->ctrl;
2441
2442out_uninit_ctrl:
2443        nvme_uninit_ctrl(&ctrl->ctrl);
2444        nvme_put_ctrl(&ctrl->ctrl);
2445        if (ret > 0)
2446                ret = -EIO;
2447        return ERR_PTR(ret);
2448out_kfree_queues:
2449        kfree(ctrl->queues);
2450out_free_ctrl:
2451        kfree(ctrl);
2452        return ERR_PTR(ret);
2453}
2454
2455static struct nvmf_transport_ops nvme_tcp_transport = {
2456        .name           = "tcp",
2457        .module         = THIS_MODULE,
2458        .required_opts  = NVMF_OPT_TRADDR,
2459        .allowed_opts   = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
2460                          NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
2461                          NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST |
2462                          NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES |
2463                          NVMF_OPT_TOS,
2464        .create_ctrl    = nvme_tcp_create_ctrl,
2465};
2466
2467static int __init nvme_tcp_init_module(void)
2468{
2469        nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq",
2470                        WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
2471        if (!nvme_tcp_wq)
2472                return -ENOMEM;
2473
2474        nvmf_register_transport(&nvme_tcp_transport);
2475        return 0;
2476}
2477
2478static void __exit nvme_tcp_cleanup_module(void)
2479{
2480        struct nvme_tcp_ctrl *ctrl;
2481
2482        nvmf_unregister_transport(&nvme_tcp_transport);
2483
2484        mutex_lock(&nvme_tcp_ctrl_mutex);
2485        list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list)
2486                nvme_delete_ctrl(&ctrl->ctrl);
2487        mutex_unlock(&nvme_tcp_ctrl_mutex);
2488        flush_workqueue(nvme_delete_wq);
2489
2490        destroy_workqueue(nvme_tcp_wq);
2491}
2492
2493module_init(nvme_tcp_init_module);
2494module_exit(nvme_tcp_cleanup_module);
2495
2496MODULE_LICENSE("GPL v2");
2497