linux/drivers/nvme/host/tcp.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * NVMe over Fabrics TCP host.
   4 * Copyright (c) 2018 Lightbits Labs. All rights reserved.
   5 */
   6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   7#include <linux/module.h>
   8#include <linux/init.h>
   9#include <linux/slab.h>
  10#include <linux/err.h>
  11#include <linux/nvme-tcp.h>
  12#include <net/sock.h>
  13#include <net/tcp.h>
  14#include <linux/blk-mq.h>
  15#include <crypto/hash.h>
  16#include <net/busy_poll.h>
  17
  18#include "nvme.h"
  19#include "fabrics.h"
  20
  21struct nvme_tcp_queue;
  22
  23/* Define the socket priority to use for connections were it is desirable
  24 * that the NIC consider performing optimized packet processing or filtering.
  25 * A non-zero value being sufficient to indicate general consideration of any
  26 * possible optimization.  Making it a module param allows for alternative
  27 * values that may be unique for some NIC implementations.
  28 */
  29static int so_priority;
  30module_param(so_priority, int, 0644);
  31MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority");
  32
  33enum nvme_tcp_send_state {
  34        NVME_TCP_SEND_CMD_PDU = 0,
  35        NVME_TCP_SEND_H2C_PDU,
  36        NVME_TCP_SEND_DATA,
  37        NVME_TCP_SEND_DDGST,
  38};
  39
  40struct nvme_tcp_request {
  41        struct nvme_request     req;
  42        void                    *pdu;
  43        struct nvme_tcp_queue   *queue;
  44        u32                     data_len;
  45        u32                     pdu_len;
  46        u32                     pdu_sent;
  47        u16                     ttag;
  48        struct list_head        entry;
  49        struct llist_node       lentry;
  50        __le32                  ddgst;
  51
  52        struct bio              *curr_bio;
  53        struct iov_iter         iter;
  54
  55        /* send state */
  56        size_t                  offset;
  57        size_t                  data_sent;
  58        enum nvme_tcp_send_state state;
  59};
  60
  61enum nvme_tcp_queue_flags {
  62        NVME_TCP_Q_ALLOCATED    = 0,
  63        NVME_TCP_Q_LIVE         = 1,
  64        NVME_TCP_Q_POLLING      = 2,
  65};
  66
  67enum nvme_tcp_recv_state {
  68        NVME_TCP_RECV_PDU = 0,
  69        NVME_TCP_RECV_DATA,
  70        NVME_TCP_RECV_DDGST,
  71};
  72
  73struct nvme_tcp_ctrl;
  74struct nvme_tcp_queue {
  75        struct socket           *sock;
  76        struct work_struct      io_work;
  77        int                     io_cpu;
  78
  79        struct mutex            send_mutex;
  80        struct llist_head       req_list;
  81        struct list_head        send_list;
  82        bool                    more_requests;
  83
  84        /* recv state */
  85        void                    *pdu;
  86        int                     pdu_remaining;
  87        int                     pdu_offset;
  88        size_t                  data_remaining;
  89        size_t                  ddgst_remaining;
  90        unsigned int            nr_cqe;
  91
  92        /* send state */
  93        struct nvme_tcp_request *request;
  94
  95        int                     queue_size;
  96        size_t                  cmnd_capsule_len;
  97        struct nvme_tcp_ctrl    *ctrl;
  98        unsigned long           flags;
  99        bool                    rd_enabled;
 100
 101        bool                    hdr_digest;
 102        bool                    data_digest;
 103        struct ahash_request    *rcv_hash;
 104        struct ahash_request    *snd_hash;
 105        __le32                  exp_ddgst;
 106        __le32                  recv_ddgst;
 107
 108        struct page_frag_cache  pf_cache;
 109
 110        void (*state_change)(struct sock *);
 111        void (*data_ready)(struct sock *);
 112        void (*write_space)(struct sock *);
 113};
 114
 115struct nvme_tcp_ctrl {
 116        /* read only in the hot path */
 117        struct nvme_tcp_queue   *queues;
 118        struct blk_mq_tag_set   tag_set;
 119
 120        /* other member variables */
 121        struct list_head        list;
 122        struct blk_mq_tag_set   admin_tag_set;
 123        struct sockaddr_storage addr;
 124        struct sockaddr_storage src_addr;
 125        struct nvme_ctrl        ctrl;
 126
 127        struct work_struct      err_work;
 128        struct delayed_work     connect_work;
 129        struct nvme_tcp_request async_req;
 130        u32                     io_queues[HCTX_MAX_TYPES];
 131};
 132
 133static LIST_HEAD(nvme_tcp_ctrl_list);
 134static DEFINE_MUTEX(nvme_tcp_ctrl_mutex);
 135static struct workqueue_struct *nvme_tcp_wq;
 136static const struct blk_mq_ops nvme_tcp_mq_ops;
 137static const struct blk_mq_ops nvme_tcp_admin_mq_ops;
 138static int nvme_tcp_try_send(struct nvme_tcp_queue *queue);
 139
 140static inline struct nvme_tcp_ctrl *to_tcp_ctrl(struct nvme_ctrl *ctrl)
 141{
 142        return container_of(ctrl, struct nvme_tcp_ctrl, ctrl);
 143}
 144
 145static inline int nvme_tcp_queue_id(struct nvme_tcp_queue *queue)
 146{
 147        return queue - queue->ctrl->queues;
 148}
 149
 150static inline struct blk_mq_tags *nvme_tcp_tagset(struct nvme_tcp_queue *queue)
 151{
 152        u32 queue_idx = nvme_tcp_queue_id(queue);
 153
 154        if (queue_idx == 0)
 155                return queue->ctrl->admin_tag_set.tags[queue_idx];
 156        return queue->ctrl->tag_set.tags[queue_idx - 1];
 157}
 158
 159static inline u8 nvme_tcp_hdgst_len(struct nvme_tcp_queue *queue)
 160{
 161        return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0;
 162}
 163
 164static inline u8 nvme_tcp_ddgst_len(struct nvme_tcp_queue *queue)
 165{
 166        return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
 167}
 168
 169static inline size_t nvme_tcp_inline_data_size(struct nvme_tcp_queue *queue)
 170{
 171        return queue->cmnd_capsule_len - sizeof(struct nvme_command);
 172}
 173
 174static inline bool nvme_tcp_async_req(struct nvme_tcp_request *req)
 175{
 176        return req == &req->queue->ctrl->async_req;
 177}
 178
 179static inline bool nvme_tcp_has_inline_data(struct nvme_tcp_request *req)
 180{
 181        struct request *rq;
 182
 183        if (unlikely(nvme_tcp_async_req(req)))
 184                return false; /* async events don't have a request */
 185
 186        rq = blk_mq_rq_from_pdu(req);
 187
 188        return rq_data_dir(rq) == WRITE && req->data_len &&
 189                req->data_len <= nvme_tcp_inline_data_size(req->queue);
 190}
 191
 192static inline struct page *nvme_tcp_req_cur_page(struct nvme_tcp_request *req)
 193{
 194        return req->iter.bvec->bv_page;
 195}
 196
 197static inline size_t nvme_tcp_req_cur_offset(struct nvme_tcp_request *req)
 198{
 199        return req->iter.bvec->bv_offset + req->iter.iov_offset;
 200}
 201
 202static inline size_t nvme_tcp_req_cur_length(struct nvme_tcp_request *req)
 203{
 204        return min_t(size_t, req->iter.bvec->bv_len - req->iter.iov_offset,
 205                        req->pdu_len - req->pdu_sent);
 206}
 207
 208static inline size_t nvme_tcp_req_offset(struct nvme_tcp_request *req)
 209{
 210        return req->iter.iov_offset;
 211}
 212
 213static inline size_t nvme_tcp_pdu_data_left(struct nvme_tcp_request *req)
 214{
 215        return rq_data_dir(blk_mq_rq_from_pdu(req)) == WRITE ?
 216                        req->pdu_len - req->pdu_sent : 0;
 217}
 218
 219static inline size_t nvme_tcp_pdu_last_send(struct nvme_tcp_request *req,
 220                int len)
 221{
 222        return nvme_tcp_pdu_data_left(req) <= len;
 223}
 224
 225static void nvme_tcp_init_iter(struct nvme_tcp_request *req,
 226                unsigned int dir)
 227{
 228        struct request *rq = blk_mq_rq_from_pdu(req);
 229        struct bio_vec *vec;
 230        unsigned int size;
 231        int nsegs;
 232        size_t offset;
 233
 234        if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) {
 235                vec = &rq->special_vec;
 236                nsegs = 1;
 237                size = blk_rq_payload_bytes(rq);
 238                offset = 0;
 239        } else {
 240                struct bio *bio = req->curr_bio;
 241
 242                vec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
 243                nsegs = bio_segments(bio);
 244                size = bio->bi_iter.bi_size;
 245                offset = bio->bi_iter.bi_bvec_done;
 246        }
 247
 248        iov_iter_bvec(&req->iter, dir, vec, nsegs, size);
 249        req->iter.iov_offset = offset;
 250}
 251
 252static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req,
 253                int len)
 254{
 255        req->data_sent += len;
 256        req->pdu_sent += len;
 257        iov_iter_advance(&req->iter, len);
 258        if (!iov_iter_count(&req->iter) &&
 259            req->data_sent < req->data_len) {
 260                req->curr_bio = req->curr_bio->bi_next;
 261                nvme_tcp_init_iter(req, WRITE);
 262        }
 263}
 264
 265static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
 266                bool sync, bool last)
 267{
 268        struct nvme_tcp_queue *queue = req->queue;
 269        bool empty;
 270
 271        empty = llist_add(&req->lentry, &queue->req_list) &&
 272                list_empty(&queue->send_list) && !queue->request;
 273
 274        /*
 275         * if we're the first on the send_list and we can try to send
 276         * directly, otherwise queue io_work. Also, only do that if we
 277         * are on the same cpu, so we don't introduce contention.
 278         */
 279        if (queue->io_cpu == smp_processor_id() &&
 280            sync && empty && mutex_trylock(&queue->send_mutex)) {
 281                queue->more_requests = !last;
 282                nvme_tcp_try_send(queue);
 283                queue->more_requests = false;
 284                mutex_unlock(&queue->send_mutex);
 285        } else if (last) {
 286                queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
 287        }
 288}
 289
 290static void nvme_tcp_process_req_list(struct nvme_tcp_queue *queue)
 291{
 292        struct nvme_tcp_request *req;
 293        struct llist_node *node;
 294
 295        for (node = llist_del_all(&queue->req_list); node; node = node->next) {
 296                req = llist_entry(node, struct nvme_tcp_request, lentry);
 297                list_add(&req->entry, &queue->send_list);
 298        }
 299}
 300
 301static inline struct nvme_tcp_request *
 302nvme_tcp_fetch_request(struct nvme_tcp_queue *queue)
 303{
 304        struct nvme_tcp_request *req;
 305
 306        req = list_first_entry_or_null(&queue->send_list,
 307                        struct nvme_tcp_request, entry);
 308        if (!req) {
 309                nvme_tcp_process_req_list(queue);
 310                req = list_first_entry_or_null(&queue->send_list,
 311                                struct nvme_tcp_request, entry);
 312                if (unlikely(!req))
 313                        return NULL;
 314        }
 315
 316        list_del(&req->entry);
 317        return req;
 318}
 319
 320static inline void nvme_tcp_ddgst_final(struct ahash_request *hash,
 321                __le32 *dgst)
 322{
 323        ahash_request_set_crypt(hash, NULL, (u8 *)dgst, 0);
 324        crypto_ahash_final(hash);
 325}
 326
 327static inline void nvme_tcp_ddgst_update(struct ahash_request *hash,
 328                struct page *page, off_t off, size_t len)
 329{
 330        struct scatterlist sg;
 331
 332        sg_init_marker(&sg, 1);
 333        sg_set_page(&sg, page, len, off);
 334        ahash_request_set_crypt(hash, &sg, NULL, len);
 335        crypto_ahash_update(hash);
 336}
 337
 338static inline void nvme_tcp_hdgst(struct ahash_request *hash,
 339                void *pdu, size_t len)
 340{
 341        struct scatterlist sg;
 342
 343        sg_init_one(&sg, pdu, len);
 344        ahash_request_set_crypt(hash, &sg, pdu + len, len);
 345        crypto_ahash_digest(hash);
 346}
 347
 348static int nvme_tcp_verify_hdgst(struct nvme_tcp_queue *queue,
 349                void *pdu, size_t pdu_len)
 350{
 351        struct nvme_tcp_hdr *hdr = pdu;
 352        __le32 recv_digest;
 353        __le32 exp_digest;
 354
 355        if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) {
 356                dev_err(queue->ctrl->ctrl.device,
 357                        "queue %d: header digest flag is cleared\n",
 358                        nvme_tcp_queue_id(queue));
 359                return -EPROTO;
 360        }
 361
 362        recv_digest = *(__le32 *)(pdu + hdr->hlen);
 363        nvme_tcp_hdgst(queue->rcv_hash, pdu, pdu_len);
 364        exp_digest = *(__le32 *)(pdu + hdr->hlen);
 365        if (recv_digest != exp_digest) {
 366                dev_err(queue->ctrl->ctrl.device,
 367                        "header digest error: recv %#x expected %#x\n",
 368                        le32_to_cpu(recv_digest), le32_to_cpu(exp_digest));
 369                return -EIO;
 370        }
 371
 372        return 0;
 373}
 374
 375static int nvme_tcp_check_ddgst(struct nvme_tcp_queue *queue, void *pdu)
 376{
 377        struct nvme_tcp_hdr *hdr = pdu;
 378        u8 digest_len = nvme_tcp_hdgst_len(queue);
 379        u32 len;
 380
 381        len = le32_to_cpu(hdr->plen) - hdr->hlen -
 382                ((hdr->flags & NVME_TCP_F_HDGST) ? digest_len : 0);
 383
 384        if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) {
 385                dev_err(queue->ctrl->ctrl.device,
 386                        "queue %d: data digest flag is cleared\n",
 387                nvme_tcp_queue_id(queue));
 388                return -EPROTO;
 389        }
 390        crypto_ahash_init(queue->rcv_hash);
 391
 392        return 0;
 393}
 394
 395static void nvme_tcp_exit_request(struct blk_mq_tag_set *set,
 396                struct request *rq, unsigned int hctx_idx)
 397{
 398        struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
 399
 400        page_frag_free(req->pdu);
 401}
 402
 403static int nvme_tcp_init_request(struct blk_mq_tag_set *set,
 404                struct request *rq, unsigned int hctx_idx,
 405                unsigned int numa_node)
 406{
 407        struct nvme_tcp_ctrl *ctrl = set->driver_data;
 408        struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
 409        int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
 410        struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx];
 411        u8 hdgst = nvme_tcp_hdgst_len(queue);
 412
 413        req->pdu = page_frag_alloc(&queue->pf_cache,
 414                sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
 415                GFP_KERNEL | __GFP_ZERO);
 416        if (!req->pdu)
 417                return -ENOMEM;
 418
 419        req->queue = queue;
 420        nvme_req(rq)->ctrl = &ctrl->ctrl;
 421
 422        return 0;
 423}
 424
 425static int nvme_tcp_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
 426                unsigned int hctx_idx)
 427{
 428        struct nvme_tcp_ctrl *ctrl = data;
 429        struct nvme_tcp_queue *queue = &ctrl->queues[hctx_idx + 1];
 430
 431        hctx->driver_data = queue;
 432        return 0;
 433}
 434
 435static int nvme_tcp_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
 436                unsigned int hctx_idx)
 437{
 438        struct nvme_tcp_ctrl *ctrl = data;
 439        struct nvme_tcp_queue *queue = &ctrl->queues[0];
 440
 441        hctx->driver_data = queue;
 442        return 0;
 443}
 444
 445static enum nvme_tcp_recv_state
 446nvme_tcp_recv_state(struct nvme_tcp_queue *queue)
 447{
 448        return  (queue->pdu_remaining) ? NVME_TCP_RECV_PDU :
 449                (queue->ddgst_remaining) ? NVME_TCP_RECV_DDGST :
 450                NVME_TCP_RECV_DATA;
 451}
 452
 453static void nvme_tcp_init_recv_ctx(struct nvme_tcp_queue *queue)
 454{
 455        queue->pdu_remaining = sizeof(struct nvme_tcp_rsp_pdu) +
 456                                nvme_tcp_hdgst_len(queue);
 457        queue->pdu_offset = 0;
 458        queue->data_remaining = -1;
 459        queue->ddgst_remaining = 0;
 460}
 461
 462static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl)
 463{
 464        if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
 465                return;
 466
 467        dev_warn(ctrl->device, "starting error recovery\n");
 468        queue_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work);
 469}
 470
 471static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
 472                struct nvme_completion *cqe)
 473{
 474        struct request *rq;
 475
 476        rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), cqe->command_id);
 477        if (!rq) {
 478                dev_err(queue->ctrl->ctrl.device,
 479                        "queue %d tag 0x%x not found\n",
 480                        nvme_tcp_queue_id(queue), cqe->command_id);
 481                nvme_tcp_error_recovery(&queue->ctrl->ctrl);
 482                return -EINVAL;
 483        }
 484
 485        if (!nvme_try_complete_req(rq, cqe->status, cqe->result))
 486                nvme_complete_rq(rq);
 487        queue->nr_cqe++;
 488
 489        return 0;
 490}
 491
 492static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue,
 493                struct nvme_tcp_data_pdu *pdu)
 494{
 495        struct request *rq;
 496
 497        rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
 498        if (!rq) {
 499                dev_err(queue->ctrl->ctrl.device,
 500                        "queue %d tag %#x not found\n",
 501                        nvme_tcp_queue_id(queue), pdu->command_id);
 502                return -ENOENT;
 503        }
 504
 505        if (!blk_rq_payload_bytes(rq)) {
 506                dev_err(queue->ctrl->ctrl.device,
 507                        "queue %d tag %#x unexpected data\n",
 508                        nvme_tcp_queue_id(queue), rq->tag);
 509                return -EIO;
 510        }
 511
 512        queue->data_remaining = le32_to_cpu(pdu->data_length);
 513
 514        if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS &&
 515            unlikely(!(pdu->hdr.flags & NVME_TCP_F_DATA_LAST))) {
 516                dev_err(queue->ctrl->ctrl.device,
 517                        "queue %d tag %#x SUCCESS set but not last PDU\n",
 518                        nvme_tcp_queue_id(queue), rq->tag);
 519                nvme_tcp_error_recovery(&queue->ctrl->ctrl);
 520                return -EPROTO;
 521        }
 522
 523        return 0;
 524}
 525
 526static int nvme_tcp_handle_comp(struct nvme_tcp_queue *queue,
 527                struct nvme_tcp_rsp_pdu *pdu)
 528{
 529        struct nvme_completion *cqe = &pdu->cqe;
 530        int ret = 0;
 531
 532        /*
 533         * AEN requests are special as they don't time out and can
 534         * survive any kind of queue freeze and often don't respond to
 535         * aborts.  We don't even bother to allocate a struct request
 536         * for them but rather special case them here.
 537         */
 538        if (unlikely(nvme_is_aen_req(nvme_tcp_queue_id(queue),
 539                                     cqe->command_id)))
 540                nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
 541                                &cqe->result);
 542        else
 543                ret = nvme_tcp_process_nvme_cqe(queue, cqe);
 544
 545        return ret;
 546}
 547
 548static int nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req,
 549                struct nvme_tcp_r2t_pdu *pdu)
 550{
 551        struct nvme_tcp_data_pdu *data = req->pdu;
 552        struct nvme_tcp_queue *queue = req->queue;
 553        struct request *rq = blk_mq_rq_from_pdu(req);
 554        u8 hdgst = nvme_tcp_hdgst_len(queue);
 555        u8 ddgst = nvme_tcp_ddgst_len(queue);
 556
 557        req->pdu_len = le32_to_cpu(pdu->r2t_length);
 558        req->pdu_sent = 0;
 559
 560        if (unlikely(req->data_sent + req->pdu_len > req->data_len)) {
 561                dev_err(queue->ctrl->ctrl.device,
 562                        "req %d r2t len %u exceeded data len %u (%zu sent)\n",
 563                        rq->tag, req->pdu_len, req->data_len,
 564                        req->data_sent);
 565                return -EPROTO;
 566        }
 567
 568        if (unlikely(le32_to_cpu(pdu->r2t_offset) < req->data_sent)) {
 569                dev_err(queue->ctrl->ctrl.device,
 570                        "req %d unexpected r2t offset %u (expected %zu)\n",
 571                        rq->tag, le32_to_cpu(pdu->r2t_offset),
 572                        req->data_sent);
 573                return -EPROTO;
 574        }
 575
 576        memset(data, 0, sizeof(*data));
 577        data->hdr.type = nvme_tcp_h2c_data;
 578        data->hdr.flags = NVME_TCP_F_DATA_LAST;
 579        if (queue->hdr_digest)
 580                data->hdr.flags |= NVME_TCP_F_HDGST;
 581        if (queue->data_digest)
 582                data->hdr.flags |= NVME_TCP_F_DDGST;
 583        data->hdr.hlen = sizeof(*data);
 584        data->hdr.pdo = data->hdr.hlen + hdgst;
 585        data->hdr.plen =
 586                cpu_to_le32(data->hdr.hlen + hdgst + req->pdu_len + ddgst);
 587        data->ttag = pdu->ttag;
 588        data->command_id = rq->tag;
 589        data->data_offset = cpu_to_le32(req->data_sent);
 590        data->data_length = cpu_to_le32(req->pdu_len);
 591        return 0;
 592}
 593
 594static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue,
 595                struct nvme_tcp_r2t_pdu *pdu)
 596{
 597        struct nvme_tcp_request *req;
 598        struct request *rq;
 599        int ret;
 600
 601        rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
 602        if (!rq) {
 603                dev_err(queue->ctrl->ctrl.device,
 604                        "queue %d tag %#x not found\n",
 605                        nvme_tcp_queue_id(queue), pdu->command_id);
 606                return -ENOENT;
 607        }
 608        req = blk_mq_rq_to_pdu(rq);
 609
 610        ret = nvme_tcp_setup_h2c_data_pdu(req, pdu);
 611        if (unlikely(ret))
 612                return ret;
 613
 614        req->state = NVME_TCP_SEND_H2C_PDU;
 615        req->offset = 0;
 616
 617        nvme_tcp_queue_request(req, false, true);
 618
 619        return 0;
 620}
 621
 622static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb,
 623                unsigned int *offset, size_t *len)
 624{
 625        struct nvme_tcp_hdr *hdr;
 626        char *pdu = queue->pdu;
 627        size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining);
 628        int ret;
 629
 630        ret = skb_copy_bits(skb, *offset,
 631                &pdu[queue->pdu_offset], rcv_len);
 632        if (unlikely(ret))
 633                return ret;
 634
 635        queue->pdu_remaining -= rcv_len;
 636        queue->pdu_offset += rcv_len;
 637        *offset += rcv_len;
 638        *len -= rcv_len;
 639        if (queue->pdu_remaining)
 640                return 0;
 641
 642        hdr = queue->pdu;
 643        if (queue->hdr_digest) {
 644                ret = nvme_tcp_verify_hdgst(queue, queue->pdu, hdr->hlen);
 645                if (unlikely(ret))
 646                        return ret;
 647        }
 648
 649
 650        if (queue->data_digest) {
 651                ret = nvme_tcp_check_ddgst(queue, queue->pdu);
 652                if (unlikely(ret))
 653                        return ret;
 654        }
 655
 656        switch (hdr->type) {
 657        case nvme_tcp_c2h_data:
 658                return nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu);
 659        case nvme_tcp_rsp:
 660                nvme_tcp_init_recv_ctx(queue);
 661                return nvme_tcp_handle_comp(queue, (void *)queue->pdu);
 662        case nvme_tcp_r2t:
 663                nvme_tcp_init_recv_ctx(queue);
 664                return nvme_tcp_handle_r2t(queue, (void *)queue->pdu);
 665        default:
 666                dev_err(queue->ctrl->ctrl.device,
 667                        "unsupported pdu type (%d)\n", hdr->type);
 668                return -EINVAL;
 669        }
 670}
 671
 672static inline void nvme_tcp_end_request(struct request *rq, u16 status)
 673{
 674        union nvme_result res = {};
 675
 676        if (!nvme_try_complete_req(rq, cpu_to_le16(status << 1), res))
 677                nvme_complete_rq(rq);
 678}
 679
 680static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
 681                              unsigned int *offset, size_t *len)
 682{
 683        struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
 684        struct nvme_tcp_request *req;
 685        struct request *rq;
 686
 687        rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
 688        if (!rq) {
 689                dev_err(queue->ctrl->ctrl.device,
 690                        "queue %d tag %#x not found\n",
 691                        nvme_tcp_queue_id(queue), pdu->command_id);
 692                return -ENOENT;
 693        }
 694        req = blk_mq_rq_to_pdu(rq);
 695
 696        while (true) {
 697                int recv_len, ret;
 698
 699                recv_len = min_t(size_t, *len, queue->data_remaining);
 700                if (!recv_len)
 701                        break;
 702
 703                if (!iov_iter_count(&req->iter)) {
 704                        req->curr_bio = req->curr_bio->bi_next;
 705
 706                        /*
 707                         * If we don`t have any bios it means that controller
 708                         * sent more data than we requested, hence error
 709                         */
 710                        if (!req->curr_bio) {
 711                                dev_err(queue->ctrl->ctrl.device,
 712                                        "queue %d no space in request %#x",
 713                                        nvme_tcp_queue_id(queue), rq->tag);
 714                                nvme_tcp_init_recv_ctx(queue);
 715                                return -EIO;
 716                        }
 717                        nvme_tcp_init_iter(req, READ);
 718                }
 719
 720                /* we can read only from what is left in this bio */
 721                recv_len = min_t(size_t, recv_len,
 722                                iov_iter_count(&req->iter));
 723
 724                if (queue->data_digest)
 725                        ret = skb_copy_and_hash_datagram_iter(skb, *offset,
 726                                &req->iter, recv_len, queue->rcv_hash);
 727                else
 728                        ret = skb_copy_datagram_iter(skb, *offset,
 729                                        &req->iter, recv_len);
 730                if (ret) {
 731                        dev_err(queue->ctrl->ctrl.device,
 732                                "queue %d failed to copy request %#x data",
 733                                nvme_tcp_queue_id(queue), rq->tag);
 734                        return ret;
 735                }
 736
 737                *len -= recv_len;
 738                *offset += recv_len;
 739                queue->data_remaining -= recv_len;
 740        }
 741
 742        if (!queue->data_remaining) {
 743                if (queue->data_digest) {
 744                        nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst);
 745                        queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH;
 746                } else {
 747                        if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
 748                                nvme_tcp_end_request(rq, NVME_SC_SUCCESS);
 749                                queue->nr_cqe++;
 750                        }
 751                        nvme_tcp_init_recv_ctx(queue);
 752                }
 753        }
 754
 755        return 0;
 756}
 757
 758static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue,
 759                struct sk_buff *skb, unsigned int *offset, size_t *len)
 760{
 761        struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
 762        char *ddgst = (char *)&queue->recv_ddgst;
 763        size_t recv_len = min_t(size_t, *len, queue->ddgst_remaining);
 764        off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining;
 765        int ret;
 766
 767        ret = skb_copy_bits(skb, *offset, &ddgst[off], recv_len);
 768        if (unlikely(ret))
 769                return ret;
 770
 771        queue->ddgst_remaining -= recv_len;
 772        *offset += recv_len;
 773        *len -= recv_len;
 774        if (queue->ddgst_remaining)
 775                return 0;
 776
 777        if (queue->recv_ddgst != queue->exp_ddgst) {
 778                dev_err(queue->ctrl->ctrl.device,
 779                        "data digest error: recv %#x expected %#x\n",
 780                        le32_to_cpu(queue->recv_ddgst),
 781                        le32_to_cpu(queue->exp_ddgst));
 782                return -EIO;
 783        }
 784
 785        if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
 786                struct request *rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue),
 787                                                pdu->command_id);
 788
 789                nvme_tcp_end_request(rq, NVME_SC_SUCCESS);
 790                queue->nr_cqe++;
 791        }
 792
 793        nvme_tcp_init_recv_ctx(queue);
 794        return 0;
 795}
 796
 797static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
 798                             unsigned int offset, size_t len)
 799{
 800        struct nvme_tcp_queue *queue = desc->arg.data;
 801        size_t consumed = len;
 802        int result;
 803
 804        while (len) {
 805                switch (nvme_tcp_recv_state(queue)) {
 806                case NVME_TCP_RECV_PDU:
 807                        result = nvme_tcp_recv_pdu(queue, skb, &offset, &len);
 808                        break;
 809                case NVME_TCP_RECV_DATA:
 810                        result = nvme_tcp_recv_data(queue, skb, &offset, &len);
 811                        break;
 812                case NVME_TCP_RECV_DDGST:
 813                        result = nvme_tcp_recv_ddgst(queue, skb, &offset, &len);
 814                        break;
 815                default:
 816                        result = -EFAULT;
 817                }
 818                if (result) {
 819                        dev_err(queue->ctrl->ctrl.device,
 820                                "receive failed:  %d\n", result);
 821                        queue->rd_enabled = false;
 822                        nvme_tcp_error_recovery(&queue->ctrl->ctrl);
 823                        return result;
 824                }
 825        }
 826
 827        return consumed;
 828}
 829
 830static void nvme_tcp_data_ready(struct sock *sk)
 831{
 832        struct nvme_tcp_queue *queue;
 833
 834        read_lock_bh(&sk->sk_callback_lock);
 835        queue = sk->sk_user_data;
 836        if (likely(queue && queue->rd_enabled) &&
 837            !test_bit(NVME_TCP_Q_POLLING, &queue->flags))
 838                queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
 839        read_unlock_bh(&sk->sk_callback_lock);
 840}
 841
 842static void nvme_tcp_write_space(struct sock *sk)
 843{
 844        struct nvme_tcp_queue *queue;
 845
 846        read_lock_bh(&sk->sk_callback_lock);
 847        queue = sk->sk_user_data;
 848        if (likely(queue && sk_stream_is_writeable(sk))) {
 849                clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 850                queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
 851        }
 852        read_unlock_bh(&sk->sk_callback_lock);
 853}
 854
 855static void nvme_tcp_state_change(struct sock *sk)
 856{
 857        struct nvme_tcp_queue *queue;
 858
 859        read_lock(&sk->sk_callback_lock);
 860        queue = sk->sk_user_data;
 861        if (!queue)
 862                goto done;
 863
 864        switch (sk->sk_state) {
 865        case TCP_CLOSE:
 866        case TCP_CLOSE_WAIT:
 867        case TCP_LAST_ACK:
 868        case TCP_FIN_WAIT1:
 869        case TCP_FIN_WAIT2:
 870                nvme_tcp_error_recovery(&queue->ctrl->ctrl);
 871                break;
 872        default:
 873                dev_info(queue->ctrl->ctrl.device,
 874                        "queue %d socket state %d\n",
 875                        nvme_tcp_queue_id(queue), sk->sk_state);
 876        }
 877
 878        queue->state_change(sk);
 879done:
 880        read_unlock(&sk->sk_callback_lock);
 881}
 882
 883static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue)
 884{
 885        return !list_empty(&queue->send_list) ||
 886                !llist_empty(&queue->req_list) || queue->more_requests;
 887}
 888
 889static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue)
 890{
 891        queue->request = NULL;
 892}
 893
 894static void nvme_tcp_fail_request(struct nvme_tcp_request *req)
 895{
 896        nvme_tcp_end_request(blk_mq_rq_from_pdu(req), NVME_SC_HOST_PATH_ERROR);
 897}
 898
 899static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
 900{
 901        struct nvme_tcp_queue *queue = req->queue;
 902
 903        while (true) {
 904                struct page *page = nvme_tcp_req_cur_page(req);
 905                size_t offset = nvme_tcp_req_cur_offset(req);
 906                size_t len = nvme_tcp_req_cur_length(req);
 907                bool last = nvme_tcp_pdu_last_send(req, len);
 908                int ret, flags = MSG_DONTWAIT;
 909
 910                if (last && !queue->data_digest && !nvme_tcp_queue_more(queue))
 911                        flags |= MSG_EOR;
 912                else
 913                        flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
 914
 915                if (sendpage_ok(page)) {
 916                        ret = kernel_sendpage(queue->sock, page, offset, len,
 917                                        flags);
 918                } else {
 919                        ret = sock_no_sendpage(queue->sock, page, offset, len,
 920                                        flags);
 921                }
 922                if (ret <= 0)
 923                        return ret;
 924
 925                nvme_tcp_advance_req(req, ret);
 926                if (queue->data_digest)
 927                        nvme_tcp_ddgst_update(queue->snd_hash, page,
 928                                        offset, ret);
 929
 930                /* fully successful last write*/
 931                if (last && ret == len) {
 932                        if (queue->data_digest) {
 933                                nvme_tcp_ddgst_final(queue->snd_hash,
 934                                        &req->ddgst);
 935                                req->state = NVME_TCP_SEND_DDGST;
 936                                req->offset = 0;
 937                        } else {
 938                                nvme_tcp_done_send_req(queue);
 939                        }
 940                        return 1;
 941                }
 942        }
 943        return -EAGAIN;
 944}
 945
 946static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
 947{
 948        struct nvme_tcp_queue *queue = req->queue;
 949        struct nvme_tcp_cmd_pdu *pdu = req->pdu;
 950        bool inline_data = nvme_tcp_has_inline_data(req);
 951        u8 hdgst = nvme_tcp_hdgst_len(queue);
 952        int len = sizeof(*pdu) + hdgst - req->offset;
 953        int flags = MSG_DONTWAIT;
 954        int ret;
 955
 956        if (inline_data || nvme_tcp_queue_more(queue))
 957                flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
 958        else
 959                flags |= MSG_EOR;
 960
 961        if (queue->hdr_digest && !req->offset)
 962                nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
 963
 964        ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
 965                        offset_in_page(pdu) + req->offset, len,  flags);
 966        if (unlikely(ret <= 0))
 967                return ret;
 968
 969        len -= ret;
 970        if (!len) {
 971                if (inline_data) {
 972                        req->state = NVME_TCP_SEND_DATA;
 973                        if (queue->data_digest)
 974                                crypto_ahash_init(queue->snd_hash);
 975                        nvme_tcp_init_iter(req, WRITE);
 976                } else {
 977                        nvme_tcp_done_send_req(queue);
 978                }
 979                return 1;
 980        }
 981        req->offset += ret;
 982
 983        return -EAGAIN;
 984}
 985
 986static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
 987{
 988        struct nvme_tcp_queue *queue = req->queue;
 989        struct nvme_tcp_data_pdu *pdu = req->pdu;
 990        u8 hdgst = nvme_tcp_hdgst_len(queue);
 991        int len = sizeof(*pdu) - req->offset + hdgst;
 992        int ret;
 993
 994        if (queue->hdr_digest && !req->offset)
 995                nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
 996
 997        ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
 998                        offset_in_page(pdu) + req->offset, len,
 999                        MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST);
1000        if (unlikely(ret <= 0))
1001                return ret;
1002
1003        len -= ret;
1004        if (!len) {
1005                req->state = NVME_TCP_SEND_DATA;
1006                if (queue->data_digest)
1007                        crypto_ahash_init(queue->snd_hash);
1008                if (!req->data_sent)
1009                        nvme_tcp_init_iter(req, WRITE);
1010                return 1;
1011        }
1012        req->offset += ret;
1013
1014        return -EAGAIN;
1015}
1016
1017static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req)
1018{
1019        struct nvme_tcp_queue *queue = req->queue;
1020        int ret;
1021        struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
1022        struct kvec iov = {
1023                .iov_base = &req->ddgst + req->offset,
1024                .iov_len = NVME_TCP_DIGEST_LENGTH - req->offset
1025        };
1026
1027        if (nvme_tcp_queue_more(queue))
1028                msg.msg_flags |= MSG_MORE;
1029        else
1030                msg.msg_flags |= MSG_EOR;
1031
1032        ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
1033        if (unlikely(ret <= 0))
1034                return ret;
1035
1036        if (req->offset + ret == NVME_TCP_DIGEST_LENGTH) {
1037                nvme_tcp_done_send_req(queue);
1038                return 1;
1039        }
1040
1041        req->offset += ret;
1042        return -EAGAIN;
1043}
1044
1045static int nvme_tcp_try_send(struct nvme_tcp_queue *queue)
1046{
1047        struct nvme_tcp_request *req;
1048        int ret = 1;
1049
1050        if (!queue->request) {
1051                queue->request = nvme_tcp_fetch_request(queue);
1052                if (!queue->request)
1053                        return 0;
1054        }
1055        req = queue->request;
1056
1057        if (req->state == NVME_TCP_SEND_CMD_PDU) {
1058                ret = nvme_tcp_try_send_cmd_pdu(req);
1059                if (ret <= 0)
1060                        goto done;
1061                if (!nvme_tcp_has_inline_data(req))
1062                        return ret;
1063        }
1064
1065        if (req->state == NVME_TCP_SEND_H2C_PDU) {
1066                ret = nvme_tcp_try_send_data_pdu(req);
1067                if (ret <= 0)
1068                        goto done;
1069        }
1070
1071        if (req->state == NVME_TCP_SEND_DATA) {
1072                ret = nvme_tcp_try_send_data(req);
1073                if (ret <= 0)
1074                        goto done;
1075        }
1076
1077        if (req->state == NVME_TCP_SEND_DDGST)
1078                ret = nvme_tcp_try_send_ddgst(req);
1079done:
1080        if (ret == -EAGAIN) {
1081                ret = 0;
1082        } else if (ret < 0) {
1083                dev_err(queue->ctrl->ctrl.device,
1084                        "failed to send request %d\n", ret);
1085                if (ret != -EPIPE && ret != -ECONNRESET)
1086                        nvme_tcp_fail_request(queue->request);
1087                nvme_tcp_done_send_req(queue);
1088        }
1089        return ret;
1090}
1091
1092static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
1093{
1094        struct socket *sock = queue->sock;
1095        struct sock *sk = sock->sk;
1096        read_descriptor_t rd_desc;
1097        int consumed;
1098
1099        rd_desc.arg.data = queue;
1100        rd_desc.count = 1;
1101        lock_sock(sk);
1102        queue->nr_cqe = 0;
1103        consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
1104        release_sock(sk);
1105        return consumed;
1106}
1107
1108static void nvme_tcp_io_work(struct work_struct *w)
1109{
1110        struct nvme_tcp_queue *queue =
1111                container_of(w, struct nvme_tcp_queue, io_work);
1112        unsigned long deadline = jiffies + msecs_to_jiffies(1);
1113
1114        do {
1115                bool pending = false;
1116                int result;
1117
1118                if (mutex_trylock(&queue->send_mutex)) {
1119                        result = nvme_tcp_try_send(queue);
1120                        mutex_unlock(&queue->send_mutex);
1121                        if (result > 0)
1122                                pending = true;
1123                        else if (unlikely(result < 0))
1124                                break;
1125                }
1126
1127                result = nvme_tcp_try_recv(queue);
1128                if (result > 0)
1129                        pending = true;
1130                else if (unlikely(result < 0))
1131                        return;
1132
1133                if (!pending)
1134                        return;
1135
1136        } while (!time_after(jiffies, deadline)); /* quota is exhausted */
1137
1138        queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
1139}
1140
1141static void nvme_tcp_free_crypto(struct nvme_tcp_queue *queue)
1142{
1143        struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash);
1144
1145        ahash_request_free(queue->rcv_hash);
1146        ahash_request_free(queue->snd_hash);
1147        crypto_free_ahash(tfm);
1148}
1149
1150static int nvme_tcp_alloc_crypto(struct nvme_tcp_queue *queue)
1151{
1152        struct crypto_ahash *tfm;
1153
1154        tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC);
1155        if (IS_ERR(tfm))
1156                return PTR_ERR(tfm);
1157
1158        queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL);
1159        if (!queue->snd_hash)
1160                goto free_tfm;
1161        ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL);
1162
1163        queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL);
1164        if (!queue->rcv_hash)
1165                goto free_snd_hash;
1166        ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL);
1167
1168        return 0;
1169free_snd_hash:
1170        ahash_request_free(queue->snd_hash);
1171free_tfm:
1172        crypto_free_ahash(tfm);
1173        return -ENOMEM;
1174}
1175
1176static void nvme_tcp_free_async_req(struct nvme_tcp_ctrl *ctrl)
1177{
1178        struct nvme_tcp_request *async = &ctrl->async_req;
1179
1180        page_frag_free(async->pdu);
1181}
1182
1183static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl)
1184{
1185        struct nvme_tcp_queue *queue = &ctrl->queues[0];
1186        struct nvme_tcp_request *async = &ctrl->async_req;
1187        u8 hdgst = nvme_tcp_hdgst_len(queue);
1188
1189        async->pdu = page_frag_alloc(&queue->pf_cache,
1190                sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
1191                GFP_KERNEL | __GFP_ZERO);
1192        if (!async->pdu)
1193                return -ENOMEM;
1194
1195        async->queue = &ctrl->queues[0];
1196        return 0;
1197}
1198
1199static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
1200{
1201        struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1202        struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1203
1204        if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
1205                return;
1206
1207        if (queue->hdr_digest || queue->data_digest)
1208                nvme_tcp_free_crypto(queue);
1209
1210        sock_release(queue->sock);
1211        kfree(queue->pdu);
1212}
1213
1214static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue)
1215{
1216        struct nvme_tcp_icreq_pdu *icreq;
1217        struct nvme_tcp_icresp_pdu *icresp;
1218        struct msghdr msg = {};
1219        struct kvec iov;
1220        bool ctrl_hdgst, ctrl_ddgst;
1221        int ret;
1222
1223        icreq = kzalloc(sizeof(*icreq), GFP_KERNEL);
1224        if (!icreq)
1225                return -ENOMEM;
1226
1227        icresp = kzalloc(sizeof(*icresp), GFP_KERNEL);
1228        if (!icresp) {
1229                ret = -ENOMEM;
1230                goto free_icreq;
1231        }
1232
1233        icreq->hdr.type = nvme_tcp_icreq;
1234        icreq->hdr.hlen = sizeof(*icreq);
1235        icreq->hdr.pdo = 0;
1236        icreq->hdr.plen = cpu_to_le32(icreq->hdr.hlen);
1237        icreq->pfv = cpu_to_le16(NVME_TCP_PFV_1_0);
1238        icreq->maxr2t = 0; /* single inflight r2t supported */
1239        icreq->hpda = 0; /* no alignment constraint */
1240        if (queue->hdr_digest)
1241                icreq->digest |= NVME_TCP_HDR_DIGEST_ENABLE;
1242        if (queue->data_digest)
1243                icreq->digest |= NVME_TCP_DATA_DIGEST_ENABLE;
1244
1245        iov.iov_base = icreq;
1246        iov.iov_len = sizeof(*icreq);
1247        ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
1248        if (ret < 0)
1249                goto free_icresp;
1250
1251        memset(&msg, 0, sizeof(msg));
1252        iov.iov_base = icresp;
1253        iov.iov_len = sizeof(*icresp);
1254        ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
1255                        iov.iov_len, msg.msg_flags);
1256        if (ret < 0)
1257                goto free_icresp;
1258
1259        ret = -EINVAL;
1260        if (icresp->hdr.type != nvme_tcp_icresp) {
1261                pr_err("queue %d: bad type returned %d\n",
1262                        nvme_tcp_queue_id(queue), icresp->hdr.type);
1263                goto free_icresp;
1264        }
1265
1266        if (le32_to_cpu(icresp->hdr.plen) != sizeof(*icresp)) {
1267                pr_err("queue %d: bad pdu length returned %d\n",
1268                        nvme_tcp_queue_id(queue), icresp->hdr.plen);
1269                goto free_icresp;
1270        }
1271
1272        if (icresp->pfv != NVME_TCP_PFV_1_0) {
1273                pr_err("queue %d: bad pfv returned %d\n",
1274                        nvme_tcp_queue_id(queue), icresp->pfv);
1275                goto free_icresp;
1276        }
1277
1278        ctrl_ddgst = !!(icresp->digest & NVME_TCP_DATA_DIGEST_ENABLE);
1279        if ((queue->data_digest && !ctrl_ddgst) ||
1280            (!queue->data_digest && ctrl_ddgst)) {
1281                pr_err("queue %d: data digest mismatch host: %s ctrl: %s\n",
1282                        nvme_tcp_queue_id(queue),
1283                        queue->data_digest ? "enabled" : "disabled",
1284                        ctrl_ddgst ? "enabled" : "disabled");
1285                goto free_icresp;
1286        }
1287
1288        ctrl_hdgst = !!(icresp->digest & NVME_TCP_HDR_DIGEST_ENABLE);
1289        if ((queue->hdr_digest && !ctrl_hdgst) ||
1290            (!queue->hdr_digest && ctrl_hdgst)) {
1291                pr_err("queue %d: header digest mismatch host: %s ctrl: %s\n",
1292                        nvme_tcp_queue_id(queue),
1293                        queue->hdr_digest ? "enabled" : "disabled",
1294                        ctrl_hdgst ? "enabled" : "disabled");
1295                goto free_icresp;
1296        }
1297
1298        if (icresp->cpda != 0) {
1299                pr_err("queue %d: unsupported cpda returned %d\n",
1300                        nvme_tcp_queue_id(queue), icresp->cpda);
1301                goto free_icresp;
1302        }
1303
1304        ret = 0;
1305free_icresp:
1306        kfree(icresp);
1307free_icreq:
1308        kfree(icreq);
1309        return ret;
1310}
1311
1312static bool nvme_tcp_admin_queue(struct nvme_tcp_queue *queue)
1313{
1314        return nvme_tcp_queue_id(queue) == 0;
1315}
1316
1317static bool nvme_tcp_default_queue(struct nvme_tcp_queue *queue)
1318{
1319        struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1320        int qid = nvme_tcp_queue_id(queue);
1321
1322        return !nvme_tcp_admin_queue(queue) &&
1323                qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT];
1324}
1325
1326static bool nvme_tcp_read_queue(struct nvme_tcp_queue *queue)
1327{
1328        struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1329        int qid = nvme_tcp_queue_id(queue);
1330
1331        return !nvme_tcp_admin_queue(queue) &&
1332                !nvme_tcp_default_queue(queue) &&
1333                qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] +
1334                          ctrl->io_queues[HCTX_TYPE_READ];
1335}
1336
1337static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue)
1338{
1339        struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1340        int qid = nvme_tcp_queue_id(queue);
1341
1342        return !nvme_tcp_admin_queue(queue) &&
1343                !nvme_tcp_default_queue(queue) &&
1344                !nvme_tcp_read_queue(queue) &&
1345                qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] +
1346                          ctrl->io_queues[HCTX_TYPE_READ] +
1347                          ctrl->io_queues[HCTX_TYPE_POLL];
1348}
1349
1350static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
1351{
1352        struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1353        int qid = nvme_tcp_queue_id(queue);
1354        int n = 0;
1355
1356        if (nvme_tcp_default_queue(queue))
1357                n = qid - 1;
1358        else if (nvme_tcp_read_queue(queue))
1359                n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - 1;
1360        else if (nvme_tcp_poll_queue(queue))
1361                n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] -
1362                                ctrl->io_queues[HCTX_TYPE_READ] - 1;
1363        queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
1364}
1365
1366static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
1367                int qid, size_t queue_size)
1368{
1369        struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1370        struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1371        int ret, rcv_pdu_size;
1372
1373        queue->ctrl = ctrl;
1374        init_llist_head(&queue->req_list);
1375        INIT_LIST_HEAD(&queue->send_list);
1376        mutex_init(&queue->send_mutex);
1377        INIT_WORK(&queue->io_work, nvme_tcp_io_work);
1378        queue->queue_size = queue_size;
1379
1380        if (qid > 0)
1381                queue->cmnd_capsule_len = nctrl->ioccsz * 16;
1382        else
1383                queue->cmnd_capsule_len = sizeof(struct nvme_command) +
1384                                                NVME_TCP_ADMIN_CCSZ;
1385
1386        ret = sock_create(ctrl->addr.ss_family, SOCK_STREAM,
1387                        IPPROTO_TCP, &queue->sock);
1388        if (ret) {
1389                dev_err(nctrl->device,
1390                        "failed to create socket: %d\n", ret);
1391                return ret;
1392        }
1393
1394        /* Single syn retry */
1395        tcp_sock_set_syncnt(queue->sock->sk, 1);
1396
1397        /* Set TCP no delay */
1398        tcp_sock_set_nodelay(queue->sock->sk);
1399
1400        /*
1401         * Cleanup whatever is sitting in the TCP transmit queue on socket
1402         * close. This is done to prevent stale data from being sent should
1403         * the network connection be restored before TCP times out.
1404         */
1405        sock_no_linger(queue->sock->sk);
1406
1407        if (so_priority > 0)
1408                sock_set_priority(queue->sock->sk, so_priority);
1409
1410        /* Set socket type of service */
1411        if (nctrl->opts->tos >= 0)
1412                ip_sock_set_tos(queue->sock->sk, nctrl->opts->tos);
1413
1414        /* Set 10 seconds timeout for icresp recvmsg */
1415        queue->sock->sk->sk_rcvtimeo = 10 * HZ;
1416
1417        queue->sock->sk->sk_allocation = GFP_ATOMIC;
1418        nvme_tcp_set_queue_io_cpu(queue);
1419        queue->request = NULL;
1420        queue->data_remaining = 0;
1421        queue->ddgst_remaining = 0;
1422        queue->pdu_remaining = 0;
1423        queue->pdu_offset = 0;
1424        sk_set_memalloc(queue->sock->sk);
1425
1426        if (nctrl->opts->mask & NVMF_OPT_HOST_TRADDR) {
1427                ret = kernel_bind(queue->sock, (struct sockaddr *)&ctrl->src_addr,
1428                        sizeof(ctrl->src_addr));
1429                if (ret) {
1430                        dev_err(nctrl->device,
1431                                "failed to bind queue %d socket %d\n",
1432                                qid, ret);
1433                        goto err_sock;
1434                }
1435        }
1436
1437        queue->hdr_digest = nctrl->opts->hdr_digest;
1438        queue->data_digest = nctrl->opts->data_digest;
1439        if (queue->hdr_digest || queue->data_digest) {
1440                ret = nvme_tcp_alloc_crypto(queue);
1441                if (ret) {
1442                        dev_err(nctrl->device,
1443                                "failed to allocate queue %d crypto\n", qid);
1444                        goto err_sock;
1445                }
1446        }
1447
1448        rcv_pdu_size = sizeof(struct nvme_tcp_rsp_pdu) +
1449                        nvme_tcp_hdgst_len(queue);
1450        queue->pdu = kmalloc(rcv_pdu_size, GFP_KERNEL);
1451        if (!queue->pdu) {
1452                ret = -ENOMEM;
1453                goto err_crypto;
1454        }
1455
1456        dev_dbg(nctrl->device, "connecting queue %d\n",
1457                        nvme_tcp_queue_id(queue));
1458
1459        ret = kernel_connect(queue->sock, (struct sockaddr *)&ctrl->addr,
1460                sizeof(ctrl->addr), 0);
1461        if (ret) {
1462                dev_err(nctrl->device,
1463                        "failed to connect socket: %d\n", ret);
1464                goto err_rcv_pdu;
1465        }
1466
1467        ret = nvme_tcp_init_connection(queue);
1468        if (ret)
1469                goto err_init_connect;
1470
1471        queue->rd_enabled = true;
1472        set_bit(NVME_TCP_Q_ALLOCATED, &queue->flags);
1473        nvme_tcp_init_recv_ctx(queue);
1474
1475        write_lock_bh(&queue->sock->sk->sk_callback_lock);
1476        queue->sock->sk->sk_user_data = queue;
1477        queue->state_change = queue->sock->sk->sk_state_change;
1478        queue->data_ready = queue->sock->sk->sk_data_ready;
1479        queue->write_space = queue->sock->sk->sk_write_space;
1480        queue->sock->sk->sk_data_ready = nvme_tcp_data_ready;
1481        queue->sock->sk->sk_state_change = nvme_tcp_state_change;
1482        queue->sock->sk->sk_write_space = nvme_tcp_write_space;
1483#ifdef CONFIG_NET_RX_BUSY_POLL
1484        queue->sock->sk->sk_ll_usec = 1;
1485#endif
1486        write_unlock_bh(&queue->sock->sk->sk_callback_lock);
1487
1488        return 0;
1489
1490err_init_connect:
1491        kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1492err_rcv_pdu:
1493        kfree(queue->pdu);
1494err_crypto:
1495        if (queue->hdr_digest || queue->data_digest)
1496                nvme_tcp_free_crypto(queue);
1497err_sock:
1498        sock_release(queue->sock);
1499        queue->sock = NULL;
1500        return ret;
1501}
1502
1503static void nvme_tcp_restore_sock_calls(struct nvme_tcp_queue *queue)
1504{
1505        struct socket *sock = queue->sock;
1506
1507        write_lock_bh(&sock->sk->sk_callback_lock);
1508        sock->sk->sk_user_data  = NULL;
1509        sock->sk->sk_data_ready = queue->data_ready;
1510        sock->sk->sk_state_change = queue->state_change;
1511        sock->sk->sk_write_space  = queue->write_space;
1512        write_unlock_bh(&sock->sk->sk_callback_lock);
1513}
1514
1515static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue)
1516{
1517        kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1518        nvme_tcp_restore_sock_calls(queue);
1519        cancel_work_sync(&queue->io_work);
1520}
1521
1522static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
1523{
1524        struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1525        struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1526
1527        if (!test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags))
1528                return;
1529        __nvme_tcp_stop_queue(queue);
1530}
1531
1532static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
1533{
1534        struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1535        int ret;
1536
1537        if (idx)
1538                ret = nvmf_connect_io_queue(nctrl, idx, false);
1539        else
1540                ret = nvmf_connect_admin_queue(nctrl);
1541
1542        if (!ret) {
1543                set_bit(NVME_TCP_Q_LIVE, &ctrl->queues[idx].flags);
1544        } else {
1545                if (test_bit(NVME_TCP_Q_ALLOCATED, &ctrl->queues[idx].flags))
1546                        __nvme_tcp_stop_queue(&ctrl->queues[idx]);
1547                dev_err(nctrl->device,
1548                        "failed to connect queue: %d ret=%d\n", idx, ret);
1549        }
1550        return ret;
1551}
1552
1553static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl,
1554                bool admin)
1555{
1556        struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1557        struct blk_mq_tag_set *set;
1558        int ret;
1559
1560        if (admin) {
1561                set = &ctrl->admin_tag_set;
1562                memset(set, 0, sizeof(*set));
1563                set->ops = &nvme_tcp_admin_mq_ops;
1564                set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
1565                set->reserved_tags = 2; /* connect + keep-alive */
1566                set->numa_node = nctrl->numa_node;
1567                set->flags = BLK_MQ_F_BLOCKING;
1568                set->cmd_size = sizeof(struct nvme_tcp_request);
1569                set->driver_data = ctrl;
1570                set->nr_hw_queues = 1;
1571                set->timeout = ADMIN_TIMEOUT;
1572        } else {
1573                set = &ctrl->tag_set;
1574                memset(set, 0, sizeof(*set));
1575                set->ops = &nvme_tcp_mq_ops;
1576                set->queue_depth = nctrl->sqsize + 1;
1577                set->reserved_tags = 1; /* fabric connect */
1578                set->numa_node = nctrl->numa_node;
1579                set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
1580                set->cmd_size = sizeof(struct nvme_tcp_request);
1581                set->driver_data = ctrl;
1582                set->nr_hw_queues = nctrl->queue_count - 1;
1583                set->timeout = NVME_IO_TIMEOUT;
1584                set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2;
1585        }
1586
1587        ret = blk_mq_alloc_tag_set(set);
1588        if (ret)
1589                return ERR_PTR(ret);
1590
1591        return set;
1592}
1593
1594static void nvme_tcp_free_admin_queue(struct nvme_ctrl *ctrl)
1595{
1596        if (to_tcp_ctrl(ctrl)->async_req.pdu) {
1597                cancel_work_sync(&ctrl->async_event_work);
1598                nvme_tcp_free_async_req(to_tcp_ctrl(ctrl));
1599                to_tcp_ctrl(ctrl)->async_req.pdu = NULL;
1600        }
1601
1602        nvme_tcp_free_queue(ctrl, 0);
1603}
1604
1605static void nvme_tcp_free_io_queues(struct nvme_ctrl *ctrl)
1606{
1607        int i;
1608
1609        for (i = 1; i < ctrl->queue_count; i++)
1610                nvme_tcp_free_queue(ctrl, i);
1611}
1612
1613static void nvme_tcp_stop_io_queues(struct nvme_ctrl *ctrl)
1614{
1615        int i;
1616
1617        for (i = 1; i < ctrl->queue_count; i++)
1618                nvme_tcp_stop_queue(ctrl, i);
1619}
1620
1621static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl)
1622{
1623        int i, ret = 0;
1624
1625        for (i = 1; i < ctrl->queue_count; i++) {
1626                ret = nvme_tcp_start_queue(ctrl, i);
1627                if (ret)
1628                        goto out_stop_queues;
1629        }
1630
1631        return 0;
1632
1633out_stop_queues:
1634        for (i--; i >= 1; i--)
1635                nvme_tcp_stop_queue(ctrl, i);
1636        return ret;
1637}
1638
1639static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl)
1640{
1641        int ret;
1642
1643        ret = nvme_tcp_alloc_queue(ctrl, 0, NVME_AQ_DEPTH);
1644        if (ret)
1645                return ret;
1646
1647        ret = nvme_tcp_alloc_async_req(to_tcp_ctrl(ctrl));
1648        if (ret)
1649                goto out_free_queue;
1650
1651        return 0;
1652
1653out_free_queue:
1654        nvme_tcp_free_queue(ctrl, 0);
1655        return ret;
1656}
1657
1658static int __nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
1659{
1660        int i, ret;
1661
1662        for (i = 1; i < ctrl->queue_count; i++) {
1663                ret = nvme_tcp_alloc_queue(ctrl, i,
1664                                ctrl->sqsize + 1);
1665                if (ret)
1666                        goto out_free_queues;
1667        }
1668
1669        return 0;
1670
1671out_free_queues:
1672        for (i--; i >= 1; i--)
1673                nvme_tcp_free_queue(ctrl, i);
1674
1675        return ret;
1676}
1677
1678static unsigned int nvme_tcp_nr_io_queues(struct nvme_ctrl *ctrl)
1679{
1680        unsigned int nr_io_queues;
1681
1682        nr_io_queues = min(ctrl->opts->nr_io_queues, num_online_cpus());
1683        nr_io_queues += min(ctrl->opts->nr_write_queues, num_online_cpus());
1684        nr_io_queues += min(ctrl->opts->nr_poll_queues, num_online_cpus());
1685
1686        return nr_io_queues;
1687}
1688
1689static void nvme_tcp_set_io_queues(struct nvme_ctrl *nctrl,
1690                unsigned int nr_io_queues)
1691{
1692        struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1693        struct nvmf_ctrl_options *opts = nctrl->opts;
1694
1695        if (opts->nr_write_queues && opts->nr_io_queues < nr_io_queues) {
1696                /*
1697                 * separate read/write queues
1698                 * hand out dedicated default queues only after we have
1699                 * sufficient read queues.
1700                 */
1701                ctrl->io_queues[HCTX_TYPE_READ] = opts->nr_io_queues;
1702                nr_io_queues -= ctrl->io_queues[HCTX_TYPE_READ];
1703                ctrl->io_queues[HCTX_TYPE_DEFAULT] =
1704                        min(opts->nr_write_queues, nr_io_queues);
1705                nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
1706        } else {
1707                /*
1708                 * shared read/write queues
1709                 * either no write queues were requested, or we don't have
1710                 * sufficient queue count to have dedicated default queues.
1711                 */
1712                ctrl->io_queues[HCTX_TYPE_DEFAULT] =
1713                        min(opts->nr_io_queues, nr_io_queues);
1714                nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
1715        }
1716
1717        if (opts->nr_poll_queues && nr_io_queues) {
1718                /* map dedicated poll queues only if we have queues left */
1719                ctrl->io_queues[HCTX_TYPE_POLL] =
1720                        min(opts->nr_poll_queues, nr_io_queues);
1721        }
1722}
1723
1724static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
1725{
1726        unsigned int nr_io_queues;
1727        int ret;
1728
1729        nr_io_queues = nvme_tcp_nr_io_queues(ctrl);
1730        ret = nvme_set_queue_count(ctrl, &nr_io_queues);
1731        if (ret)
1732                return ret;
1733
1734        ctrl->queue_count = nr_io_queues + 1;
1735        if (ctrl->queue_count < 2)
1736                return 0;
1737
1738        dev_info(ctrl->device,
1739                "creating %d I/O queues.\n", nr_io_queues);
1740
1741        nvme_tcp_set_io_queues(ctrl, nr_io_queues);
1742
1743        return __nvme_tcp_alloc_io_queues(ctrl);
1744}
1745
1746static void nvme_tcp_destroy_io_queues(struct nvme_ctrl *ctrl, bool remove)
1747{
1748        nvme_tcp_stop_io_queues(ctrl);
1749        if (remove) {
1750                blk_cleanup_queue(ctrl->connect_q);
1751                blk_mq_free_tag_set(ctrl->tagset);
1752        }
1753        nvme_tcp_free_io_queues(ctrl);
1754}
1755
1756static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
1757{
1758        int ret;
1759
1760        ret = nvme_tcp_alloc_io_queues(ctrl);
1761        if (ret)
1762                return ret;
1763
1764        if (new) {
1765                ctrl->tagset = nvme_tcp_alloc_tagset(ctrl, false);
1766                if (IS_ERR(ctrl->tagset)) {
1767                        ret = PTR_ERR(ctrl->tagset);
1768                        goto out_free_io_queues;
1769                }
1770
1771                ctrl->connect_q = blk_mq_init_queue(ctrl->tagset);
1772                if (IS_ERR(ctrl->connect_q)) {
1773                        ret = PTR_ERR(ctrl->connect_q);
1774                        goto out_free_tag_set;
1775                }
1776        }
1777
1778        ret = nvme_tcp_start_io_queues(ctrl);
1779        if (ret)
1780                goto out_cleanup_connect_q;
1781
1782        if (!new) {
1783                nvme_start_queues(ctrl);
1784                if (!nvme_wait_freeze_timeout(ctrl, NVME_IO_TIMEOUT)) {
1785                        /*
1786                         * If we timed out waiting for freeze we are likely to
1787                         * be stuck.  Fail the controller initialization just
1788                         * to be safe.
1789                         */
1790                        ret = -ENODEV;
1791                        goto out_wait_freeze_timed_out;
1792                }
1793                blk_mq_update_nr_hw_queues(ctrl->tagset,
1794                        ctrl->queue_count - 1);
1795                nvme_unfreeze(ctrl);
1796        }
1797
1798        return 0;
1799
1800out_wait_freeze_timed_out:
1801        nvme_stop_queues(ctrl);
1802        nvme_tcp_stop_io_queues(ctrl);
1803out_cleanup_connect_q:
1804        if (new)
1805                blk_cleanup_queue(ctrl->connect_q);
1806out_free_tag_set:
1807        if (new)
1808                blk_mq_free_tag_set(ctrl->tagset);
1809out_free_io_queues:
1810        nvme_tcp_free_io_queues(ctrl);
1811        return ret;
1812}
1813
1814static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove)
1815{
1816        nvme_tcp_stop_queue(ctrl, 0);
1817        if (remove) {
1818                blk_cleanup_queue(ctrl->admin_q);
1819                blk_cleanup_queue(ctrl->fabrics_q);
1820                blk_mq_free_tag_set(ctrl->admin_tagset);
1821        }
1822        nvme_tcp_free_admin_queue(ctrl);
1823}
1824
1825static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
1826{
1827        int error;
1828
1829        error = nvme_tcp_alloc_admin_queue(ctrl);
1830        if (error)
1831                return error;
1832
1833        if (new) {
1834                ctrl->admin_tagset = nvme_tcp_alloc_tagset(ctrl, true);
1835                if (IS_ERR(ctrl->admin_tagset)) {
1836                        error = PTR_ERR(ctrl->admin_tagset);
1837                        goto out_free_queue;
1838                }
1839
1840                ctrl->fabrics_q = blk_mq_init_queue(ctrl->admin_tagset);
1841                if (IS_ERR(ctrl->fabrics_q)) {
1842                        error = PTR_ERR(ctrl->fabrics_q);
1843                        goto out_free_tagset;
1844                }
1845
1846                ctrl->admin_q = blk_mq_init_queue(ctrl->admin_tagset);
1847                if (IS_ERR(ctrl->admin_q)) {
1848                        error = PTR_ERR(ctrl->admin_q);
1849                        goto out_cleanup_fabrics_q;
1850                }
1851        }
1852
1853        error = nvme_tcp_start_queue(ctrl, 0);
1854        if (error)
1855                goto out_cleanup_queue;
1856
1857        error = nvme_enable_ctrl(ctrl);
1858        if (error)
1859                goto out_stop_queue;
1860
1861        blk_mq_unquiesce_queue(ctrl->admin_q);
1862
1863        error = nvme_init_identify(ctrl);
1864        if (error)
1865                goto out_stop_queue;
1866
1867        return 0;
1868
1869out_stop_queue:
1870        nvme_tcp_stop_queue(ctrl, 0);
1871out_cleanup_queue:
1872        if (new)
1873                blk_cleanup_queue(ctrl->admin_q);
1874out_cleanup_fabrics_q:
1875        if (new)
1876                blk_cleanup_queue(ctrl->fabrics_q);
1877out_free_tagset:
1878        if (new)
1879                blk_mq_free_tag_set(ctrl->admin_tagset);
1880out_free_queue:
1881        nvme_tcp_free_admin_queue(ctrl);
1882        return error;
1883}
1884
1885static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl,
1886                bool remove)
1887{
1888        blk_mq_quiesce_queue(ctrl->admin_q);
1889        blk_sync_queue(ctrl->admin_q);
1890        nvme_tcp_stop_queue(ctrl, 0);
1891        if (ctrl->admin_tagset) {
1892                blk_mq_tagset_busy_iter(ctrl->admin_tagset,
1893                        nvme_cancel_request, ctrl);
1894                blk_mq_tagset_wait_completed_request(ctrl->admin_tagset);
1895        }
1896        if (remove)
1897                blk_mq_unquiesce_queue(ctrl->admin_q);
1898        nvme_tcp_destroy_admin_queue(ctrl, remove);
1899}
1900
1901static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
1902                bool remove)
1903{
1904        if (ctrl->queue_count <= 1)
1905                return;
1906        blk_mq_quiesce_queue(ctrl->admin_q);
1907        nvme_start_freeze(ctrl);
1908        nvme_stop_queues(ctrl);
1909        nvme_sync_io_queues(ctrl);
1910        nvme_tcp_stop_io_queues(ctrl);
1911        if (ctrl->tagset) {
1912                blk_mq_tagset_busy_iter(ctrl->tagset,
1913                        nvme_cancel_request, ctrl);
1914                blk_mq_tagset_wait_completed_request(ctrl->tagset);
1915        }
1916        if (remove)
1917                nvme_start_queues(ctrl);
1918        nvme_tcp_destroy_io_queues(ctrl, remove);
1919}
1920
1921static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl)
1922{
1923        /* If we are resetting/deleting then do nothing */
1924        if (ctrl->state != NVME_CTRL_CONNECTING) {
1925                WARN_ON_ONCE(ctrl->state == NVME_CTRL_NEW ||
1926                        ctrl->state == NVME_CTRL_LIVE);
1927                return;
1928        }
1929
1930        if (nvmf_should_reconnect(ctrl)) {
1931                dev_info(ctrl->device, "Reconnecting in %d seconds...\n",
1932                        ctrl->opts->reconnect_delay);
1933                queue_delayed_work(nvme_wq, &to_tcp_ctrl(ctrl)->connect_work,
1934                                ctrl->opts->reconnect_delay * HZ);
1935        } else {
1936                dev_info(ctrl->device, "Removing controller...\n");
1937                nvme_delete_ctrl(ctrl);
1938        }
1939}
1940
1941static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
1942{
1943        struct nvmf_ctrl_options *opts = ctrl->opts;
1944        int ret;
1945
1946        ret = nvme_tcp_configure_admin_queue(ctrl, new);
1947        if (ret)
1948                return ret;
1949
1950        if (ctrl->icdoff) {
1951                dev_err(ctrl->device, "icdoff is not supported!\n");
1952                goto destroy_admin;
1953        }
1954
1955        if (opts->queue_size > ctrl->sqsize + 1)
1956                dev_warn(ctrl->device,
1957                        "queue_size %zu > ctrl sqsize %u, clamping down\n",
1958                        opts->queue_size, ctrl->sqsize + 1);
1959
1960        if (ctrl->sqsize + 1 > ctrl->maxcmd) {
1961                dev_warn(ctrl->device,
1962                        "sqsize %u > ctrl maxcmd %u, clamping down\n",
1963                        ctrl->sqsize + 1, ctrl->maxcmd);
1964                ctrl->sqsize = ctrl->maxcmd - 1;
1965        }
1966
1967        if (ctrl->queue_count > 1) {
1968                ret = nvme_tcp_configure_io_queues(ctrl, new);
1969                if (ret)
1970                        goto destroy_admin;
1971        }
1972
1973        if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) {
1974                /*
1975                 * state change failure is ok if we started ctrl delete,
1976                 * unless we're during creation of a new controller to
1977                 * avoid races with teardown flow.
1978                 */
1979                WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
1980                             ctrl->state != NVME_CTRL_DELETING_NOIO);
1981                WARN_ON_ONCE(new);
1982                ret = -EINVAL;
1983                goto destroy_io;
1984        }
1985
1986        nvme_start_ctrl(ctrl);
1987        return 0;
1988
1989destroy_io:
1990        if (ctrl->queue_count > 1)
1991                nvme_tcp_destroy_io_queues(ctrl, new);
1992destroy_admin:
1993        nvme_tcp_stop_queue(ctrl, 0);
1994        nvme_tcp_destroy_admin_queue(ctrl, new);
1995        return ret;
1996}
1997
1998static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
1999{
2000        struct nvme_tcp_ctrl *tcp_ctrl = container_of(to_delayed_work(work),
2001                        struct nvme_tcp_ctrl, connect_work);
2002        struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
2003
2004        ++ctrl->nr_reconnects;
2005
2006        if (nvme_tcp_setup_ctrl(ctrl, false))
2007                goto requeue;
2008
2009        dev_info(ctrl->device, "Successfully reconnected (%d attempt)\n",
2010                        ctrl->nr_reconnects);
2011
2012        ctrl->nr_reconnects = 0;
2013
2014        return;
2015
2016requeue:
2017        dev_info(ctrl->device, "Failed reconnect attempt %d\n",
2018                        ctrl->nr_reconnects);
2019        nvme_tcp_reconnect_or_remove(ctrl);
2020}
2021
2022static void nvme_tcp_error_recovery_work(struct work_struct *work)
2023{
2024        struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
2025                                struct nvme_tcp_ctrl, err_work);
2026        struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
2027
2028        nvme_stop_keep_alive(ctrl);
2029        nvme_tcp_teardown_io_queues(ctrl, false);
2030        /* unquiesce to fail fast pending requests */
2031        nvme_start_queues(ctrl);
2032        nvme_tcp_teardown_admin_queue(ctrl, false);
2033        blk_mq_unquiesce_queue(ctrl->admin_q);
2034
2035        if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
2036                /* state change failure is ok if we started ctrl delete */
2037                WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
2038                             ctrl->state != NVME_CTRL_DELETING_NOIO);
2039                return;
2040        }
2041
2042        nvme_tcp_reconnect_or_remove(ctrl);
2043}
2044
2045static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
2046{
2047        cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work);
2048        cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
2049
2050        nvme_tcp_teardown_io_queues(ctrl, shutdown);
2051        blk_mq_quiesce_queue(ctrl->admin_q);
2052        if (shutdown)
2053                nvme_shutdown_ctrl(ctrl);
2054        else
2055                nvme_disable_ctrl(ctrl);
2056        nvme_tcp_teardown_admin_queue(ctrl, shutdown);
2057}
2058
2059static void nvme_tcp_delete_ctrl(struct nvme_ctrl *ctrl)
2060{
2061        nvme_tcp_teardown_ctrl(ctrl, true);
2062}
2063
2064static void nvme_reset_ctrl_work(struct work_struct *work)
2065{
2066        struct nvme_ctrl *ctrl =
2067                container_of(work, struct nvme_ctrl, reset_work);
2068
2069        nvme_stop_ctrl(ctrl);
2070        nvme_tcp_teardown_ctrl(ctrl, false);
2071
2072        if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
2073                /* state change failure is ok if we started ctrl delete */
2074                WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
2075                             ctrl->state != NVME_CTRL_DELETING_NOIO);
2076                return;
2077        }
2078
2079        if (nvme_tcp_setup_ctrl(ctrl, false))
2080                goto out_fail;
2081
2082        return;
2083
2084out_fail:
2085        ++ctrl->nr_reconnects;
2086        nvme_tcp_reconnect_or_remove(ctrl);
2087}
2088
2089static void nvme_tcp_free_ctrl(struct nvme_ctrl *nctrl)
2090{
2091        struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
2092
2093        if (list_empty(&ctrl->list))
2094                goto free_ctrl;
2095
2096        mutex_lock(&nvme_tcp_ctrl_mutex);
2097        list_del(&ctrl->list);
2098        mutex_unlock(&nvme_tcp_ctrl_mutex);
2099
2100        nvmf_free_options(nctrl->opts);
2101free_ctrl:
2102        kfree(ctrl->queues);
2103        kfree(ctrl);
2104}
2105
2106static void nvme_tcp_set_sg_null(struct nvme_command *c)
2107{
2108        struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
2109
2110        sg->addr = 0;
2111        sg->length = 0;
2112        sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
2113                        NVME_SGL_FMT_TRANSPORT_A;
2114}
2115
2116static void nvme_tcp_set_sg_inline(struct nvme_tcp_queue *queue,
2117                struct nvme_command *c, u32 data_len)
2118{
2119        struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
2120
2121        sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff);
2122        sg->length = cpu_to_le32(data_len);
2123        sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
2124}
2125
2126static void nvme_tcp_set_sg_host_data(struct nvme_command *c,
2127                u32 data_len)
2128{
2129        struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
2130
2131        sg->addr = 0;
2132        sg->length = cpu_to_le32(data_len);
2133        sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
2134                        NVME_SGL_FMT_TRANSPORT_A;
2135}
2136
2137static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg)
2138{
2139        struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(arg);
2140        struct nvme_tcp_queue *queue = &ctrl->queues[0];
2141        struct nvme_tcp_cmd_pdu *pdu = ctrl->async_req.pdu;
2142        struct nvme_command *cmd = &pdu->cmd;
2143        u8 hdgst = nvme_tcp_hdgst_len(queue);
2144
2145        memset(pdu, 0, sizeof(*pdu));
2146        pdu->hdr.type = nvme_tcp_cmd;
2147        if (queue->hdr_digest)
2148                pdu->hdr.flags |= NVME_TCP_F_HDGST;
2149        pdu->hdr.hlen = sizeof(*pdu);
2150        pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
2151
2152        cmd->common.opcode = nvme_admin_async_event;
2153        cmd->common.command_id = NVME_AQ_BLK_MQ_DEPTH;
2154        cmd->common.flags |= NVME_CMD_SGL_METABUF;
2155        nvme_tcp_set_sg_null(cmd);
2156
2157        ctrl->async_req.state = NVME_TCP_SEND_CMD_PDU;
2158        ctrl->async_req.offset = 0;
2159        ctrl->async_req.curr_bio = NULL;
2160        ctrl->async_req.data_len = 0;
2161
2162        nvme_tcp_queue_request(&ctrl->async_req, true, true);
2163}
2164
2165static void nvme_tcp_complete_timed_out(struct request *rq)
2166{
2167        struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2168        struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl;
2169
2170        nvme_tcp_stop_queue(ctrl, nvme_tcp_queue_id(req->queue));
2171        if (blk_mq_request_started(rq) && !blk_mq_request_completed(rq)) {
2172                nvme_req(rq)->status = NVME_SC_HOST_ABORTED_CMD;
2173                blk_mq_complete_request(rq);
2174        }
2175}
2176
2177static enum blk_eh_timer_return
2178nvme_tcp_timeout(struct request *rq, bool reserved)
2179{
2180        struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2181        struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl;
2182        struct nvme_tcp_cmd_pdu *pdu = req->pdu;
2183
2184        dev_warn(ctrl->device,
2185                "queue %d: timeout request %#x type %d\n",
2186                nvme_tcp_queue_id(req->queue), rq->tag, pdu->hdr.type);
2187
2188        if (ctrl->state != NVME_CTRL_LIVE) {
2189                /*
2190                 * If we are resetting, connecting or deleting we should
2191                 * complete immediately because we may block controller
2192                 * teardown or setup sequence
2193                 * - ctrl disable/shutdown fabrics requests
2194                 * - connect requests
2195                 * - initialization admin requests
2196                 * - I/O requests that entered after unquiescing and
2197                 *   the controller stopped responding
2198                 *
2199                 * All other requests should be cancelled by the error
2200                 * recovery work, so it's fine that we fail it here.
2201                 */
2202                nvme_tcp_complete_timed_out(rq);
2203                return BLK_EH_DONE;
2204        }
2205
2206        /*
2207         * LIVE state should trigger the normal error recovery which will
2208         * handle completing this request.
2209         */
2210        nvme_tcp_error_recovery(ctrl);
2211        return BLK_EH_RESET_TIMER;
2212}
2213
2214static blk_status_t nvme_tcp_map_data(struct nvme_tcp_queue *queue,
2215                        struct request *rq)
2216{
2217        struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2218        struct nvme_tcp_cmd_pdu *pdu = req->pdu;
2219        struct nvme_command *c = &pdu->cmd;
2220
2221        c->common.flags |= NVME_CMD_SGL_METABUF;
2222
2223        if (!blk_rq_nr_phys_segments(rq))
2224                nvme_tcp_set_sg_null(c);
2225        else if (rq_data_dir(rq) == WRITE &&
2226            req->data_len <= nvme_tcp_inline_data_size(queue))
2227                nvme_tcp_set_sg_inline(queue, c, req->data_len);
2228        else
2229                nvme_tcp_set_sg_host_data(c, req->data_len);
2230
2231        return 0;
2232}
2233
2234static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns,
2235                struct request *rq)
2236{
2237        struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2238        struct nvme_tcp_cmd_pdu *pdu = req->pdu;
2239        struct nvme_tcp_queue *queue = req->queue;
2240        u8 hdgst = nvme_tcp_hdgst_len(queue), ddgst = 0;
2241        blk_status_t ret;
2242
2243        ret = nvme_setup_cmd(ns, rq, &pdu->cmd);
2244        if (ret)
2245                return ret;
2246
2247        req->state = NVME_TCP_SEND_CMD_PDU;
2248        req->offset = 0;
2249        req->data_sent = 0;
2250        req->pdu_len = 0;
2251        req->pdu_sent = 0;
2252        req->data_len = blk_rq_nr_phys_segments(rq) ?
2253                                blk_rq_payload_bytes(rq) : 0;
2254        req->curr_bio = rq->bio;
2255
2256        if (rq_data_dir(rq) == WRITE &&
2257            req->data_len <= nvme_tcp_inline_data_size(queue))
2258                req->pdu_len = req->data_len;
2259        else if (req->curr_bio)
2260                nvme_tcp_init_iter(req, READ);
2261
2262        pdu->hdr.type = nvme_tcp_cmd;
2263        pdu->hdr.flags = 0;
2264        if (queue->hdr_digest)
2265                pdu->hdr.flags |= NVME_TCP_F_HDGST;
2266        if (queue->data_digest && req->pdu_len) {
2267                pdu->hdr.flags |= NVME_TCP_F_DDGST;
2268                ddgst = nvme_tcp_ddgst_len(queue);
2269        }
2270        pdu->hdr.hlen = sizeof(*pdu);
2271        pdu->hdr.pdo = req->pdu_len ? pdu->hdr.hlen + hdgst : 0;
2272        pdu->hdr.plen =
2273                cpu_to_le32(pdu->hdr.hlen + hdgst + req->pdu_len + ddgst);
2274
2275        ret = nvme_tcp_map_data(queue, rq);
2276        if (unlikely(ret)) {
2277                nvme_cleanup_cmd(rq);
2278                dev_err(queue->ctrl->ctrl.device,
2279                        "Failed to map data (%d)\n", ret);
2280                return ret;
2281        }
2282
2283        return 0;
2284}
2285
2286static void nvme_tcp_commit_rqs(struct blk_mq_hw_ctx *hctx)
2287{
2288        struct nvme_tcp_queue *queue = hctx->driver_data;
2289
2290        if (!llist_empty(&queue->req_list))
2291                queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
2292}
2293
2294static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
2295                const struct blk_mq_queue_data *bd)
2296{
2297        struct nvme_ns *ns = hctx->queue->queuedata;
2298        struct nvme_tcp_queue *queue = hctx->driver_data;
2299        struct request *rq = bd->rq;
2300        struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2301        bool queue_ready = test_bit(NVME_TCP_Q_LIVE, &queue->flags);
2302        blk_status_t ret;
2303
2304        if (!nvmf_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
2305                return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq);
2306
2307        ret = nvme_tcp_setup_cmd_pdu(ns, rq);
2308        if (unlikely(ret))
2309                return ret;
2310
2311        blk_mq_start_request(rq);
2312
2313        nvme_tcp_queue_request(req, true, bd->last);
2314
2315        return BLK_STS_OK;
2316}
2317
2318static int nvme_tcp_map_queues(struct blk_mq_tag_set *set)
2319{
2320        struct nvme_tcp_ctrl *ctrl = set->driver_data;
2321        struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
2322
2323        if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) {
2324                /* separate read/write queues */
2325                set->map[HCTX_TYPE_DEFAULT].nr_queues =
2326                        ctrl->io_queues[HCTX_TYPE_DEFAULT];
2327                set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
2328                set->map[HCTX_TYPE_READ].nr_queues =
2329                        ctrl->io_queues[HCTX_TYPE_READ];
2330                set->map[HCTX_TYPE_READ].queue_offset =
2331                        ctrl->io_queues[HCTX_TYPE_DEFAULT];
2332        } else {
2333                /* shared read/write queues */
2334                set->map[HCTX_TYPE_DEFAULT].nr_queues =
2335                        ctrl->io_queues[HCTX_TYPE_DEFAULT];
2336                set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
2337                set->map[HCTX_TYPE_READ].nr_queues =
2338                        ctrl->io_queues[HCTX_TYPE_DEFAULT];
2339                set->map[HCTX_TYPE_READ].queue_offset = 0;
2340        }
2341        blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
2342        blk_mq_map_queues(&set->map[HCTX_TYPE_READ]);
2343
2344        if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) {
2345                /* map dedicated poll queues only if we have queues left */
2346                set->map[HCTX_TYPE_POLL].nr_queues =
2347                                ctrl->io_queues[HCTX_TYPE_POLL];
2348                set->map[HCTX_TYPE_POLL].queue_offset =
2349                        ctrl->io_queues[HCTX_TYPE_DEFAULT] +
2350                        ctrl->io_queues[HCTX_TYPE_READ];
2351                blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]);
2352        }
2353
2354        dev_info(ctrl->ctrl.device,
2355                "mapped %d/%d/%d default/read/poll queues.\n",
2356                ctrl->io_queues[HCTX_TYPE_DEFAULT],
2357                ctrl->io_queues[HCTX_TYPE_READ],
2358                ctrl->io_queues[HCTX_TYPE_POLL]);
2359
2360        return 0;
2361}
2362
2363static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx)
2364{
2365        struct nvme_tcp_queue *queue = hctx->driver_data;
2366        struct sock *sk = queue->sock->sk;
2367
2368        if (!test_bit(NVME_TCP_Q_LIVE, &queue->flags))
2369                return 0;
2370
2371        set_bit(NVME_TCP_Q_POLLING, &queue->flags);
2372        if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue))
2373                sk_busy_loop(sk, true);
2374        nvme_tcp_try_recv(queue);
2375        clear_bit(NVME_TCP_Q_POLLING, &queue->flags);
2376        return queue->nr_cqe;
2377}
2378
2379static const struct blk_mq_ops nvme_tcp_mq_ops = {
2380        .queue_rq       = nvme_tcp_queue_rq,
2381        .commit_rqs     = nvme_tcp_commit_rqs,
2382        .complete       = nvme_complete_rq,
2383        .init_request   = nvme_tcp_init_request,
2384        .exit_request   = nvme_tcp_exit_request,
2385        .init_hctx      = nvme_tcp_init_hctx,
2386        .timeout        = nvme_tcp_timeout,
2387        .map_queues     = nvme_tcp_map_queues,
2388        .poll           = nvme_tcp_poll,
2389};
2390
2391static const struct blk_mq_ops nvme_tcp_admin_mq_ops = {
2392        .queue_rq       = nvme_tcp_queue_rq,
2393        .complete       = nvme_complete_rq,
2394        .init_request   = nvme_tcp_init_request,
2395        .exit_request   = nvme_tcp_exit_request,
2396        .init_hctx      = nvme_tcp_init_admin_hctx,
2397        .timeout        = nvme_tcp_timeout,
2398};
2399
2400static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = {
2401        .name                   = "tcp",
2402        .module                 = THIS_MODULE,
2403        .flags                  = NVME_F_FABRICS,
2404        .reg_read32             = nvmf_reg_read32,
2405        .reg_read64             = nvmf_reg_read64,
2406        .reg_write32            = nvmf_reg_write32,
2407        .free_ctrl              = nvme_tcp_free_ctrl,
2408        .submit_async_event     = nvme_tcp_submit_async_event,
2409        .delete_ctrl            = nvme_tcp_delete_ctrl,
2410        .get_address            = nvmf_get_address,
2411};
2412
2413static bool
2414nvme_tcp_existing_controller(struct nvmf_ctrl_options *opts)
2415{
2416        struct nvme_tcp_ctrl *ctrl;
2417        bool found = false;
2418
2419        mutex_lock(&nvme_tcp_ctrl_mutex);
2420        list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list) {
2421                found = nvmf_ip_options_match(&ctrl->ctrl, opts);
2422                if (found)
2423                        break;
2424        }
2425        mutex_unlock(&nvme_tcp_ctrl_mutex);
2426
2427        return found;
2428}
2429
2430static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
2431                struct nvmf_ctrl_options *opts)
2432{
2433        struct nvme_tcp_ctrl *ctrl;
2434        int ret;
2435
2436        ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
2437        if (!ctrl)
2438                return ERR_PTR(-ENOMEM);
2439
2440        INIT_LIST_HEAD(&ctrl->list);
2441        ctrl->ctrl.opts = opts;
2442        ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues +
2443                                opts->nr_poll_queues + 1;
2444        ctrl->ctrl.sqsize = opts->queue_size - 1;
2445        ctrl->ctrl.kato = opts->kato;
2446
2447        INIT_DELAYED_WORK(&ctrl->connect_work,
2448                        nvme_tcp_reconnect_ctrl_work);
2449        INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work);
2450        INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work);
2451
2452        if (!(opts->mask & NVMF_OPT_TRSVCID)) {
2453                opts->trsvcid =
2454                        kstrdup(__stringify(NVME_TCP_DISC_PORT), GFP_KERNEL);
2455                if (!opts->trsvcid) {
2456                        ret = -ENOMEM;
2457                        goto out_free_ctrl;
2458                }
2459                opts->mask |= NVMF_OPT_TRSVCID;
2460        }
2461
2462        ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
2463                        opts->traddr, opts->trsvcid, &ctrl->addr);
2464        if (ret) {
2465                pr_err("malformed address passed: %s:%s\n",
2466                        opts->traddr, opts->trsvcid);
2467                goto out_free_ctrl;
2468        }
2469
2470        if (opts->mask & NVMF_OPT_HOST_TRADDR) {
2471                ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
2472                        opts->host_traddr, NULL, &ctrl->src_addr);
2473                if (ret) {
2474                        pr_err("malformed src address passed: %s\n",
2475                               opts->host_traddr);
2476                        goto out_free_ctrl;
2477                }
2478        }
2479
2480        if (!opts->duplicate_connect && nvme_tcp_existing_controller(opts)) {
2481                ret = -EALREADY;
2482                goto out_free_ctrl;
2483        }
2484
2485        ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(*ctrl->queues),
2486                                GFP_KERNEL);
2487        if (!ctrl->queues) {
2488                ret = -ENOMEM;
2489                goto out_free_ctrl;
2490        }
2491
2492        ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_tcp_ctrl_ops, 0);
2493        if (ret)
2494                goto out_kfree_queues;
2495
2496        if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
2497                WARN_ON_ONCE(1);
2498                ret = -EINTR;
2499                goto out_uninit_ctrl;
2500        }
2501
2502        ret = nvme_tcp_setup_ctrl(&ctrl->ctrl, true);
2503        if (ret)
2504                goto out_uninit_ctrl;
2505
2506        dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n",
2507                ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
2508
2509        mutex_lock(&nvme_tcp_ctrl_mutex);
2510        list_add_tail(&ctrl->list, &nvme_tcp_ctrl_list);
2511        mutex_unlock(&nvme_tcp_ctrl_mutex);
2512
2513        return &ctrl->ctrl;
2514
2515out_uninit_ctrl:
2516        nvme_uninit_ctrl(&ctrl->ctrl);
2517        nvme_put_ctrl(&ctrl->ctrl);
2518        if (ret > 0)
2519                ret = -EIO;
2520        return ERR_PTR(ret);
2521out_kfree_queues:
2522        kfree(ctrl->queues);
2523out_free_ctrl:
2524        kfree(ctrl);
2525        return ERR_PTR(ret);
2526}
2527
2528static struct nvmf_transport_ops nvme_tcp_transport = {
2529        .name           = "tcp",
2530        .module         = THIS_MODULE,
2531        .required_opts  = NVMF_OPT_TRADDR,
2532        .allowed_opts   = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
2533                          NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
2534                          NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST |
2535                          NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES |
2536                          NVMF_OPT_TOS,
2537        .create_ctrl    = nvme_tcp_create_ctrl,
2538};
2539
2540static int __init nvme_tcp_init_module(void)
2541{
2542        nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq",
2543                        WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
2544        if (!nvme_tcp_wq)
2545                return -ENOMEM;
2546
2547        nvmf_register_transport(&nvme_tcp_transport);
2548        return 0;
2549}
2550
2551static void __exit nvme_tcp_cleanup_module(void)
2552{
2553        struct nvme_tcp_ctrl *ctrl;
2554
2555        nvmf_unregister_transport(&nvme_tcp_transport);
2556
2557        mutex_lock(&nvme_tcp_ctrl_mutex);
2558        list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list)
2559                nvme_delete_ctrl(&ctrl->ctrl);
2560        mutex_unlock(&nvme_tcp_ctrl_mutex);
2561        flush_workqueue(nvme_delete_wq);
2562
2563        destroy_workqueue(nvme_tcp_wq);
2564}
2565
2566module_init(nvme_tcp_init_module);
2567module_exit(nvme_tcp_cleanup_module);
2568
2569MODULE_LICENSE("GPL v2");
2570