linux/drivers/nvme/host/tcp.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * NVMe over Fabrics TCP host.
   4 * Copyright (c) 2018 Lightbits Labs. All rights reserved.
   5 */
   6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   7#include <linux/module.h>
   8#include <linux/init.h>
   9#include <linux/slab.h>
  10#include <linux/err.h>
  11#include <linux/nvme-tcp.h>
  12#include <net/sock.h>
  13#include <net/tcp.h>
  14#include <linux/blk-mq.h>
  15#include <crypto/hash.h>
  16#include <net/busy_poll.h>
  17
  18#include "nvme.h"
  19#include "fabrics.h"
  20
  21struct nvme_tcp_queue;
  22
  23/* Define the socket priority to use for connections were it is desirable
  24 * that the NIC consider performing optimized packet processing or filtering.
  25 * A non-zero value being sufficient to indicate general consideration of any
  26 * possible optimization.  Making it a module param allows for alternative
  27 * values that may be unique for some NIC implementations.
  28 */
  29static int so_priority;
  30module_param(so_priority, int, 0644);
  31MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority");
  32
  33enum nvme_tcp_send_state {
  34        NVME_TCP_SEND_CMD_PDU = 0,
  35        NVME_TCP_SEND_H2C_PDU,
  36        NVME_TCP_SEND_DATA,
  37        NVME_TCP_SEND_DDGST,
  38};
  39
  40struct nvme_tcp_request {
  41        struct nvme_request     req;
  42        void                    *pdu;
  43        struct nvme_tcp_queue   *queue;
  44        u32                     data_len;
  45        u32                     pdu_len;
  46        u32                     pdu_sent;
  47        u16                     ttag;
  48        __le16                  status;
  49        struct list_head        entry;
  50        struct llist_node       lentry;
  51        __le32                  ddgst;
  52
  53        struct bio              *curr_bio;
  54        struct iov_iter         iter;
  55
  56        /* send state */
  57        size_t                  offset;
  58        size_t                  data_sent;
  59        enum nvme_tcp_send_state state;
  60};
  61
  62enum nvme_tcp_queue_flags {
  63        NVME_TCP_Q_ALLOCATED    = 0,
  64        NVME_TCP_Q_LIVE         = 1,
  65        NVME_TCP_Q_POLLING      = 2,
  66};
  67
  68enum nvme_tcp_recv_state {
  69        NVME_TCP_RECV_PDU = 0,
  70        NVME_TCP_RECV_DATA,
  71        NVME_TCP_RECV_DDGST,
  72};
  73
  74struct nvme_tcp_ctrl;
  75struct nvme_tcp_queue {
  76        struct socket           *sock;
  77        struct work_struct      io_work;
  78        int                     io_cpu;
  79
  80        struct mutex            queue_lock;
  81        struct mutex            send_mutex;
  82        struct llist_head       req_list;
  83        struct list_head        send_list;
  84        bool                    more_requests;
  85
  86        /* recv state */
  87        void                    *pdu;
  88        int                     pdu_remaining;
  89        int                     pdu_offset;
  90        size_t                  data_remaining;
  91        size_t                  ddgst_remaining;
  92        unsigned int            nr_cqe;
  93
  94        /* send state */
  95        struct nvme_tcp_request *request;
  96
  97        int                     queue_size;
  98        size_t                  cmnd_capsule_len;
  99        struct nvme_tcp_ctrl    *ctrl;
 100        unsigned long           flags;
 101        bool                    rd_enabled;
 102
 103        bool                    hdr_digest;
 104        bool                    data_digest;
 105        struct ahash_request    *rcv_hash;
 106        struct ahash_request    *snd_hash;
 107        __le32                  exp_ddgst;
 108        __le32                  recv_ddgst;
 109
 110        struct page_frag_cache  pf_cache;
 111
 112        void (*state_change)(struct sock *);
 113        void (*data_ready)(struct sock *);
 114        void (*write_space)(struct sock *);
 115};
 116
 117struct nvme_tcp_ctrl {
 118        /* read only in the hot path */
 119        struct nvme_tcp_queue   *queues;
 120        struct blk_mq_tag_set   tag_set;
 121
 122        /* other member variables */
 123        struct list_head        list;
 124        struct blk_mq_tag_set   admin_tag_set;
 125        struct sockaddr_storage addr;
 126        struct sockaddr_storage src_addr;
 127        struct nvme_ctrl        ctrl;
 128
 129        struct work_struct      err_work;
 130        struct delayed_work     connect_work;
 131        struct nvme_tcp_request async_req;
 132        u32                     io_queues[HCTX_MAX_TYPES];
 133};
 134
 135static LIST_HEAD(nvme_tcp_ctrl_list);
 136static DEFINE_MUTEX(nvme_tcp_ctrl_mutex);
 137static struct workqueue_struct *nvme_tcp_wq;
 138static const struct blk_mq_ops nvme_tcp_mq_ops;
 139static const struct blk_mq_ops nvme_tcp_admin_mq_ops;
 140static int nvme_tcp_try_send(struct nvme_tcp_queue *queue);
 141
 142static inline struct nvme_tcp_ctrl *to_tcp_ctrl(struct nvme_ctrl *ctrl)
 143{
 144        return container_of(ctrl, struct nvme_tcp_ctrl, ctrl);
 145}
 146
 147static inline int nvme_tcp_queue_id(struct nvme_tcp_queue *queue)
 148{
 149        return queue - queue->ctrl->queues;
 150}
 151
 152static inline struct blk_mq_tags *nvme_tcp_tagset(struct nvme_tcp_queue *queue)
 153{
 154        u32 queue_idx = nvme_tcp_queue_id(queue);
 155
 156        if (queue_idx == 0)
 157                return queue->ctrl->admin_tag_set.tags[queue_idx];
 158        return queue->ctrl->tag_set.tags[queue_idx - 1];
 159}
 160
 161static inline u8 nvme_tcp_hdgst_len(struct nvme_tcp_queue *queue)
 162{
 163        return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0;
 164}
 165
 166static inline u8 nvme_tcp_ddgst_len(struct nvme_tcp_queue *queue)
 167{
 168        return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
 169}
 170
 171static inline size_t nvme_tcp_inline_data_size(struct nvme_tcp_queue *queue)
 172{
 173        return queue->cmnd_capsule_len - sizeof(struct nvme_command);
 174}
 175
 176static inline bool nvme_tcp_async_req(struct nvme_tcp_request *req)
 177{
 178        return req == &req->queue->ctrl->async_req;
 179}
 180
 181static inline bool nvme_tcp_has_inline_data(struct nvme_tcp_request *req)
 182{
 183        struct request *rq;
 184
 185        if (unlikely(nvme_tcp_async_req(req)))
 186                return false; /* async events don't have a request */
 187
 188        rq = blk_mq_rq_from_pdu(req);
 189
 190        return rq_data_dir(rq) == WRITE && req->data_len &&
 191                req->data_len <= nvme_tcp_inline_data_size(req->queue);
 192}
 193
 194static inline struct page *nvme_tcp_req_cur_page(struct nvme_tcp_request *req)
 195{
 196        return req->iter.bvec->bv_page;
 197}
 198
 199static inline size_t nvme_tcp_req_cur_offset(struct nvme_tcp_request *req)
 200{
 201        return req->iter.bvec->bv_offset + req->iter.iov_offset;
 202}
 203
 204static inline size_t nvme_tcp_req_cur_length(struct nvme_tcp_request *req)
 205{
 206        return min_t(size_t, iov_iter_single_seg_count(&req->iter),
 207                        req->pdu_len - req->pdu_sent);
 208}
 209
 210static inline size_t nvme_tcp_pdu_data_left(struct nvme_tcp_request *req)
 211{
 212        return rq_data_dir(blk_mq_rq_from_pdu(req)) == WRITE ?
 213                        req->pdu_len - req->pdu_sent : 0;
 214}
 215
 216static inline size_t nvme_tcp_pdu_last_send(struct nvme_tcp_request *req,
 217                int len)
 218{
 219        return nvme_tcp_pdu_data_left(req) <= len;
 220}
 221
 222static void nvme_tcp_init_iter(struct nvme_tcp_request *req,
 223                unsigned int dir)
 224{
 225        struct request *rq = blk_mq_rq_from_pdu(req);
 226        struct bio_vec *vec;
 227        unsigned int size;
 228        int nr_bvec;
 229        size_t offset;
 230
 231        if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) {
 232                vec = &rq->special_vec;
 233                nr_bvec = 1;
 234                size = blk_rq_payload_bytes(rq);
 235                offset = 0;
 236        } else {
 237                struct bio *bio = req->curr_bio;
 238                struct bvec_iter bi;
 239                struct bio_vec bv;
 240
 241                vec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
 242                nr_bvec = 0;
 243                bio_for_each_bvec(bv, bio, bi) {
 244                        nr_bvec++;
 245                }
 246                size = bio->bi_iter.bi_size;
 247                offset = bio->bi_iter.bi_bvec_done;
 248        }
 249
 250        iov_iter_bvec(&req->iter, dir, vec, nr_bvec, size);
 251        req->iter.iov_offset = offset;
 252}
 253
 254static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req,
 255                int len)
 256{
 257        req->data_sent += len;
 258        req->pdu_sent += len;
 259        iov_iter_advance(&req->iter, len);
 260        if (!iov_iter_count(&req->iter) &&
 261            req->data_sent < req->data_len) {
 262                req->curr_bio = req->curr_bio->bi_next;
 263                nvme_tcp_init_iter(req, WRITE);
 264        }
 265}
 266
 267static inline void nvme_tcp_send_all(struct nvme_tcp_queue *queue)
 268{
 269        int ret;
 270
 271        /* drain the send queue as much as we can... */
 272        do {
 273                ret = nvme_tcp_try_send(queue);
 274        } while (ret > 0);
 275}
 276
 277static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue)
 278{
 279        return !list_empty(&queue->send_list) ||
 280                !llist_empty(&queue->req_list) || queue->more_requests;
 281}
 282
 283static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
 284                bool sync, bool last)
 285{
 286        struct nvme_tcp_queue *queue = req->queue;
 287        bool empty;
 288
 289        empty = llist_add(&req->lentry, &queue->req_list) &&
 290                list_empty(&queue->send_list) && !queue->request;
 291
 292        /*
 293         * if we're the first on the send_list and we can try to send
 294         * directly, otherwise queue io_work. Also, only do that if we
 295         * are on the same cpu, so we don't introduce contention.
 296         */
 297        if (queue->io_cpu == raw_smp_processor_id() &&
 298            sync && empty && mutex_trylock(&queue->send_mutex)) {
 299                queue->more_requests = !last;
 300                nvme_tcp_send_all(queue);
 301                queue->more_requests = false;
 302                mutex_unlock(&queue->send_mutex);
 303        }
 304
 305        if (last && nvme_tcp_queue_more(queue))
 306                queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
 307}
 308
 309static void nvme_tcp_process_req_list(struct nvme_tcp_queue *queue)
 310{
 311        struct nvme_tcp_request *req;
 312        struct llist_node *node;
 313
 314        for (node = llist_del_all(&queue->req_list); node; node = node->next) {
 315                req = llist_entry(node, struct nvme_tcp_request, lentry);
 316                list_add(&req->entry, &queue->send_list);
 317        }
 318}
 319
 320static inline struct nvme_tcp_request *
 321nvme_tcp_fetch_request(struct nvme_tcp_queue *queue)
 322{
 323        struct nvme_tcp_request *req;
 324
 325        req = list_first_entry_or_null(&queue->send_list,
 326                        struct nvme_tcp_request, entry);
 327        if (!req) {
 328                nvme_tcp_process_req_list(queue);
 329                req = list_first_entry_or_null(&queue->send_list,
 330                                struct nvme_tcp_request, entry);
 331                if (unlikely(!req))
 332                        return NULL;
 333        }
 334
 335        list_del(&req->entry);
 336        return req;
 337}
 338
 339static inline void nvme_tcp_ddgst_final(struct ahash_request *hash,
 340                __le32 *dgst)
 341{
 342        ahash_request_set_crypt(hash, NULL, (u8 *)dgst, 0);
 343        crypto_ahash_final(hash);
 344}
 345
 346static inline void nvme_tcp_ddgst_update(struct ahash_request *hash,
 347                struct page *page, off_t off, size_t len)
 348{
 349        struct scatterlist sg;
 350
 351        sg_init_marker(&sg, 1);
 352        sg_set_page(&sg, page, len, off);
 353        ahash_request_set_crypt(hash, &sg, NULL, len);
 354        crypto_ahash_update(hash);
 355}
 356
 357static inline void nvme_tcp_hdgst(struct ahash_request *hash,
 358                void *pdu, size_t len)
 359{
 360        struct scatterlist sg;
 361
 362        sg_init_one(&sg, pdu, len);
 363        ahash_request_set_crypt(hash, &sg, pdu + len, len);
 364        crypto_ahash_digest(hash);
 365}
 366
 367static int nvme_tcp_verify_hdgst(struct nvme_tcp_queue *queue,
 368                void *pdu, size_t pdu_len)
 369{
 370        struct nvme_tcp_hdr *hdr = pdu;
 371        __le32 recv_digest;
 372        __le32 exp_digest;
 373
 374        if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) {
 375                dev_err(queue->ctrl->ctrl.device,
 376                        "queue %d: header digest flag is cleared\n",
 377                        nvme_tcp_queue_id(queue));
 378                return -EPROTO;
 379        }
 380
 381        recv_digest = *(__le32 *)(pdu + hdr->hlen);
 382        nvme_tcp_hdgst(queue->rcv_hash, pdu, pdu_len);
 383        exp_digest = *(__le32 *)(pdu + hdr->hlen);
 384        if (recv_digest != exp_digest) {
 385                dev_err(queue->ctrl->ctrl.device,
 386                        "header digest error: recv %#x expected %#x\n",
 387                        le32_to_cpu(recv_digest), le32_to_cpu(exp_digest));
 388                return -EIO;
 389        }
 390
 391        return 0;
 392}
 393
 394static int nvme_tcp_check_ddgst(struct nvme_tcp_queue *queue, void *pdu)
 395{
 396        struct nvme_tcp_hdr *hdr = pdu;
 397        u8 digest_len = nvme_tcp_hdgst_len(queue);
 398        u32 len;
 399
 400        len = le32_to_cpu(hdr->plen) - hdr->hlen -
 401                ((hdr->flags & NVME_TCP_F_HDGST) ? digest_len : 0);
 402
 403        if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) {
 404                dev_err(queue->ctrl->ctrl.device,
 405                        "queue %d: data digest flag is cleared\n",
 406                nvme_tcp_queue_id(queue));
 407                return -EPROTO;
 408        }
 409        crypto_ahash_init(queue->rcv_hash);
 410
 411        return 0;
 412}
 413
 414static void nvme_tcp_exit_request(struct blk_mq_tag_set *set,
 415                struct request *rq, unsigned int hctx_idx)
 416{
 417        struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
 418
 419        page_frag_free(req->pdu);
 420}
 421
 422static int nvme_tcp_init_request(struct blk_mq_tag_set *set,
 423                struct request *rq, unsigned int hctx_idx,
 424                unsigned int numa_node)
 425{
 426        struct nvme_tcp_ctrl *ctrl = set->driver_data;
 427        struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
 428        struct nvme_tcp_cmd_pdu *pdu;
 429        int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
 430        struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx];
 431        u8 hdgst = nvme_tcp_hdgst_len(queue);
 432
 433        req->pdu = page_frag_alloc(&queue->pf_cache,
 434                sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
 435                GFP_KERNEL | __GFP_ZERO);
 436        if (!req->pdu)
 437                return -ENOMEM;
 438
 439        pdu = req->pdu;
 440        req->queue = queue;
 441        nvme_req(rq)->ctrl = &ctrl->ctrl;
 442        nvme_req(rq)->cmd = &pdu->cmd;
 443
 444        return 0;
 445}
 446
 447static int nvme_tcp_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
 448                unsigned int hctx_idx)
 449{
 450        struct nvme_tcp_ctrl *ctrl = data;
 451        struct nvme_tcp_queue *queue = &ctrl->queues[hctx_idx + 1];
 452
 453        hctx->driver_data = queue;
 454        return 0;
 455}
 456
 457static int nvme_tcp_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
 458                unsigned int hctx_idx)
 459{
 460        struct nvme_tcp_ctrl *ctrl = data;
 461        struct nvme_tcp_queue *queue = &ctrl->queues[0];
 462
 463        hctx->driver_data = queue;
 464        return 0;
 465}
 466
 467static enum nvme_tcp_recv_state
 468nvme_tcp_recv_state(struct nvme_tcp_queue *queue)
 469{
 470        return  (queue->pdu_remaining) ? NVME_TCP_RECV_PDU :
 471                (queue->ddgst_remaining) ? NVME_TCP_RECV_DDGST :
 472                NVME_TCP_RECV_DATA;
 473}
 474
 475static void nvme_tcp_init_recv_ctx(struct nvme_tcp_queue *queue)
 476{
 477        queue->pdu_remaining = sizeof(struct nvme_tcp_rsp_pdu) +
 478                                nvme_tcp_hdgst_len(queue);
 479        queue->pdu_offset = 0;
 480        queue->data_remaining = -1;
 481        queue->ddgst_remaining = 0;
 482}
 483
 484static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl)
 485{
 486        if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
 487                return;
 488
 489        dev_warn(ctrl->device, "starting error recovery\n");
 490        queue_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work);
 491}
 492
 493static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
 494                struct nvme_completion *cqe)
 495{
 496        struct nvme_tcp_request *req;
 497        struct request *rq;
 498
 499        rq = nvme_find_rq(nvme_tcp_tagset(queue), cqe->command_id);
 500        if (!rq) {
 501                dev_err(queue->ctrl->ctrl.device,
 502                        "got bad cqe.command_id %#x on queue %d\n",
 503                        cqe->command_id, nvme_tcp_queue_id(queue));
 504                nvme_tcp_error_recovery(&queue->ctrl->ctrl);
 505                return -EINVAL;
 506        }
 507
 508        req = blk_mq_rq_to_pdu(rq);
 509        if (req->status == cpu_to_le16(NVME_SC_SUCCESS))
 510                req->status = cqe->status;
 511
 512        if (!nvme_try_complete_req(rq, req->status, cqe->result))
 513                nvme_complete_rq(rq);
 514        queue->nr_cqe++;
 515
 516        return 0;
 517}
 518
 519static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue,
 520                struct nvme_tcp_data_pdu *pdu)
 521{
 522        struct request *rq;
 523
 524        rq = nvme_find_rq(nvme_tcp_tagset(queue), pdu->command_id);
 525        if (!rq) {
 526                dev_err(queue->ctrl->ctrl.device,
 527                        "got bad c2hdata.command_id %#x on queue %d\n",
 528                        pdu->command_id, nvme_tcp_queue_id(queue));
 529                return -ENOENT;
 530        }
 531
 532        if (!blk_rq_payload_bytes(rq)) {
 533                dev_err(queue->ctrl->ctrl.device,
 534                        "queue %d tag %#x unexpected data\n",
 535                        nvme_tcp_queue_id(queue), rq->tag);
 536                return -EIO;
 537        }
 538
 539        queue->data_remaining = le32_to_cpu(pdu->data_length);
 540
 541        if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS &&
 542            unlikely(!(pdu->hdr.flags & NVME_TCP_F_DATA_LAST))) {
 543                dev_err(queue->ctrl->ctrl.device,
 544                        "queue %d tag %#x SUCCESS set but not last PDU\n",
 545                        nvme_tcp_queue_id(queue), rq->tag);
 546                nvme_tcp_error_recovery(&queue->ctrl->ctrl);
 547                return -EPROTO;
 548        }
 549
 550        return 0;
 551}
 552
 553static int nvme_tcp_handle_comp(struct nvme_tcp_queue *queue,
 554                struct nvme_tcp_rsp_pdu *pdu)
 555{
 556        struct nvme_completion *cqe = &pdu->cqe;
 557        int ret = 0;
 558
 559        /*
 560         * AEN requests are special as they don't time out and can
 561         * survive any kind of queue freeze and often don't respond to
 562         * aborts.  We don't even bother to allocate a struct request
 563         * for them but rather special case them here.
 564         */
 565        if (unlikely(nvme_is_aen_req(nvme_tcp_queue_id(queue),
 566                                     cqe->command_id)))
 567                nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
 568                                &cqe->result);
 569        else
 570                ret = nvme_tcp_process_nvme_cqe(queue, cqe);
 571
 572        return ret;
 573}
 574
 575static int nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req,
 576                struct nvme_tcp_r2t_pdu *pdu)
 577{
 578        struct nvme_tcp_data_pdu *data = req->pdu;
 579        struct nvme_tcp_queue *queue = req->queue;
 580        struct request *rq = blk_mq_rq_from_pdu(req);
 581        u8 hdgst = nvme_tcp_hdgst_len(queue);
 582        u8 ddgst = nvme_tcp_ddgst_len(queue);
 583
 584        req->pdu_len = le32_to_cpu(pdu->r2t_length);
 585        req->pdu_sent = 0;
 586
 587        if (unlikely(!req->pdu_len)) {
 588                dev_err(queue->ctrl->ctrl.device,
 589                        "req %d r2t len is %u, probably a bug...\n",
 590                        rq->tag, req->pdu_len);
 591                return -EPROTO;
 592        }
 593
 594        if (unlikely(req->data_sent + req->pdu_len > req->data_len)) {
 595                dev_err(queue->ctrl->ctrl.device,
 596                        "req %d r2t len %u exceeded data len %u (%zu sent)\n",
 597                        rq->tag, req->pdu_len, req->data_len,
 598                        req->data_sent);
 599                return -EPROTO;
 600        }
 601
 602        if (unlikely(le32_to_cpu(pdu->r2t_offset) < req->data_sent)) {
 603                dev_err(queue->ctrl->ctrl.device,
 604                        "req %d unexpected r2t offset %u (expected %zu)\n",
 605                        rq->tag, le32_to_cpu(pdu->r2t_offset),
 606                        req->data_sent);
 607                return -EPROTO;
 608        }
 609
 610        memset(data, 0, sizeof(*data));
 611        data->hdr.type = nvme_tcp_h2c_data;
 612        data->hdr.flags = NVME_TCP_F_DATA_LAST;
 613        if (queue->hdr_digest)
 614                data->hdr.flags |= NVME_TCP_F_HDGST;
 615        if (queue->data_digest)
 616                data->hdr.flags |= NVME_TCP_F_DDGST;
 617        data->hdr.hlen = sizeof(*data);
 618        data->hdr.pdo = data->hdr.hlen + hdgst;
 619        data->hdr.plen =
 620                cpu_to_le32(data->hdr.hlen + hdgst + req->pdu_len + ddgst);
 621        data->ttag = pdu->ttag;
 622        data->command_id = nvme_cid(rq);
 623        data->data_offset = pdu->r2t_offset;
 624        data->data_length = cpu_to_le32(req->pdu_len);
 625        return 0;
 626}
 627
 628static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue,
 629                struct nvme_tcp_r2t_pdu *pdu)
 630{
 631        struct nvme_tcp_request *req;
 632        struct request *rq;
 633        int ret;
 634
 635        rq = nvme_find_rq(nvme_tcp_tagset(queue), pdu->command_id);
 636        if (!rq) {
 637                dev_err(queue->ctrl->ctrl.device,
 638                        "got bad r2t.command_id %#x on queue %d\n",
 639                        pdu->command_id, nvme_tcp_queue_id(queue));
 640                return -ENOENT;
 641        }
 642        req = blk_mq_rq_to_pdu(rq);
 643
 644        ret = nvme_tcp_setup_h2c_data_pdu(req, pdu);
 645        if (unlikely(ret))
 646                return ret;
 647
 648        req->state = NVME_TCP_SEND_H2C_PDU;
 649        req->offset = 0;
 650
 651        nvme_tcp_queue_request(req, false, true);
 652
 653        return 0;
 654}
 655
 656static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb,
 657                unsigned int *offset, size_t *len)
 658{
 659        struct nvme_tcp_hdr *hdr;
 660        char *pdu = queue->pdu;
 661        size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining);
 662        int ret;
 663
 664        ret = skb_copy_bits(skb, *offset,
 665                &pdu[queue->pdu_offset], rcv_len);
 666        if (unlikely(ret))
 667                return ret;
 668
 669        queue->pdu_remaining -= rcv_len;
 670        queue->pdu_offset += rcv_len;
 671        *offset += rcv_len;
 672        *len -= rcv_len;
 673        if (queue->pdu_remaining)
 674                return 0;
 675
 676        hdr = queue->pdu;
 677        if (queue->hdr_digest) {
 678                ret = nvme_tcp_verify_hdgst(queue, queue->pdu, hdr->hlen);
 679                if (unlikely(ret))
 680                        return ret;
 681        }
 682
 683
 684        if (queue->data_digest) {
 685                ret = nvme_tcp_check_ddgst(queue, queue->pdu);
 686                if (unlikely(ret))
 687                        return ret;
 688        }
 689
 690        switch (hdr->type) {
 691        case nvme_tcp_c2h_data:
 692                return nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu);
 693        case nvme_tcp_rsp:
 694                nvme_tcp_init_recv_ctx(queue);
 695                return nvme_tcp_handle_comp(queue, (void *)queue->pdu);
 696        case nvme_tcp_r2t:
 697                nvme_tcp_init_recv_ctx(queue);
 698                return nvme_tcp_handle_r2t(queue, (void *)queue->pdu);
 699        default:
 700                dev_err(queue->ctrl->ctrl.device,
 701                        "unsupported pdu type (%d)\n", hdr->type);
 702                return -EINVAL;
 703        }
 704}
 705
 706static inline void nvme_tcp_end_request(struct request *rq, u16 status)
 707{
 708        union nvme_result res = {};
 709
 710        if (!nvme_try_complete_req(rq, cpu_to_le16(status << 1), res))
 711                nvme_complete_rq(rq);
 712}
 713
 714static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
 715                              unsigned int *offset, size_t *len)
 716{
 717        struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
 718        struct request *rq =
 719                nvme_cid_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
 720        struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
 721
 722        while (true) {
 723                int recv_len, ret;
 724
 725                recv_len = min_t(size_t, *len, queue->data_remaining);
 726                if (!recv_len)
 727                        break;
 728
 729                if (!iov_iter_count(&req->iter)) {
 730                        req->curr_bio = req->curr_bio->bi_next;
 731
 732                        /*
 733                         * If we don`t have any bios it means that controller
 734                         * sent more data than we requested, hence error
 735                         */
 736                        if (!req->curr_bio) {
 737                                dev_err(queue->ctrl->ctrl.device,
 738                                        "queue %d no space in request %#x",
 739                                        nvme_tcp_queue_id(queue), rq->tag);
 740                                nvme_tcp_init_recv_ctx(queue);
 741                                return -EIO;
 742                        }
 743                        nvme_tcp_init_iter(req, READ);
 744                }
 745
 746                /* we can read only from what is left in this bio */
 747                recv_len = min_t(size_t, recv_len,
 748                                iov_iter_count(&req->iter));
 749
 750                if (queue->data_digest)
 751                        ret = skb_copy_and_hash_datagram_iter(skb, *offset,
 752                                &req->iter, recv_len, queue->rcv_hash);
 753                else
 754                        ret = skb_copy_datagram_iter(skb, *offset,
 755                                        &req->iter, recv_len);
 756                if (ret) {
 757                        dev_err(queue->ctrl->ctrl.device,
 758                                "queue %d failed to copy request %#x data",
 759                                nvme_tcp_queue_id(queue), rq->tag);
 760                        return ret;
 761                }
 762
 763                *len -= recv_len;
 764                *offset += recv_len;
 765                queue->data_remaining -= recv_len;
 766        }
 767
 768        if (!queue->data_remaining) {
 769                if (queue->data_digest) {
 770                        nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst);
 771                        queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH;
 772                } else {
 773                        if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
 774                                nvme_tcp_end_request(rq,
 775                                                le16_to_cpu(req->status));
 776                                queue->nr_cqe++;
 777                        }
 778                        nvme_tcp_init_recv_ctx(queue);
 779                }
 780        }
 781
 782        return 0;
 783}
 784
 785static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue,
 786                struct sk_buff *skb, unsigned int *offset, size_t *len)
 787{
 788        struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
 789        char *ddgst = (char *)&queue->recv_ddgst;
 790        size_t recv_len = min_t(size_t, *len, queue->ddgst_remaining);
 791        off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining;
 792        int ret;
 793
 794        ret = skb_copy_bits(skb, *offset, &ddgst[off], recv_len);
 795        if (unlikely(ret))
 796                return ret;
 797
 798        queue->ddgst_remaining -= recv_len;
 799        *offset += recv_len;
 800        *len -= recv_len;
 801        if (queue->ddgst_remaining)
 802                return 0;
 803
 804        if (queue->recv_ddgst != queue->exp_ddgst) {
 805                struct request *rq = nvme_cid_to_rq(nvme_tcp_tagset(queue),
 806                                        pdu->command_id);
 807                struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
 808
 809                req->status = cpu_to_le16(NVME_SC_DATA_XFER_ERROR);
 810
 811                dev_err(queue->ctrl->ctrl.device,
 812                        "data digest error: recv %#x expected %#x\n",
 813                        le32_to_cpu(queue->recv_ddgst),
 814                        le32_to_cpu(queue->exp_ddgst));
 815        }
 816
 817        if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
 818                struct request *rq = nvme_cid_to_rq(nvme_tcp_tagset(queue),
 819                                        pdu->command_id);
 820                struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
 821
 822                nvme_tcp_end_request(rq, le16_to_cpu(req->status));
 823                queue->nr_cqe++;
 824        }
 825
 826        nvme_tcp_init_recv_ctx(queue);
 827        return 0;
 828}
 829
 830static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
 831                             unsigned int offset, size_t len)
 832{
 833        struct nvme_tcp_queue *queue = desc->arg.data;
 834        size_t consumed = len;
 835        int result;
 836
 837        while (len) {
 838                switch (nvme_tcp_recv_state(queue)) {
 839                case NVME_TCP_RECV_PDU:
 840                        result = nvme_tcp_recv_pdu(queue, skb, &offset, &len);
 841                        break;
 842                case NVME_TCP_RECV_DATA:
 843                        result = nvme_tcp_recv_data(queue, skb, &offset, &len);
 844                        break;
 845                case NVME_TCP_RECV_DDGST:
 846                        result = nvme_tcp_recv_ddgst(queue, skb, &offset, &len);
 847                        break;
 848                default:
 849                        result = -EFAULT;
 850                }
 851                if (result) {
 852                        dev_err(queue->ctrl->ctrl.device,
 853                                "receive failed:  %d\n", result);
 854                        queue->rd_enabled = false;
 855                        nvme_tcp_error_recovery(&queue->ctrl->ctrl);
 856                        return result;
 857                }
 858        }
 859
 860        return consumed;
 861}
 862
 863static void nvme_tcp_data_ready(struct sock *sk)
 864{
 865        struct nvme_tcp_queue *queue;
 866
 867        read_lock_bh(&sk->sk_callback_lock);
 868        queue = sk->sk_user_data;
 869        if (likely(queue && queue->rd_enabled) &&
 870            !test_bit(NVME_TCP_Q_POLLING, &queue->flags))
 871                queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
 872        read_unlock_bh(&sk->sk_callback_lock);
 873}
 874
 875static void nvme_tcp_write_space(struct sock *sk)
 876{
 877        struct nvme_tcp_queue *queue;
 878
 879        read_lock_bh(&sk->sk_callback_lock);
 880        queue = sk->sk_user_data;
 881        if (likely(queue && sk_stream_is_writeable(sk))) {
 882                clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 883                queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
 884        }
 885        read_unlock_bh(&sk->sk_callback_lock);
 886}
 887
 888static void nvme_tcp_state_change(struct sock *sk)
 889{
 890        struct nvme_tcp_queue *queue;
 891
 892        read_lock_bh(&sk->sk_callback_lock);
 893        queue = sk->sk_user_data;
 894        if (!queue)
 895                goto done;
 896
 897        switch (sk->sk_state) {
 898        case TCP_CLOSE:
 899        case TCP_CLOSE_WAIT:
 900        case TCP_LAST_ACK:
 901        case TCP_FIN_WAIT1:
 902        case TCP_FIN_WAIT2:
 903                nvme_tcp_error_recovery(&queue->ctrl->ctrl);
 904                break;
 905        default:
 906                dev_info(queue->ctrl->ctrl.device,
 907                        "queue %d socket state %d\n",
 908                        nvme_tcp_queue_id(queue), sk->sk_state);
 909        }
 910
 911        queue->state_change(sk);
 912done:
 913        read_unlock_bh(&sk->sk_callback_lock);
 914}
 915
 916static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue)
 917{
 918        queue->request = NULL;
 919}
 920
 921static void nvme_tcp_fail_request(struct nvme_tcp_request *req)
 922{
 923        nvme_tcp_end_request(blk_mq_rq_from_pdu(req), NVME_SC_HOST_PATH_ERROR);
 924}
 925
 926static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
 927{
 928        struct nvme_tcp_queue *queue = req->queue;
 929        int req_data_len = req->data_len;
 930
 931        while (true) {
 932                struct page *page = nvme_tcp_req_cur_page(req);
 933                size_t offset = nvme_tcp_req_cur_offset(req);
 934                size_t len = nvme_tcp_req_cur_length(req);
 935                bool last = nvme_tcp_pdu_last_send(req, len);
 936                int req_data_sent = req->data_sent;
 937                int ret, flags = MSG_DONTWAIT;
 938
 939                if (last && !queue->data_digest && !nvme_tcp_queue_more(queue))
 940                        flags |= MSG_EOR;
 941                else
 942                        flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
 943
 944                if (sendpage_ok(page)) {
 945                        ret = kernel_sendpage(queue->sock, page, offset, len,
 946                                        flags);
 947                } else {
 948                        ret = sock_no_sendpage(queue->sock, page, offset, len,
 949                                        flags);
 950                }
 951                if (ret <= 0)
 952                        return ret;
 953
 954                if (queue->data_digest)
 955                        nvme_tcp_ddgst_update(queue->snd_hash, page,
 956                                        offset, ret);
 957
 958                /*
 959                 * update the request iterator except for the last payload send
 960                 * in the request where we don't want to modify it as we may
 961                 * compete with the RX path completing the request.
 962                 */
 963                if (req_data_sent + ret < req_data_len)
 964                        nvme_tcp_advance_req(req, ret);
 965
 966                /* fully successful last send in current PDU */
 967                if (last && ret == len) {
 968                        if (queue->data_digest) {
 969                                nvme_tcp_ddgst_final(queue->snd_hash,
 970                                        &req->ddgst);
 971                                req->state = NVME_TCP_SEND_DDGST;
 972                                req->offset = 0;
 973                        } else {
 974                                nvme_tcp_done_send_req(queue);
 975                        }
 976                        return 1;
 977                }
 978        }
 979        return -EAGAIN;
 980}
 981
 982static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
 983{
 984        struct nvme_tcp_queue *queue = req->queue;
 985        struct nvme_tcp_cmd_pdu *pdu = req->pdu;
 986        bool inline_data = nvme_tcp_has_inline_data(req);
 987        u8 hdgst = nvme_tcp_hdgst_len(queue);
 988        int len = sizeof(*pdu) + hdgst - req->offset;
 989        int flags = MSG_DONTWAIT;
 990        int ret;
 991
 992        if (inline_data || nvme_tcp_queue_more(queue))
 993                flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
 994        else
 995                flags |= MSG_EOR;
 996
 997        if (queue->hdr_digest && !req->offset)
 998                nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
 999
1000        ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
1001                        offset_in_page(pdu) + req->offset, len,  flags);
1002        if (unlikely(ret <= 0))
1003                return ret;
1004
1005        len -= ret;
1006        if (!len) {
1007                if (inline_data) {
1008                        req->state = NVME_TCP_SEND_DATA;
1009                        if (queue->data_digest)
1010                                crypto_ahash_init(queue->snd_hash);
1011                } else {
1012                        nvme_tcp_done_send_req(queue);
1013                }
1014                return 1;
1015        }
1016        req->offset += ret;
1017
1018        return -EAGAIN;
1019}
1020
1021static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
1022{
1023        struct nvme_tcp_queue *queue = req->queue;
1024        struct nvme_tcp_data_pdu *pdu = req->pdu;
1025        u8 hdgst = nvme_tcp_hdgst_len(queue);
1026        int len = sizeof(*pdu) - req->offset + hdgst;
1027        int ret;
1028
1029        if (queue->hdr_digest && !req->offset)
1030                nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
1031
1032        ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
1033                        offset_in_page(pdu) + req->offset, len,
1034                        MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST);
1035        if (unlikely(ret <= 0))
1036                return ret;
1037
1038        len -= ret;
1039        if (!len) {
1040                req->state = NVME_TCP_SEND_DATA;
1041                if (queue->data_digest)
1042                        crypto_ahash_init(queue->snd_hash);
1043                return 1;
1044        }
1045        req->offset += ret;
1046
1047        return -EAGAIN;
1048}
1049
1050static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req)
1051{
1052        struct nvme_tcp_queue *queue = req->queue;
1053        size_t offset = req->offset;
1054        int ret;
1055        struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
1056        struct kvec iov = {
1057                .iov_base = (u8 *)&req->ddgst + req->offset,
1058                .iov_len = NVME_TCP_DIGEST_LENGTH - req->offset
1059        };
1060
1061        if (nvme_tcp_queue_more(queue))
1062                msg.msg_flags |= MSG_MORE;
1063        else
1064                msg.msg_flags |= MSG_EOR;
1065
1066        ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
1067        if (unlikely(ret <= 0))
1068                return ret;
1069
1070        if (offset + ret == NVME_TCP_DIGEST_LENGTH) {
1071                nvme_tcp_done_send_req(queue);
1072                return 1;
1073        }
1074
1075        req->offset += ret;
1076        return -EAGAIN;
1077}
1078
1079static int nvme_tcp_try_send(struct nvme_tcp_queue *queue)
1080{
1081        struct nvme_tcp_request *req;
1082        int ret = 1;
1083
1084        if (!queue->request) {
1085                queue->request = nvme_tcp_fetch_request(queue);
1086                if (!queue->request)
1087                        return 0;
1088        }
1089        req = queue->request;
1090
1091        if (req->state == NVME_TCP_SEND_CMD_PDU) {
1092                ret = nvme_tcp_try_send_cmd_pdu(req);
1093                if (ret <= 0)
1094                        goto done;
1095                if (!nvme_tcp_has_inline_data(req))
1096                        return ret;
1097        }
1098
1099        if (req->state == NVME_TCP_SEND_H2C_PDU) {
1100                ret = nvme_tcp_try_send_data_pdu(req);
1101                if (ret <= 0)
1102                        goto done;
1103        }
1104
1105        if (req->state == NVME_TCP_SEND_DATA) {
1106                ret = nvme_tcp_try_send_data(req);
1107                if (ret <= 0)
1108                        goto done;
1109        }
1110
1111        if (req->state == NVME_TCP_SEND_DDGST)
1112                ret = nvme_tcp_try_send_ddgst(req);
1113done:
1114        if (ret == -EAGAIN) {
1115                ret = 0;
1116        } else if (ret < 0) {
1117                dev_err(queue->ctrl->ctrl.device,
1118                        "failed to send request %d\n", ret);
1119                if (ret != -EPIPE && ret != -ECONNRESET)
1120                        nvme_tcp_fail_request(queue->request);
1121                nvme_tcp_done_send_req(queue);
1122        }
1123        return ret;
1124}
1125
1126static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
1127{
1128        struct socket *sock = queue->sock;
1129        struct sock *sk = sock->sk;
1130        read_descriptor_t rd_desc;
1131        int consumed;
1132
1133        rd_desc.arg.data = queue;
1134        rd_desc.count = 1;
1135        lock_sock(sk);
1136        queue->nr_cqe = 0;
1137        consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
1138        release_sock(sk);
1139        return consumed;
1140}
1141
1142static void nvme_tcp_io_work(struct work_struct *w)
1143{
1144        struct nvme_tcp_queue *queue =
1145                container_of(w, struct nvme_tcp_queue, io_work);
1146        unsigned long deadline = jiffies + msecs_to_jiffies(1);
1147
1148        do {
1149                bool pending = false;
1150                int result;
1151
1152                if (mutex_trylock(&queue->send_mutex)) {
1153                        result = nvme_tcp_try_send(queue);
1154                        mutex_unlock(&queue->send_mutex);
1155                        if (result > 0)
1156                                pending = true;
1157                        else if (unlikely(result < 0))
1158                                break;
1159                }
1160
1161                result = nvme_tcp_try_recv(queue);
1162                if (result > 0)
1163                        pending = true;
1164                else if (unlikely(result < 0))
1165                        return;
1166
1167                if (!pending)
1168                        return;
1169
1170        } while (!time_after(jiffies, deadline)); /* quota is exhausted */
1171
1172        queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
1173}
1174
1175static void nvme_tcp_free_crypto(struct nvme_tcp_queue *queue)
1176{
1177        struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash);
1178
1179        ahash_request_free(queue->rcv_hash);
1180        ahash_request_free(queue->snd_hash);
1181        crypto_free_ahash(tfm);
1182}
1183
1184static int nvme_tcp_alloc_crypto(struct nvme_tcp_queue *queue)
1185{
1186        struct crypto_ahash *tfm;
1187
1188        tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC);
1189        if (IS_ERR(tfm))
1190                return PTR_ERR(tfm);
1191
1192        queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL);
1193        if (!queue->snd_hash)
1194                goto free_tfm;
1195        ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL);
1196
1197        queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL);
1198        if (!queue->rcv_hash)
1199                goto free_snd_hash;
1200        ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL);
1201
1202        return 0;
1203free_snd_hash:
1204        ahash_request_free(queue->snd_hash);
1205free_tfm:
1206        crypto_free_ahash(tfm);
1207        return -ENOMEM;
1208}
1209
1210static void nvme_tcp_free_async_req(struct nvme_tcp_ctrl *ctrl)
1211{
1212        struct nvme_tcp_request *async = &ctrl->async_req;
1213
1214        page_frag_free(async->pdu);
1215}
1216
1217static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl)
1218{
1219        struct nvme_tcp_queue *queue = &ctrl->queues[0];
1220        struct nvme_tcp_request *async = &ctrl->async_req;
1221        u8 hdgst = nvme_tcp_hdgst_len(queue);
1222
1223        async->pdu = page_frag_alloc(&queue->pf_cache,
1224                sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
1225                GFP_KERNEL | __GFP_ZERO);
1226        if (!async->pdu)
1227                return -ENOMEM;
1228
1229        async->queue = &ctrl->queues[0];
1230        return 0;
1231}
1232
1233static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
1234{
1235        struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1236        struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1237
1238        if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
1239                return;
1240
1241        if (queue->hdr_digest || queue->data_digest)
1242                nvme_tcp_free_crypto(queue);
1243
1244        sock_release(queue->sock);
1245        kfree(queue->pdu);
1246        mutex_destroy(&queue->send_mutex);
1247        mutex_destroy(&queue->queue_lock);
1248}
1249
1250static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue)
1251{
1252        struct nvme_tcp_icreq_pdu *icreq;
1253        struct nvme_tcp_icresp_pdu *icresp;
1254        struct msghdr msg = {};
1255        struct kvec iov;
1256        bool ctrl_hdgst, ctrl_ddgst;
1257        int ret;
1258
1259        icreq = kzalloc(sizeof(*icreq), GFP_KERNEL);
1260        if (!icreq)
1261                return -ENOMEM;
1262
1263        icresp = kzalloc(sizeof(*icresp), GFP_KERNEL);
1264        if (!icresp) {
1265                ret = -ENOMEM;
1266                goto free_icreq;
1267        }
1268
1269        icreq->hdr.type = nvme_tcp_icreq;
1270        icreq->hdr.hlen = sizeof(*icreq);
1271        icreq->hdr.pdo = 0;
1272        icreq->hdr.plen = cpu_to_le32(icreq->hdr.hlen);
1273        icreq->pfv = cpu_to_le16(NVME_TCP_PFV_1_0);
1274        icreq->maxr2t = 0; /* single inflight r2t supported */
1275        icreq->hpda = 0; /* no alignment constraint */
1276        if (queue->hdr_digest)
1277                icreq->digest |= NVME_TCP_HDR_DIGEST_ENABLE;
1278        if (queue->data_digest)
1279                icreq->digest |= NVME_TCP_DATA_DIGEST_ENABLE;
1280
1281        iov.iov_base = icreq;
1282        iov.iov_len = sizeof(*icreq);
1283        ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
1284        if (ret < 0)
1285                goto free_icresp;
1286
1287        memset(&msg, 0, sizeof(msg));
1288        iov.iov_base = icresp;
1289        iov.iov_len = sizeof(*icresp);
1290        ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
1291                        iov.iov_len, msg.msg_flags);
1292        if (ret < 0)
1293                goto free_icresp;
1294
1295        ret = -EINVAL;
1296        if (icresp->hdr.type != nvme_tcp_icresp) {
1297                pr_err("queue %d: bad type returned %d\n",
1298                        nvme_tcp_queue_id(queue), icresp->hdr.type);
1299                goto free_icresp;
1300        }
1301
1302        if (le32_to_cpu(icresp->hdr.plen) != sizeof(*icresp)) {
1303                pr_err("queue %d: bad pdu length returned %d\n",
1304                        nvme_tcp_queue_id(queue), icresp->hdr.plen);
1305                goto free_icresp;
1306        }
1307
1308        if (icresp->pfv != NVME_TCP_PFV_1_0) {
1309                pr_err("queue %d: bad pfv returned %d\n",
1310                        nvme_tcp_queue_id(queue), icresp->pfv);
1311                goto free_icresp;
1312        }
1313
1314        ctrl_ddgst = !!(icresp->digest & NVME_TCP_DATA_DIGEST_ENABLE);
1315        if ((queue->data_digest && !ctrl_ddgst) ||
1316            (!queue->data_digest && ctrl_ddgst)) {
1317                pr_err("queue %d: data digest mismatch host: %s ctrl: %s\n",
1318                        nvme_tcp_queue_id(queue),
1319                        queue->data_digest ? "enabled" : "disabled",
1320                        ctrl_ddgst ? "enabled" : "disabled");
1321                goto free_icresp;
1322        }
1323
1324        ctrl_hdgst = !!(icresp->digest & NVME_TCP_HDR_DIGEST_ENABLE);
1325        if ((queue->hdr_digest && !ctrl_hdgst) ||
1326            (!queue->hdr_digest && ctrl_hdgst)) {
1327                pr_err("queue %d: header digest mismatch host: %s ctrl: %s\n",
1328                        nvme_tcp_queue_id(queue),
1329                        queue->hdr_digest ? "enabled" : "disabled",
1330                        ctrl_hdgst ? "enabled" : "disabled");
1331                goto free_icresp;
1332        }
1333
1334        if (icresp->cpda != 0) {
1335                pr_err("queue %d: unsupported cpda returned %d\n",
1336                        nvme_tcp_queue_id(queue), icresp->cpda);
1337                goto free_icresp;
1338        }
1339
1340        ret = 0;
1341free_icresp:
1342        kfree(icresp);
1343free_icreq:
1344        kfree(icreq);
1345        return ret;
1346}
1347
1348static bool nvme_tcp_admin_queue(struct nvme_tcp_queue *queue)
1349{
1350        return nvme_tcp_queue_id(queue) == 0;
1351}
1352
1353static bool nvme_tcp_default_queue(struct nvme_tcp_queue *queue)
1354{
1355        struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1356        int qid = nvme_tcp_queue_id(queue);
1357
1358        return !nvme_tcp_admin_queue(queue) &&
1359                qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT];
1360}
1361
1362static bool nvme_tcp_read_queue(struct nvme_tcp_queue *queue)
1363{
1364        struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1365        int qid = nvme_tcp_queue_id(queue);
1366
1367        return !nvme_tcp_admin_queue(queue) &&
1368                !nvme_tcp_default_queue(queue) &&
1369                qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] +
1370                          ctrl->io_queues[HCTX_TYPE_READ];
1371}
1372
1373static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue)
1374{
1375        struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1376        int qid = nvme_tcp_queue_id(queue);
1377
1378        return !nvme_tcp_admin_queue(queue) &&
1379                !nvme_tcp_default_queue(queue) &&
1380                !nvme_tcp_read_queue(queue) &&
1381                qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] +
1382                          ctrl->io_queues[HCTX_TYPE_READ] +
1383                          ctrl->io_queues[HCTX_TYPE_POLL];
1384}
1385
1386static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
1387{
1388        struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1389        int qid = nvme_tcp_queue_id(queue);
1390        int n = 0;
1391
1392        if (nvme_tcp_default_queue(queue))
1393                n = qid - 1;
1394        else if (nvme_tcp_read_queue(queue))
1395                n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - 1;
1396        else if (nvme_tcp_poll_queue(queue))
1397                n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] -
1398                                ctrl->io_queues[HCTX_TYPE_READ] - 1;
1399        queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
1400}
1401
1402static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
1403                int qid, size_t queue_size)
1404{
1405        struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1406        struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1407        int ret, rcv_pdu_size;
1408
1409        mutex_init(&queue->queue_lock);
1410        queue->ctrl = ctrl;
1411        init_llist_head(&queue->req_list);
1412        INIT_LIST_HEAD(&queue->send_list);
1413        mutex_init(&queue->send_mutex);
1414        INIT_WORK(&queue->io_work, nvme_tcp_io_work);
1415        queue->queue_size = queue_size;
1416
1417        if (qid > 0)
1418                queue->cmnd_capsule_len = nctrl->ioccsz * 16;
1419        else
1420                queue->cmnd_capsule_len = sizeof(struct nvme_command) +
1421                                                NVME_TCP_ADMIN_CCSZ;
1422
1423        ret = sock_create(ctrl->addr.ss_family, SOCK_STREAM,
1424                        IPPROTO_TCP, &queue->sock);
1425        if (ret) {
1426                dev_err(nctrl->device,
1427                        "failed to create socket: %d\n", ret);
1428                goto err_destroy_mutex;
1429        }
1430
1431        /* Single syn retry */
1432        tcp_sock_set_syncnt(queue->sock->sk, 1);
1433
1434        /* Set TCP no delay */
1435        tcp_sock_set_nodelay(queue->sock->sk);
1436
1437        /*
1438         * Cleanup whatever is sitting in the TCP transmit queue on socket
1439         * close. This is done to prevent stale data from being sent should
1440         * the network connection be restored before TCP times out.
1441         */
1442        sock_no_linger(queue->sock->sk);
1443
1444        if (so_priority > 0)
1445                sock_set_priority(queue->sock->sk, so_priority);
1446
1447        /* Set socket type of service */
1448        if (nctrl->opts->tos >= 0)
1449                ip_sock_set_tos(queue->sock->sk, nctrl->opts->tos);
1450
1451        /* Set 10 seconds timeout for icresp recvmsg */
1452        queue->sock->sk->sk_rcvtimeo = 10 * HZ;
1453
1454        queue->sock->sk->sk_allocation = GFP_ATOMIC;
1455        nvme_tcp_set_queue_io_cpu(queue);
1456        queue->request = NULL;
1457        queue->data_remaining = 0;
1458        queue->ddgst_remaining = 0;
1459        queue->pdu_remaining = 0;
1460        queue->pdu_offset = 0;
1461        sk_set_memalloc(queue->sock->sk);
1462
1463        if (nctrl->opts->mask & NVMF_OPT_HOST_TRADDR) {
1464                ret = kernel_bind(queue->sock, (struct sockaddr *)&ctrl->src_addr,
1465                        sizeof(ctrl->src_addr));
1466                if (ret) {
1467                        dev_err(nctrl->device,
1468                                "failed to bind queue %d socket %d\n",
1469                                qid, ret);
1470                        goto err_sock;
1471                }
1472        }
1473
1474        if (nctrl->opts->mask & NVMF_OPT_HOST_IFACE) {
1475                char *iface = nctrl->opts->host_iface;
1476                sockptr_t optval = KERNEL_SOCKPTR(iface);
1477
1478                ret = sock_setsockopt(queue->sock, SOL_SOCKET, SO_BINDTODEVICE,
1479                                      optval, strlen(iface));
1480                if (ret) {
1481                        dev_err(nctrl->device,
1482                          "failed to bind to interface %s queue %d err %d\n",
1483                          iface, qid, ret);
1484                        goto err_sock;
1485                }
1486        }
1487
1488        queue->hdr_digest = nctrl->opts->hdr_digest;
1489        queue->data_digest = nctrl->opts->data_digest;
1490        if (queue->hdr_digest || queue->data_digest) {
1491                ret = nvme_tcp_alloc_crypto(queue);
1492                if (ret) {
1493                        dev_err(nctrl->device,
1494                                "failed to allocate queue %d crypto\n", qid);
1495                        goto err_sock;
1496                }
1497        }
1498
1499        rcv_pdu_size = sizeof(struct nvme_tcp_rsp_pdu) +
1500                        nvme_tcp_hdgst_len(queue);
1501        queue->pdu = kmalloc(rcv_pdu_size, GFP_KERNEL);
1502        if (!queue->pdu) {
1503                ret = -ENOMEM;
1504                goto err_crypto;
1505        }
1506
1507        dev_dbg(nctrl->device, "connecting queue %d\n",
1508                        nvme_tcp_queue_id(queue));
1509
1510        ret = kernel_connect(queue->sock, (struct sockaddr *)&ctrl->addr,
1511                sizeof(ctrl->addr), 0);
1512        if (ret) {
1513                dev_err(nctrl->device,
1514                        "failed to connect socket: %d\n", ret);
1515                goto err_rcv_pdu;
1516        }
1517
1518        ret = nvme_tcp_init_connection(queue);
1519        if (ret)
1520                goto err_init_connect;
1521
1522        queue->rd_enabled = true;
1523        set_bit(NVME_TCP_Q_ALLOCATED, &queue->flags);
1524        nvme_tcp_init_recv_ctx(queue);
1525
1526        write_lock_bh(&queue->sock->sk->sk_callback_lock);
1527        queue->sock->sk->sk_user_data = queue;
1528        queue->state_change = queue->sock->sk->sk_state_change;
1529        queue->data_ready = queue->sock->sk->sk_data_ready;
1530        queue->write_space = queue->sock->sk->sk_write_space;
1531        queue->sock->sk->sk_data_ready = nvme_tcp_data_ready;
1532        queue->sock->sk->sk_state_change = nvme_tcp_state_change;
1533        queue->sock->sk->sk_write_space = nvme_tcp_write_space;
1534#ifdef CONFIG_NET_RX_BUSY_POLL
1535        queue->sock->sk->sk_ll_usec = 1;
1536#endif
1537        write_unlock_bh(&queue->sock->sk->sk_callback_lock);
1538
1539        return 0;
1540
1541err_init_connect:
1542        kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1543err_rcv_pdu:
1544        kfree(queue->pdu);
1545err_crypto:
1546        if (queue->hdr_digest || queue->data_digest)
1547                nvme_tcp_free_crypto(queue);
1548err_sock:
1549        sock_release(queue->sock);
1550        queue->sock = NULL;
1551err_destroy_mutex:
1552        mutex_destroy(&queue->send_mutex);
1553        mutex_destroy(&queue->queue_lock);
1554        return ret;
1555}
1556
1557static void nvme_tcp_restore_sock_calls(struct nvme_tcp_queue *queue)
1558{
1559        struct socket *sock = queue->sock;
1560
1561        write_lock_bh(&sock->sk->sk_callback_lock);
1562        sock->sk->sk_user_data  = NULL;
1563        sock->sk->sk_data_ready = queue->data_ready;
1564        sock->sk->sk_state_change = queue->state_change;
1565        sock->sk->sk_write_space  = queue->write_space;
1566        write_unlock_bh(&sock->sk->sk_callback_lock);
1567}
1568
1569static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue)
1570{
1571        kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1572        nvme_tcp_restore_sock_calls(queue);
1573        cancel_work_sync(&queue->io_work);
1574}
1575
1576static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
1577{
1578        struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1579        struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1580
1581        mutex_lock(&queue->queue_lock);
1582        if (test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags))
1583                __nvme_tcp_stop_queue(queue);
1584        mutex_unlock(&queue->queue_lock);
1585}
1586
1587static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
1588{
1589        struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1590        int ret;
1591
1592        if (idx)
1593                ret = nvmf_connect_io_queue(nctrl, idx);
1594        else
1595                ret = nvmf_connect_admin_queue(nctrl);
1596
1597        if (!ret) {
1598                set_bit(NVME_TCP_Q_LIVE, &ctrl->queues[idx].flags);
1599        } else {
1600                if (test_bit(NVME_TCP_Q_ALLOCATED, &ctrl->queues[idx].flags))
1601                        __nvme_tcp_stop_queue(&ctrl->queues[idx]);
1602                dev_err(nctrl->device,
1603                        "failed to connect queue: %d ret=%d\n", idx, ret);
1604        }
1605        return ret;
1606}
1607
1608static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl,
1609                bool admin)
1610{
1611        struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1612        struct blk_mq_tag_set *set;
1613        int ret;
1614
1615        if (admin) {
1616                set = &ctrl->admin_tag_set;
1617                memset(set, 0, sizeof(*set));
1618                set->ops = &nvme_tcp_admin_mq_ops;
1619                set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
1620                set->reserved_tags = NVMF_RESERVED_TAGS;
1621                set->numa_node = nctrl->numa_node;
1622                set->flags = BLK_MQ_F_BLOCKING;
1623                set->cmd_size = sizeof(struct nvme_tcp_request);
1624                set->driver_data = ctrl;
1625                set->nr_hw_queues = 1;
1626                set->timeout = NVME_ADMIN_TIMEOUT;
1627        } else {
1628                set = &ctrl->tag_set;
1629                memset(set, 0, sizeof(*set));
1630                set->ops = &nvme_tcp_mq_ops;
1631                set->queue_depth = nctrl->sqsize + 1;
1632                set->reserved_tags = NVMF_RESERVED_TAGS;
1633                set->numa_node = nctrl->numa_node;
1634                set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
1635                set->cmd_size = sizeof(struct nvme_tcp_request);
1636                set->driver_data = ctrl;
1637                set->nr_hw_queues = nctrl->queue_count - 1;
1638                set->timeout = NVME_IO_TIMEOUT;
1639                set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2;
1640        }
1641
1642        ret = blk_mq_alloc_tag_set(set);
1643        if (ret)
1644                return ERR_PTR(ret);
1645
1646        return set;
1647}
1648
1649static void nvme_tcp_free_admin_queue(struct nvme_ctrl *ctrl)
1650{
1651        if (to_tcp_ctrl(ctrl)->async_req.pdu) {
1652                cancel_work_sync(&ctrl->async_event_work);
1653                nvme_tcp_free_async_req(to_tcp_ctrl(ctrl));
1654                to_tcp_ctrl(ctrl)->async_req.pdu = NULL;
1655        }
1656
1657        nvme_tcp_free_queue(ctrl, 0);
1658}
1659
1660static void nvme_tcp_free_io_queues(struct nvme_ctrl *ctrl)
1661{
1662        int i;
1663
1664        for (i = 1; i < ctrl->queue_count; i++)
1665                nvme_tcp_free_queue(ctrl, i);
1666}
1667
1668static void nvme_tcp_stop_io_queues(struct nvme_ctrl *ctrl)
1669{
1670        int i;
1671
1672        for (i = 1; i < ctrl->queue_count; i++)
1673                nvme_tcp_stop_queue(ctrl, i);
1674}
1675
1676static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl)
1677{
1678        int i, ret = 0;
1679
1680        for (i = 1; i < ctrl->queue_count; i++) {
1681                ret = nvme_tcp_start_queue(ctrl, i);
1682                if (ret)
1683                        goto out_stop_queues;
1684        }
1685
1686        return 0;
1687
1688out_stop_queues:
1689        for (i--; i >= 1; i--)
1690                nvme_tcp_stop_queue(ctrl, i);
1691        return ret;
1692}
1693
1694static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl)
1695{
1696        int ret;
1697
1698        ret = nvme_tcp_alloc_queue(ctrl, 0, NVME_AQ_DEPTH);
1699        if (ret)
1700                return ret;
1701
1702        ret = nvme_tcp_alloc_async_req(to_tcp_ctrl(ctrl));
1703        if (ret)
1704                goto out_free_queue;
1705
1706        return 0;
1707
1708out_free_queue:
1709        nvme_tcp_free_queue(ctrl, 0);
1710        return ret;
1711}
1712
1713static int __nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
1714{
1715        int i, ret;
1716
1717        for (i = 1; i < ctrl->queue_count; i++) {
1718                ret = nvme_tcp_alloc_queue(ctrl, i,
1719                                ctrl->sqsize + 1);
1720                if (ret)
1721                        goto out_free_queues;
1722        }
1723
1724        return 0;
1725
1726out_free_queues:
1727        for (i--; i >= 1; i--)
1728                nvme_tcp_free_queue(ctrl, i);
1729
1730        return ret;
1731}
1732
1733static unsigned int nvme_tcp_nr_io_queues(struct nvme_ctrl *ctrl)
1734{
1735        unsigned int nr_io_queues;
1736
1737        nr_io_queues = min(ctrl->opts->nr_io_queues, num_online_cpus());
1738        nr_io_queues += min(ctrl->opts->nr_write_queues, num_online_cpus());
1739        nr_io_queues += min(ctrl->opts->nr_poll_queues, num_online_cpus());
1740
1741        return nr_io_queues;
1742}
1743
1744static void nvme_tcp_set_io_queues(struct nvme_ctrl *nctrl,
1745                unsigned int nr_io_queues)
1746{
1747        struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1748        struct nvmf_ctrl_options *opts = nctrl->opts;
1749
1750        if (opts->nr_write_queues && opts->nr_io_queues < nr_io_queues) {
1751                /*
1752                 * separate read/write queues
1753                 * hand out dedicated default queues only after we have
1754                 * sufficient read queues.
1755                 */
1756                ctrl->io_queues[HCTX_TYPE_READ] = opts->nr_io_queues;
1757                nr_io_queues -= ctrl->io_queues[HCTX_TYPE_READ];
1758                ctrl->io_queues[HCTX_TYPE_DEFAULT] =
1759                        min(opts->nr_write_queues, nr_io_queues);
1760                nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
1761        } else {
1762                /*
1763                 * shared read/write queues
1764                 * either no write queues were requested, or we don't have
1765                 * sufficient queue count to have dedicated default queues.
1766                 */
1767                ctrl->io_queues[HCTX_TYPE_DEFAULT] =
1768                        min(opts->nr_io_queues, nr_io_queues);
1769                nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
1770        }
1771
1772        if (opts->nr_poll_queues && nr_io_queues) {
1773                /* map dedicated poll queues only if we have queues left */
1774                ctrl->io_queues[HCTX_TYPE_POLL] =
1775                        min(opts->nr_poll_queues, nr_io_queues);
1776        }
1777}
1778
1779static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
1780{
1781        unsigned int nr_io_queues;
1782        int ret;
1783
1784        nr_io_queues = nvme_tcp_nr_io_queues(ctrl);
1785        ret = nvme_set_queue_count(ctrl, &nr_io_queues);
1786        if (ret)
1787                return ret;
1788
1789        if (nr_io_queues == 0) {
1790                dev_err(ctrl->device,
1791                        "unable to set any I/O queues\n");
1792                return -ENOMEM;
1793        }
1794
1795        ctrl->queue_count = nr_io_queues + 1;
1796        dev_info(ctrl->device,
1797                "creating %d I/O queues.\n", nr_io_queues);
1798
1799        nvme_tcp_set_io_queues(ctrl, nr_io_queues);
1800
1801        return __nvme_tcp_alloc_io_queues(ctrl);
1802}
1803
1804static void nvme_tcp_destroy_io_queues(struct nvme_ctrl *ctrl, bool remove)
1805{
1806        nvme_tcp_stop_io_queues(ctrl);
1807        if (remove) {
1808                blk_cleanup_queue(ctrl->connect_q);
1809                blk_mq_free_tag_set(ctrl->tagset);
1810        }
1811        nvme_tcp_free_io_queues(ctrl);
1812}
1813
1814static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
1815{
1816        int ret;
1817
1818        ret = nvme_tcp_alloc_io_queues(ctrl);
1819        if (ret)
1820                return ret;
1821
1822        if (new) {
1823                ctrl->tagset = nvme_tcp_alloc_tagset(ctrl, false);
1824                if (IS_ERR(ctrl->tagset)) {
1825                        ret = PTR_ERR(ctrl->tagset);
1826                        goto out_free_io_queues;
1827                }
1828
1829                ctrl->connect_q = blk_mq_init_queue(ctrl->tagset);
1830                if (IS_ERR(ctrl->connect_q)) {
1831                        ret = PTR_ERR(ctrl->connect_q);
1832                        goto out_free_tag_set;
1833                }
1834        }
1835
1836        ret = nvme_tcp_start_io_queues(ctrl);
1837        if (ret)
1838                goto out_cleanup_connect_q;
1839
1840        if (!new) {
1841                nvme_start_queues(ctrl);
1842                if (!nvme_wait_freeze_timeout(ctrl, NVME_IO_TIMEOUT)) {
1843                        /*
1844                         * If we timed out waiting for freeze we are likely to
1845                         * be stuck.  Fail the controller initialization just
1846                         * to be safe.
1847                         */
1848                        ret = -ENODEV;
1849                        goto out_wait_freeze_timed_out;
1850                }
1851                blk_mq_update_nr_hw_queues(ctrl->tagset,
1852                        ctrl->queue_count - 1);
1853                nvme_unfreeze(ctrl);
1854        }
1855
1856        return 0;
1857
1858out_wait_freeze_timed_out:
1859        nvme_stop_queues(ctrl);
1860        nvme_sync_io_queues(ctrl);
1861        nvme_tcp_stop_io_queues(ctrl);
1862out_cleanup_connect_q:
1863        nvme_cancel_tagset(ctrl);
1864        if (new)
1865                blk_cleanup_queue(ctrl->connect_q);
1866out_free_tag_set:
1867        if (new)
1868                blk_mq_free_tag_set(ctrl->tagset);
1869out_free_io_queues:
1870        nvme_tcp_free_io_queues(ctrl);
1871        return ret;
1872}
1873
1874static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove)
1875{
1876        nvme_tcp_stop_queue(ctrl, 0);
1877        if (remove) {
1878                blk_cleanup_queue(ctrl->admin_q);
1879                blk_cleanup_queue(ctrl->fabrics_q);
1880                blk_mq_free_tag_set(ctrl->admin_tagset);
1881        }
1882        nvme_tcp_free_admin_queue(ctrl);
1883}
1884
1885static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
1886{
1887        int error;
1888
1889        error = nvme_tcp_alloc_admin_queue(ctrl);
1890        if (error)
1891                return error;
1892
1893        if (new) {
1894                ctrl->admin_tagset = nvme_tcp_alloc_tagset(ctrl, true);
1895                if (IS_ERR(ctrl->admin_tagset)) {
1896                        error = PTR_ERR(ctrl->admin_tagset);
1897                        goto out_free_queue;
1898                }
1899
1900                ctrl->fabrics_q = blk_mq_init_queue(ctrl->admin_tagset);
1901                if (IS_ERR(ctrl->fabrics_q)) {
1902                        error = PTR_ERR(ctrl->fabrics_q);
1903                        goto out_free_tagset;
1904                }
1905
1906                ctrl->admin_q = blk_mq_init_queue(ctrl->admin_tagset);
1907                if (IS_ERR(ctrl->admin_q)) {
1908                        error = PTR_ERR(ctrl->admin_q);
1909                        goto out_cleanup_fabrics_q;
1910                }
1911        }
1912
1913        error = nvme_tcp_start_queue(ctrl, 0);
1914        if (error)
1915                goto out_cleanup_queue;
1916
1917        error = nvme_enable_ctrl(ctrl);
1918        if (error)
1919                goto out_stop_queue;
1920
1921        blk_mq_unquiesce_queue(ctrl->admin_q);
1922
1923        error = nvme_init_ctrl_finish(ctrl);
1924        if (error)
1925                goto out_quiesce_queue;
1926
1927        return 0;
1928
1929out_quiesce_queue:
1930        blk_mq_quiesce_queue(ctrl->admin_q);
1931        blk_sync_queue(ctrl->admin_q);
1932out_stop_queue:
1933        nvme_tcp_stop_queue(ctrl, 0);
1934        nvme_cancel_admin_tagset(ctrl);
1935out_cleanup_queue:
1936        if (new)
1937                blk_cleanup_queue(ctrl->admin_q);
1938out_cleanup_fabrics_q:
1939        if (new)
1940                blk_cleanup_queue(ctrl->fabrics_q);
1941out_free_tagset:
1942        if (new)
1943                blk_mq_free_tag_set(ctrl->admin_tagset);
1944out_free_queue:
1945        nvme_tcp_free_admin_queue(ctrl);
1946        return error;
1947}
1948
1949static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl,
1950                bool remove)
1951{
1952        blk_mq_quiesce_queue(ctrl->admin_q);
1953        blk_sync_queue(ctrl->admin_q);
1954        nvme_tcp_stop_queue(ctrl, 0);
1955        nvme_cancel_admin_tagset(ctrl);
1956        if (remove)
1957                blk_mq_unquiesce_queue(ctrl->admin_q);
1958        nvme_tcp_destroy_admin_queue(ctrl, remove);
1959}
1960
1961static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
1962                bool remove)
1963{
1964        if (ctrl->queue_count <= 1)
1965                return;
1966        blk_mq_quiesce_queue(ctrl->admin_q);
1967        nvme_start_freeze(ctrl);
1968        nvme_stop_queues(ctrl);
1969        nvme_sync_io_queues(ctrl);
1970        nvme_tcp_stop_io_queues(ctrl);
1971        nvme_cancel_tagset(ctrl);
1972        if (remove)
1973                nvme_start_queues(ctrl);
1974        nvme_tcp_destroy_io_queues(ctrl, remove);
1975}
1976
1977static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl)
1978{
1979        /* If we are resetting/deleting then do nothing */
1980        if (ctrl->state != NVME_CTRL_CONNECTING) {
1981                WARN_ON_ONCE(ctrl->state == NVME_CTRL_NEW ||
1982                        ctrl->state == NVME_CTRL_LIVE);
1983                return;
1984        }
1985
1986        if (nvmf_should_reconnect(ctrl)) {
1987                dev_info(ctrl->device, "Reconnecting in %d seconds...\n",
1988                        ctrl->opts->reconnect_delay);
1989                queue_delayed_work(nvme_wq, &to_tcp_ctrl(ctrl)->connect_work,
1990                                ctrl->opts->reconnect_delay * HZ);
1991        } else {
1992                dev_info(ctrl->device, "Removing controller...\n");
1993                nvme_delete_ctrl(ctrl);
1994        }
1995}
1996
1997static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
1998{
1999        struct nvmf_ctrl_options *opts = ctrl->opts;
2000        int ret;
2001
2002        ret = nvme_tcp_configure_admin_queue(ctrl, new);
2003        if (ret)
2004                return ret;
2005
2006        if (ctrl->icdoff) {
2007                ret = -EOPNOTSUPP;
2008                dev_err(ctrl->device, "icdoff is not supported!\n");
2009                goto destroy_admin;
2010        }
2011
2012        if (!nvme_ctrl_sgl_supported(ctrl)) {
2013                ret = -EOPNOTSUPP;
2014                dev_err(ctrl->device, "Mandatory sgls are not supported!\n");
2015                goto destroy_admin;
2016        }
2017
2018        if (opts->queue_size > ctrl->sqsize + 1)
2019                dev_warn(ctrl->device,
2020                        "queue_size %zu > ctrl sqsize %u, clamping down\n",
2021                        opts->queue_size, ctrl->sqsize + 1);
2022
2023        if (ctrl->sqsize + 1 > ctrl->maxcmd) {
2024                dev_warn(ctrl->device,
2025                        "sqsize %u > ctrl maxcmd %u, clamping down\n",
2026                        ctrl->sqsize + 1, ctrl->maxcmd);
2027                ctrl->sqsize = ctrl->maxcmd - 1;
2028        }
2029
2030        if (ctrl->queue_count > 1) {
2031                ret = nvme_tcp_configure_io_queues(ctrl, new);
2032                if (ret)
2033                        goto destroy_admin;
2034        }
2035
2036        if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) {
2037                /*
2038                 * state change failure is ok if we started ctrl delete,
2039                 * unless we're during creation of a new controller to
2040                 * avoid races with teardown flow.
2041                 */
2042                WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
2043                             ctrl->state != NVME_CTRL_DELETING_NOIO);
2044                WARN_ON_ONCE(new);
2045                ret = -EINVAL;
2046                goto destroy_io;
2047        }
2048
2049        nvme_start_ctrl(ctrl);
2050        return 0;
2051
2052destroy_io:
2053        if (ctrl->queue_count > 1) {
2054                nvme_stop_queues(ctrl);
2055                nvme_sync_io_queues(ctrl);
2056                nvme_tcp_stop_io_queues(ctrl);
2057                nvme_cancel_tagset(ctrl);
2058                nvme_tcp_destroy_io_queues(ctrl, new);
2059        }
2060destroy_admin:
2061        blk_mq_quiesce_queue(ctrl->admin_q);
2062        blk_sync_queue(ctrl->admin_q);
2063        nvme_tcp_stop_queue(ctrl, 0);
2064        nvme_cancel_admin_tagset(ctrl);
2065        nvme_tcp_destroy_admin_queue(ctrl, new);
2066        return ret;
2067}
2068
2069static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
2070{
2071        struct nvme_tcp_ctrl *tcp_ctrl = container_of(to_delayed_work(work),
2072                        struct nvme_tcp_ctrl, connect_work);
2073        struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
2074
2075        ++ctrl->nr_reconnects;
2076
2077        if (nvme_tcp_setup_ctrl(ctrl, false))
2078                goto requeue;
2079
2080        dev_info(ctrl->device, "Successfully reconnected (%d attempt)\n",
2081                        ctrl->nr_reconnects);
2082
2083        ctrl->nr_reconnects = 0;
2084
2085        return;
2086
2087requeue:
2088        dev_info(ctrl->device, "Failed reconnect attempt %d\n",
2089                        ctrl->nr_reconnects);
2090        nvme_tcp_reconnect_or_remove(ctrl);
2091}
2092
2093static void nvme_tcp_error_recovery_work(struct work_struct *work)
2094{
2095        struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
2096                                struct nvme_tcp_ctrl, err_work);
2097        struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
2098
2099        nvme_stop_keep_alive(ctrl);
2100        nvme_tcp_teardown_io_queues(ctrl, false);
2101        /* unquiesce to fail fast pending requests */
2102        nvme_start_queues(ctrl);
2103        nvme_tcp_teardown_admin_queue(ctrl, false);
2104        blk_mq_unquiesce_queue(ctrl->admin_q);
2105
2106        if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
2107                /* state change failure is ok if we started ctrl delete */
2108                WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
2109                             ctrl->state != NVME_CTRL_DELETING_NOIO);
2110                return;
2111        }
2112
2113        nvme_tcp_reconnect_or_remove(ctrl);
2114}
2115
2116static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
2117{
2118        cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work);
2119        cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
2120
2121        nvme_tcp_teardown_io_queues(ctrl, shutdown);
2122        blk_mq_quiesce_queue(ctrl->admin_q);
2123        if (shutdown)
2124                nvme_shutdown_ctrl(ctrl);
2125        else
2126                nvme_disable_ctrl(ctrl);
2127        nvme_tcp_teardown_admin_queue(ctrl, shutdown);
2128}
2129
2130static void nvme_tcp_delete_ctrl(struct nvme_ctrl *ctrl)
2131{
2132        nvme_tcp_teardown_ctrl(ctrl, true);
2133}
2134
2135static void nvme_reset_ctrl_work(struct work_struct *work)
2136{
2137        struct nvme_ctrl *ctrl =
2138                container_of(work, struct nvme_ctrl, reset_work);
2139
2140        nvme_stop_ctrl(ctrl);
2141        nvme_tcp_teardown_ctrl(ctrl, false);
2142
2143        if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
2144                /* state change failure is ok if we started ctrl delete */
2145                WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
2146                             ctrl->state != NVME_CTRL_DELETING_NOIO);
2147                return;
2148        }
2149
2150        if (nvme_tcp_setup_ctrl(ctrl, false))
2151                goto out_fail;
2152
2153        return;
2154
2155out_fail:
2156        ++ctrl->nr_reconnects;
2157        nvme_tcp_reconnect_or_remove(ctrl);
2158}
2159
2160static void nvme_tcp_free_ctrl(struct nvme_ctrl *nctrl)
2161{
2162        struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
2163
2164        if (list_empty(&ctrl->list))
2165                goto free_ctrl;
2166
2167        mutex_lock(&nvme_tcp_ctrl_mutex);
2168        list_del(&ctrl->list);
2169        mutex_unlock(&nvme_tcp_ctrl_mutex);
2170
2171        nvmf_free_options(nctrl->opts);
2172free_ctrl:
2173        kfree(ctrl->queues);
2174        kfree(ctrl);
2175}
2176
2177static void nvme_tcp_set_sg_null(struct nvme_command *c)
2178{
2179        struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
2180
2181        sg->addr = 0;
2182        sg->length = 0;
2183        sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
2184                        NVME_SGL_FMT_TRANSPORT_A;
2185}
2186
2187static void nvme_tcp_set_sg_inline(struct nvme_tcp_queue *queue,
2188                struct nvme_command *c, u32 data_len)
2189{
2190        struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
2191
2192        sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff);
2193        sg->length = cpu_to_le32(data_len);
2194        sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
2195}
2196
2197static void nvme_tcp_set_sg_host_data(struct nvme_command *c,
2198                u32 data_len)
2199{
2200        struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
2201
2202        sg->addr = 0;
2203        sg->length = cpu_to_le32(data_len);
2204        sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
2205                        NVME_SGL_FMT_TRANSPORT_A;
2206}
2207
2208static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg)
2209{
2210        struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(arg);
2211        struct nvme_tcp_queue *queue = &ctrl->queues[0];
2212        struct nvme_tcp_cmd_pdu *pdu = ctrl->async_req.pdu;
2213        struct nvme_command *cmd = &pdu->cmd;
2214        u8 hdgst = nvme_tcp_hdgst_len(queue);
2215
2216        memset(pdu, 0, sizeof(*pdu));
2217        pdu->hdr.type = nvme_tcp_cmd;
2218        if (queue->hdr_digest)
2219                pdu->hdr.flags |= NVME_TCP_F_HDGST;
2220        pdu->hdr.hlen = sizeof(*pdu);
2221        pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
2222
2223        cmd->common.opcode = nvme_admin_async_event;
2224        cmd->common.command_id = NVME_AQ_BLK_MQ_DEPTH;
2225        cmd->common.flags |= NVME_CMD_SGL_METABUF;
2226        nvme_tcp_set_sg_null(cmd);
2227
2228        ctrl->async_req.state = NVME_TCP_SEND_CMD_PDU;
2229        ctrl->async_req.offset = 0;
2230        ctrl->async_req.curr_bio = NULL;
2231        ctrl->async_req.data_len = 0;
2232
2233        nvme_tcp_queue_request(&ctrl->async_req, true, true);
2234}
2235
2236static void nvme_tcp_complete_timed_out(struct request *rq)
2237{
2238        struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2239        struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl;
2240
2241        nvme_tcp_stop_queue(ctrl, nvme_tcp_queue_id(req->queue));
2242        if (blk_mq_request_started(rq) && !blk_mq_request_completed(rq)) {
2243                nvme_req(rq)->status = NVME_SC_HOST_ABORTED_CMD;
2244                blk_mq_complete_request(rq);
2245        }
2246}
2247
2248static enum blk_eh_timer_return
2249nvme_tcp_timeout(struct request *rq, bool reserved)
2250{
2251        struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2252        struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl;
2253        struct nvme_tcp_cmd_pdu *pdu = req->pdu;
2254
2255        dev_warn(ctrl->device,
2256                "queue %d: timeout request %#x type %d\n",
2257                nvme_tcp_queue_id(req->queue), rq->tag, pdu->hdr.type);
2258
2259        if (ctrl->state != NVME_CTRL_LIVE) {
2260                /*
2261                 * If we are resetting, connecting or deleting we should
2262                 * complete immediately because we may block controller
2263                 * teardown or setup sequence
2264                 * - ctrl disable/shutdown fabrics requests
2265                 * - connect requests
2266                 * - initialization admin requests
2267                 * - I/O requests that entered after unquiescing and
2268                 *   the controller stopped responding
2269                 *
2270                 * All other requests should be cancelled by the error
2271                 * recovery work, so it's fine that we fail it here.
2272                 */
2273                nvme_tcp_complete_timed_out(rq);
2274                return BLK_EH_DONE;
2275        }
2276
2277        /*
2278         * LIVE state should trigger the normal error recovery which will
2279         * handle completing this request.
2280         */
2281        nvme_tcp_error_recovery(ctrl);
2282        return BLK_EH_RESET_TIMER;
2283}
2284
2285static blk_status_t nvme_tcp_map_data(struct nvme_tcp_queue *queue,
2286                        struct request *rq)
2287{
2288        struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2289        struct nvme_tcp_cmd_pdu *pdu = req->pdu;
2290        struct nvme_command *c = &pdu->cmd;
2291
2292        c->common.flags |= NVME_CMD_SGL_METABUF;
2293
2294        if (!blk_rq_nr_phys_segments(rq))
2295                nvme_tcp_set_sg_null(c);
2296        else if (rq_data_dir(rq) == WRITE &&
2297            req->data_len <= nvme_tcp_inline_data_size(queue))
2298                nvme_tcp_set_sg_inline(queue, c, req->data_len);
2299        else
2300                nvme_tcp_set_sg_host_data(c, req->data_len);
2301
2302        return 0;
2303}
2304
2305static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns,
2306                struct request *rq)
2307{
2308        struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2309        struct nvme_tcp_cmd_pdu *pdu = req->pdu;
2310        struct nvme_tcp_queue *queue = req->queue;
2311        u8 hdgst = nvme_tcp_hdgst_len(queue), ddgst = 0;
2312        blk_status_t ret;
2313
2314        ret = nvme_setup_cmd(ns, rq);
2315        if (ret)
2316                return ret;
2317
2318        req->state = NVME_TCP_SEND_CMD_PDU;
2319        req->status = cpu_to_le16(NVME_SC_SUCCESS);
2320        req->offset = 0;
2321        req->data_sent = 0;
2322        req->pdu_len = 0;
2323        req->pdu_sent = 0;
2324        req->data_len = blk_rq_nr_phys_segments(rq) ?
2325                                blk_rq_payload_bytes(rq) : 0;
2326        req->curr_bio = rq->bio;
2327        if (req->curr_bio && req->data_len)
2328                nvme_tcp_init_iter(req, rq_data_dir(rq));
2329
2330        if (rq_data_dir(rq) == WRITE &&
2331            req->data_len <= nvme_tcp_inline_data_size(queue))
2332                req->pdu_len = req->data_len;
2333
2334        pdu->hdr.type = nvme_tcp_cmd;
2335        pdu->hdr.flags = 0;
2336        if (queue->hdr_digest)
2337                pdu->hdr.flags |= NVME_TCP_F_HDGST;
2338        if (queue->data_digest && req->pdu_len) {
2339                pdu->hdr.flags |= NVME_TCP_F_DDGST;
2340                ddgst = nvme_tcp_ddgst_len(queue);
2341        }
2342        pdu->hdr.hlen = sizeof(*pdu);
2343        pdu->hdr.pdo = req->pdu_len ? pdu->hdr.hlen + hdgst : 0;
2344        pdu->hdr.plen =
2345                cpu_to_le32(pdu->hdr.hlen + hdgst + req->pdu_len + ddgst);
2346
2347        ret = nvme_tcp_map_data(queue, rq);
2348        if (unlikely(ret)) {
2349                nvme_cleanup_cmd(rq);
2350                dev_err(queue->ctrl->ctrl.device,
2351                        "Failed to map data (%d)\n", ret);
2352                return ret;
2353        }
2354
2355        return 0;
2356}
2357
2358static void nvme_tcp_commit_rqs(struct blk_mq_hw_ctx *hctx)
2359{
2360        struct nvme_tcp_queue *queue = hctx->driver_data;
2361
2362        if (!llist_empty(&queue->req_list))
2363                queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
2364}
2365
2366static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
2367                const struct blk_mq_queue_data *bd)
2368{
2369        struct nvme_ns *ns = hctx->queue->queuedata;
2370        struct nvme_tcp_queue *queue = hctx->driver_data;
2371        struct request *rq = bd->rq;
2372        struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2373        bool queue_ready = test_bit(NVME_TCP_Q_LIVE, &queue->flags);
2374        blk_status_t ret;
2375
2376        if (!nvme_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
2377                return nvme_fail_nonready_command(&queue->ctrl->ctrl, rq);
2378
2379        ret = nvme_tcp_setup_cmd_pdu(ns, rq);
2380        if (unlikely(ret))
2381                return ret;
2382
2383        blk_mq_start_request(rq);
2384
2385        nvme_tcp_queue_request(req, true, bd->last);
2386
2387        return BLK_STS_OK;
2388}
2389
2390static int nvme_tcp_map_queues(struct blk_mq_tag_set *set)
2391{
2392        struct nvme_tcp_ctrl *ctrl = set->driver_data;
2393        struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
2394
2395        if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) {
2396                /* separate read/write queues */
2397                set->map[HCTX_TYPE_DEFAULT].nr_queues =
2398                        ctrl->io_queues[HCTX_TYPE_DEFAULT];
2399                set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
2400                set->map[HCTX_TYPE_READ].nr_queues =
2401                        ctrl->io_queues[HCTX_TYPE_READ];
2402                set->map[HCTX_TYPE_READ].queue_offset =
2403                        ctrl->io_queues[HCTX_TYPE_DEFAULT];
2404        } else {
2405                /* shared read/write queues */
2406                set->map[HCTX_TYPE_DEFAULT].nr_queues =
2407                        ctrl->io_queues[HCTX_TYPE_DEFAULT];
2408                set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
2409                set->map[HCTX_TYPE_READ].nr_queues =
2410                        ctrl->io_queues[HCTX_TYPE_DEFAULT];
2411                set->map[HCTX_TYPE_READ].queue_offset = 0;
2412        }
2413        blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
2414        blk_mq_map_queues(&set->map[HCTX_TYPE_READ]);
2415
2416        if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) {
2417                /* map dedicated poll queues only if we have queues left */
2418                set->map[HCTX_TYPE_POLL].nr_queues =
2419                                ctrl->io_queues[HCTX_TYPE_POLL];
2420                set->map[HCTX_TYPE_POLL].queue_offset =
2421                        ctrl->io_queues[HCTX_TYPE_DEFAULT] +
2422                        ctrl->io_queues[HCTX_TYPE_READ];
2423                blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]);
2424        }
2425
2426        dev_info(ctrl->ctrl.device,
2427                "mapped %d/%d/%d default/read/poll queues.\n",
2428                ctrl->io_queues[HCTX_TYPE_DEFAULT],
2429                ctrl->io_queues[HCTX_TYPE_READ],
2430                ctrl->io_queues[HCTX_TYPE_POLL]);
2431
2432        return 0;
2433}
2434
2435static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx)
2436{
2437        struct nvme_tcp_queue *queue = hctx->driver_data;
2438        struct sock *sk = queue->sock->sk;
2439
2440        if (!test_bit(NVME_TCP_Q_LIVE, &queue->flags))
2441                return 0;
2442
2443        set_bit(NVME_TCP_Q_POLLING, &queue->flags);
2444        if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue))
2445                sk_busy_loop(sk, true);
2446        nvme_tcp_try_recv(queue);
2447        clear_bit(NVME_TCP_Q_POLLING, &queue->flags);
2448        return queue->nr_cqe;
2449}
2450
2451static const struct blk_mq_ops nvme_tcp_mq_ops = {
2452        .queue_rq       = nvme_tcp_queue_rq,
2453        .commit_rqs     = nvme_tcp_commit_rqs,
2454        .complete       = nvme_complete_rq,
2455        .init_request   = nvme_tcp_init_request,
2456        .exit_request   = nvme_tcp_exit_request,
2457        .init_hctx      = nvme_tcp_init_hctx,
2458        .timeout        = nvme_tcp_timeout,
2459        .map_queues     = nvme_tcp_map_queues,
2460        .poll           = nvme_tcp_poll,
2461};
2462
2463static const struct blk_mq_ops nvme_tcp_admin_mq_ops = {
2464        .queue_rq       = nvme_tcp_queue_rq,
2465        .complete       = nvme_complete_rq,
2466        .init_request   = nvme_tcp_init_request,
2467        .exit_request   = nvme_tcp_exit_request,
2468        .init_hctx      = nvme_tcp_init_admin_hctx,
2469        .timeout        = nvme_tcp_timeout,
2470};
2471
2472static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = {
2473        .name                   = "tcp",
2474        .module                 = THIS_MODULE,
2475        .flags                  = NVME_F_FABRICS,
2476        .reg_read32             = nvmf_reg_read32,
2477        .reg_read64             = nvmf_reg_read64,
2478        .reg_write32            = nvmf_reg_write32,
2479        .free_ctrl              = nvme_tcp_free_ctrl,
2480        .submit_async_event     = nvme_tcp_submit_async_event,
2481        .delete_ctrl            = nvme_tcp_delete_ctrl,
2482        .get_address            = nvmf_get_address,
2483};
2484
2485static bool
2486nvme_tcp_existing_controller(struct nvmf_ctrl_options *opts)
2487{
2488        struct nvme_tcp_ctrl *ctrl;
2489        bool found = false;
2490
2491        mutex_lock(&nvme_tcp_ctrl_mutex);
2492        list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list) {
2493                found = nvmf_ip_options_match(&ctrl->ctrl, opts);
2494                if (found)
2495                        break;
2496        }
2497        mutex_unlock(&nvme_tcp_ctrl_mutex);
2498
2499        return found;
2500}
2501
2502static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
2503                struct nvmf_ctrl_options *opts)
2504{
2505        struct nvme_tcp_ctrl *ctrl;
2506        int ret;
2507
2508        ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
2509        if (!ctrl)
2510                return ERR_PTR(-ENOMEM);
2511
2512        INIT_LIST_HEAD(&ctrl->list);
2513        ctrl->ctrl.opts = opts;
2514        ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues +
2515                                opts->nr_poll_queues + 1;
2516        ctrl->ctrl.sqsize = opts->queue_size - 1;
2517        ctrl->ctrl.kato = opts->kato;
2518
2519        INIT_DELAYED_WORK(&ctrl->connect_work,
2520                        nvme_tcp_reconnect_ctrl_work);
2521        INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work);
2522        INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work);
2523
2524        if (!(opts->mask & NVMF_OPT_TRSVCID)) {
2525                opts->trsvcid =
2526                        kstrdup(__stringify(NVME_TCP_DISC_PORT), GFP_KERNEL);
2527                if (!opts->trsvcid) {
2528                        ret = -ENOMEM;
2529                        goto out_free_ctrl;
2530                }
2531                opts->mask |= NVMF_OPT_TRSVCID;
2532        }
2533
2534        ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
2535                        opts->traddr, opts->trsvcid, &ctrl->addr);
2536        if (ret) {
2537                pr_err("malformed address passed: %s:%s\n",
2538                        opts->traddr, opts->trsvcid);
2539                goto out_free_ctrl;
2540        }
2541
2542        if (opts->mask & NVMF_OPT_HOST_TRADDR) {
2543                ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
2544                        opts->host_traddr, NULL, &ctrl->src_addr);
2545                if (ret) {
2546                        pr_err("malformed src address passed: %s\n",
2547                               opts->host_traddr);
2548                        goto out_free_ctrl;
2549                }
2550        }
2551
2552        if (opts->mask & NVMF_OPT_HOST_IFACE) {
2553                if (!__dev_get_by_name(&init_net, opts->host_iface)) {
2554                        pr_err("invalid interface passed: %s\n",
2555                               opts->host_iface);
2556                        ret = -ENODEV;
2557                        goto out_free_ctrl;
2558                }
2559        }
2560
2561        if (!opts->duplicate_connect && nvme_tcp_existing_controller(opts)) {
2562                ret = -EALREADY;
2563                goto out_free_ctrl;
2564        }
2565
2566        ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(*ctrl->queues),
2567                                GFP_KERNEL);
2568        if (!ctrl->queues) {
2569                ret = -ENOMEM;
2570                goto out_free_ctrl;
2571        }
2572
2573        ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_tcp_ctrl_ops, 0);
2574        if (ret)
2575                goto out_kfree_queues;
2576
2577        if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
2578                WARN_ON_ONCE(1);
2579                ret = -EINTR;
2580                goto out_uninit_ctrl;
2581        }
2582
2583        ret = nvme_tcp_setup_ctrl(&ctrl->ctrl, true);
2584        if (ret)
2585                goto out_uninit_ctrl;
2586
2587        dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n",
2588                ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
2589
2590        mutex_lock(&nvme_tcp_ctrl_mutex);
2591        list_add_tail(&ctrl->list, &nvme_tcp_ctrl_list);
2592        mutex_unlock(&nvme_tcp_ctrl_mutex);
2593
2594        return &ctrl->ctrl;
2595
2596out_uninit_ctrl:
2597        nvme_uninit_ctrl(&ctrl->ctrl);
2598        nvme_put_ctrl(&ctrl->ctrl);
2599        if (ret > 0)
2600                ret = -EIO;
2601        return ERR_PTR(ret);
2602out_kfree_queues:
2603        kfree(ctrl->queues);
2604out_free_ctrl:
2605        kfree(ctrl);
2606        return ERR_PTR(ret);
2607}
2608
2609static struct nvmf_transport_ops nvme_tcp_transport = {
2610        .name           = "tcp",
2611        .module         = THIS_MODULE,
2612        .required_opts  = NVMF_OPT_TRADDR,
2613        .allowed_opts   = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
2614                          NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
2615                          NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST |
2616                          NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES |
2617                          NVMF_OPT_TOS | NVMF_OPT_HOST_IFACE,
2618        .create_ctrl    = nvme_tcp_create_ctrl,
2619};
2620
2621static int __init nvme_tcp_init_module(void)
2622{
2623        nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq",
2624                        WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
2625        if (!nvme_tcp_wq)
2626                return -ENOMEM;
2627
2628        nvmf_register_transport(&nvme_tcp_transport);
2629        return 0;
2630}
2631
2632static void __exit nvme_tcp_cleanup_module(void)
2633{
2634        struct nvme_tcp_ctrl *ctrl;
2635
2636        nvmf_unregister_transport(&nvme_tcp_transport);
2637
2638        mutex_lock(&nvme_tcp_ctrl_mutex);
2639        list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list)
2640                nvme_delete_ctrl(&ctrl->ctrl);
2641        mutex_unlock(&nvme_tcp_ctrl_mutex);
2642        flush_workqueue(nvme_delete_wq);
2643
2644        destroy_workqueue(nvme_tcp_wq);
2645}
2646
2647module_init(nvme_tcp_init_module);
2648module_exit(nvme_tcp_cleanup_module);
2649
2650MODULE_LICENSE("GPL v2");
2651