linux/drivers/nvme/host/tcp.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * NVMe over Fabrics TCP host.
   4 * Copyright (c) 2018 Lightbits Labs. All rights reserved.
   5 */
   6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   7#include <linux/module.h>
   8#include <linux/init.h>
   9#include <linux/slab.h>
  10#include <linux/err.h>
  11#include <linux/nvme-tcp.h>
  12#include <net/sock.h>
  13#include <net/tcp.h>
  14#include <linux/blk-mq.h>
  15#include <crypto/hash.h>
  16#include <net/busy_poll.h>
  17
  18#include "nvme.h"
  19#include "fabrics.h"
  20
  21struct nvme_tcp_queue;
  22
  23/* Define the socket priority to use for connections were it is desirable
  24 * that the NIC consider performing optimized packet processing or filtering.
  25 * A non-zero value being sufficient to indicate general consideration of any
  26 * possible optimization.  Making it a module param allows for alternative
  27 * values that may be unique for some NIC implementations.
  28 */
  29static int so_priority;
  30module_param(so_priority, int, 0644);
  31MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority");
  32
  33enum nvme_tcp_send_state {
  34        NVME_TCP_SEND_CMD_PDU = 0,
  35        NVME_TCP_SEND_H2C_PDU,
  36        NVME_TCP_SEND_DATA,
  37        NVME_TCP_SEND_DDGST,
  38};
  39
  40struct nvme_tcp_request {
  41        struct nvme_request     req;
  42        void                    *pdu;
  43        struct nvme_tcp_queue   *queue;
  44        u32                     data_len;
  45        u32                     pdu_len;
  46        u32                     pdu_sent;
  47        u16                     ttag;
  48        struct list_head        entry;
  49        struct llist_node       lentry;
  50        __le32                  ddgst;
  51
  52        struct bio              *curr_bio;
  53        struct iov_iter         iter;
  54
  55        /* send state */
  56        size_t                  offset;
  57        size_t                  data_sent;
  58        enum nvme_tcp_send_state state;
  59};
  60
  61enum nvme_tcp_queue_flags {
  62        NVME_TCP_Q_ALLOCATED    = 0,
  63        NVME_TCP_Q_LIVE         = 1,
  64        NVME_TCP_Q_POLLING      = 2,
  65};
  66
  67enum nvme_tcp_recv_state {
  68        NVME_TCP_RECV_PDU = 0,
  69        NVME_TCP_RECV_DATA,
  70        NVME_TCP_RECV_DDGST,
  71};
  72
  73struct nvme_tcp_ctrl;
  74struct nvme_tcp_queue {
  75        struct socket           *sock;
  76        struct work_struct      io_work;
  77        int                     io_cpu;
  78
  79        struct mutex            queue_lock;
  80        struct mutex            send_mutex;
  81        struct llist_head       req_list;
  82        struct list_head        send_list;
  83        bool                    more_requests;
  84
  85        /* recv state */
  86        void                    *pdu;
  87        int                     pdu_remaining;
  88        int                     pdu_offset;
  89        size_t                  data_remaining;
  90        size_t                  ddgst_remaining;
  91        unsigned int            nr_cqe;
  92
  93        /* send state */
  94        struct nvme_tcp_request *request;
  95
  96        int                     queue_size;
  97        size_t                  cmnd_capsule_len;
  98        struct nvme_tcp_ctrl    *ctrl;
  99        unsigned long           flags;
 100        bool                    rd_enabled;
 101
 102        bool                    hdr_digest;
 103        bool                    data_digest;
 104        struct ahash_request    *rcv_hash;
 105        struct ahash_request    *snd_hash;
 106        __le32                  exp_ddgst;
 107        __le32                  recv_ddgst;
 108
 109        struct page_frag_cache  pf_cache;
 110
 111        void (*state_change)(struct sock *);
 112        void (*data_ready)(struct sock *);
 113        void (*write_space)(struct sock *);
 114};
 115
 116struct nvme_tcp_ctrl {
 117        /* read only in the hot path */
 118        struct nvme_tcp_queue   *queues;
 119        struct blk_mq_tag_set   tag_set;
 120
 121        /* other member variables */
 122        struct list_head        list;
 123        struct blk_mq_tag_set   admin_tag_set;
 124        struct sockaddr_storage addr;
 125        struct sockaddr_storage src_addr;
 126        struct nvme_ctrl        ctrl;
 127
 128        struct work_struct      err_work;
 129        struct delayed_work     connect_work;
 130        struct nvme_tcp_request async_req;
 131        u32                     io_queues[HCTX_MAX_TYPES];
 132};
 133
 134static LIST_HEAD(nvme_tcp_ctrl_list);
 135static DEFINE_MUTEX(nvme_tcp_ctrl_mutex);
 136static struct workqueue_struct *nvme_tcp_wq;
 137static const struct blk_mq_ops nvme_tcp_mq_ops;
 138static const struct blk_mq_ops nvme_tcp_admin_mq_ops;
 139static int nvme_tcp_try_send(struct nvme_tcp_queue *queue);
 140
 141static inline struct nvme_tcp_ctrl *to_tcp_ctrl(struct nvme_ctrl *ctrl)
 142{
 143        return container_of(ctrl, struct nvme_tcp_ctrl, ctrl);
 144}
 145
 146static inline int nvme_tcp_queue_id(struct nvme_tcp_queue *queue)
 147{
 148        return queue - queue->ctrl->queues;
 149}
 150
 151static inline struct blk_mq_tags *nvme_tcp_tagset(struct nvme_tcp_queue *queue)
 152{
 153        u32 queue_idx = nvme_tcp_queue_id(queue);
 154
 155        if (queue_idx == 0)
 156                return queue->ctrl->admin_tag_set.tags[queue_idx];
 157        return queue->ctrl->tag_set.tags[queue_idx - 1];
 158}
 159
 160static inline u8 nvme_tcp_hdgst_len(struct nvme_tcp_queue *queue)
 161{
 162        return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0;
 163}
 164
 165static inline u8 nvme_tcp_ddgst_len(struct nvme_tcp_queue *queue)
 166{
 167        return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
 168}
 169
 170static inline size_t nvme_tcp_inline_data_size(struct nvme_tcp_queue *queue)
 171{
 172        return queue->cmnd_capsule_len - sizeof(struct nvme_command);
 173}
 174
 175static inline bool nvme_tcp_async_req(struct nvme_tcp_request *req)
 176{
 177        return req == &req->queue->ctrl->async_req;
 178}
 179
 180static inline bool nvme_tcp_has_inline_data(struct nvme_tcp_request *req)
 181{
 182        struct request *rq;
 183
 184        if (unlikely(nvme_tcp_async_req(req)))
 185                return false; /* async events don't have a request */
 186
 187        rq = blk_mq_rq_from_pdu(req);
 188
 189        return rq_data_dir(rq) == WRITE && req->data_len &&
 190                req->data_len <= nvme_tcp_inline_data_size(req->queue);
 191}
 192
 193static inline struct page *nvme_tcp_req_cur_page(struct nvme_tcp_request *req)
 194{
 195        return req->iter.bvec->bv_page;
 196}
 197
 198static inline size_t nvme_tcp_req_cur_offset(struct nvme_tcp_request *req)
 199{
 200        return req->iter.bvec->bv_offset + req->iter.iov_offset;
 201}
 202
 203static inline size_t nvme_tcp_req_cur_length(struct nvme_tcp_request *req)
 204{
 205        return min_t(size_t, iov_iter_single_seg_count(&req->iter),
 206                        req->pdu_len - req->pdu_sent);
 207}
 208
 209static inline size_t nvme_tcp_pdu_data_left(struct nvme_tcp_request *req)
 210{
 211        return rq_data_dir(blk_mq_rq_from_pdu(req)) == WRITE ?
 212                        req->pdu_len - req->pdu_sent : 0;
 213}
 214
 215static inline size_t nvme_tcp_pdu_last_send(struct nvme_tcp_request *req,
 216                int len)
 217{
 218        return nvme_tcp_pdu_data_left(req) <= len;
 219}
 220
 221static void nvme_tcp_init_iter(struct nvme_tcp_request *req,
 222                unsigned int dir)
 223{
 224        struct request *rq = blk_mq_rq_from_pdu(req);
 225        struct bio_vec *vec;
 226        unsigned int size;
 227        int nr_bvec;
 228        size_t offset;
 229
 230        if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) {
 231                vec = &rq->special_vec;
 232                nr_bvec = 1;
 233                size = blk_rq_payload_bytes(rq);
 234                offset = 0;
 235        } else {
 236                struct bio *bio = req->curr_bio;
 237                struct bvec_iter bi;
 238                struct bio_vec bv;
 239
 240                vec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
 241                nr_bvec = 0;
 242                bio_for_each_bvec(bv, bio, bi) {
 243                        nr_bvec++;
 244                }
 245                size = bio->bi_iter.bi_size;
 246                offset = bio->bi_iter.bi_bvec_done;
 247        }
 248
 249        iov_iter_bvec(&req->iter, dir, vec, nr_bvec, size);
 250        req->iter.iov_offset = offset;
 251}
 252
 253static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req,
 254                int len)
 255{
 256        req->data_sent += len;
 257        req->pdu_sent += len;
 258        iov_iter_advance(&req->iter, len);
 259        if (!iov_iter_count(&req->iter) &&
 260            req->data_sent < req->data_len) {
 261                req->curr_bio = req->curr_bio->bi_next;
 262                nvme_tcp_init_iter(req, WRITE);
 263        }
 264}
 265
 266static inline void nvme_tcp_send_all(struct nvme_tcp_queue *queue)
 267{
 268        int ret;
 269
 270        /* drain the send queue as much as we can... */
 271        do {
 272                ret = nvme_tcp_try_send(queue);
 273        } while (ret > 0);
 274}
 275
 276static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
 277                bool sync, bool last)
 278{
 279        struct nvme_tcp_queue *queue = req->queue;
 280        bool empty;
 281
 282        empty = llist_add(&req->lentry, &queue->req_list) &&
 283                list_empty(&queue->send_list) && !queue->request;
 284
 285        /*
 286         * if we're the first on the send_list and we can try to send
 287         * directly, otherwise queue io_work. Also, only do that if we
 288         * are on the same cpu, so we don't introduce contention.
 289         */
 290        if (queue->io_cpu == raw_smp_processor_id() &&
 291            sync && empty && mutex_trylock(&queue->send_mutex)) {
 292                queue->more_requests = !last;
 293                nvme_tcp_send_all(queue);
 294                queue->more_requests = false;
 295                mutex_unlock(&queue->send_mutex);
 296        } else if (last) {
 297                queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
 298        }
 299}
 300
 301static void nvme_tcp_process_req_list(struct nvme_tcp_queue *queue)
 302{
 303        struct nvme_tcp_request *req;
 304        struct llist_node *node;
 305
 306        for (node = llist_del_all(&queue->req_list); node; node = node->next) {
 307                req = llist_entry(node, struct nvme_tcp_request, lentry);
 308                list_add(&req->entry, &queue->send_list);
 309        }
 310}
 311
 312static inline struct nvme_tcp_request *
 313nvme_tcp_fetch_request(struct nvme_tcp_queue *queue)
 314{
 315        struct nvme_tcp_request *req;
 316
 317        req = list_first_entry_or_null(&queue->send_list,
 318                        struct nvme_tcp_request, entry);
 319        if (!req) {
 320                nvme_tcp_process_req_list(queue);
 321                req = list_first_entry_or_null(&queue->send_list,
 322                                struct nvme_tcp_request, entry);
 323                if (unlikely(!req))
 324                        return NULL;
 325        }
 326
 327        list_del(&req->entry);
 328        return req;
 329}
 330
 331static inline void nvme_tcp_ddgst_final(struct ahash_request *hash,
 332                __le32 *dgst)
 333{
 334        ahash_request_set_crypt(hash, NULL, (u8 *)dgst, 0);
 335        crypto_ahash_final(hash);
 336}
 337
 338static inline void nvme_tcp_ddgst_update(struct ahash_request *hash,
 339                struct page *page, off_t off, size_t len)
 340{
 341        struct scatterlist sg;
 342
 343        sg_init_marker(&sg, 1);
 344        sg_set_page(&sg, page, len, off);
 345        ahash_request_set_crypt(hash, &sg, NULL, len);
 346        crypto_ahash_update(hash);
 347}
 348
 349static inline void nvme_tcp_hdgst(struct ahash_request *hash,
 350                void *pdu, size_t len)
 351{
 352        struct scatterlist sg;
 353
 354        sg_init_one(&sg, pdu, len);
 355        ahash_request_set_crypt(hash, &sg, pdu + len, len);
 356        crypto_ahash_digest(hash);
 357}
 358
 359static int nvme_tcp_verify_hdgst(struct nvme_tcp_queue *queue,
 360                void *pdu, size_t pdu_len)
 361{
 362        struct nvme_tcp_hdr *hdr = pdu;
 363        __le32 recv_digest;
 364        __le32 exp_digest;
 365
 366        if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) {
 367                dev_err(queue->ctrl->ctrl.device,
 368                        "queue %d: header digest flag is cleared\n",
 369                        nvme_tcp_queue_id(queue));
 370                return -EPROTO;
 371        }
 372
 373        recv_digest = *(__le32 *)(pdu + hdr->hlen);
 374        nvme_tcp_hdgst(queue->rcv_hash, pdu, pdu_len);
 375        exp_digest = *(__le32 *)(pdu + hdr->hlen);
 376        if (recv_digest != exp_digest) {
 377                dev_err(queue->ctrl->ctrl.device,
 378                        "header digest error: recv %#x expected %#x\n",
 379                        le32_to_cpu(recv_digest), le32_to_cpu(exp_digest));
 380                return -EIO;
 381        }
 382
 383        return 0;
 384}
 385
 386static int nvme_tcp_check_ddgst(struct nvme_tcp_queue *queue, void *pdu)
 387{
 388        struct nvme_tcp_hdr *hdr = pdu;
 389        u8 digest_len = nvme_tcp_hdgst_len(queue);
 390        u32 len;
 391
 392        len = le32_to_cpu(hdr->plen) - hdr->hlen -
 393                ((hdr->flags & NVME_TCP_F_HDGST) ? digest_len : 0);
 394
 395        if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) {
 396                dev_err(queue->ctrl->ctrl.device,
 397                        "queue %d: data digest flag is cleared\n",
 398                nvme_tcp_queue_id(queue));
 399                return -EPROTO;
 400        }
 401        crypto_ahash_init(queue->rcv_hash);
 402
 403        return 0;
 404}
 405
 406static void nvme_tcp_exit_request(struct blk_mq_tag_set *set,
 407                struct request *rq, unsigned int hctx_idx)
 408{
 409        struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
 410
 411        page_frag_free(req->pdu);
 412}
 413
 414static int nvme_tcp_init_request(struct blk_mq_tag_set *set,
 415                struct request *rq, unsigned int hctx_idx,
 416                unsigned int numa_node)
 417{
 418        struct nvme_tcp_ctrl *ctrl = set->driver_data;
 419        struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
 420        struct nvme_tcp_cmd_pdu *pdu;
 421        int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
 422        struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx];
 423        u8 hdgst = nvme_tcp_hdgst_len(queue);
 424
 425        req->pdu = page_frag_alloc(&queue->pf_cache,
 426                sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
 427                GFP_KERNEL | __GFP_ZERO);
 428        if (!req->pdu)
 429                return -ENOMEM;
 430
 431        pdu = req->pdu;
 432        req->queue = queue;
 433        nvme_req(rq)->ctrl = &ctrl->ctrl;
 434        nvme_req(rq)->cmd = &pdu->cmd;
 435
 436        return 0;
 437}
 438
 439static int nvme_tcp_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
 440                unsigned int hctx_idx)
 441{
 442        struct nvme_tcp_ctrl *ctrl = data;
 443        struct nvme_tcp_queue *queue = &ctrl->queues[hctx_idx + 1];
 444
 445        hctx->driver_data = queue;
 446        return 0;
 447}
 448
 449static int nvme_tcp_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
 450                unsigned int hctx_idx)
 451{
 452        struct nvme_tcp_ctrl *ctrl = data;
 453        struct nvme_tcp_queue *queue = &ctrl->queues[0];
 454
 455        hctx->driver_data = queue;
 456        return 0;
 457}
 458
 459static enum nvme_tcp_recv_state
 460nvme_tcp_recv_state(struct nvme_tcp_queue *queue)
 461{
 462        return  (queue->pdu_remaining) ? NVME_TCP_RECV_PDU :
 463                (queue->ddgst_remaining) ? NVME_TCP_RECV_DDGST :
 464                NVME_TCP_RECV_DATA;
 465}
 466
 467static void nvme_tcp_init_recv_ctx(struct nvme_tcp_queue *queue)
 468{
 469        queue->pdu_remaining = sizeof(struct nvme_tcp_rsp_pdu) +
 470                                nvme_tcp_hdgst_len(queue);
 471        queue->pdu_offset = 0;
 472        queue->data_remaining = -1;
 473        queue->ddgst_remaining = 0;
 474}
 475
 476static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl)
 477{
 478        if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
 479                return;
 480
 481        dev_warn(ctrl->device, "starting error recovery\n");
 482        queue_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work);
 483}
 484
 485static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
 486                struct nvme_completion *cqe)
 487{
 488        struct request *rq;
 489
 490        rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), cqe->command_id);
 491        if (!rq) {
 492                dev_err(queue->ctrl->ctrl.device,
 493                        "queue %d tag 0x%x not found\n",
 494                        nvme_tcp_queue_id(queue), cqe->command_id);
 495                nvme_tcp_error_recovery(&queue->ctrl->ctrl);
 496                return -EINVAL;
 497        }
 498
 499        if (!nvme_try_complete_req(rq, cqe->status, cqe->result))
 500                nvme_complete_rq(rq);
 501        queue->nr_cqe++;
 502
 503        return 0;
 504}
 505
 506static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue,
 507                struct nvme_tcp_data_pdu *pdu)
 508{
 509        struct request *rq;
 510
 511        rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
 512        if (!rq) {
 513                dev_err(queue->ctrl->ctrl.device,
 514                        "queue %d tag %#x not found\n",
 515                        nvme_tcp_queue_id(queue), pdu->command_id);
 516                return -ENOENT;
 517        }
 518
 519        if (!blk_rq_payload_bytes(rq)) {
 520                dev_err(queue->ctrl->ctrl.device,
 521                        "queue %d tag %#x unexpected data\n",
 522                        nvme_tcp_queue_id(queue), rq->tag);
 523                return -EIO;
 524        }
 525
 526        queue->data_remaining = le32_to_cpu(pdu->data_length);
 527
 528        if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS &&
 529            unlikely(!(pdu->hdr.flags & NVME_TCP_F_DATA_LAST))) {
 530                dev_err(queue->ctrl->ctrl.device,
 531                        "queue %d tag %#x SUCCESS set but not last PDU\n",
 532                        nvme_tcp_queue_id(queue), rq->tag);
 533                nvme_tcp_error_recovery(&queue->ctrl->ctrl);
 534                return -EPROTO;
 535        }
 536
 537        return 0;
 538}
 539
 540static int nvme_tcp_handle_comp(struct nvme_tcp_queue *queue,
 541                struct nvme_tcp_rsp_pdu *pdu)
 542{
 543        struct nvme_completion *cqe = &pdu->cqe;
 544        int ret = 0;
 545
 546        /*
 547         * AEN requests are special as they don't time out and can
 548         * survive any kind of queue freeze and often don't respond to
 549         * aborts.  We don't even bother to allocate a struct request
 550         * for them but rather special case them here.
 551         */
 552        if (unlikely(nvme_is_aen_req(nvme_tcp_queue_id(queue),
 553                                     cqe->command_id)))
 554                nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
 555                                &cqe->result);
 556        else
 557                ret = nvme_tcp_process_nvme_cqe(queue, cqe);
 558
 559        return ret;
 560}
 561
 562static int nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req,
 563                struct nvme_tcp_r2t_pdu *pdu)
 564{
 565        struct nvme_tcp_data_pdu *data = req->pdu;
 566        struct nvme_tcp_queue *queue = req->queue;
 567        struct request *rq = blk_mq_rq_from_pdu(req);
 568        u8 hdgst = nvme_tcp_hdgst_len(queue);
 569        u8 ddgst = nvme_tcp_ddgst_len(queue);
 570
 571        req->pdu_len = le32_to_cpu(pdu->r2t_length);
 572        req->pdu_sent = 0;
 573
 574        if (unlikely(!req->pdu_len)) {
 575                dev_err(queue->ctrl->ctrl.device,
 576                        "req %d r2t len is %u, probably a bug...\n",
 577                        rq->tag, req->pdu_len);
 578                return -EPROTO;
 579        }
 580
 581        if (unlikely(req->data_sent + req->pdu_len > req->data_len)) {
 582                dev_err(queue->ctrl->ctrl.device,
 583                        "req %d r2t len %u exceeded data len %u (%zu sent)\n",
 584                        rq->tag, req->pdu_len, req->data_len,
 585                        req->data_sent);
 586                return -EPROTO;
 587        }
 588
 589        if (unlikely(le32_to_cpu(pdu->r2t_offset) < req->data_sent)) {
 590                dev_err(queue->ctrl->ctrl.device,
 591                        "req %d unexpected r2t offset %u (expected %zu)\n",
 592                        rq->tag, le32_to_cpu(pdu->r2t_offset),
 593                        req->data_sent);
 594                return -EPROTO;
 595        }
 596
 597        memset(data, 0, sizeof(*data));
 598        data->hdr.type = nvme_tcp_h2c_data;
 599        data->hdr.flags = NVME_TCP_F_DATA_LAST;
 600        if (queue->hdr_digest)
 601                data->hdr.flags |= NVME_TCP_F_HDGST;
 602        if (queue->data_digest)
 603                data->hdr.flags |= NVME_TCP_F_DDGST;
 604        data->hdr.hlen = sizeof(*data);
 605        data->hdr.pdo = data->hdr.hlen + hdgst;
 606        data->hdr.plen =
 607                cpu_to_le32(data->hdr.hlen + hdgst + req->pdu_len + ddgst);
 608        data->ttag = pdu->ttag;
 609        data->command_id = rq->tag;
 610        data->data_offset = cpu_to_le32(req->data_sent);
 611        data->data_length = cpu_to_le32(req->pdu_len);
 612        return 0;
 613}
 614
 615static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue,
 616                struct nvme_tcp_r2t_pdu *pdu)
 617{
 618        struct nvme_tcp_request *req;
 619        struct request *rq;
 620        int ret;
 621
 622        rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
 623        if (!rq) {
 624                dev_err(queue->ctrl->ctrl.device,
 625                        "queue %d tag %#x not found\n",
 626                        nvme_tcp_queue_id(queue), pdu->command_id);
 627                return -ENOENT;
 628        }
 629        req = blk_mq_rq_to_pdu(rq);
 630
 631        ret = nvme_tcp_setup_h2c_data_pdu(req, pdu);
 632        if (unlikely(ret))
 633                return ret;
 634
 635        req->state = NVME_TCP_SEND_H2C_PDU;
 636        req->offset = 0;
 637
 638        nvme_tcp_queue_request(req, false, true);
 639
 640        return 0;
 641}
 642
 643static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb,
 644                unsigned int *offset, size_t *len)
 645{
 646        struct nvme_tcp_hdr *hdr;
 647        char *pdu = queue->pdu;
 648        size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining);
 649        int ret;
 650
 651        ret = skb_copy_bits(skb, *offset,
 652                &pdu[queue->pdu_offset], rcv_len);
 653        if (unlikely(ret))
 654                return ret;
 655
 656        queue->pdu_remaining -= rcv_len;
 657        queue->pdu_offset += rcv_len;
 658        *offset += rcv_len;
 659        *len -= rcv_len;
 660        if (queue->pdu_remaining)
 661                return 0;
 662
 663        hdr = queue->pdu;
 664        if (queue->hdr_digest) {
 665                ret = nvme_tcp_verify_hdgst(queue, queue->pdu, hdr->hlen);
 666                if (unlikely(ret))
 667                        return ret;
 668        }
 669
 670
 671        if (queue->data_digest) {
 672                ret = nvme_tcp_check_ddgst(queue, queue->pdu);
 673                if (unlikely(ret))
 674                        return ret;
 675        }
 676
 677        switch (hdr->type) {
 678        case nvme_tcp_c2h_data:
 679                return nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu);
 680        case nvme_tcp_rsp:
 681                nvme_tcp_init_recv_ctx(queue);
 682                return nvme_tcp_handle_comp(queue, (void *)queue->pdu);
 683        case nvme_tcp_r2t:
 684                nvme_tcp_init_recv_ctx(queue);
 685                return nvme_tcp_handle_r2t(queue, (void *)queue->pdu);
 686        default:
 687                dev_err(queue->ctrl->ctrl.device,
 688                        "unsupported pdu type (%d)\n", hdr->type);
 689                return -EINVAL;
 690        }
 691}
 692
 693static inline void nvme_tcp_end_request(struct request *rq, u16 status)
 694{
 695        union nvme_result res = {};
 696
 697        if (!nvme_try_complete_req(rq, cpu_to_le16(status << 1), res))
 698                nvme_complete_rq(rq);
 699}
 700
 701static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
 702                              unsigned int *offset, size_t *len)
 703{
 704        struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
 705        struct nvme_tcp_request *req;
 706        struct request *rq;
 707
 708        rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
 709        if (!rq) {
 710                dev_err(queue->ctrl->ctrl.device,
 711                        "queue %d tag %#x not found\n",
 712                        nvme_tcp_queue_id(queue), pdu->command_id);
 713                return -ENOENT;
 714        }
 715        req = blk_mq_rq_to_pdu(rq);
 716
 717        while (true) {
 718                int recv_len, ret;
 719
 720                recv_len = min_t(size_t, *len, queue->data_remaining);
 721                if (!recv_len)
 722                        break;
 723
 724                if (!iov_iter_count(&req->iter)) {
 725                        req->curr_bio = req->curr_bio->bi_next;
 726
 727                        /*
 728                         * If we don`t have any bios it means that controller
 729                         * sent more data than we requested, hence error
 730                         */
 731                        if (!req->curr_bio) {
 732                                dev_err(queue->ctrl->ctrl.device,
 733                                        "queue %d no space in request %#x",
 734                                        nvme_tcp_queue_id(queue), rq->tag);
 735                                nvme_tcp_init_recv_ctx(queue);
 736                                return -EIO;
 737                        }
 738                        nvme_tcp_init_iter(req, READ);
 739                }
 740
 741                /* we can read only from what is left in this bio */
 742                recv_len = min_t(size_t, recv_len,
 743                                iov_iter_count(&req->iter));
 744
 745                if (queue->data_digest)
 746                        ret = skb_copy_and_hash_datagram_iter(skb, *offset,
 747                                &req->iter, recv_len, queue->rcv_hash);
 748                else
 749                        ret = skb_copy_datagram_iter(skb, *offset,
 750                                        &req->iter, recv_len);
 751                if (ret) {
 752                        dev_err(queue->ctrl->ctrl.device,
 753                                "queue %d failed to copy request %#x data",
 754                                nvme_tcp_queue_id(queue), rq->tag);
 755                        return ret;
 756                }
 757
 758                *len -= recv_len;
 759                *offset += recv_len;
 760                queue->data_remaining -= recv_len;
 761        }
 762
 763        if (!queue->data_remaining) {
 764                if (queue->data_digest) {
 765                        nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst);
 766                        queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH;
 767                } else {
 768                        if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
 769                                nvme_tcp_end_request(rq, NVME_SC_SUCCESS);
 770                                queue->nr_cqe++;
 771                        }
 772                        nvme_tcp_init_recv_ctx(queue);
 773                }
 774        }
 775
 776        return 0;
 777}
 778
 779static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue,
 780                struct sk_buff *skb, unsigned int *offset, size_t *len)
 781{
 782        struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
 783        char *ddgst = (char *)&queue->recv_ddgst;
 784        size_t recv_len = min_t(size_t, *len, queue->ddgst_remaining);
 785        off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining;
 786        int ret;
 787
 788        ret = skb_copy_bits(skb, *offset, &ddgst[off], recv_len);
 789        if (unlikely(ret))
 790                return ret;
 791
 792        queue->ddgst_remaining -= recv_len;
 793        *offset += recv_len;
 794        *len -= recv_len;
 795        if (queue->ddgst_remaining)
 796                return 0;
 797
 798        if (queue->recv_ddgst != queue->exp_ddgst) {
 799                dev_err(queue->ctrl->ctrl.device,
 800                        "data digest error: recv %#x expected %#x\n",
 801                        le32_to_cpu(queue->recv_ddgst),
 802                        le32_to_cpu(queue->exp_ddgst));
 803                return -EIO;
 804        }
 805
 806        if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
 807                struct request *rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue),
 808                                                pdu->command_id);
 809
 810                nvme_tcp_end_request(rq, NVME_SC_SUCCESS);
 811                queue->nr_cqe++;
 812        }
 813
 814        nvme_tcp_init_recv_ctx(queue);
 815        return 0;
 816}
 817
 818static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
 819                             unsigned int offset, size_t len)
 820{
 821        struct nvme_tcp_queue *queue = desc->arg.data;
 822        size_t consumed = len;
 823        int result;
 824
 825        while (len) {
 826                switch (nvme_tcp_recv_state(queue)) {
 827                case NVME_TCP_RECV_PDU:
 828                        result = nvme_tcp_recv_pdu(queue, skb, &offset, &len);
 829                        break;
 830                case NVME_TCP_RECV_DATA:
 831                        result = nvme_tcp_recv_data(queue, skb, &offset, &len);
 832                        break;
 833                case NVME_TCP_RECV_DDGST:
 834                        result = nvme_tcp_recv_ddgst(queue, skb, &offset, &len);
 835                        break;
 836                default:
 837                        result = -EFAULT;
 838                }
 839                if (result) {
 840                        dev_err(queue->ctrl->ctrl.device,
 841                                "receive failed:  %d\n", result);
 842                        queue->rd_enabled = false;
 843                        nvme_tcp_error_recovery(&queue->ctrl->ctrl);
 844                        return result;
 845                }
 846        }
 847
 848        return consumed;
 849}
 850
 851static void nvme_tcp_data_ready(struct sock *sk)
 852{
 853        struct nvme_tcp_queue *queue;
 854
 855        read_lock_bh(&sk->sk_callback_lock);
 856        queue = sk->sk_user_data;
 857        if (likely(queue && queue->rd_enabled) &&
 858            !test_bit(NVME_TCP_Q_POLLING, &queue->flags))
 859                queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
 860        read_unlock_bh(&sk->sk_callback_lock);
 861}
 862
 863static void nvme_tcp_write_space(struct sock *sk)
 864{
 865        struct nvme_tcp_queue *queue;
 866
 867        read_lock_bh(&sk->sk_callback_lock);
 868        queue = sk->sk_user_data;
 869        if (likely(queue && sk_stream_is_writeable(sk))) {
 870                clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 871                queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
 872        }
 873        read_unlock_bh(&sk->sk_callback_lock);
 874}
 875
 876static void nvme_tcp_state_change(struct sock *sk)
 877{
 878        struct nvme_tcp_queue *queue;
 879
 880        read_lock_bh(&sk->sk_callback_lock);
 881        queue = sk->sk_user_data;
 882        if (!queue)
 883                goto done;
 884
 885        switch (sk->sk_state) {
 886        case TCP_CLOSE:
 887        case TCP_CLOSE_WAIT:
 888        case TCP_LAST_ACK:
 889        case TCP_FIN_WAIT1:
 890        case TCP_FIN_WAIT2:
 891                nvme_tcp_error_recovery(&queue->ctrl->ctrl);
 892                break;
 893        default:
 894                dev_info(queue->ctrl->ctrl.device,
 895                        "queue %d socket state %d\n",
 896                        nvme_tcp_queue_id(queue), sk->sk_state);
 897        }
 898
 899        queue->state_change(sk);
 900done:
 901        read_unlock_bh(&sk->sk_callback_lock);
 902}
 903
 904static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue)
 905{
 906        return !list_empty(&queue->send_list) ||
 907                !llist_empty(&queue->req_list) || queue->more_requests;
 908}
 909
 910static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue)
 911{
 912        queue->request = NULL;
 913}
 914
 915static void nvme_tcp_fail_request(struct nvme_tcp_request *req)
 916{
 917        nvme_tcp_end_request(blk_mq_rq_from_pdu(req), NVME_SC_HOST_PATH_ERROR);
 918}
 919
 920static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
 921{
 922        struct nvme_tcp_queue *queue = req->queue;
 923
 924        while (true) {
 925                struct page *page = nvme_tcp_req_cur_page(req);
 926                size_t offset = nvme_tcp_req_cur_offset(req);
 927                size_t len = nvme_tcp_req_cur_length(req);
 928                bool last = nvme_tcp_pdu_last_send(req, len);
 929                int ret, flags = MSG_DONTWAIT;
 930
 931                if (last && !queue->data_digest && !nvme_tcp_queue_more(queue))
 932                        flags |= MSG_EOR;
 933                else
 934                        flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
 935
 936                if (sendpage_ok(page)) {
 937                        ret = kernel_sendpage(queue->sock, page, offset, len,
 938                                        flags);
 939                } else {
 940                        ret = sock_no_sendpage(queue->sock, page, offset, len,
 941                                        flags);
 942                }
 943                if (ret <= 0)
 944                        return ret;
 945
 946                if (queue->data_digest)
 947                        nvme_tcp_ddgst_update(queue->snd_hash, page,
 948                                        offset, ret);
 949
 950                /* fully successful last write*/
 951                if (last && ret == len) {
 952                        if (queue->data_digest) {
 953                                nvme_tcp_ddgst_final(queue->snd_hash,
 954                                        &req->ddgst);
 955                                req->state = NVME_TCP_SEND_DDGST;
 956                                req->offset = 0;
 957                        } else {
 958                                nvme_tcp_done_send_req(queue);
 959                        }
 960                        return 1;
 961                }
 962                nvme_tcp_advance_req(req, ret);
 963        }
 964        return -EAGAIN;
 965}
 966
 967static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
 968{
 969        struct nvme_tcp_queue *queue = req->queue;
 970        struct nvme_tcp_cmd_pdu *pdu = req->pdu;
 971        bool inline_data = nvme_tcp_has_inline_data(req);
 972        u8 hdgst = nvme_tcp_hdgst_len(queue);
 973        int len = sizeof(*pdu) + hdgst - req->offset;
 974        int flags = MSG_DONTWAIT;
 975        int ret;
 976
 977        if (inline_data || nvme_tcp_queue_more(queue))
 978                flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
 979        else
 980                flags |= MSG_EOR;
 981
 982        if (queue->hdr_digest && !req->offset)
 983                nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
 984
 985        ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
 986                        offset_in_page(pdu) + req->offset, len,  flags);
 987        if (unlikely(ret <= 0))
 988                return ret;
 989
 990        len -= ret;
 991        if (!len) {
 992                if (inline_data) {
 993                        req->state = NVME_TCP_SEND_DATA;
 994                        if (queue->data_digest)
 995                                crypto_ahash_init(queue->snd_hash);
 996                } else {
 997                        nvme_tcp_done_send_req(queue);
 998                }
 999                return 1;
1000        }
1001        req->offset += ret;
1002
1003        return -EAGAIN;
1004}
1005
1006static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
1007{
1008        struct nvme_tcp_queue *queue = req->queue;
1009        struct nvme_tcp_data_pdu *pdu = req->pdu;
1010        u8 hdgst = nvme_tcp_hdgst_len(queue);
1011        int len = sizeof(*pdu) - req->offset + hdgst;
1012        int ret;
1013
1014        if (queue->hdr_digest && !req->offset)
1015                nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
1016
1017        ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
1018                        offset_in_page(pdu) + req->offset, len,
1019                        MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST);
1020        if (unlikely(ret <= 0))
1021                return ret;
1022
1023        len -= ret;
1024        if (!len) {
1025                req->state = NVME_TCP_SEND_DATA;
1026                if (queue->data_digest)
1027                        crypto_ahash_init(queue->snd_hash);
1028                return 1;
1029        }
1030        req->offset += ret;
1031
1032        return -EAGAIN;
1033}
1034
1035static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req)
1036{
1037        struct nvme_tcp_queue *queue = req->queue;
1038        int ret;
1039        struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
1040        struct kvec iov = {
1041                .iov_base = &req->ddgst + req->offset,
1042                .iov_len = NVME_TCP_DIGEST_LENGTH - req->offset
1043        };
1044
1045        if (nvme_tcp_queue_more(queue))
1046                msg.msg_flags |= MSG_MORE;
1047        else
1048                msg.msg_flags |= MSG_EOR;
1049
1050        ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
1051        if (unlikely(ret <= 0))
1052                return ret;
1053
1054        if (req->offset + ret == NVME_TCP_DIGEST_LENGTH) {
1055                nvme_tcp_done_send_req(queue);
1056                return 1;
1057        }
1058
1059        req->offset += ret;
1060        return -EAGAIN;
1061}
1062
1063static int nvme_tcp_try_send(struct nvme_tcp_queue *queue)
1064{
1065        struct nvme_tcp_request *req;
1066        int ret = 1;
1067
1068        if (!queue->request) {
1069                queue->request = nvme_tcp_fetch_request(queue);
1070                if (!queue->request)
1071                        return 0;
1072        }
1073        req = queue->request;
1074
1075        if (req->state == NVME_TCP_SEND_CMD_PDU) {
1076                ret = nvme_tcp_try_send_cmd_pdu(req);
1077                if (ret <= 0)
1078                        goto done;
1079                if (!nvme_tcp_has_inline_data(req))
1080                        return ret;
1081        }
1082
1083        if (req->state == NVME_TCP_SEND_H2C_PDU) {
1084                ret = nvme_tcp_try_send_data_pdu(req);
1085                if (ret <= 0)
1086                        goto done;
1087        }
1088
1089        if (req->state == NVME_TCP_SEND_DATA) {
1090                ret = nvme_tcp_try_send_data(req);
1091                if (ret <= 0)
1092                        goto done;
1093        }
1094
1095        if (req->state == NVME_TCP_SEND_DDGST)
1096                ret = nvme_tcp_try_send_ddgst(req);
1097done:
1098        if (ret == -EAGAIN) {
1099                ret = 0;
1100        } else if (ret < 0) {
1101                dev_err(queue->ctrl->ctrl.device,
1102                        "failed to send request %d\n", ret);
1103                if (ret != -EPIPE && ret != -ECONNRESET)
1104                        nvme_tcp_fail_request(queue->request);
1105                nvme_tcp_done_send_req(queue);
1106        }
1107        return ret;
1108}
1109
1110static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
1111{
1112        struct socket *sock = queue->sock;
1113        struct sock *sk = sock->sk;
1114        read_descriptor_t rd_desc;
1115        int consumed;
1116
1117        rd_desc.arg.data = queue;
1118        rd_desc.count = 1;
1119        lock_sock(sk);
1120        queue->nr_cqe = 0;
1121        consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
1122        release_sock(sk);
1123        return consumed;
1124}
1125
1126static void nvme_tcp_io_work(struct work_struct *w)
1127{
1128        struct nvme_tcp_queue *queue =
1129                container_of(w, struct nvme_tcp_queue, io_work);
1130        unsigned long deadline = jiffies + msecs_to_jiffies(1);
1131
1132        do {
1133                bool pending = false;
1134                int result;
1135
1136                if (mutex_trylock(&queue->send_mutex)) {
1137                        result = nvme_tcp_try_send(queue);
1138                        mutex_unlock(&queue->send_mutex);
1139                        if (result > 0)
1140                                pending = true;
1141                        else if (unlikely(result < 0))
1142                                break;
1143                } else
1144                        pending = !llist_empty(&queue->req_list);
1145
1146                result = nvme_tcp_try_recv(queue);
1147                if (result > 0)
1148                        pending = true;
1149                else if (unlikely(result < 0))
1150                        return;
1151
1152                if (!pending)
1153                        return;
1154
1155        } while (!time_after(jiffies, deadline)); /* quota is exhausted */
1156
1157        queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
1158}
1159
1160static void nvme_tcp_free_crypto(struct nvme_tcp_queue *queue)
1161{
1162        struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash);
1163
1164        ahash_request_free(queue->rcv_hash);
1165        ahash_request_free(queue->snd_hash);
1166        crypto_free_ahash(tfm);
1167}
1168
1169static int nvme_tcp_alloc_crypto(struct nvme_tcp_queue *queue)
1170{
1171        struct crypto_ahash *tfm;
1172
1173        tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC);
1174        if (IS_ERR(tfm))
1175                return PTR_ERR(tfm);
1176
1177        queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL);
1178        if (!queue->snd_hash)
1179                goto free_tfm;
1180        ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL);
1181
1182        queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL);
1183        if (!queue->rcv_hash)
1184                goto free_snd_hash;
1185        ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL);
1186
1187        return 0;
1188free_snd_hash:
1189        ahash_request_free(queue->snd_hash);
1190free_tfm:
1191        crypto_free_ahash(tfm);
1192        return -ENOMEM;
1193}
1194
1195static void nvme_tcp_free_async_req(struct nvme_tcp_ctrl *ctrl)
1196{
1197        struct nvme_tcp_request *async = &ctrl->async_req;
1198
1199        page_frag_free(async->pdu);
1200}
1201
1202static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl)
1203{
1204        struct nvme_tcp_queue *queue = &ctrl->queues[0];
1205        struct nvme_tcp_request *async = &ctrl->async_req;
1206        u8 hdgst = nvme_tcp_hdgst_len(queue);
1207
1208        async->pdu = page_frag_alloc(&queue->pf_cache,
1209                sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
1210                GFP_KERNEL | __GFP_ZERO);
1211        if (!async->pdu)
1212                return -ENOMEM;
1213
1214        async->queue = &ctrl->queues[0];
1215        return 0;
1216}
1217
1218static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
1219{
1220        struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1221        struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1222
1223        if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
1224                return;
1225
1226        if (queue->hdr_digest || queue->data_digest)
1227                nvme_tcp_free_crypto(queue);
1228
1229        sock_release(queue->sock);
1230        kfree(queue->pdu);
1231        mutex_destroy(&queue->queue_lock);
1232}
1233
1234static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue)
1235{
1236        struct nvme_tcp_icreq_pdu *icreq;
1237        struct nvme_tcp_icresp_pdu *icresp;
1238        struct msghdr msg = {};
1239        struct kvec iov;
1240        bool ctrl_hdgst, ctrl_ddgst;
1241        int ret;
1242
1243        icreq = kzalloc(sizeof(*icreq), GFP_KERNEL);
1244        if (!icreq)
1245                return -ENOMEM;
1246
1247        icresp = kzalloc(sizeof(*icresp), GFP_KERNEL);
1248        if (!icresp) {
1249                ret = -ENOMEM;
1250                goto free_icreq;
1251        }
1252
1253        icreq->hdr.type = nvme_tcp_icreq;
1254        icreq->hdr.hlen = sizeof(*icreq);
1255        icreq->hdr.pdo = 0;
1256        icreq->hdr.plen = cpu_to_le32(icreq->hdr.hlen);
1257        icreq->pfv = cpu_to_le16(NVME_TCP_PFV_1_0);
1258        icreq->maxr2t = 0; /* single inflight r2t supported */
1259        icreq->hpda = 0; /* no alignment constraint */
1260        if (queue->hdr_digest)
1261                icreq->digest |= NVME_TCP_HDR_DIGEST_ENABLE;
1262        if (queue->data_digest)
1263                icreq->digest |= NVME_TCP_DATA_DIGEST_ENABLE;
1264
1265        iov.iov_base = icreq;
1266        iov.iov_len = sizeof(*icreq);
1267        ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
1268        if (ret < 0)
1269                goto free_icresp;
1270
1271        memset(&msg, 0, sizeof(msg));
1272        iov.iov_base = icresp;
1273        iov.iov_len = sizeof(*icresp);
1274        ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
1275                        iov.iov_len, msg.msg_flags);
1276        if (ret < 0)
1277                goto free_icresp;
1278
1279        ret = -EINVAL;
1280        if (icresp->hdr.type != nvme_tcp_icresp) {
1281                pr_err("queue %d: bad type returned %d\n",
1282                        nvme_tcp_queue_id(queue), icresp->hdr.type);
1283                goto free_icresp;
1284        }
1285
1286        if (le32_to_cpu(icresp->hdr.plen) != sizeof(*icresp)) {
1287                pr_err("queue %d: bad pdu length returned %d\n",
1288                        nvme_tcp_queue_id(queue), icresp->hdr.plen);
1289                goto free_icresp;
1290        }
1291
1292        if (icresp->pfv != NVME_TCP_PFV_1_0) {
1293                pr_err("queue %d: bad pfv returned %d\n",
1294                        nvme_tcp_queue_id(queue), icresp->pfv);
1295                goto free_icresp;
1296        }
1297
1298        ctrl_ddgst = !!(icresp->digest & NVME_TCP_DATA_DIGEST_ENABLE);
1299        if ((queue->data_digest && !ctrl_ddgst) ||
1300            (!queue->data_digest && ctrl_ddgst)) {
1301                pr_err("queue %d: data digest mismatch host: %s ctrl: %s\n",
1302                        nvme_tcp_queue_id(queue),
1303                        queue->data_digest ? "enabled" : "disabled",
1304                        ctrl_ddgst ? "enabled" : "disabled");
1305                goto free_icresp;
1306        }
1307
1308        ctrl_hdgst = !!(icresp->digest & NVME_TCP_HDR_DIGEST_ENABLE);
1309        if ((queue->hdr_digest && !ctrl_hdgst) ||
1310            (!queue->hdr_digest && ctrl_hdgst)) {
1311                pr_err("queue %d: header digest mismatch host: %s ctrl: %s\n",
1312                        nvme_tcp_queue_id(queue),
1313                        queue->hdr_digest ? "enabled" : "disabled",
1314                        ctrl_hdgst ? "enabled" : "disabled");
1315                goto free_icresp;
1316        }
1317
1318        if (icresp->cpda != 0) {
1319                pr_err("queue %d: unsupported cpda returned %d\n",
1320                        nvme_tcp_queue_id(queue), icresp->cpda);
1321                goto free_icresp;
1322        }
1323
1324        ret = 0;
1325free_icresp:
1326        kfree(icresp);
1327free_icreq:
1328        kfree(icreq);
1329        return ret;
1330}
1331
1332static bool nvme_tcp_admin_queue(struct nvme_tcp_queue *queue)
1333{
1334        return nvme_tcp_queue_id(queue) == 0;
1335}
1336
1337static bool nvme_tcp_default_queue(struct nvme_tcp_queue *queue)
1338{
1339        struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1340        int qid = nvme_tcp_queue_id(queue);
1341
1342        return !nvme_tcp_admin_queue(queue) &&
1343                qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT];
1344}
1345
1346static bool nvme_tcp_read_queue(struct nvme_tcp_queue *queue)
1347{
1348        struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1349        int qid = nvme_tcp_queue_id(queue);
1350
1351        return !nvme_tcp_admin_queue(queue) &&
1352                !nvme_tcp_default_queue(queue) &&
1353                qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] +
1354                          ctrl->io_queues[HCTX_TYPE_READ];
1355}
1356
1357static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue)
1358{
1359        struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1360        int qid = nvme_tcp_queue_id(queue);
1361
1362        return !nvme_tcp_admin_queue(queue) &&
1363                !nvme_tcp_default_queue(queue) &&
1364                !nvme_tcp_read_queue(queue) &&
1365                qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] +
1366                          ctrl->io_queues[HCTX_TYPE_READ] +
1367                          ctrl->io_queues[HCTX_TYPE_POLL];
1368}
1369
1370static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
1371{
1372        struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1373        int qid = nvme_tcp_queue_id(queue);
1374        int n = 0;
1375
1376        if (nvme_tcp_default_queue(queue))
1377                n = qid - 1;
1378        else if (nvme_tcp_read_queue(queue))
1379                n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - 1;
1380        else if (nvme_tcp_poll_queue(queue))
1381                n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] -
1382                                ctrl->io_queues[HCTX_TYPE_READ] - 1;
1383        queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
1384}
1385
1386static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
1387                int qid, size_t queue_size)
1388{
1389        struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1390        struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1391        int ret, rcv_pdu_size;
1392
1393        mutex_init(&queue->queue_lock);
1394        queue->ctrl = ctrl;
1395        init_llist_head(&queue->req_list);
1396        INIT_LIST_HEAD(&queue->send_list);
1397        mutex_init(&queue->send_mutex);
1398        INIT_WORK(&queue->io_work, nvme_tcp_io_work);
1399        queue->queue_size = queue_size;
1400
1401        if (qid > 0)
1402                queue->cmnd_capsule_len = nctrl->ioccsz * 16;
1403        else
1404                queue->cmnd_capsule_len = sizeof(struct nvme_command) +
1405                                                NVME_TCP_ADMIN_CCSZ;
1406
1407        ret = sock_create(ctrl->addr.ss_family, SOCK_STREAM,
1408                        IPPROTO_TCP, &queue->sock);
1409        if (ret) {
1410                dev_err(nctrl->device,
1411                        "failed to create socket: %d\n", ret);
1412                goto err_destroy_mutex;
1413        }
1414
1415        /* Single syn retry */
1416        tcp_sock_set_syncnt(queue->sock->sk, 1);
1417
1418        /* Set TCP no delay */
1419        tcp_sock_set_nodelay(queue->sock->sk);
1420
1421        /*
1422         * Cleanup whatever is sitting in the TCP transmit queue on socket
1423         * close. This is done to prevent stale data from being sent should
1424         * the network connection be restored before TCP times out.
1425         */
1426        sock_no_linger(queue->sock->sk);
1427
1428        if (so_priority > 0)
1429                sock_set_priority(queue->sock->sk, so_priority);
1430
1431        /* Set socket type of service */
1432        if (nctrl->opts->tos >= 0)
1433                ip_sock_set_tos(queue->sock->sk, nctrl->opts->tos);
1434
1435        /* Set 10 seconds timeout for icresp recvmsg */
1436        queue->sock->sk->sk_rcvtimeo = 10 * HZ;
1437
1438        queue->sock->sk->sk_allocation = GFP_ATOMIC;
1439        nvme_tcp_set_queue_io_cpu(queue);
1440        queue->request = NULL;
1441        queue->data_remaining = 0;
1442        queue->ddgst_remaining = 0;
1443        queue->pdu_remaining = 0;
1444        queue->pdu_offset = 0;
1445        sk_set_memalloc(queue->sock->sk);
1446
1447        if (nctrl->opts->mask & NVMF_OPT_HOST_TRADDR) {
1448                ret = kernel_bind(queue->sock, (struct sockaddr *)&ctrl->src_addr,
1449                        sizeof(ctrl->src_addr));
1450                if (ret) {
1451                        dev_err(nctrl->device,
1452                                "failed to bind queue %d socket %d\n",
1453                                qid, ret);
1454                        goto err_sock;
1455                }
1456        }
1457
1458        if (nctrl->opts->mask & NVMF_OPT_HOST_IFACE) {
1459                char *iface = nctrl->opts->host_iface;
1460                sockptr_t optval = KERNEL_SOCKPTR(iface);
1461
1462                ret = sock_setsockopt(queue->sock, SOL_SOCKET, SO_BINDTODEVICE,
1463                                      optval, strlen(iface));
1464                if (ret) {
1465                        dev_err(nctrl->device,
1466                          "failed to bind to interface %s queue %d err %d\n",
1467                          iface, qid, ret);
1468                        goto err_sock;
1469                }
1470        }
1471
1472        queue->hdr_digest = nctrl->opts->hdr_digest;
1473        queue->data_digest = nctrl->opts->data_digest;
1474        if (queue->hdr_digest || queue->data_digest) {
1475                ret = nvme_tcp_alloc_crypto(queue);
1476                if (ret) {
1477                        dev_err(nctrl->device,
1478                                "failed to allocate queue %d crypto\n", qid);
1479                        goto err_sock;
1480                }
1481        }
1482
1483        rcv_pdu_size = sizeof(struct nvme_tcp_rsp_pdu) +
1484                        nvme_tcp_hdgst_len(queue);
1485        queue->pdu = kmalloc(rcv_pdu_size, GFP_KERNEL);
1486        if (!queue->pdu) {
1487                ret = -ENOMEM;
1488                goto err_crypto;
1489        }
1490
1491        dev_dbg(nctrl->device, "connecting queue %d\n",
1492                        nvme_tcp_queue_id(queue));
1493
1494        ret = kernel_connect(queue->sock, (struct sockaddr *)&ctrl->addr,
1495                sizeof(ctrl->addr), 0);
1496        if (ret) {
1497                dev_err(nctrl->device,
1498                        "failed to connect socket: %d\n", ret);
1499                goto err_rcv_pdu;
1500        }
1501
1502        ret = nvme_tcp_init_connection(queue);
1503        if (ret)
1504                goto err_init_connect;
1505
1506        queue->rd_enabled = true;
1507        set_bit(NVME_TCP_Q_ALLOCATED, &queue->flags);
1508        nvme_tcp_init_recv_ctx(queue);
1509
1510        write_lock_bh(&queue->sock->sk->sk_callback_lock);
1511        queue->sock->sk->sk_user_data = queue;
1512        queue->state_change = queue->sock->sk->sk_state_change;
1513        queue->data_ready = queue->sock->sk->sk_data_ready;
1514        queue->write_space = queue->sock->sk->sk_write_space;
1515        queue->sock->sk->sk_data_ready = nvme_tcp_data_ready;
1516        queue->sock->sk->sk_state_change = nvme_tcp_state_change;
1517        queue->sock->sk->sk_write_space = nvme_tcp_write_space;
1518#ifdef CONFIG_NET_RX_BUSY_POLL
1519        queue->sock->sk->sk_ll_usec = 1;
1520#endif
1521        write_unlock_bh(&queue->sock->sk->sk_callback_lock);
1522
1523        return 0;
1524
1525err_init_connect:
1526        kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1527err_rcv_pdu:
1528        kfree(queue->pdu);
1529err_crypto:
1530        if (queue->hdr_digest || queue->data_digest)
1531                nvme_tcp_free_crypto(queue);
1532err_sock:
1533        sock_release(queue->sock);
1534        queue->sock = NULL;
1535err_destroy_mutex:
1536        mutex_destroy(&queue->queue_lock);
1537        return ret;
1538}
1539
1540static void nvme_tcp_restore_sock_calls(struct nvme_tcp_queue *queue)
1541{
1542        struct socket *sock = queue->sock;
1543
1544        write_lock_bh(&sock->sk->sk_callback_lock);
1545        sock->sk->sk_user_data  = NULL;
1546        sock->sk->sk_data_ready = queue->data_ready;
1547        sock->sk->sk_state_change = queue->state_change;
1548        sock->sk->sk_write_space  = queue->write_space;
1549        write_unlock_bh(&sock->sk->sk_callback_lock);
1550}
1551
1552static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue)
1553{
1554        kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1555        nvme_tcp_restore_sock_calls(queue);
1556        cancel_work_sync(&queue->io_work);
1557}
1558
1559static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
1560{
1561        struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1562        struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1563
1564        mutex_lock(&queue->queue_lock);
1565        if (test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags))
1566                __nvme_tcp_stop_queue(queue);
1567        mutex_unlock(&queue->queue_lock);
1568}
1569
1570static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
1571{
1572        struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1573        int ret;
1574
1575        if (idx)
1576                ret = nvmf_connect_io_queue(nctrl, idx);
1577        else
1578                ret = nvmf_connect_admin_queue(nctrl);
1579
1580        if (!ret) {
1581                set_bit(NVME_TCP_Q_LIVE, &ctrl->queues[idx].flags);
1582        } else {
1583                if (test_bit(NVME_TCP_Q_ALLOCATED, &ctrl->queues[idx].flags))
1584                        __nvme_tcp_stop_queue(&ctrl->queues[idx]);
1585                dev_err(nctrl->device,
1586                        "failed to connect queue: %d ret=%d\n", idx, ret);
1587        }
1588        return ret;
1589}
1590
1591static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl,
1592                bool admin)
1593{
1594        struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1595        struct blk_mq_tag_set *set;
1596        int ret;
1597
1598        if (admin) {
1599                set = &ctrl->admin_tag_set;
1600                memset(set, 0, sizeof(*set));
1601                set->ops = &nvme_tcp_admin_mq_ops;
1602                set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
1603                set->reserved_tags = NVMF_RESERVED_TAGS;
1604                set->numa_node = nctrl->numa_node;
1605                set->flags = BLK_MQ_F_BLOCKING;
1606                set->cmd_size = sizeof(struct nvme_tcp_request);
1607                set->driver_data = ctrl;
1608                set->nr_hw_queues = 1;
1609                set->timeout = NVME_ADMIN_TIMEOUT;
1610        } else {
1611                set = &ctrl->tag_set;
1612                memset(set, 0, sizeof(*set));
1613                set->ops = &nvme_tcp_mq_ops;
1614                set->queue_depth = nctrl->sqsize + 1;
1615                set->reserved_tags = NVMF_RESERVED_TAGS;
1616                set->numa_node = nctrl->numa_node;
1617                set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
1618                set->cmd_size = sizeof(struct nvme_tcp_request);
1619                set->driver_data = ctrl;
1620                set->nr_hw_queues = nctrl->queue_count - 1;
1621                set->timeout = NVME_IO_TIMEOUT;
1622                set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2;
1623        }
1624
1625        ret = blk_mq_alloc_tag_set(set);
1626        if (ret)
1627                return ERR_PTR(ret);
1628
1629        return set;
1630}
1631
1632static void nvme_tcp_free_admin_queue(struct nvme_ctrl *ctrl)
1633{
1634        if (to_tcp_ctrl(ctrl)->async_req.pdu) {
1635                cancel_work_sync(&ctrl->async_event_work);
1636                nvme_tcp_free_async_req(to_tcp_ctrl(ctrl));
1637                to_tcp_ctrl(ctrl)->async_req.pdu = NULL;
1638        }
1639
1640        nvme_tcp_free_queue(ctrl, 0);
1641}
1642
1643static void nvme_tcp_free_io_queues(struct nvme_ctrl *ctrl)
1644{
1645        int i;
1646
1647        for (i = 1; i < ctrl->queue_count; i++)
1648                nvme_tcp_free_queue(ctrl, i);
1649}
1650
1651static void nvme_tcp_stop_io_queues(struct nvme_ctrl *ctrl)
1652{
1653        int i;
1654
1655        for (i = 1; i < ctrl->queue_count; i++)
1656                nvme_tcp_stop_queue(ctrl, i);
1657}
1658
1659static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl)
1660{
1661        int i, ret = 0;
1662
1663        for (i = 1; i < ctrl->queue_count; i++) {
1664                ret = nvme_tcp_start_queue(ctrl, i);
1665                if (ret)
1666                        goto out_stop_queues;
1667        }
1668
1669        return 0;
1670
1671out_stop_queues:
1672        for (i--; i >= 1; i--)
1673                nvme_tcp_stop_queue(ctrl, i);
1674        return ret;
1675}
1676
1677static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl)
1678{
1679        int ret;
1680
1681        ret = nvme_tcp_alloc_queue(ctrl, 0, NVME_AQ_DEPTH);
1682        if (ret)
1683                return ret;
1684
1685        ret = nvme_tcp_alloc_async_req(to_tcp_ctrl(ctrl));
1686        if (ret)
1687                goto out_free_queue;
1688
1689        return 0;
1690
1691out_free_queue:
1692        nvme_tcp_free_queue(ctrl, 0);
1693        return ret;
1694}
1695
1696static int __nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
1697{
1698        int i, ret;
1699
1700        for (i = 1; i < ctrl->queue_count; i++) {
1701                ret = nvme_tcp_alloc_queue(ctrl, i,
1702                                ctrl->sqsize + 1);
1703                if (ret)
1704                        goto out_free_queues;
1705        }
1706
1707        return 0;
1708
1709out_free_queues:
1710        for (i--; i >= 1; i--)
1711                nvme_tcp_free_queue(ctrl, i);
1712
1713        return ret;
1714}
1715
1716static unsigned int nvme_tcp_nr_io_queues(struct nvme_ctrl *ctrl)
1717{
1718        unsigned int nr_io_queues;
1719
1720        nr_io_queues = min(ctrl->opts->nr_io_queues, num_online_cpus());
1721        nr_io_queues += min(ctrl->opts->nr_write_queues, num_online_cpus());
1722        nr_io_queues += min(ctrl->opts->nr_poll_queues, num_online_cpus());
1723
1724        return nr_io_queues;
1725}
1726
1727static void nvme_tcp_set_io_queues(struct nvme_ctrl *nctrl,
1728                unsigned int nr_io_queues)
1729{
1730        struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1731        struct nvmf_ctrl_options *opts = nctrl->opts;
1732
1733        if (opts->nr_write_queues && opts->nr_io_queues < nr_io_queues) {
1734                /*
1735                 * separate read/write queues
1736                 * hand out dedicated default queues only after we have
1737                 * sufficient read queues.
1738                 */
1739                ctrl->io_queues[HCTX_TYPE_READ] = opts->nr_io_queues;
1740                nr_io_queues -= ctrl->io_queues[HCTX_TYPE_READ];
1741                ctrl->io_queues[HCTX_TYPE_DEFAULT] =
1742                        min(opts->nr_write_queues, nr_io_queues);
1743                nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
1744        } else {
1745                /*
1746                 * shared read/write queues
1747                 * either no write queues were requested, or we don't have
1748                 * sufficient queue count to have dedicated default queues.
1749                 */
1750                ctrl->io_queues[HCTX_TYPE_DEFAULT] =
1751                        min(opts->nr_io_queues, nr_io_queues);
1752                nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
1753        }
1754
1755        if (opts->nr_poll_queues && nr_io_queues) {
1756                /* map dedicated poll queues only if we have queues left */
1757                ctrl->io_queues[HCTX_TYPE_POLL] =
1758                        min(opts->nr_poll_queues, nr_io_queues);
1759        }
1760}
1761
1762static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
1763{
1764        unsigned int nr_io_queues;
1765        int ret;
1766
1767        nr_io_queues = nvme_tcp_nr_io_queues(ctrl);
1768        ret = nvme_set_queue_count(ctrl, &nr_io_queues);
1769        if (ret)
1770                return ret;
1771
1772        ctrl->queue_count = nr_io_queues + 1;
1773        if (ctrl->queue_count < 2) {
1774                dev_err(ctrl->device,
1775                        "unable to set any I/O queues\n");
1776                return -ENOMEM;
1777        }
1778
1779        dev_info(ctrl->device,
1780                "creating %d I/O queues.\n", nr_io_queues);
1781
1782        nvme_tcp_set_io_queues(ctrl, nr_io_queues);
1783
1784        return __nvme_tcp_alloc_io_queues(ctrl);
1785}
1786
1787static void nvme_tcp_destroy_io_queues(struct nvme_ctrl *ctrl, bool remove)
1788{
1789        nvme_tcp_stop_io_queues(ctrl);
1790        if (remove) {
1791                blk_cleanup_queue(ctrl->connect_q);
1792                blk_mq_free_tag_set(ctrl->tagset);
1793        }
1794        nvme_tcp_free_io_queues(ctrl);
1795}
1796
1797static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
1798{
1799        int ret;
1800
1801        ret = nvme_tcp_alloc_io_queues(ctrl);
1802        if (ret)
1803                return ret;
1804
1805        if (new) {
1806                ctrl->tagset = nvme_tcp_alloc_tagset(ctrl, false);
1807                if (IS_ERR(ctrl->tagset)) {
1808                        ret = PTR_ERR(ctrl->tagset);
1809                        goto out_free_io_queues;
1810                }
1811
1812                ctrl->connect_q = blk_mq_init_queue(ctrl->tagset);
1813                if (IS_ERR(ctrl->connect_q)) {
1814                        ret = PTR_ERR(ctrl->connect_q);
1815                        goto out_free_tag_set;
1816                }
1817        }
1818
1819        ret = nvme_tcp_start_io_queues(ctrl);
1820        if (ret)
1821                goto out_cleanup_connect_q;
1822
1823        if (!new) {
1824                nvme_start_queues(ctrl);
1825                if (!nvme_wait_freeze_timeout(ctrl, NVME_IO_TIMEOUT)) {
1826                        /*
1827                         * If we timed out waiting for freeze we are likely to
1828                         * be stuck.  Fail the controller initialization just
1829                         * to be safe.
1830                         */
1831                        ret = -ENODEV;
1832                        goto out_wait_freeze_timed_out;
1833                }
1834                blk_mq_update_nr_hw_queues(ctrl->tagset,
1835                        ctrl->queue_count - 1);
1836                nvme_unfreeze(ctrl);
1837        }
1838
1839        return 0;
1840
1841out_wait_freeze_timed_out:
1842        nvme_stop_queues(ctrl);
1843        nvme_sync_io_queues(ctrl);
1844        nvme_tcp_stop_io_queues(ctrl);
1845out_cleanup_connect_q:
1846        nvme_cancel_tagset(ctrl);
1847        if (new)
1848                blk_cleanup_queue(ctrl->connect_q);
1849out_free_tag_set:
1850        if (new)
1851                blk_mq_free_tag_set(ctrl->tagset);
1852out_free_io_queues:
1853        nvme_tcp_free_io_queues(ctrl);
1854        return ret;
1855}
1856
1857static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove)
1858{
1859        nvme_tcp_stop_queue(ctrl, 0);
1860        if (remove) {
1861                blk_cleanup_queue(ctrl->admin_q);
1862                blk_cleanup_queue(ctrl->fabrics_q);
1863                blk_mq_free_tag_set(ctrl->admin_tagset);
1864        }
1865        nvme_tcp_free_admin_queue(ctrl);
1866}
1867
1868static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
1869{
1870        int error;
1871
1872        error = nvme_tcp_alloc_admin_queue(ctrl);
1873        if (error)
1874                return error;
1875
1876        if (new) {
1877                ctrl->admin_tagset = nvme_tcp_alloc_tagset(ctrl, true);
1878                if (IS_ERR(ctrl->admin_tagset)) {
1879                        error = PTR_ERR(ctrl->admin_tagset);
1880                        goto out_free_queue;
1881                }
1882
1883                ctrl->fabrics_q = blk_mq_init_queue(ctrl->admin_tagset);
1884                if (IS_ERR(ctrl->fabrics_q)) {
1885                        error = PTR_ERR(ctrl->fabrics_q);
1886                        goto out_free_tagset;
1887                }
1888
1889                ctrl->admin_q = blk_mq_init_queue(ctrl->admin_tagset);
1890                if (IS_ERR(ctrl->admin_q)) {
1891                        error = PTR_ERR(ctrl->admin_q);
1892                        goto out_cleanup_fabrics_q;
1893                }
1894        }
1895
1896        error = nvme_tcp_start_queue(ctrl, 0);
1897        if (error)
1898                goto out_cleanup_queue;
1899
1900        error = nvme_enable_ctrl(ctrl);
1901        if (error)
1902                goto out_stop_queue;
1903
1904        blk_mq_unquiesce_queue(ctrl->admin_q);
1905
1906        error = nvme_init_ctrl_finish(ctrl);
1907        if (error)
1908                goto out_quiesce_queue;
1909
1910        return 0;
1911
1912out_quiesce_queue:
1913        blk_mq_quiesce_queue(ctrl->admin_q);
1914        blk_sync_queue(ctrl->admin_q);
1915out_stop_queue:
1916        nvme_tcp_stop_queue(ctrl, 0);
1917        nvme_cancel_admin_tagset(ctrl);
1918out_cleanup_queue:
1919        if (new)
1920                blk_cleanup_queue(ctrl->admin_q);
1921out_cleanup_fabrics_q:
1922        if (new)
1923                blk_cleanup_queue(ctrl->fabrics_q);
1924out_free_tagset:
1925        if (new)
1926                blk_mq_free_tag_set(ctrl->admin_tagset);
1927out_free_queue:
1928        nvme_tcp_free_admin_queue(ctrl);
1929        return error;
1930}
1931
1932static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl,
1933                bool remove)
1934{
1935        blk_mq_quiesce_queue(ctrl->admin_q);
1936        blk_sync_queue(ctrl->admin_q);
1937        nvme_tcp_stop_queue(ctrl, 0);
1938        nvme_cancel_admin_tagset(ctrl);
1939        if (remove)
1940                blk_mq_unquiesce_queue(ctrl->admin_q);
1941        nvme_tcp_destroy_admin_queue(ctrl, remove);
1942}
1943
1944static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
1945                bool remove)
1946{
1947        if (ctrl->queue_count <= 1)
1948                return;
1949        blk_mq_quiesce_queue(ctrl->admin_q);
1950        nvme_start_freeze(ctrl);
1951        nvme_stop_queues(ctrl);
1952        nvme_sync_io_queues(ctrl);
1953        nvme_tcp_stop_io_queues(ctrl);
1954        nvme_cancel_tagset(ctrl);
1955        if (remove)
1956                nvme_start_queues(ctrl);
1957        nvme_tcp_destroy_io_queues(ctrl, remove);
1958}
1959
1960static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl)
1961{
1962        /* If we are resetting/deleting then do nothing */
1963        if (ctrl->state != NVME_CTRL_CONNECTING) {
1964                WARN_ON_ONCE(ctrl->state == NVME_CTRL_NEW ||
1965                        ctrl->state == NVME_CTRL_LIVE);
1966                return;
1967        }
1968
1969        if (nvmf_should_reconnect(ctrl)) {
1970                dev_info(ctrl->device, "Reconnecting in %d seconds...\n",
1971                        ctrl->opts->reconnect_delay);
1972                queue_delayed_work(nvme_wq, &to_tcp_ctrl(ctrl)->connect_work,
1973                                ctrl->opts->reconnect_delay * HZ);
1974        } else {
1975                dev_info(ctrl->device, "Removing controller...\n");
1976                nvme_delete_ctrl(ctrl);
1977        }
1978}
1979
1980static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
1981{
1982        struct nvmf_ctrl_options *opts = ctrl->opts;
1983        int ret;
1984
1985        ret = nvme_tcp_configure_admin_queue(ctrl, new);
1986        if (ret)
1987                return ret;
1988
1989        if (ctrl->icdoff) {
1990                ret = -EOPNOTSUPP;
1991                dev_err(ctrl->device, "icdoff is not supported!\n");
1992                goto destroy_admin;
1993        }
1994
1995        if (!nvme_ctrl_sgl_supported(ctrl)) {
1996                ret = -EOPNOTSUPP;
1997                dev_err(ctrl->device, "Mandatory sgls are not supported!\n");
1998                goto destroy_admin;
1999        }
2000
2001        if (opts->queue_size > ctrl->sqsize + 1)
2002                dev_warn(ctrl->device,
2003                        "queue_size %zu > ctrl sqsize %u, clamping down\n",
2004                        opts->queue_size, ctrl->sqsize + 1);
2005
2006        if (ctrl->sqsize + 1 > ctrl->maxcmd) {
2007                dev_warn(ctrl->device,
2008                        "sqsize %u > ctrl maxcmd %u, clamping down\n",
2009                        ctrl->sqsize + 1, ctrl->maxcmd);
2010                ctrl->sqsize = ctrl->maxcmd - 1;
2011        }
2012
2013        if (ctrl->queue_count > 1) {
2014                ret = nvme_tcp_configure_io_queues(ctrl, new);
2015                if (ret)
2016                        goto destroy_admin;
2017        }
2018
2019        if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) {
2020                /*
2021                 * state change failure is ok if we started ctrl delete,
2022                 * unless we're during creation of a new controller to
2023                 * avoid races with teardown flow.
2024                 */
2025                WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
2026                             ctrl->state != NVME_CTRL_DELETING_NOIO);
2027                WARN_ON_ONCE(new);
2028                ret = -EINVAL;
2029                goto destroy_io;
2030        }
2031
2032        nvme_start_ctrl(ctrl);
2033        return 0;
2034
2035destroy_io:
2036        if (ctrl->queue_count > 1) {
2037                nvme_stop_queues(ctrl);
2038                nvme_sync_io_queues(ctrl);
2039                nvme_tcp_stop_io_queues(ctrl);
2040                nvme_cancel_tagset(ctrl);
2041                nvme_tcp_destroy_io_queues(ctrl, new);
2042        }
2043destroy_admin:
2044        blk_mq_quiesce_queue(ctrl->admin_q);
2045        blk_sync_queue(ctrl->admin_q);
2046        nvme_tcp_stop_queue(ctrl, 0);
2047        nvme_cancel_admin_tagset(ctrl);
2048        nvme_tcp_destroy_admin_queue(ctrl, new);
2049        return ret;
2050}
2051
2052static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
2053{
2054        struct nvme_tcp_ctrl *tcp_ctrl = container_of(to_delayed_work(work),
2055                        struct nvme_tcp_ctrl, connect_work);
2056        struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
2057
2058        ++ctrl->nr_reconnects;
2059
2060        if (nvme_tcp_setup_ctrl(ctrl, false))
2061                goto requeue;
2062
2063        dev_info(ctrl->device, "Successfully reconnected (%d attempt)\n",
2064                        ctrl->nr_reconnects);
2065
2066        ctrl->nr_reconnects = 0;
2067
2068        return;
2069
2070requeue:
2071        dev_info(ctrl->device, "Failed reconnect attempt %d\n",
2072                        ctrl->nr_reconnects);
2073        nvme_tcp_reconnect_or_remove(ctrl);
2074}
2075
2076static void nvme_tcp_error_recovery_work(struct work_struct *work)
2077{
2078        struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
2079                                struct nvme_tcp_ctrl, err_work);
2080        struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
2081
2082        nvme_stop_keep_alive(ctrl);
2083        nvme_tcp_teardown_io_queues(ctrl, false);
2084        /* unquiesce to fail fast pending requests */
2085        nvme_start_queues(ctrl);
2086        nvme_tcp_teardown_admin_queue(ctrl, false);
2087        blk_mq_unquiesce_queue(ctrl->admin_q);
2088
2089        if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
2090                /* state change failure is ok if we started ctrl delete */
2091                WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
2092                             ctrl->state != NVME_CTRL_DELETING_NOIO);
2093                return;
2094        }
2095
2096        nvme_tcp_reconnect_or_remove(ctrl);
2097}
2098
2099static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
2100{
2101        cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work);
2102        cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
2103
2104        nvme_tcp_teardown_io_queues(ctrl, shutdown);
2105        blk_mq_quiesce_queue(ctrl->admin_q);
2106        if (shutdown)
2107                nvme_shutdown_ctrl(ctrl);
2108        else
2109                nvme_disable_ctrl(ctrl);
2110        nvme_tcp_teardown_admin_queue(ctrl, shutdown);
2111}
2112
2113static void nvme_tcp_delete_ctrl(struct nvme_ctrl *ctrl)
2114{
2115        nvme_tcp_teardown_ctrl(ctrl, true);
2116}
2117
2118static void nvme_reset_ctrl_work(struct work_struct *work)
2119{
2120        struct nvme_ctrl *ctrl =
2121                container_of(work, struct nvme_ctrl, reset_work);
2122
2123        nvme_stop_ctrl(ctrl);
2124        nvme_tcp_teardown_ctrl(ctrl, false);
2125
2126        if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
2127                /* state change failure is ok if we started ctrl delete */
2128                WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
2129                             ctrl->state != NVME_CTRL_DELETING_NOIO);
2130                return;
2131        }
2132
2133        if (nvme_tcp_setup_ctrl(ctrl, false))
2134                goto out_fail;
2135
2136        return;
2137
2138out_fail:
2139        ++ctrl->nr_reconnects;
2140        nvme_tcp_reconnect_or_remove(ctrl);
2141}
2142
2143static void nvme_tcp_free_ctrl(struct nvme_ctrl *nctrl)
2144{
2145        struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
2146
2147        if (list_empty(&ctrl->list))
2148                goto free_ctrl;
2149
2150        mutex_lock(&nvme_tcp_ctrl_mutex);
2151        list_del(&ctrl->list);
2152        mutex_unlock(&nvme_tcp_ctrl_mutex);
2153
2154        nvmf_free_options(nctrl->opts);
2155free_ctrl:
2156        kfree(ctrl->queues);
2157        kfree(ctrl);
2158}
2159
2160static void nvme_tcp_set_sg_null(struct nvme_command *c)
2161{
2162        struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
2163
2164        sg->addr = 0;
2165        sg->length = 0;
2166        sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
2167                        NVME_SGL_FMT_TRANSPORT_A;
2168}
2169
2170static void nvme_tcp_set_sg_inline(struct nvme_tcp_queue *queue,
2171                struct nvme_command *c, u32 data_len)
2172{
2173        struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
2174
2175        sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff);
2176        sg->length = cpu_to_le32(data_len);
2177        sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
2178}
2179
2180static void nvme_tcp_set_sg_host_data(struct nvme_command *c,
2181                u32 data_len)
2182{
2183        struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
2184
2185        sg->addr = 0;
2186        sg->length = cpu_to_le32(data_len);
2187        sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
2188                        NVME_SGL_FMT_TRANSPORT_A;
2189}
2190
2191static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg)
2192{
2193        struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(arg);
2194        struct nvme_tcp_queue *queue = &ctrl->queues[0];
2195        struct nvme_tcp_cmd_pdu *pdu = ctrl->async_req.pdu;
2196        struct nvme_command *cmd = &pdu->cmd;
2197        u8 hdgst = nvme_tcp_hdgst_len(queue);
2198
2199        memset(pdu, 0, sizeof(*pdu));
2200        pdu->hdr.type = nvme_tcp_cmd;
2201        if (queue->hdr_digest)
2202                pdu->hdr.flags |= NVME_TCP_F_HDGST;
2203        pdu->hdr.hlen = sizeof(*pdu);
2204        pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
2205
2206        cmd->common.opcode = nvme_admin_async_event;
2207        cmd->common.command_id = NVME_AQ_BLK_MQ_DEPTH;
2208        cmd->common.flags |= NVME_CMD_SGL_METABUF;
2209        nvme_tcp_set_sg_null(cmd);
2210
2211        ctrl->async_req.state = NVME_TCP_SEND_CMD_PDU;
2212        ctrl->async_req.offset = 0;
2213        ctrl->async_req.curr_bio = NULL;
2214        ctrl->async_req.data_len = 0;
2215
2216        nvme_tcp_queue_request(&ctrl->async_req, true, true);
2217}
2218
2219static void nvme_tcp_complete_timed_out(struct request *rq)
2220{
2221        struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2222        struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl;
2223
2224        nvme_tcp_stop_queue(ctrl, nvme_tcp_queue_id(req->queue));
2225        if (blk_mq_request_started(rq) && !blk_mq_request_completed(rq)) {
2226                nvme_req(rq)->status = NVME_SC_HOST_ABORTED_CMD;
2227                blk_mq_complete_request(rq);
2228        }
2229}
2230
2231static enum blk_eh_timer_return
2232nvme_tcp_timeout(struct request *rq, bool reserved)
2233{
2234        struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2235        struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl;
2236        struct nvme_tcp_cmd_pdu *pdu = req->pdu;
2237
2238        dev_warn(ctrl->device,
2239                "queue %d: timeout request %#x type %d\n",
2240                nvme_tcp_queue_id(req->queue), rq->tag, pdu->hdr.type);
2241
2242        if (ctrl->state != NVME_CTRL_LIVE) {
2243                /*
2244                 * If we are resetting, connecting or deleting we should
2245                 * complete immediately because we may block controller
2246                 * teardown or setup sequence
2247                 * - ctrl disable/shutdown fabrics requests
2248                 * - connect requests
2249                 * - initialization admin requests
2250                 * - I/O requests that entered after unquiescing and
2251                 *   the controller stopped responding
2252                 *
2253                 * All other requests should be cancelled by the error
2254                 * recovery work, so it's fine that we fail it here.
2255                 */
2256                nvme_tcp_complete_timed_out(rq);
2257                return BLK_EH_DONE;
2258        }
2259
2260        /*
2261         * LIVE state should trigger the normal error recovery which will
2262         * handle completing this request.
2263         */
2264        nvme_tcp_error_recovery(ctrl);
2265        return BLK_EH_RESET_TIMER;
2266}
2267
2268static blk_status_t nvme_tcp_map_data(struct nvme_tcp_queue *queue,
2269                        struct request *rq)
2270{
2271        struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2272        struct nvme_tcp_cmd_pdu *pdu = req->pdu;
2273        struct nvme_command *c = &pdu->cmd;
2274
2275        c->common.flags |= NVME_CMD_SGL_METABUF;
2276
2277        if (!blk_rq_nr_phys_segments(rq))
2278                nvme_tcp_set_sg_null(c);
2279        else if (rq_data_dir(rq) == WRITE &&
2280            req->data_len <= nvme_tcp_inline_data_size(queue))
2281                nvme_tcp_set_sg_inline(queue, c, req->data_len);
2282        else
2283                nvme_tcp_set_sg_host_data(c, req->data_len);
2284
2285        return 0;
2286}
2287
2288static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns,
2289                struct request *rq)
2290{
2291        struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2292        struct nvme_tcp_cmd_pdu *pdu = req->pdu;
2293        struct nvme_tcp_queue *queue = req->queue;
2294        u8 hdgst = nvme_tcp_hdgst_len(queue), ddgst = 0;
2295        blk_status_t ret;
2296
2297        ret = nvme_setup_cmd(ns, rq);
2298        if (ret)
2299                return ret;
2300
2301        req->state = NVME_TCP_SEND_CMD_PDU;
2302        req->offset = 0;
2303        req->data_sent = 0;
2304        req->pdu_len = 0;
2305        req->pdu_sent = 0;
2306        req->data_len = blk_rq_nr_phys_segments(rq) ?
2307                                blk_rq_payload_bytes(rq) : 0;
2308        req->curr_bio = rq->bio;
2309        if (req->curr_bio && req->data_len)
2310                nvme_tcp_init_iter(req, rq_data_dir(rq));
2311
2312        if (rq_data_dir(rq) == WRITE &&
2313            req->data_len <= nvme_tcp_inline_data_size(queue))
2314                req->pdu_len = req->data_len;
2315
2316        pdu->hdr.type = nvme_tcp_cmd;
2317        pdu->hdr.flags = 0;
2318        if (queue->hdr_digest)
2319                pdu->hdr.flags |= NVME_TCP_F_HDGST;
2320        if (queue->data_digest && req->pdu_len) {
2321                pdu->hdr.flags |= NVME_TCP_F_DDGST;
2322                ddgst = nvme_tcp_ddgst_len(queue);
2323        }
2324        pdu->hdr.hlen = sizeof(*pdu);
2325        pdu->hdr.pdo = req->pdu_len ? pdu->hdr.hlen + hdgst : 0;
2326        pdu->hdr.plen =
2327                cpu_to_le32(pdu->hdr.hlen + hdgst + req->pdu_len + ddgst);
2328
2329        ret = nvme_tcp_map_data(queue, rq);
2330        if (unlikely(ret)) {
2331                nvme_cleanup_cmd(rq);
2332                dev_err(queue->ctrl->ctrl.device,
2333                        "Failed to map data (%d)\n", ret);
2334                return ret;
2335        }
2336
2337        return 0;
2338}
2339
2340static void nvme_tcp_commit_rqs(struct blk_mq_hw_ctx *hctx)
2341{
2342        struct nvme_tcp_queue *queue = hctx->driver_data;
2343
2344        if (!llist_empty(&queue->req_list))
2345                queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
2346}
2347
2348static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
2349                const struct blk_mq_queue_data *bd)
2350{
2351        struct nvme_ns *ns = hctx->queue->queuedata;
2352        struct nvme_tcp_queue *queue = hctx->driver_data;
2353        struct request *rq = bd->rq;
2354        struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2355        bool queue_ready = test_bit(NVME_TCP_Q_LIVE, &queue->flags);
2356        blk_status_t ret;
2357
2358        if (!nvme_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
2359                return nvme_fail_nonready_command(&queue->ctrl->ctrl, rq);
2360
2361        ret = nvme_tcp_setup_cmd_pdu(ns, rq);
2362        if (unlikely(ret))
2363                return ret;
2364
2365        blk_mq_start_request(rq);
2366
2367        nvme_tcp_queue_request(req, true, bd->last);
2368
2369        return BLK_STS_OK;
2370}
2371
2372static int nvme_tcp_map_queues(struct blk_mq_tag_set *set)
2373{
2374        struct nvme_tcp_ctrl *ctrl = set->driver_data;
2375        struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
2376
2377        if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) {
2378                /* separate read/write queues */
2379                set->map[HCTX_TYPE_DEFAULT].nr_queues =
2380                        ctrl->io_queues[HCTX_TYPE_DEFAULT];
2381                set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
2382                set->map[HCTX_TYPE_READ].nr_queues =
2383                        ctrl->io_queues[HCTX_TYPE_READ];
2384                set->map[HCTX_TYPE_READ].queue_offset =
2385                        ctrl->io_queues[HCTX_TYPE_DEFAULT];
2386        } else {
2387                /* shared read/write queues */
2388                set->map[HCTX_TYPE_DEFAULT].nr_queues =
2389                        ctrl->io_queues[HCTX_TYPE_DEFAULT];
2390                set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
2391                set->map[HCTX_TYPE_READ].nr_queues =
2392                        ctrl->io_queues[HCTX_TYPE_DEFAULT];
2393                set->map[HCTX_TYPE_READ].queue_offset = 0;
2394        }
2395        blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
2396        blk_mq_map_queues(&set->map[HCTX_TYPE_READ]);
2397
2398        if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) {
2399                /* map dedicated poll queues only if we have queues left */
2400                set->map[HCTX_TYPE_POLL].nr_queues =
2401                                ctrl->io_queues[HCTX_TYPE_POLL];
2402                set->map[HCTX_TYPE_POLL].queue_offset =
2403                        ctrl->io_queues[HCTX_TYPE_DEFAULT] +
2404                        ctrl->io_queues[HCTX_TYPE_READ];
2405                blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]);
2406        }
2407
2408        dev_info(ctrl->ctrl.device,
2409                "mapped %d/%d/%d default/read/poll queues.\n",
2410                ctrl->io_queues[HCTX_TYPE_DEFAULT],
2411                ctrl->io_queues[HCTX_TYPE_READ],
2412                ctrl->io_queues[HCTX_TYPE_POLL]);
2413
2414        return 0;
2415}
2416
2417static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx)
2418{
2419        struct nvme_tcp_queue *queue = hctx->driver_data;
2420        struct sock *sk = queue->sock->sk;
2421
2422        if (!test_bit(NVME_TCP_Q_LIVE, &queue->flags))
2423                return 0;
2424
2425        set_bit(NVME_TCP_Q_POLLING, &queue->flags);
2426        if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue))
2427                sk_busy_loop(sk, true);
2428        nvme_tcp_try_recv(queue);
2429        clear_bit(NVME_TCP_Q_POLLING, &queue->flags);
2430        return queue->nr_cqe;
2431}
2432
2433static const struct blk_mq_ops nvme_tcp_mq_ops = {
2434        .queue_rq       = nvme_tcp_queue_rq,
2435        .commit_rqs     = nvme_tcp_commit_rqs,
2436        .complete       = nvme_complete_rq,
2437        .init_request   = nvme_tcp_init_request,
2438        .exit_request   = nvme_tcp_exit_request,
2439        .init_hctx      = nvme_tcp_init_hctx,
2440        .timeout        = nvme_tcp_timeout,
2441        .map_queues     = nvme_tcp_map_queues,
2442        .poll           = nvme_tcp_poll,
2443};
2444
2445static const struct blk_mq_ops nvme_tcp_admin_mq_ops = {
2446        .queue_rq       = nvme_tcp_queue_rq,
2447        .complete       = nvme_complete_rq,
2448        .init_request   = nvme_tcp_init_request,
2449        .exit_request   = nvme_tcp_exit_request,
2450        .init_hctx      = nvme_tcp_init_admin_hctx,
2451        .timeout        = nvme_tcp_timeout,
2452};
2453
2454static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = {
2455        .name                   = "tcp",
2456        .module                 = THIS_MODULE,
2457        .flags                  = NVME_F_FABRICS,
2458        .reg_read32             = nvmf_reg_read32,
2459        .reg_read64             = nvmf_reg_read64,
2460        .reg_write32            = nvmf_reg_write32,
2461        .free_ctrl              = nvme_tcp_free_ctrl,
2462        .submit_async_event     = nvme_tcp_submit_async_event,
2463        .delete_ctrl            = nvme_tcp_delete_ctrl,
2464        .get_address            = nvmf_get_address,
2465};
2466
2467static bool
2468nvme_tcp_existing_controller(struct nvmf_ctrl_options *opts)
2469{
2470        struct nvme_tcp_ctrl *ctrl;
2471        bool found = false;
2472
2473        mutex_lock(&nvme_tcp_ctrl_mutex);
2474        list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list) {
2475                found = nvmf_ip_options_match(&ctrl->ctrl, opts);
2476                if (found)
2477                        break;
2478        }
2479        mutex_unlock(&nvme_tcp_ctrl_mutex);
2480
2481        return found;
2482}
2483
2484static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
2485                struct nvmf_ctrl_options *opts)
2486{
2487        struct nvme_tcp_ctrl *ctrl;
2488        int ret;
2489
2490        ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
2491        if (!ctrl)
2492                return ERR_PTR(-ENOMEM);
2493
2494        INIT_LIST_HEAD(&ctrl->list);
2495        ctrl->ctrl.opts = opts;
2496        ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues +
2497                                opts->nr_poll_queues + 1;
2498        ctrl->ctrl.sqsize = opts->queue_size - 1;
2499        ctrl->ctrl.kato = opts->kato;
2500
2501        INIT_DELAYED_WORK(&ctrl->connect_work,
2502                        nvme_tcp_reconnect_ctrl_work);
2503        INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work);
2504        INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work);
2505
2506        if (!(opts->mask & NVMF_OPT_TRSVCID)) {
2507                opts->trsvcid =
2508                        kstrdup(__stringify(NVME_TCP_DISC_PORT), GFP_KERNEL);
2509                if (!opts->trsvcid) {
2510                        ret = -ENOMEM;
2511                        goto out_free_ctrl;
2512                }
2513                opts->mask |= NVMF_OPT_TRSVCID;
2514        }
2515
2516        ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
2517                        opts->traddr, opts->trsvcid, &ctrl->addr);
2518        if (ret) {
2519                pr_err("malformed address passed: %s:%s\n",
2520                        opts->traddr, opts->trsvcid);
2521                goto out_free_ctrl;
2522        }
2523
2524        if (opts->mask & NVMF_OPT_HOST_TRADDR) {
2525                ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
2526                        opts->host_traddr, NULL, &ctrl->src_addr);
2527                if (ret) {
2528                        pr_err("malformed src address passed: %s\n",
2529                               opts->host_traddr);
2530                        goto out_free_ctrl;
2531                }
2532        }
2533
2534        if (opts->mask & NVMF_OPT_HOST_IFACE) {
2535                if (!__dev_get_by_name(&init_net, opts->host_iface)) {
2536                        pr_err("invalid interface passed: %s\n",
2537                               opts->host_iface);
2538                        ret = -ENODEV;
2539                        goto out_free_ctrl;
2540                }
2541        }
2542
2543        if (!opts->duplicate_connect && nvme_tcp_existing_controller(opts)) {
2544                ret = -EALREADY;
2545                goto out_free_ctrl;
2546        }
2547
2548        ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(*ctrl->queues),
2549                                GFP_KERNEL);
2550        if (!ctrl->queues) {
2551                ret = -ENOMEM;
2552                goto out_free_ctrl;
2553        }
2554
2555        ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_tcp_ctrl_ops, 0);
2556        if (ret)
2557                goto out_kfree_queues;
2558
2559        if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
2560                WARN_ON_ONCE(1);
2561                ret = -EINTR;
2562                goto out_uninit_ctrl;
2563        }
2564
2565        ret = nvme_tcp_setup_ctrl(&ctrl->ctrl, true);
2566        if (ret)
2567                goto out_uninit_ctrl;
2568
2569        dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n",
2570                ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
2571
2572        mutex_lock(&nvme_tcp_ctrl_mutex);
2573        list_add_tail(&ctrl->list, &nvme_tcp_ctrl_list);
2574        mutex_unlock(&nvme_tcp_ctrl_mutex);
2575
2576        return &ctrl->ctrl;
2577
2578out_uninit_ctrl:
2579        nvme_uninit_ctrl(&ctrl->ctrl);
2580        nvme_put_ctrl(&ctrl->ctrl);
2581        if (ret > 0)
2582                ret = -EIO;
2583        return ERR_PTR(ret);
2584out_kfree_queues:
2585        kfree(ctrl->queues);
2586out_free_ctrl:
2587        kfree(ctrl);
2588        return ERR_PTR(ret);
2589}
2590
2591static struct nvmf_transport_ops nvme_tcp_transport = {
2592        .name           = "tcp",
2593        .module         = THIS_MODULE,
2594        .required_opts  = NVMF_OPT_TRADDR,
2595        .allowed_opts   = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
2596                          NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
2597                          NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST |
2598                          NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES |
2599                          NVMF_OPT_TOS | NVMF_OPT_HOST_IFACE,
2600        .create_ctrl    = nvme_tcp_create_ctrl,
2601};
2602
2603static int __init nvme_tcp_init_module(void)
2604{
2605        nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq",
2606                        WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
2607        if (!nvme_tcp_wq)
2608                return -ENOMEM;
2609
2610        nvmf_register_transport(&nvme_tcp_transport);
2611        return 0;
2612}
2613
2614static void __exit nvme_tcp_cleanup_module(void)
2615{
2616        struct nvme_tcp_ctrl *ctrl;
2617
2618        nvmf_unregister_transport(&nvme_tcp_transport);
2619
2620        mutex_lock(&nvme_tcp_ctrl_mutex);
2621        list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list)
2622                nvme_delete_ctrl(&ctrl->ctrl);
2623        mutex_unlock(&nvme_tcp_ctrl_mutex);
2624        flush_workqueue(nvme_delete_wq);
2625
2626        destroy_workqueue(nvme_tcp_wq);
2627}
2628
2629module_init(nvme_tcp_init_module);
2630module_exit(nvme_tcp_cleanup_module);
2631
2632MODULE_LICENSE("GPL v2");
2633