linux/drivers/nvme/host/tcp.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * NVMe over Fabrics TCP host.
   4 * Copyright (c) 2018 Lightbits Labs. All rights reserved.
   5 */
   6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   7#include <linux/module.h>
   8#include <linux/init.h>
   9#include <linux/slab.h>
  10#include <linux/err.h>
  11#include <linux/nvme-tcp.h>
  12#include <net/sock.h>
  13#include <net/tcp.h>
  14#include <linux/blk-mq.h>
  15#include <crypto/hash.h>
  16#include <net/busy_poll.h>
  17
  18#include "nvme.h"
  19#include "fabrics.h"
  20
  21struct nvme_tcp_queue;
  22
  23/* Define the socket priority to use for connections were it is desirable
  24 * that the NIC consider performing optimized packet processing or filtering.
  25 * A non-zero value being sufficient to indicate general consideration of any
  26 * possible optimization.  Making it a module param allows for alternative
  27 * values that may be unique for some NIC implementations.
  28 */
  29static int so_priority;
  30module_param(so_priority, int, 0644);
  31MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority");
  32
  33enum nvme_tcp_send_state {
  34        NVME_TCP_SEND_CMD_PDU = 0,
  35        NVME_TCP_SEND_H2C_PDU,
  36        NVME_TCP_SEND_DATA,
  37        NVME_TCP_SEND_DDGST,
  38};
  39
  40struct nvme_tcp_request {
  41        struct nvme_request     req;
  42        void                    *pdu;
  43        struct nvme_tcp_queue   *queue;
  44        u32                     data_len;
  45        u32                     pdu_len;
  46        u32                     pdu_sent;
  47        u32                     h2cdata_left;
  48        u32                     h2cdata_offset;
  49        u16                     ttag;
  50        __le16                  status;
  51        struct list_head        entry;
  52        struct llist_node       lentry;
  53        __le32                  ddgst;
  54
  55        struct bio              *curr_bio;
  56        struct iov_iter         iter;
  57
  58        /* send state */
  59        size_t                  offset;
  60        size_t                  data_sent;
  61        enum nvme_tcp_send_state state;
  62};
  63
  64enum nvme_tcp_queue_flags {
  65        NVME_TCP_Q_ALLOCATED    = 0,
  66        NVME_TCP_Q_LIVE         = 1,
  67        NVME_TCP_Q_POLLING      = 2,
  68};
  69
  70enum nvme_tcp_recv_state {
  71        NVME_TCP_RECV_PDU = 0,
  72        NVME_TCP_RECV_DATA,
  73        NVME_TCP_RECV_DDGST,
  74};
  75
  76struct nvme_tcp_ctrl;
  77struct nvme_tcp_queue {
  78        struct socket           *sock;
  79        struct work_struct      io_work;
  80        int                     io_cpu;
  81
  82        struct mutex            queue_lock;
  83        struct mutex            send_mutex;
  84        struct llist_head       req_list;
  85        struct list_head        send_list;
  86        bool                    more_requests;
  87
  88        /* recv state */
  89        void                    *pdu;
  90        int                     pdu_remaining;
  91        int                     pdu_offset;
  92        size_t                  data_remaining;
  93        size_t                  ddgst_remaining;
  94        unsigned int            nr_cqe;
  95
  96        /* send state */
  97        struct nvme_tcp_request *request;
  98
  99        int                     queue_size;
 100        u32                     maxh2cdata;
 101        size_t                  cmnd_capsule_len;
 102        struct nvme_tcp_ctrl    *ctrl;
 103        unsigned long           flags;
 104        bool                    rd_enabled;
 105
 106        bool                    hdr_digest;
 107        bool                    data_digest;
 108        struct ahash_request    *rcv_hash;
 109        struct ahash_request    *snd_hash;
 110        __le32                  exp_ddgst;
 111        __le32                  recv_ddgst;
 112
 113        struct page_frag_cache  pf_cache;
 114
 115        void (*state_change)(struct sock *);
 116        void (*data_ready)(struct sock *);
 117        void (*write_space)(struct sock *);
 118};
 119
 120struct nvme_tcp_ctrl {
 121        /* read only in the hot path */
 122        struct nvme_tcp_queue   *queues;
 123        struct blk_mq_tag_set   tag_set;
 124
 125        /* other member variables */
 126        struct list_head        list;
 127        struct blk_mq_tag_set   admin_tag_set;
 128        struct sockaddr_storage addr;
 129        struct sockaddr_storage src_addr;
 130        struct nvme_ctrl        ctrl;
 131
 132        struct work_struct      err_work;
 133        struct delayed_work     connect_work;
 134        struct nvme_tcp_request async_req;
 135        u32                     io_queues[HCTX_MAX_TYPES];
 136};
 137
 138static LIST_HEAD(nvme_tcp_ctrl_list);
 139static DEFINE_MUTEX(nvme_tcp_ctrl_mutex);
 140static struct workqueue_struct *nvme_tcp_wq;
 141static const struct blk_mq_ops nvme_tcp_mq_ops;
 142static const struct blk_mq_ops nvme_tcp_admin_mq_ops;
 143static int nvme_tcp_try_send(struct nvme_tcp_queue *queue);
 144
 145static inline struct nvme_tcp_ctrl *to_tcp_ctrl(struct nvme_ctrl *ctrl)
 146{
 147        return container_of(ctrl, struct nvme_tcp_ctrl, ctrl);
 148}
 149
 150static inline int nvme_tcp_queue_id(struct nvme_tcp_queue *queue)
 151{
 152        return queue - queue->ctrl->queues;
 153}
 154
 155static inline struct blk_mq_tags *nvme_tcp_tagset(struct nvme_tcp_queue *queue)
 156{
 157        u32 queue_idx = nvme_tcp_queue_id(queue);
 158
 159        if (queue_idx == 0)
 160                return queue->ctrl->admin_tag_set.tags[queue_idx];
 161        return queue->ctrl->tag_set.tags[queue_idx - 1];
 162}
 163
 164static inline u8 nvme_tcp_hdgst_len(struct nvme_tcp_queue *queue)
 165{
 166        return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0;
 167}
 168
 169static inline u8 nvme_tcp_ddgst_len(struct nvme_tcp_queue *queue)
 170{
 171        return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
 172}
 173
 174static inline size_t nvme_tcp_inline_data_size(struct nvme_tcp_queue *queue)
 175{
 176        return queue->cmnd_capsule_len - sizeof(struct nvme_command);
 177}
 178
 179static inline bool nvme_tcp_async_req(struct nvme_tcp_request *req)
 180{
 181        return req == &req->queue->ctrl->async_req;
 182}
 183
 184static inline bool nvme_tcp_has_inline_data(struct nvme_tcp_request *req)
 185{
 186        struct request *rq;
 187
 188        if (unlikely(nvme_tcp_async_req(req)))
 189                return false; /* async events don't have a request */
 190
 191        rq = blk_mq_rq_from_pdu(req);
 192
 193        return rq_data_dir(rq) == WRITE && req->data_len &&
 194                req->data_len <= nvme_tcp_inline_data_size(req->queue);
 195}
 196
 197static inline struct page *nvme_tcp_req_cur_page(struct nvme_tcp_request *req)
 198{
 199        return req->iter.bvec->bv_page;
 200}
 201
 202static inline size_t nvme_tcp_req_cur_offset(struct nvme_tcp_request *req)
 203{
 204        return req->iter.bvec->bv_offset + req->iter.iov_offset;
 205}
 206
 207static inline size_t nvme_tcp_req_cur_length(struct nvme_tcp_request *req)
 208{
 209        return min_t(size_t, iov_iter_single_seg_count(&req->iter),
 210                        req->pdu_len - req->pdu_sent);
 211}
 212
 213static inline size_t nvme_tcp_pdu_data_left(struct nvme_tcp_request *req)
 214{
 215        return rq_data_dir(blk_mq_rq_from_pdu(req)) == WRITE ?
 216                        req->pdu_len - req->pdu_sent : 0;
 217}
 218
 219static inline size_t nvme_tcp_pdu_last_send(struct nvme_tcp_request *req,
 220                int len)
 221{
 222        return nvme_tcp_pdu_data_left(req) <= len;
 223}
 224
 225static void nvme_tcp_init_iter(struct nvme_tcp_request *req,
 226                unsigned int dir)
 227{
 228        struct request *rq = blk_mq_rq_from_pdu(req);
 229        struct bio_vec *vec;
 230        unsigned int size;
 231        int nr_bvec;
 232        size_t offset;
 233
 234        if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) {
 235                vec = &rq->special_vec;
 236                nr_bvec = 1;
 237                size = blk_rq_payload_bytes(rq);
 238                offset = 0;
 239        } else {
 240                struct bio *bio = req->curr_bio;
 241                struct bvec_iter bi;
 242                struct bio_vec bv;
 243
 244                vec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
 245                nr_bvec = 0;
 246                bio_for_each_bvec(bv, bio, bi) {
 247                        nr_bvec++;
 248                }
 249                size = bio->bi_iter.bi_size;
 250                offset = bio->bi_iter.bi_bvec_done;
 251        }
 252
 253        iov_iter_bvec(&req->iter, dir, vec, nr_bvec, size);
 254        req->iter.iov_offset = offset;
 255}
 256
 257static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req,
 258                int len)
 259{
 260        req->data_sent += len;
 261        req->pdu_sent += len;
 262        iov_iter_advance(&req->iter, len);
 263        if (!iov_iter_count(&req->iter) &&
 264            req->data_sent < req->data_len) {
 265                req->curr_bio = req->curr_bio->bi_next;
 266                nvme_tcp_init_iter(req, WRITE);
 267        }
 268}
 269
 270static inline void nvme_tcp_send_all(struct nvme_tcp_queue *queue)
 271{
 272        int ret;
 273
 274        /* drain the send queue as much as we can... */
 275        do {
 276                ret = nvme_tcp_try_send(queue);
 277        } while (ret > 0);
 278}
 279
 280static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue)
 281{
 282        return !list_empty(&queue->send_list) ||
 283                !llist_empty(&queue->req_list) || queue->more_requests;
 284}
 285
 286static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
 287                bool sync, bool last)
 288{
 289        struct nvme_tcp_queue *queue = req->queue;
 290        bool empty;
 291
 292        empty = llist_add(&req->lentry, &queue->req_list) &&
 293                list_empty(&queue->send_list) && !queue->request;
 294
 295        /*
 296         * if we're the first on the send_list and we can try to send
 297         * directly, otherwise queue io_work. Also, only do that if we
 298         * are on the same cpu, so we don't introduce contention.
 299         */
 300        if (queue->io_cpu == raw_smp_processor_id() &&
 301            sync && empty && mutex_trylock(&queue->send_mutex)) {
 302                queue->more_requests = !last;
 303                nvme_tcp_send_all(queue);
 304                queue->more_requests = false;
 305                mutex_unlock(&queue->send_mutex);
 306        }
 307
 308        if (last && nvme_tcp_queue_more(queue))
 309                queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
 310}
 311
 312static void nvme_tcp_process_req_list(struct nvme_tcp_queue *queue)
 313{
 314        struct nvme_tcp_request *req;
 315        struct llist_node *node;
 316
 317        for (node = llist_del_all(&queue->req_list); node; node = node->next) {
 318                req = llist_entry(node, struct nvme_tcp_request, lentry);
 319                list_add(&req->entry, &queue->send_list);
 320        }
 321}
 322
 323static inline struct nvme_tcp_request *
 324nvme_tcp_fetch_request(struct nvme_tcp_queue *queue)
 325{
 326        struct nvme_tcp_request *req;
 327
 328        req = list_first_entry_or_null(&queue->send_list,
 329                        struct nvme_tcp_request, entry);
 330        if (!req) {
 331                nvme_tcp_process_req_list(queue);
 332                req = list_first_entry_or_null(&queue->send_list,
 333                                struct nvme_tcp_request, entry);
 334                if (unlikely(!req))
 335                        return NULL;
 336        }
 337
 338        list_del(&req->entry);
 339        return req;
 340}
 341
 342static inline void nvme_tcp_ddgst_final(struct ahash_request *hash,
 343                __le32 *dgst)
 344{
 345        ahash_request_set_crypt(hash, NULL, (u8 *)dgst, 0);
 346        crypto_ahash_final(hash);
 347}
 348
 349static inline void nvme_tcp_ddgst_update(struct ahash_request *hash,
 350                struct page *page, off_t off, size_t len)
 351{
 352        struct scatterlist sg;
 353
 354        sg_init_marker(&sg, 1);
 355        sg_set_page(&sg, page, len, off);
 356        ahash_request_set_crypt(hash, &sg, NULL, len);
 357        crypto_ahash_update(hash);
 358}
 359
 360static inline void nvme_tcp_hdgst(struct ahash_request *hash,
 361                void *pdu, size_t len)
 362{
 363        struct scatterlist sg;
 364
 365        sg_init_one(&sg, pdu, len);
 366        ahash_request_set_crypt(hash, &sg, pdu + len, len);
 367        crypto_ahash_digest(hash);
 368}
 369
 370static int nvme_tcp_verify_hdgst(struct nvme_tcp_queue *queue,
 371                void *pdu, size_t pdu_len)
 372{
 373        struct nvme_tcp_hdr *hdr = pdu;
 374        __le32 recv_digest;
 375        __le32 exp_digest;
 376
 377        if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) {
 378                dev_err(queue->ctrl->ctrl.device,
 379                        "queue %d: header digest flag is cleared\n",
 380                        nvme_tcp_queue_id(queue));
 381                return -EPROTO;
 382        }
 383
 384        recv_digest = *(__le32 *)(pdu + hdr->hlen);
 385        nvme_tcp_hdgst(queue->rcv_hash, pdu, pdu_len);
 386        exp_digest = *(__le32 *)(pdu + hdr->hlen);
 387        if (recv_digest != exp_digest) {
 388                dev_err(queue->ctrl->ctrl.device,
 389                        "header digest error: recv %#x expected %#x\n",
 390                        le32_to_cpu(recv_digest), le32_to_cpu(exp_digest));
 391                return -EIO;
 392        }
 393
 394        return 0;
 395}
 396
 397static int nvme_tcp_check_ddgst(struct nvme_tcp_queue *queue, void *pdu)
 398{
 399        struct nvme_tcp_hdr *hdr = pdu;
 400        u8 digest_len = nvme_tcp_hdgst_len(queue);
 401        u32 len;
 402
 403        len = le32_to_cpu(hdr->plen) - hdr->hlen -
 404                ((hdr->flags & NVME_TCP_F_HDGST) ? digest_len : 0);
 405
 406        if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) {
 407                dev_err(queue->ctrl->ctrl.device,
 408                        "queue %d: data digest flag is cleared\n",
 409                nvme_tcp_queue_id(queue));
 410                return -EPROTO;
 411        }
 412        crypto_ahash_init(queue->rcv_hash);
 413
 414        return 0;
 415}
 416
 417static void nvme_tcp_exit_request(struct blk_mq_tag_set *set,
 418                struct request *rq, unsigned int hctx_idx)
 419{
 420        struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
 421
 422        page_frag_free(req->pdu);
 423}
 424
 425static int nvme_tcp_init_request(struct blk_mq_tag_set *set,
 426                struct request *rq, unsigned int hctx_idx,
 427                unsigned int numa_node)
 428{
 429        struct nvme_tcp_ctrl *ctrl = set->driver_data;
 430        struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
 431        struct nvme_tcp_cmd_pdu *pdu;
 432        int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
 433        struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx];
 434        u8 hdgst = nvme_tcp_hdgst_len(queue);
 435
 436        req->pdu = page_frag_alloc(&queue->pf_cache,
 437                sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
 438                GFP_KERNEL | __GFP_ZERO);
 439        if (!req->pdu)
 440                return -ENOMEM;
 441
 442        pdu = req->pdu;
 443        req->queue = queue;
 444        nvme_req(rq)->ctrl = &ctrl->ctrl;
 445        nvme_req(rq)->cmd = &pdu->cmd;
 446
 447        return 0;
 448}
 449
 450static int nvme_tcp_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
 451                unsigned int hctx_idx)
 452{
 453        struct nvme_tcp_ctrl *ctrl = data;
 454        struct nvme_tcp_queue *queue = &ctrl->queues[hctx_idx + 1];
 455
 456        hctx->driver_data = queue;
 457        return 0;
 458}
 459
 460static int nvme_tcp_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
 461                unsigned int hctx_idx)
 462{
 463        struct nvme_tcp_ctrl *ctrl = data;
 464        struct nvme_tcp_queue *queue = &ctrl->queues[0];
 465
 466        hctx->driver_data = queue;
 467        return 0;
 468}
 469
 470static enum nvme_tcp_recv_state
 471nvme_tcp_recv_state(struct nvme_tcp_queue *queue)
 472{
 473        return  (queue->pdu_remaining) ? NVME_TCP_RECV_PDU :
 474                (queue->ddgst_remaining) ? NVME_TCP_RECV_DDGST :
 475                NVME_TCP_RECV_DATA;
 476}
 477
 478static void nvme_tcp_init_recv_ctx(struct nvme_tcp_queue *queue)
 479{
 480        queue->pdu_remaining = sizeof(struct nvme_tcp_rsp_pdu) +
 481                                nvme_tcp_hdgst_len(queue);
 482        queue->pdu_offset = 0;
 483        queue->data_remaining = -1;
 484        queue->ddgst_remaining = 0;
 485}
 486
 487static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl)
 488{
 489        if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
 490                return;
 491
 492        dev_warn(ctrl->device, "starting error recovery\n");
 493        queue_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work);
 494}
 495
 496static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
 497                struct nvme_completion *cqe)
 498{
 499        struct nvme_tcp_request *req;
 500        struct request *rq;
 501
 502        rq = nvme_find_rq(nvme_tcp_tagset(queue), cqe->command_id);
 503        if (!rq) {
 504                dev_err(queue->ctrl->ctrl.device,
 505                        "got bad cqe.command_id %#x on queue %d\n",
 506                        cqe->command_id, nvme_tcp_queue_id(queue));
 507                nvme_tcp_error_recovery(&queue->ctrl->ctrl);
 508                return -EINVAL;
 509        }
 510
 511        req = blk_mq_rq_to_pdu(rq);
 512        if (req->status == cpu_to_le16(NVME_SC_SUCCESS))
 513                req->status = cqe->status;
 514
 515        if (!nvme_try_complete_req(rq, req->status, cqe->result))
 516                nvme_complete_rq(rq);
 517        queue->nr_cqe++;
 518
 519        return 0;
 520}
 521
 522static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue,
 523                struct nvme_tcp_data_pdu *pdu)
 524{
 525        struct request *rq;
 526
 527        rq = nvme_find_rq(nvme_tcp_tagset(queue), pdu->command_id);
 528        if (!rq) {
 529                dev_err(queue->ctrl->ctrl.device,
 530                        "got bad c2hdata.command_id %#x on queue %d\n",
 531                        pdu->command_id, nvme_tcp_queue_id(queue));
 532                return -ENOENT;
 533        }
 534
 535        if (!blk_rq_payload_bytes(rq)) {
 536                dev_err(queue->ctrl->ctrl.device,
 537                        "queue %d tag %#x unexpected data\n",
 538                        nvme_tcp_queue_id(queue), rq->tag);
 539                return -EIO;
 540        }
 541
 542        queue->data_remaining = le32_to_cpu(pdu->data_length);
 543
 544        if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS &&
 545            unlikely(!(pdu->hdr.flags & NVME_TCP_F_DATA_LAST))) {
 546                dev_err(queue->ctrl->ctrl.device,
 547                        "queue %d tag %#x SUCCESS set but not last PDU\n",
 548                        nvme_tcp_queue_id(queue), rq->tag);
 549                nvme_tcp_error_recovery(&queue->ctrl->ctrl);
 550                return -EPROTO;
 551        }
 552
 553        return 0;
 554}
 555
 556static int nvme_tcp_handle_comp(struct nvme_tcp_queue *queue,
 557                struct nvme_tcp_rsp_pdu *pdu)
 558{
 559        struct nvme_completion *cqe = &pdu->cqe;
 560        int ret = 0;
 561
 562        /*
 563         * AEN requests are special as they don't time out and can
 564         * survive any kind of queue freeze and often don't respond to
 565         * aborts.  We don't even bother to allocate a struct request
 566         * for them but rather special case them here.
 567         */
 568        if (unlikely(nvme_is_aen_req(nvme_tcp_queue_id(queue),
 569                                     cqe->command_id)))
 570                nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
 571                                &cqe->result);
 572        else
 573                ret = nvme_tcp_process_nvme_cqe(queue, cqe);
 574
 575        return ret;
 576}
 577
 578static void nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req)
 579{
 580        struct nvme_tcp_data_pdu *data = req->pdu;
 581        struct nvme_tcp_queue *queue = req->queue;
 582        struct request *rq = blk_mq_rq_from_pdu(req);
 583        u32 h2cdata_sent = req->pdu_len;
 584        u8 hdgst = nvme_tcp_hdgst_len(queue);
 585        u8 ddgst = nvme_tcp_ddgst_len(queue);
 586
 587        req->state = NVME_TCP_SEND_H2C_PDU;
 588        req->offset = 0;
 589        req->pdu_len = min(req->h2cdata_left, queue->maxh2cdata);
 590        req->pdu_sent = 0;
 591        req->h2cdata_left -= req->pdu_len;
 592        req->h2cdata_offset += h2cdata_sent;
 593
 594        memset(data, 0, sizeof(*data));
 595        data->hdr.type = nvme_tcp_h2c_data;
 596        if (!req->h2cdata_left)
 597                data->hdr.flags = NVME_TCP_F_DATA_LAST;
 598        if (queue->hdr_digest)
 599                data->hdr.flags |= NVME_TCP_F_HDGST;
 600        if (queue->data_digest)
 601                data->hdr.flags |= NVME_TCP_F_DDGST;
 602        data->hdr.hlen = sizeof(*data);
 603        data->hdr.pdo = data->hdr.hlen + hdgst;
 604        data->hdr.plen =
 605                cpu_to_le32(data->hdr.hlen + hdgst + req->pdu_len + ddgst);
 606        data->ttag = req->ttag;
 607        data->command_id = nvme_cid(rq);
 608        data->data_offset = cpu_to_le32(req->h2cdata_offset);
 609        data->data_length = cpu_to_le32(req->pdu_len);
 610}
 611
 612static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue,
 613                struct nvme_tcp_r2t_pdu *pdu)
 614{
 615        struct nvme_tcp_request *req;
 616        struct request *rq;
 617        u32 r2t_length = le32_to_cpu(pdu->r2t_length);
 618        u32 r2t_offset = le32_to_cpu(pdu->r2t_offset);
 619
 620        rq = nvme_find_rq(nvme_tcp_tagset(queue), pdu->command_id);
 621        if (!rq) {
 622                dev_err(queue->ctrl->ctrl.device,
 623                        "got bad r2t.command_id %#x on queue %d\n",
 624                        pdu->command_id, nvme_tcp_queue_id(queue));
 625                return -ENOENT;
 626        }
 627        req = blk_mq_rq_to_pdu(rq);
 628
 629        if (unlikely(!r2t_length)) {
 630                dev_err(queue->ctrl->ctrl.device,
 631                        "req %d r2t len is %u, probably a bug...\n",
 632                        rq->tag, r2t_length);
 633                return -EPROTO;
 634        }
 635
 636        if (unlikely(req->data_sent + r2t_length > req->data_len)) {
 637                dev_err(queue->ctrl->ctrl.device,
 638                        "req %d r2t len %u exceeded data len %u (%zu sent)\n",
 639                        rq->tag, r2t_length, req->data_len, req->data_sent);
 640                return -EPROTO;
 641        }
 642
 643        if (unlikely(r2t_offset < req->data_sent)) {
 644                dev_err(queue->ctrl->ctrl.device,
 645                        "req %d unexpected r2t offset %u (expected %zu)\n",
 646                        rq->tag, r2t_offset, req->data_sent);
 647                return -EPROTO;
 648        }
 649
 650        req->pdu_len = 0;
 651        req->h2cdata_left = r2t_length;
 652        req->h2cdata_offset = r2t_offset;
 653        req->ttag = pdu->ttag;
 654
 655        nvme_tcp_setup_h2c_data_pdu(req);
 656        nvme_tcp_queue_request(req, false, true);
 657
 658        return 0;
 659}
 660
 661static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb,
 662                unsigned int *offset, size_t *len)
 663{
 664        struct nvme_tcp_hdr *hdr;
 665        char *pdu = queue->pdu;
 666        size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining);
 667        int ret;
 668
 669        ret = skb_copy_bits(skb, *offset,
 670                &pdu[queue->pdu_offset], rcv_len);
 671        if (unlikely(ret))
 672                return ret;
 673
 674        queue->pdu_remaining -= rcv_len;
 675        queue->pdu_offset += rcv_len;
 676        *offset += rcv_len;
 677        *len -= rcv_len;
 678        if (queue->pdu_remaining)
 679                return 0;
 680
 681        hdr = queue->pdu;
 682        if (queue->hdr_digest) {
 683                ret = nvme_tcp_verify_hdgst(queue, queue->pdu, hdr->hlen);
 684                if (unlikely(ret))
 685                        return ret;
 686        }
 687
 688
 689        if (queue->data_digest) {
 690                ret = nvme_tcp_check_ddgst(queue, queue->pdu);
 691                if (unlikely(ret))
 692                        return ret;
 693        }
 694
 695        switch (hdr->type) {
 696        case nvme_tcp_c2h_data:
 697                return nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu);
 698        case nvme_tcp_rsp:
 699                nvme_tcp_init_recv_ctx(queue);
 700                return nvme_tcp_handle_comp(queue, (void *)queue->pdu);
 701        case nvme_tcp_r2t:
 702                nvme_tcp_init_recv_ctx(queue);
 703                return nvme_tcp_handle_r2t(queue, (void *)queue->pdu);
 704        default:
 705                dev_err(queue->ctrl->ctrl.device,
 706                        "unsupported pdu type (%d)\n", hdr->type);
 707                return -EINVAL;
 708        }
 709}
 710
 711static inline void nvme_tcp_end_request(struct request *rq, u16 status)
 712{
 713        union nvme_result res = {};
 714
 715        if (!nvme_try_complete_req(rq, cpu_to_le16(status << 1), res))
 716                nvme_complete_rq(rq);
 717}
 718
 719static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
 720                              unsigned int *offset, size_t *len)
 721{
 722        struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
 723        struct request *rq =
 724                nvme_cid_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
 725        struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
 726
 727        while (true) {
 728                int recv_len, ret;
 729
 730                recv_len = min_t(size_t, *len, queue->data_remaining);
 731                if (!recv_len)
 732                        break;
 733
 734                if (!iov_iter_count(&req->iter)) {
 735                        req->curr_bio = req->curr_bio->bi_next;
 736
 737                        /*
 738                         * If we don`t have any bios it means that controller
 739                         * sent more data than we requested, hence error
 740                         */
 741                        if (!req->curr_bio) {
 742                                dev_err(queue->ctrl->ctrl.device,
 743                                        "queue %d no space in request %#x",
 744                                        nvme_tcp_queue_id(queue), rq->tag);
 745                                nvme_tcp_init_recv_ctx(queue);
 746                                return -EIO;
 747                        }
 748                        nvme_tcp_init_iter(req, READ);
 749                }
 750
 751                /* we can read only from what is left in this bio */
 752                recv_len = min_t(size_t, recv_len,
 753                                iov_iter_count(&req->iter));
 754
 755                if (queue->data_digest)
 756                        ret = skb_copy_and_hash_datagram_iter(skb, *offset,
 757                                &req->iter, recv_len, queue->rcv_hash);
 758                else
 759                        ret = skb_copy_datagram_iter(skb, *offset,
 760                                        &req->iter, recv_len);
 761                if (ret) {
 762                        dev_err(queue->ctrl->ctrl.device,
 763                                "queue %d failed to copy request %#x data",
 764                                nvme_tcp_queue_id(queue), rq->tag);
 765                        return ret;
 766                }
 767
 768                *len -= recv_len;
 769                *offset += recv_len;
 770                queue->data_remaining -= recv_len;
 771        }
 772
 773        if (!queue->data_remaining) {
 774                if (queue->data_digest) {
 775                        nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst);
 776                        queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH;
 777                } else {
 778                        if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
 779                                nvme_tcp_end_request(rq,
 780                                                le16_to_cpu(req->status));
 781                                queue->nr_cqe++;
 782                        }
 783                        nvme_tcp_init_recv_ctx(queue);
 784                }
 785        }
 786
 787        return 0;
 788}
 789
 790static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue,
 791                struct sk_buff *skb, unsigned int *offset, size_t *len)
 792{
 793        struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
 794        char *ddgst = (char *)&queue->recv_ddgst;
 795        size_t recv_len = min_t(size_t, *len, queue->ddgst_remaining);
 796        off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining;
 797        int ret;
 798
 799        ret = skb_copy_bits(skb, *offset, &ddgst[off], recv_len);
 800        if (unlikely(ret))
 801                return ret;
 802
 803        queue->ddgst_remaining -= recv_len;
 804        *offset += recv_len;
 805        *len -= recv_len;
 806        if (queue->ddgst_remaining)
 807                return 0;
 808
 809        if (queue->recv_ddgst != queue->exp_ddgst) {
 810                struct request *rq = nvme_cid_to_rq(nvme_tcp_tagset(queue),
 811                                        pdu->command_id);
 812                struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
 813
 814                req->status = cpu_to_le16(NVME_SC_DATA_XFER_ERROR);
 815
 816                dev_err(queue->ctrl->ctrl.device,
 817                        "data digest error: recv %#x expected %#x\n",
 818                        le32_to_cpu(queue->recv_ddgst),
 819                        le32_to_cpu(queue->exp_ddgst));
 820        }
 821
 822        if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
 823                struct request *rq = nvme_cid_to_rq(nvme_tcp_tagset(queue),
 824                                        pdu->command_id);
 825                struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
 826
 827                nvme_tcp_end_request(rq, le16_to_cpu(req->status));
 828                queue->nr_cqe++;
 829        }
 830
 831        nvme_tcp_init_recv_ctx(queue);
 832        return 0;
 833}
 834
 835static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
 836                             unsigned int offset, size_t len)
 837{
 838        struct nvme_tcp_queue *queue = desc->arg.data;
 839        size_t consumed = len;
 840        int result;
 841
 842        while (len) {
 843                switch (nvme_tcp_recv_state(queue)) {
 844                case NVME_TCP_RECV_PDU:
 845                        result = nvme_tcp_recv_pdu(queue, skb, &offset, &len);
 846                        break;
 847                case NVME_TCP_RECV_DATA:
 848                        result = nvme_tcp_recv_data(queue, skb, &offset, &len);
 849                        break;
 850                case NVME_TCP_RECV_DDGST:
 851                        result = nvme_tcp_recv_ddgst(queue, skb, &offset, &len);
 852                        break;
 853                default:
 854                        result = -EFAULT;
 855                }
 856                if (result) {
 857                        dev_err(queue->ctrl->ctrl.device,
 858                                "receive failed:  %d\n", result);
 859                        queue->rd_enabled = false;
 860                        nvme_tcp_error_recovery(&queue->ctrl->ctrl);
 861                        return result;
 862                }
 863        }
 864
 865        return consumed;
 866}
 867
 868static void nvme_tcp_data_ready(struct sock *sk)
 869{
 870        struct nvme_tcp_queue *queue;
 871
 872        read_lock_bh(&sk->sk_callback_lock);
 873        queue = sk->sk_user_data;
 874        if (likely(queue && queue->rd_enabled) &&
 875            !test_bit(NVME_TCP_Q_POLLING, &queue->flags))
 876                queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
 877        read_unlock_bh(&sk->sk_callback_lock);
 878}
 879
 880static void nvme_tcp_write_space(struct sock *sk)
 881{
 882        struct nvme_tcp_queue *queue;
 883
 884        read_lock_bh(&sk->sk_callback_lock);
 885        queue = sk->sk_user_data;
 886        if (likely(queue && sk_stream_is_writeable(sk))) {
 887                clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 888                queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
 889        }
 890        read_unlock_bh(&sk->sk_callback_lock);
 891}
 892
 893static void nvme_tcp_state_change(struct sock *sk)
 894{
 895        struct nvme_tcp_queue *queue;
 896
 897        read_lock_bh(&sk->sk_callback_lock);
 898        queue = sk->sk_user_data;
 899        if (!queue)
 900                goto done;
 901
 902        switch (sk->sk_state) {
 903        case TCP_CLOSE:
 904        case TCP_CLOSE_WAIT:
 905        case TCP_LAST_ACK:
 906        case TCP_FIN_WAIT1:
 907        case TCP_FIN_WAIT2:
 908                nvme_tcp_error_recovery(&queue->ctrl->ctrl);
 909                break;
 910        default:
 911                dev_info(queue->ctrl->ctrl.device,
 912                        "queue %d socket state %d\n",
 913                        nvme_tcp_queue_id(queue), sk->sk_state);
 914        }
 915
 916        queue->state_change(sk);
 917done:
 918        read_unlock_bh(&sk->sk_callback_lock);
 919}
 920
 921static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue)
 922{
 923        queue->request = NULL;
 924}
 925
 926static void nvme_tcp_fail_request(struct nvme_tcp_request *req)
 927{
 928        if (nvme_tcp_async_req(req)) {
 929                union nvme_result res = {};
 930
 931                nvme_complete_async_event(&req->queue->ctrl->ctrl,
 932                                cpu_to_le16(NVME_SC_HOST_PATH_ERROR), &res);
 933        } else {
 934                nvme_tcp_end_request(blk_mq_rq_from_pdu(req),
 935                                NVME_SC_HOST_PATH_ERROR);
 936        }
 937}
 938
 939static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
 940{
 941        struct nvme_tcp_queue *queue = req->queue;
 942        int req_data_len = req->data_len;
 943        u32 h2cdata_left = req->h2cdata_left;
 944
 945        while (true) {
 946                struct page *page = nvme_tcp_req_cur_page(req);
 947                size_t offset = nvme_tcp_req_cur_offset(req);
 948                size_t len = nvme_tcp_req_cur_length(req);
 949                bool last = nvme_tcp_pdu_last_send(req, len);
 950                int req_data_sent = req->data_sent;
 951                int ret, flags = MSG_DONTWAIT;
 952
 953                if (last && !queue->data_digest && !nvme_tcp_queue_more(queue))
 954                        flags |= MSG_EOR;
 955                else
 956                        flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
 957
 958                if (sendpage_ok(page)) {
 959                        ret = kernel_sendpage(queue->sock, page, offset, len,
 960                                        flags);
 961                } else {
 962                        ret = sock_no_sendpage(queue->sock, page, offset, len,
 963                                        flags);
 964                }
 965                if (ret <= 0)
 966                        return ret;
 967
 968                if (queue->data_digest)
 969                        nvme_tcp_ddgst_update(queue->snd_hash, page,
 970                                        offset, ret);
 971
 972                /*
 973                 * update the request iterator except for the last payload send
 974                 * in the request where we don't want to modify it as we may
 975                 * compete with the RX path completing the request.
 976                 */
 977                if (req_data_sent + ret < req_data_len)
 978                        nvme_tcp_advance_req(req, ret);
 979
 980                /* fully successful last send in current PDU */
 981                if (last && ret == len) {
 982                        if (queue->data_digest) {
 983                                nvme_tcp_ddgst_final(queue->snd_hash,
 984                                        &req->ddgst);
 985                                req->state = NVME_TCP_SEND_DDGST;
 986                                req->offset = 0;
 987                        } else {
 988                                if (h2cdata_left)
 989                                        nvme_tcp_setup_h2c_data_pdu(req);
 990                                else
 991                                        nvme_tcp_done_send_req(queue);
 992                        }
 993                        return 1;
 994                }
 995        }
 996        return -EAGAIN;
 997}
 998
 999static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
1000{
1001        struct nvme_tcp_queue *queue = req->queue;
1002        struct nvme_tcp_cmd_pdu *pdu = req->pdu;
1003        bool inline_data = nvme_tcp_has_inline_data(req);
1004        u8 hdgst = nvme_tcp_hdgst_len(queue);
1005        int len = sizeof(*pdu) + hdgst - req->offset;
1006        int flags = MSG_DONTWAIT;
1007        int ret;
1008
1009        if (inline_data || nvme_tcp_queue_more(queue))
1010                flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
1011        else
1012                flags |= MSG_EOR;
1013
1014        if (queue->hdr_digest && !req->offset)
1015                nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
1016
1017        ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
1018                        offset_in_page(pdu) + req->offset, len,  flags);
1019        if (unlikely(ret <= 0))
1020                return ret;
1021
1022        len -= ret;
1023        if (!len) {
1024                if (inline_data) {
1025                        req->state = NVME_TCP_SEND_DATA;
1026                        if (queue->data_digest)
1027                                crypto_ahash_init(queue->snd_hash);
1028                } else {
1029                        nvme_tcp_done_send_req(queue);
1030                }
1031                return 1;
1032        }
1033        req->offset += ret;
1034
1035        return -EAGAIN;
1036}
1037
1038static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
1039{
1040        struct nvme_tcp_queue *queue = req->queue;
1041        struct nvme_tcp_data_pdu *pdu = req->pdu;
1042        u8 hdgst = nvme_tcp_hdgst_len(queue);
1043        int len = sizeof(*pdu) - req->offset + hdgst;
1044        int ret;
1045
1046        if (queue->hdr_digest && !req->offset)
1047                nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
1048
1049        if (!req->h2cdata_left)
1050                ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
1051                                offset_in_page(pdu) + req->offset, len,
1052                                MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST);
1053        else
1054                ret = sock_no_sendpage(queue->sock, virt_to_page(pdu),
1055                                offset_in_page(pdu) + req->offset, len,
1056                                MSG_DONTWAIT | MSG_MORE);
1057        if (unlikely(ret <= 0))
1058                return ret;
1059
1060        len -= ret;
1061        if (!len) {
1062                req->state = NVME_TCP_SEND_DATA;
1063                if (queue->data_digest)
1064                        crypto_ahash_init(queue->snd_hash);
1065                return 1;
1066        }
1067        req->offset += ret;
1068
1069        return -EAGAIN;
1070}
1071
1072static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req)
1073{
1074        struct nvme_tcp_queue *queue = req->queue;
1075        size_t offset = req->offset;
1076        u32 h2cdata_left = req->h2cdata_left;
1077        int ret;
1078        struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
1079        struct kvec iov = {
1080                .iov_base = (u8 *)&req->ddgst + req->offset,
1081                .iov_len = NVME_TCP_DIGEST_LENGTH - req->offset
1082        };
1083
1084        if (nvme_tcp_queue_more(queue))
1085                msg.msg_flags |= MSG_MORE;
1086        else
1087                msg.msg_flags |= MSG_EOR;
1088
1089        ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
1090        if (unlikely(ret <= 0))
1091                return ret;
1092
1093        if (offset + ret == NVME_TCP_DIGEST_LENGTH) {
1094                if (h2cdata_left)
1095                        nvme_tcp_setup_h2c_data_pdu(req);
1096                else
1097                        nvme_tcp_done_send_req(queue);
1098                return 1;
1099        }
1100
1101        req->offset += ret;
1102        return -EAGAIN;
1103}
1104
1105static int nvme_tcp_try_send(struct nvme_tcp_queue *queue)
1106{
1107        struct nvme_tcp_request *req;
1108        int ret = 1;
1109
1110        if (!queue->request) {
1111                queue->request = nvme_tcp_fetch_request(queue);
1112                if (!queue->request)
1113                        return 0;
1114        }
1115        req = queue->request;
1116
1117        if (req->state == NVME_TCP_SEND_CMD_PDU) {
1118                ret = nvme_tcp_try_send_cmd_pdu(req);
1119                if (ret <= 0)
1120                        goto done;
1121                if (!nvme_tcp_has_inline_data(req))
1122                        return ret;
1123        }
1124
1125        if (req->state == NVME_TCP_SEND_H2C_PDU) {
1126                ret = nvme_tcp_try_send_data_pdu(req);
1127                if (ret <= 0)
1128                        goto done;
1129        }
1130
1131        if (req->state == NVME_TCP_SEND_DATA) {
1132                ret = nvme_tcp_try_send_data(req);
1133                if (ret <= 0)
1134                        goto done;
1135        }
1136
1137        if (req->state == NVME_TCP_SEND_DDGST)
1138                ret = nvme_tcp_try_send_ddgst(req);
1139done:
1140        if (ret == -EAGAIN) {
1141                ret = 0;
1142        } else if (ret < 0) {
1143                dev_err(queue->ctrl->ctrl.device,
1144                        "failed to send request %d\n", ret);
1145                if (ret != -EPIPE && ret != -ECONNRESET)
1146                        nvme_tcp_fail_request(queue->request);
1147                nvme_tcp_done_send_req(queue);
1148        }
1149        return ret;
1150}
1151
1152static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
1153{
1154        struct socket *sock = queue->sock;
1155        struct sock *sk = sock->sk;
1156        read_descriptor_t rd_desc;
1157        int consumed;
1158
1159        rd_desc.arg.data = queue;
1160        rd_desc.count = 1;
1161        lock_sock(sk);
1162        queue->nr_cqe = 0;
1163        consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
1164        release_sock(sk);
1165        return consumed;
1166}
1167
1168static void nvme_tcp_io_work(struct work_struct *w)
1169{
1170        struct nvme_tcp_queue *queue =
1171                container_of(w, struct nvme_tcp_queue, io_work);
1172        unsigned long deadline = jiffies + msecs_to_jiffies(1);
1173
1174        do {
1175                bool pending = false;
1176                int result;
1177
1178                if (mutex_trylock(&queue->send_mutex)) {
1179                        result = nvme_tcp_try_send(queue);
1180                        mutex_unlock(&queue->send_mutex);
1181                        if (result > 0)
1182                                pending = true;
1183                        else if (unlikely(result < 0))
1184                                break;
1185                }
1186
1187                result = nvme_tcp_try_recv(queue);
1188                if (result > 0)
1189                        pending = true;
1190                else if (unlikely(result < 0))
1191                        return;
1192
1193                if (!pending)
1194                        return;
1195
1196        } while (!time_after(jiffies, deadline)); /* quota is exhausted */
1197
1198        queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
1199}
1200
1201static void nvme_tcp_free_crypto(struct nvme_tcp_queue *queue)
1202{
1203        struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash);
1204
1205        ahash_request_free(queue->rcv_hash);
1206        ahash_request_free(queue->snd_hash);
1207        crypto_free_ahash(tfm);
1208}
1209
1210static int nvme_tcp_alloc_crypto(struct nvme_tcp_queue *queue)
1211{
1212        struct crypto_ahash *tfm;
1213
1214        tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC);
1215        if (IS_ERR(tfm))
1216                return PTR_ERR(tfm);
1217
1218        queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL);
1219        if (!queue->snd_hash)
1220                goto free_tfm;
1221        ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL);
1222
1223        queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL);
1224        if (!queue->rcv_hash)
1225                goto free_snd_hash;
1226        ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL);
1227
1228        return 0;
1229free_snd_hash:
1230        ahash_request_free(queue->snd_hash);
1231free_tfm:
1232        crypto_free_ahash(tfm);
1233        return -ENOMEM;
1234}
1235
1236static void nvme_tcp_free_async_req(struct nvme_tcp_ctrl *ctrl)
1237{
1238        struct nvme_tcp_request *async = &ctrl->async_req;
1239
1240        page_frag_free(async->pdu);
1241}
1242
1243static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl)
1244{
1245        struct nvme_tcp_queue *queue = &ctrl->queues[0];
1246        struct nvme_tcp_request *async = &ctrl->async_req;
1247        u8 hdgst = nvme_tcp_hdgst_len(queue);
1248
1249        async->pdu = page_frag_alloc(&queue->pf_cache,
1250                sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
1251                GFP_KERNEL | __GFP_ZERO);
1252        if (!async->pdu)
1253                return -ENOMEM;
1254
1255        async->queue = &ctrl->queues[0];
1256        return 0;
1257}
1258
1259static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
1260{
1261        struct page *page;
1262        struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1263        struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1264
1265        if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
1266                return;
1267
1268        if (queue->hdr_digest || queue->data_digest)
1269                nvme_tcp_free_crypto(queue);
1270
1271        if (queue->pf_cache.va) {
1272                page = virt_to_head_page(queue->pf_cache.va);
1273                __page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias);
1274                queue->pf_cache.va = NULL;
1275        }
1276        sock_release(queue->sock);
1277        kfree(queue->pdu);
1278        mutex_destroy(&queue->send_mutex);
1279        mutex_destroy(&queue->queue_lock);
1280}
1281
1282static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue)
1283{
1284        struct nvme_tcp_icreq_pdu *icreq;
1285        struct nvme_tcp_icresp_pdu *icresp;
1286        struct msghdr msg = {};
1287        struct kvec iov;
1288        bool ctrl_hdgst, ctrl_ddgst;
1289        u32 maxh2cdata;
1290        int ret;
1291
1292        icreq = kzalloc(sizeof(*icreq), GFP_KERNEL);
1293        if (!icreq)
1294                return -ENOMEM;
1295
1296        icresp = kzalloc(sizeof(*icresp), GFP_KERNEL);
1297        if (!icresp) {
1298                ret = -ENOMEM;
1299                goto free_icreq;
1300        }
1301
1302        icreq->hdr.type = nvme_tcp_icreq;
1303        icreq->hdr.hlen = sizeof(*icreq);
1304        icreq->hdr.pdo = 0;
1305        icreq->hdr.plen = cpu_to_le32(icreq->hdr.hlen);
1306        icreq->pfv = cpu_to_le16(NVME_TCP_PFV_1_0);
1307        icreq->maxr2t = 0; /* single inflight r2t supported */
1308        icreq->hpda = 0; /* no alignment constraint */
1309        if (queue->hdr_digest)
1310                icreq->digest |= NVME_TCP_HDR_DIGEST_ENABLE;
1311        if (queue->data_digest)
1312                icreq->digest |= NVME_TCP_DATA_DIGEST_ENABLE;
1313
1314        iov.iov_base = icreq;
1315        iov.iov_len = sizeof(*icreq);
1316        ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
1317        if (ret < 0)
1318                goto free_icresp;
1319
1320        memset(&msg, 0, sizeof(msg));
1321        iov.iov_base = icresp;
1322        iov.iov_len = sizeof(*icresp);
1323        ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
1324                        iov.iov_len, msg.msg_flags);
1325        if (ret < 0)
1326                goto free_icresp;
1327
1328        ret = -EINVAL;
1329        if (icresp->hdr.type != nvme_tcp_icresp) {
1330                pr_err("queue %d: bad type returned %d\n",
1331                        nvme_tcp_queue_id(queue), icresp->hdr.type);
1332                goto free_icresp;
1333        }
1334
1335        if (le32_to_cpu(icresp->hdr.plen) != sizeof(*icresp)) {
1336                pr_err("queue %d: bad pdu length returned %d\n",
1337                        nvme_tcp_queue_id(queue), icresp->hdr.plen);
1338                goto free_icresp;
1339        }
1340
1341        if (icresp->pfv != NVME_TCP_PFV_1_0) {
1342                pr_err("queue %d: bad pfv returned %d\n",
1343                        nvme_tcp_queue_id(queue), icresp->pfv);
1344                goto free_icresp;
1345        }
1346
1347        ctrl_ddgst = !!(icresp->digest & NVME_TCP_DATA_DIGEST_ENABLE);
1348        if ((queue->data_digest && !ctrl_ddgst) ||
1349            (!queue->data_digest && ctrl_ddgst)) {
1350                pr_err("queue %d: data digest mismatch host: %s ctrl: %s\n",
1351                        nvme_tcp_queue_id(queue),
1352                        queue->data_digest ? "enabled" : "disabled",
1353                        ctrl_ddgst ? "enabled" : "disabled");
1354                goto free_icresp;
1355        }
1356
1357        ctrl_hdgst = !!(icresp->digest & NVME_TCP_HDR_DIGEST_ENABLE);
1358        if ((queue->hdr_digest && !ctrl_hdgst) ||
1359            (!queue->hdr_digest && ctrl_hdgst)) {
1360                pr_err("queue %d: header digest mismatch host: %s ctrl: %s\n",
1361                        nvme_tcp_queue_id(queue),
1362                        queue->hdr_digest ? "enabled" : "disabled",
1363                        ctrl_hdgst ? "enabled" : "disabled");
1364                goto free_icresp;
1365        }
1366
1367        if (icresp->cpda != 0) {
1368                pr_err("queue %d: unsupported cpda returned %d\n",
1369                        nvme_tcp_queue_id(queue), icresp->cpda);
1370                goto free_icresp;
1371        }
1372
1373        maxh2cdata = le32_to_cpu(icresp->maxdata);
1374        if ((maxh2cdata % 4) || (maxh2cdata < NVME_TCP_MIN_MAXH2CDATA)) {
1375                pr_err("queue %d: invalid maxh2cdata returned %u\n",
1376                       nvme_tcp_queue_id(queue), maxh2cdata);
1377                goto free_icresp;
1378        }
1379        queue->maxh2cdata = maxh2cdata;
1380
1381        ret = 0;
1382free_icresp:
1383        kfree(icresp);
1384free_icreq:
1385        kfree(icreq);
1386        return ret;
1387}
1388
1389static bool nvme_tcp_admin_queue(struct nvme_tcp_queue *queue)
1390{
1391        return nvme_tcp_queue_id(queue) == 0;
1392}
1393
1394static bool nvme_tcp_default_queue(struct nvme_tcp_queue *queue)
1395{
1396        struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1397        int qid = nvme_tcp_queue_id(queue);
1398
1399        return !nvme_tcp_admin_queue(queue) &&
1400                qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT];
1401}
1402
1403static bool nvme_tcp_read_queue(struct nvme_tcp_queue *queue)
1404{
1405        struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1406        int qid = nvme_tcp_queue_id(queue);
1407
1408        return !nvme_tcp_admin_queue(queue) &&
1409                !nvme_tcp_default_queue(queue) &&
1410                qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] +
1411                          ctrl->io_queues[HCTX_TYPE_READ];
1412}
1413
1414static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue)
1415{
1416        struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1417        int qid = nvme_tcp_queue_id(queue);
1418
1419        return !nvme_tcp_admin_queue(queue) &&
1420                !nvme_tcp_default_queue(queue) &&
1421                !nvme_tcp_read_queue(queue) &&
1422                qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] +
1423                          ctrl->io_queues[HCTX_TYPE_READ] +
1424                          ctrl->io_queues[HCTX_TYPE_POLL];
1425}
1426
1427static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
1428{
1429        struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1430        int qid = nvme_tcp_queue_id(queue);
1431        int n = 0;
1432
1433        if (nvme_tcp_default_queue(queue))
1434                n = qid - 1;
1435        else if (nvme_tcp_read_queue(queue))
1436                n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - 1;
1437        else if (nvme_tcp_poll_queue(queue))
1438                n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] -
1439                                ctrl->io_queues[HCTX_TYPE_READ] - 1;
1440        queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
1441}
1442
1443static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
1444                int qid, size_t queue_size)
1445{
1446        struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1447        struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1448        int ret, rcv_pdu_size;
1449
1450        mutex_init(&queue->queue_lock);
1451        queue->ctrl = ctrl;
1452        init_llist_head(&queue->req_list);
1453        INIT_LIST_HEAD(&queue->send_list);
1454        mutex_init(&queue->send_mutex);
1455        INIT_WORK(&queue->io_work, nvme_tcp_io_work);
1456        queue->queue_size = queue_size;
1457
1458        if (qid > 0)
1459                queue->cmnd_capsule_len = nctrl->ioccsz * 16;
1460        else
1461                queue->cmnd_capsule_len = sizeof(struct nvme_command) +
1462                                                NVME_TCP_ADMIN_CCSZ;
1463
1464        ret = sock_create(ctrl->addr.ss_family, SOCK_STREAM,
1465                        IPPROTO_TCP, &queue->sock);
1466        if (ret) {
1467                dev_err(nctrl->device,
1468                        "failed to create socket: %d\n", ret);
1469                goto err_destroy_mutex;
1470        }
1471
1472        /* Single syn retry */
1473        tcp_sock_set_syncnt(queue->sock->sk, 1);
1474
1475        /* Set TCP no delay */
1476        tcp_sock_set_nodelay(queue->sock->sk);
1477
1478        /*
1479         * Cleanup whatever is sitting in the TCP transmit queue on socket
1480         * close. This is done to prevent stale data from being sent should
1481         * the network connection be restored before TCP times out.
1482         */
1483        sock_no_linger(queue->sock->sk);
1484
1485        if (so_priority > 0)
1486                sock_set_priority(queue->sock->sk, so_priority);
1487
1488        /* Set socket type of service */
1489        if (nctrl->opts->tos >= 0)
1490                ip_sock_set_tos(queue->sock->sk, nctrl->opts->tos);
1491
1492        /* Set 10 seconds timeout for icresp recvmsg */
1493        queue->sock->sk->sk_rcvtimeo = 10 * HZ;
1494
1495        queue->sock->sk->sk_allocation = GFP_ATOMIC;
1496        nvme_tcp_set_queue_io_cpu(queue);
1497        queue->request = NULL;
1498        queue->data_remaining = 0;
1499        queue->ddgst_remaining = 0;
1500        queue->pdu_remaining = 0;
1501        queue->pdu_offset = 0;
1502        sk_set_memalloc(queue->sock->sk);
1503
1504        if (nctrl->opts->mask & NVMF_OPT_HOST_TRADDR) {
1505                ret = kernel_bind(queue->sock, (struct sockaddr *)&ctrl->src_addr,
1506                        sizeof(ctrl->src_addr));
1507                if (ret) {
1508                        dev_err(nctrl->device,
1509                                "failed to bind queue %d socket %d\n",
1510                                qid, ret);
1511                        goto err_sock;
1512                }
1513        }
1514
1515        if (nctrl->opts->mask & NVMF_OPT_HOST_IFACE) {
1516                char *iface = nctrl->opts->host_iface;
1517                sockptr_t optval = KERNEL_SOCKPTR(iface);
1518
1519                ret = sock_setsockopt(queue->sock, SOL_SOCKET, SO_BINDTODEVICE,
1520                                      optval, strlen(iface));
1521                if (ret) {
1522                        dev_err(nctrl->device,
1523                          "failed to bind to interface %s queue %d err %d\n",
1524                          iface, qid, ret);
1525                        goto err_sock;
1526                }
1527        }
1528
1529        queue->hdr_digest = nctrl->opts->hdr_digest;
1530        queue->data_digest = nctrl->opts->data_digest;
1531        if (queue->hdr_digest || queue->data_digest) {
1532                ret = nvme_tcp_alloc_crypto(queue);
1533                if (ret) {
1534                        dev_err(nctrl->device,
1535                                "failed to allocate queue %d crypto\n", qid);
1536                        goto err_sock;
1537                }
1538        }
1539
1540        rcv_pdu_size = sizeof(struct nvme_tcp_rsp_pdu) +
1541                        nvme_tcp_hdgst_len(queue);
1542        queue->pdu = kmalloc(rcv_pdu_size, GFP_KERNEL);
1543        if (!queue->pdu) {
1544                ret = -ENOMEM;
1545                goto err_crypto;
1546        }
1547
1548        dev_dbg(nctrl->device, "connecting queue %d\n",
1549                        nvme_tcp_queue_id(queue));
1550
1551        ret = kernel_connect(queue->sock, (struct sockaddr *)&ctrl->addr,
1552                sizeof(ctrl->addr), 0);
1553        if (ret) {
1554                dev_err(nctrl->device,
1555                        "failed to connect socket: %d\n", ret);
1556                goto err_rcv_pdu;
1557        }
1558
1559        ret = nvme_tcp_init_connection(queue);
1560        if (ret)
1561                goto err_init_connect;
1562
1563        queue->rd_enabled = true;
1564        set_bit(NVME_TCP_Q_ALLOCATED, &queue->flags);
1565        nvme_tcp_init_recv_ctx(queue);
1566
1567        write_lock_bh(&queue->sock->sk->sk_callback_lock);
1568        queue->sock->sk->sk_user_data = queue;
1569        queue->state_change = queue->sock->sk->sk_state_change;
1570        queue->data_ready = queue->sock->sk->sk_data_ready;
1571        queue->write_space = queue->sock->sk->sk_write_space;
1572        queue->sock->sk->sk_data_ready = nvme_tcp_data_ready;
1573        queue->sock->sk->sk_state_change = nvme_tcp_state_change;
1574        queue->sock->sk->sk_write_space = nvme_tcp_write_space;
1575#ifdef CONFIG_NET_RX_BUSY_POLL
1576        queue->sock->sk->sk_ll_usec = 1;
1577#endif
1578        write_unlock_bh(&queue->sock->sk->sk_callback_lock);
1579
1580        return 0;
1581
1582err_init_connect:
1583        kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1584err_rcv_pdu:
1585        kfree(queue->pdu);
1586err_crypto:
1587        if (queue->hdr_digest || queue->data_digest)
1588                nvme_tcp_free_crypto(queue);
1589err_sock:
1590        sock_release(queue->sock);
1591        queue->sock = NULL;
1592err_destroy_mutex:
1593        mutex_destroy(&queue->send_mutex);
1594        mutex_destroy(&queue->queue_lock);
1595        return ret;
1596}
1597
1598static void nvme_tcp_restore_sock_calls(struct nvme_tcp_queue *queue)
1599{
1600        struct socket *sock = queue->sock;
1601
1602        write_lock_bh(&sock->sk->sk_callback_lock);
1603        sock->sk->sk_user_data  = NULL;
1604        sock->sk->sk_data_ready = queue->data_ready;
1605        sock->sk->sk_state_change = queue->state_change;
1606        sock->sk->sk_write_space  = queue->write_space;
1607        write_unlock_bh(&sock->sk->sk_callback_lock);
1608}
1609
1610static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue)
1611{
1612        kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1613        nvme_tcp_restore_sock_calls(queue);
1614        cancel_work_sync(&queue->io_work);
1615}
1616
1617static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
1618{
1619        struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1620        struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1621
1622        mutex_lock(&queue->queue_lock);
1623        if (test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags))
1624                __nvme_tcp_stop_queue(queue);
1625        mutex_unlock(&queue->queue_lock);
1626}
1627
1628static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
1629{
1630        struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1631        int ret;
1632
1633        if (idx)
1634                ret = nvmf_connect_io_queue(nctrl, idx);
1635        else
1636                ret = nvmf_connect_admin_queue(nctrl);
1637
1638        if (!ret) {
1639                set_bit(NVME_TCP_Q_LIVE, &ctrl->queues[idx].flags);
1640        } else {
1641                if (test_bit(NVME_TCP_Q_ALLOCATED, &ctrl->queues[idx].flags))
1642                        __nvme_tcp_stop_queue(&ctrl->queues[idx]);
1643                dev_err(nctrl->device,
1644                        "failed to connect queue: %d ret=%d\n", idx, ret);
1645        }
1646        return ret;
1647}
1648
1649static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl,
1650                bool admin)
1651{
1652        struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1653        struct blk_mq_tag_set *set;
1654        int ret;
1655
1656        if (admin) {
1657                set = &ctrl->admin_tag_set;
1658                memset(set, 0, sizeof(*set));
1659                set->ops = &nvme_tcp_admin_mq_ops;
1660                set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
1661                set->reserved_tags = NVMF_RESERVED_TAGS;
1662                set->numa_node = nctrl->numa_node;
1663                set->flags = BLK_MQ_F_BLOCKING;
1664                set->cmd_size = sizeof(struct nvme_tcp_request);
1665                set->driver_data = ctrl;
1666                set->nr_hw_queues = 1;
1667                set->timeout = NVME_ADMIN_TIMEOUT;
1668        } else {
1669                set = &ctrl->tag_set;
1670                memset(set, 0, sizeof(*set));
1671                set->ops = &nvme_tcp_mq_ops;
1672                set->queue_depth = nctrl->sqsize + 1;
1673                set->reserved_tags = NVMF_RESERVED_TAGS;
1674                set->numa_node = nctrl->numa_node;
1675                set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
1676                set->cmd_size = sizeof(struct nvme_tcp_request);
1677                set->driver_data = ctrl;
1678                set->nr_hw_queues = nctrl->queue_count - 1;
1679                set->timeout = NVME_IO_TIMEOUT;
1680                set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2;
1681        }
1682
1683        ret = blk_mq_alloc_tag_set(set);
1684        if (ret)
1685                return ERR_PTR(ret);
1686
1687        return set;
1688}
1689
1690static void nvme_tcp_free_admin_queue(struct nvme_ctrl *ctrl)
1691{
1692        if (to_tcp_ctrl(ctrl)->async_req.pdu) {
1693                cancel_work_sync(&ctrl->async_event_work);
1694                nvme_tcp_free_async_req(to_tcp_ctrl(ctrl));
1695                to_tcp_ctrl(ctrl)->async_req.pdu = NULL;
1696        }
1697
1698        nvme_tcp_free_queue(ctrl, 0);
1699}
1700
1701static void nvme_tcp_free_io_queues(struct nvme_ctrl *ctrl)
1702{
1703        int i;
1704
1705        for (i = 1; i < ctrl->queue_count; i++)
1706                nvme_tcp_free_queue(ctrl, i);
1707}
1708
1709static void nvme_tcp_stop_io_queues(struct nvme_ctrl *ctrl)
1710{
1711        int i;
1712
1713        for (i = 1; i < ctrl->queue_count; i++)
1714                nvme_tcp_stop_queue(ctrl, i);
1715}
1716
1717static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl)
1718{
1719        int i, ret = 0;
1720
1721        for (i = 1; i < ctrl->queue_count; i++) {
1722                ret = nvme_tcp_start_queue(ctrl, i);
1723                if (ret)
1724                        goto out_stop_queues;
1725        }
1726
1727        return 0;
1728
1729out_stop_queues:
1730        for (i--; i >= 1; i--)
1731                nvme_tcp_stop_queue(ctrl, i);
1732        return ret;
1733}
1734
1735static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl)
1736{
1737        int ret;
1738
1739        ret = nvme_tcp_alloc_queue(ctrl, 0, NVME_AQ_DEPTH);
1740        if (ret)
1741                return ret;
1742
1743        ret = nvme_tcp_alloc_async_req(to_tcp_ctrl(ctrl));
1744        if (ret)
1745                goto out_free_queue;
1746
1747        return 0;
1748
1749out_free_queue:
1750        nvme_tcp_free_queue(ctrl, 0);
1751        return ret;
1752}
1753
1754static int __nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
1755{
1756        int i, ret;
1757
1758        for (i = 1; i < ctrl->queue_count; i++) {
1759                ret = nvme_tcp_alloc_queue(ctrl, i,
1760                                ctrl->sqsize + 1);
1761                if (ret)
1762                        goto out_free_queues;
1763        }
1764
1765        return 0;
1766
1767out_free_queues:
1768        for (i--; i >= 1; i--)
1769                nvme_tcp_free_queue(ctrl, i);
1770
1771        return ret;
1772}
1773
1774static unsigned int nvme_tcp_nr_io_queues(struct nvme_ctrl *ctrl)
1775{
1776        unsigned int nr_io_queues;
1777
1778        nr_io_queues = min(ctrl->opts->nr_io_queues, num_online_cpus());
1779        nr_io_queues += min(ctrl->opts->nr_write_queues, num_online_cpus());
1780        nr_io_queues += min(ctrl->opts->nr_poll_queues, num_online_cpus());
1781
1782        return nr_io_queues;
1783}
1784
1785static void nvme_tcp_set_io_queues(struct nvme_ctrl *nctrl,
1786                unsigned int nr_io_queues)
1787{
1788        struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1789        struct nvmf_ctrl_options *opts = nctrl->opts;
1790
1791        if (opts->nr_write_queues && opts->nr_io_queues < nr_io_queues) {
1792                /*
1793                 * separate read/write queues
1794                 * hand out dedicated default queues only after we have
1795                 * sufficient read queues.
1796                 */
1797                ctrl->io_queues[HCTX_TYPE_READ] = opts->nr_io_queues;
1798                nr_io_queues -= ctrl->io_queues[HCTX_TYPE_READ];
1799                ctrl->io_queues[HCTX_TYPE_DEFAULT] =
1800                        min(opts->nr_write_queues, nr_io_queues);
1801                nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
1802        } else {
1803                /*
1804                 * shared read/write queues
1805                 * either no write queues were requested, or we don't have
1806                 * sufficient queue count to have dedicated default queues.
1807                 */
1808                ctrl->io_queues[HCTX_TYPE_DEFAULT] =
1809                        min(opts->nr_io_queues, nr_io_queues);
1810                nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
1811        }
1812
1813        if (opts->nr_poll_queues && nr_io_queues) {
1814                /* map dedicated poll queues only if we have queues left */
1815                ctrl->io_queues[HCTX_TYPE_POLL] =
1816                        min(opts->nr_poll_queues, nr_io_queues);
1817        }
1818}
1819
1820static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
1821{
1822        unsigned int nr_io_queues;
1823        int ret;
1824
1825        nr_io_queues = nvme_tcp_nr_io_queues(ctrl);
1826        ret = nvme_set_queue_count(ctrl, &nr_io_queues);
1827        if (ret)
1828                return ret;
1829
1830        if (nr_io_queues == 0) {
1831                dev_err(ctrl->device,
1832                        "unable to set any I/O queues\n");
1833                return -ENOMEM;
1834        }
1835
1836        ctrl->queue_count = nr_io_queues + 1;
1837        dev_info(ctrl->device,
1838                "creating %d I/O queues.\n", nr_io_queues);
1839
1840        nvme_tcp_set_io_queues(ctrl, nr_io_queues);
1841
1842        return __nvme_tcp_alloc_io_queues(ctrl);
1843}
1844
1845static void nvme_tcp_destroy_io_queues(struct nvme_ctrl *ctrl, bool remove)
1846{
1847        nvme_tcp_stop_io_queues(ctrl);
1848        if (remove) {
1849                blk_cleanup_queue(ctrl->connect_q);
1850                blk_mq_free_tag_set(ctrl->tagset);
1851        }
1852        nvme_tcp_free_io_queues(ctrl);
1853}
1854
1855static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
1856{
1857        int ret;
1858
1859        ret = nvme_tcp_alloc_io_queues(ctrl);
1860        if (ret)
1861                return ret;
1862
1863        if (new) {
1864                ctrl->tagset = nvme_tcp_alloc_tagset(ctrl, false);
1865                if (IS_ERR(ctrl->tagset)) {
1866                        ret = PTR_ERR(ctrl->tagset);
1867                        goto out_free_io_queues;
1868                }
1869
1870                ctrl->connect_q = blk_mq_init_queue(ctrl->tagset);
1871                if (IS_ERR(ctrl->connect_q)) {
1872                        ret = PTR_ERR(ctrl->connect_q);
1873                        goto out_free_tag_set;
1874                }
1875        }
1876
1877        ret = nvme_tcp_start_io_queues(ctrl);
1878        if (ret)
1879                goto out_cleanup_connect_q;
1880
1881        if (!new) {
1882                nvme_start_queues(ctrl);
1883                if (!nvme_wait_freeze_timeout(ctrl, NVME_IO_TIMEOUT)) {
1884                        /*
1885                         * If we timed out waiting for freeze we are likely to
1886                         * be stuck.  Fail the controller initialization just
1887                         * to be safe.
1888                         */
1889                        ret = -ENODEV;
1890                        goto out_wait_freeze_timed_out;
1891                }
1892                blk_mq_update_nr_hw_queues(ctrl->tagset,
1893                        ctrl->queue_count - 1);
1894                nvme_unfreeze(ctrl);
1895        }
1896
1897        return 0;
1898
1899out_wait_freeze_timed_out:
1900        nvme_stop_queues(ctrl);
1901        nvme_sync_io_queues(ctrl);
1902        nvme_tcp_stop_io_queues(ctrl);
1903out_cleanup_connect_q:
1904        nvme_cancel_tagset(ctrl);
1905        if (new)
1906                blk_cleanup_queue(ctrl->connect_q);
1907out_free_tag_set:
1908        if (new)
1909                blk_mq_free_tag_set(ctrl->tagset);
1910out_free_io_queues:
1911        nvme_tcp_free_io_queues(ctrl);
1912        return ret;
1913}
1914
1915static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove)
1916{
1917        nvme_tcp_stop_queue(ctrl, 0);
1918        if (remove) {
1919                blk_cleanup_queue(ctrl->admin_q);
1920                blk_cleanup_queue(ctrl->fabrics_q);
1921                blk_mq_free_tag_set(ctrl->admin_tagset);
1922        }
1923        nvme_tcp_free_admin_queue(ctrl);
1924}
1925
1926static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
1927{
1928        int error;
1929
1930        error = nvme_tcp_alloc_admin_queue(ctrl);
1931        if (error)
1932                return error;
1933
1934        if (new) {
1935                ctrl->admin_tagset = nvme_tcp_alloc_tagset(ctrl, true);
1936                if (IS_ERR(ctrl->admin_tagset)) {
1937                        error = PTR_ERR(ctrl->admin_tagset);
1938                        goto out_free_queue;
1939                }
1940
1941                ctrl->fabrics_q = blk_mq_init_queue(ctrl->admin_tagset);
1942                if (IS_ERR(ctrl->fabrics_q)) {
1943                        error = PTR_ERR(ctrl->fabrics_q);
1944                        goto out_free_tagset;
1945                }
1946
1947                ctrl->admin_q = blk_mq_init_queue(ctrl->admin_tagset);
1948                if (IS_ERR(ctrl->admin_q)) {
1949                        error = PTR_ERR(ctrl->admin_q);
1950                        goto out_cleanup_fabrics_q;
1951                }
1952        }
1953
1954        error = nvme_tcp_start_queue(ctrl, 0);
1955        if (error)
1956                goto out_cleanup_queue;
1957
1958        error = nvme_enable_ctrl(ctrl);
1959        if (error)
1960                goto out_stop_queue;
1961
1962        nvme_start_admin_queue(ctrl);
1963
1964        error = nvme_init_ctrl_finish(ctrl);
1965        if (error)
1966                goto out_quiesce_queue;
1967
1968        return 0;
1969
1970out_quiesce_queue:
1971        nvme_stop_admin_queue(ctrl);
1972        blk_sync_queue(ctrl->admin_q);
1973out_stop_queue:
1974        nvme_tcp_stop_queue(ctrl, 0);
1975        nvme_cancel_admin_tagset(ctrl);
1976out_cleanup_queue:
1977        if (new)
1978                blk_cleanup_queue(ctrl->admin_q);
1979out_cleanup_fabrics_q:
1980        if (new)
1981                blk_cleanup_queue(ctrl->fabrics_q);
1982out_free_tagset:
1983        if (new)
1984                blk_mq_free_tag_set(ctrl->admin_tagset);
1985out_free_queue:
1986        nvme_tcp_free_admin_queue(ctrl);
1987        return error;
1988}
1989
1990static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl,
1991                bool remove)
1992{
1993        nvme_stop_admin_queue(ctrl);
1994        blk_sync_queue(ctrl->admin_q);
1995        nvme_tcp_stop_queue(ctrl, 0);
1996        nvme_cancel_admin_tagset(ctrl);
1997        if (remove)
1998                nvme_start_admin_queue(ctrl);
1999        nvme_tcp_destroy_admin_queue(ctrl, remove);
2000}
2001
2002static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
2003                bool remove)
2004{
2005        if (ctrl->queue_count <= 1)
2006                return;
2007        nvme_stop_admin_queue(ctrl);
2008        nvme_start_freeze(ctrl);
2009        nvme_stop_queues(ctrl);
2010        nvme_sync_io_queues(ctrl);
2011        nvme_tcp_stop_io_queues(ctrl);
2012        nvme_cancel_tagset(ctrl);
2013        if (remove)
2014                nvme_start_queues(ctrl);
2015        nvme_tcp_destroy_io_queues(ctrl, remove);
2016}
2017
2018static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl)
2019{
2020        /* If we are resetting/deleting then do nothing */
2021        if (ctrl->state != NVME_CTRL_CONNECTING) {
2022                WARN_ON_ONCE(ctrl->state == NVME_CTRL_NEW ||
2023                        ctrl->state == NVME_CTRL_LIVE);
2024                return;
2025        }
2026
2027        if (nvmf_should_reconnect(ctrl)) {
2028                dev_info(ctrl->device, "Reconnecting in %d seconds...\n",
2029                        ctrl->opts->reconnect_delay);
2030                queue_delayed_work(nvme_wq, &to_tcp_ctrl(ctrl)->connect_work,
2031                                ctrl->opts->reconnect_delay * HZ);
2032        } else {
2033                dev_info(ctrl->device, "Removing controller...\n");
2034                nvme_delete_ctrl(ctrl);
2035        }
2036}
2037
2038static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
2039{
2040        struct nvmf_ctrl_options *opts = ctrl->opts;
2041        int ret;
2042
2043        ret = nvme_tcp_configure_admin_queue(ctrl, new);
2044        if (ret)
2045                return ret;
2046
2047        if (ctrl->icdoff) {
2048                ret = -EOPNOTSUPP;
2049                dev_err(ctrl->device, "icdoff is not supported!\n");
2050                goto destroy_admin;
2051        }
2052
2053        if (!nvme_ctrl_sgl_supported(ctrl)) {
2054                ret = -EOPNOTSUPP;
2055                dev_err(ctrl->device, "Mandatory sgls are not supported!\n");
2056                goto destroy_admin;
2057        }
2058
2059        if (opts->queue_size > ctrl->sqsize + 1)
2060                dev_warn(ctrl->device,
2061                        "queue_size %zu > ctrl sqsize %u, clamping down\n",
2062                        opts->queue_size, ctrl->sqsize + 1);
2063
2064        if (ctrl->sqsize + 1 > ctrl->maxcmd) {
2065                dev_warn(ctrl->device,
2066                        "sqsize %u > ctrl maxcmd %u, clamping down\n",
2067                        ctrl->sqsize + 1, ctrl->maxcmd);
2068                ctrl->sqsize = ctrl->maxcmd - 1;
2069        }
2070
2071        if (ctrl->queue_count > 1) {
2072                ret = nvme_tcp_configure_io_queues(ctrl, new);
2073                if (ret)
2074                        goto destroy_admin;
2075        }
2076
2077        if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) {
2078                /*
2079                 * state change failure is ok if we started ctrl delete,
2080                 * unless we're during creation of a new controller to
2081                 * avoid races with teardown flow.
2082                 */
2083                WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
2084                             ctrl->state != NVME_CTRL_DELETING_NOIO);
2085                WARN_ON_ONCE(new);
2086                ret = -EINVAL;
2087                goto destroy_io;
2088        }
2089
2090        nvme_start_ctrl(ctrl);
2091        return 0;
2092
2093destroy_io:
2094        if (ctrl->queue_count > 1) {
2095                nvme_stop_queues(ctrl);
2096                nvme_sync_io_queues(ctrl);
2097                nvme_tcp_stop_io_queues(ctrl);
2098                nvme_cancel_tagset(ctrl);
2099                nvme_tcp_destroy_io_queues(ctrl, new);
2100        }
2101destroy_admin:
2102        nvme_stop_admin_queue(ctrl);
2103        blk_sync_queue(ctrl->admin_q);
2104        nvme_tcp_stop_queue(ctrl, 0);
2105        nvme_cancel_admin_tagset(ctrl);
2106        nvme_tcp_destroy_admin_queue(ctrl, new);
2107        return ret;
2108}
2109
2110static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
2111{
2112        struct nvme_tcp_ctrl *tcp_ctrl = container_of(to_delayed_work(work),
2113                        struct nvme_tcp_ctrl, connect_work);
2114        struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
2115
2116        ++ctrl->nr_reconnects;
2117
2118        if (nvme_tcp_setup_ctrl(ctrl, false))
2119                goto requeue;
2120
2121        dev_info(ctrl->device, "Successfully reconnected (%d attempt)\n",
2122                        ctrl->nr_reconnects);
2123
2124        ctrl->nr_reconnects = 0;
2125
2126        return;
2127
2128requeue:
2129        dev_info(ctrl->device, "Failed reconnect attempt %d\n",
2130                        ctrl->nr_reconnects);
2131        nvme_tcp_reconnect_or_remove(ctrl);
2132}
2133
2134static void nvme_tcp_error_recovery_work(struct work_struct *work)
2135{
2136        struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
2137                                struct nvme_tcp_ctrl, err_work);
2138        struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
2139
2140        nvme_stop_keep_alive(ctrl);
2141        flush_work(&ctrl->async_event_work);
2142        nvme_tcp_teardown_io_queues(ctrl, false);
2143        /* unquiesce to fail fast pending requests */
2144        nvme_start_queues(ctrl);
2145        nvme_tcp_teardown_admin_queue(ctrl, false);
2146        nvme_start_admin_queue(ctrl);
2147
2148        if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
2149                /* state change failure is ok if we started ctrl delete */
2150                WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
2151                             ctrl->state != NVME_CTRL_DELETING_NOIO);
2152                return;
2153        }
2154
2155        nvme_tcp_reconnect_or_remove(ctrl);
2156}
2157
2158static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
2159{
2160        cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work);
2161        cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
2162
2163        nvme_tcp_teardown_io_queues(ctrl, shutdown);
2164        nvme_stop_admin_queue(ctrl);
2165        if (shutdown)
2166                nvme_shutdown_ctrl(ctrl);
2167        else
2168                nvme_disable_ctrl(ctrl);
2169        nvme_tcp_teardown_admin_queue(ctrl, shutdown);
2170}
2171
2172static void nvme_tcp_delete_ctrl(struct nvme_ctrl *ctrl)
2173{
2174        nvme_tcp_teardown_ctrl(ctrl, true);
2175}
2176
2177static void nvme_reset_ctrl_work(struct work_struct *work)
2178{
2179        struct nvme_ctrl *ctrl =
2180                container_of(work, struct nvme_ctrl, reset_work);
2181
2182        nvme_stop_ctrl(ctrl);
2183        nvme_tcp_teardown_ctrl(ctrl, false);
2184
2185        if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
2186                /* state change failure is ok if we started ctrl delete */
2187                WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
2188                             ctrl->state != NVME_CTRL_DELETING_NOIO);
2189                return;
2190        }
2191
2192        if (nvme_tcp_setup_ctrl(ctrl, false))
2193                goto out_fail;
2194
2195        return;
2196
2197out_fail:
2198        ++ctrl->nr_reconnects;
2199        nvme_tcp_reconnect_or_remove(ctrl);
2200}
2201
2202static void nvme_tcp_free_ctrl(struct nvme_ctrl *nctrl)
2203{
2204        struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
2205
2206        if (list_empty(&ctrl->list))
2207                goto free_ctrl;
2208
2209        mutex_lock(&nvme_tcp_ctrl_mutex);
2210        list_del(&ctrl->list);
2211        mutex_unlock(&nvme_tcp_ctrl_mutex);
2212
2213        nvmf_free_options(nctrl->opts);
2214free_ctrl:
2215        kfree(ctrl->queues);
2216        kfree(ctrl);
2217}
2218
2219static void nvme_tcp_set_sg_null(struct nvme_command *c)
2220{
2221        struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
2222
2223        sg->addr = 0;
2224        sg->length = 0;
2225        sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
2226                        NVME_SGL_FMT_TRANSPORT_A;
2227}
2228
2229static void nvme_tcp_set_sg_inline(struct nvme_tcp_queue *queue,
2230                struct nvme_command *c, u32 data_len)
2231{
2232        struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
2233
2234        sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff);
2235        sg->length = cpu_to_le32(data_len);
2236        sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
2237}
2238
2239static void nvme_tcp_set_sg_host_data(struct nvme_command *c,
2240                u32 data_len)
2241{
2242        struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
2243
2244        sg->addr = 0;
2245        sg->length = cpu_to_le32(data_len);
2246        sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
2247                        NVME_SGL_FMT_TRANSPORT_A;
2248}
2249
2250static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg)
2251{
2252        struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(arg);
2253        struct nvme_tcp_queue *queue = &ctrl->queues[0];
2254        struct nvme_tcp_cmd_pdu *pdu = ctrl->async_req.pdu;
2255        struct nvme_command *cmd = &pdu->cmd;
2256        u8 hdgst = nvme_tcp_hdgst_len(queue);
2257
2258        memset(pdu, 0, sizeof(*pdu));
2259        pdu->hdr.type = nvme_tcp_cmd;
2260        if (queue->hdr_digest)
2261                pdu->hdr.flags |= NVME_TCP_F_HDGST;
2262        pdu->hdr.hlen = sizeof(*pdu);
2263        pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
2264
2265        cmd->common.opcode = nvme_admin_async_event;
2266        cmd->common.command_id = NVME_AQ_BLK_MQ_DEPTH;
2267        cmd->common.flags |= NVME_CMD_SGL_METABUF;
2268        nvme_tcp_set_sg_null(cmd);
2269
2270        ctrl->async_req.state = NVME_TCP_SEND_CMD_PDU;
2271        ctrl->async_req.offset = 0;
2272        ctrl->async_req.curr_bio = NULL;
2273        ctrl->async_req.data_len = 0;
2274
2275        nvme_tcp_queue_request(&ctrl->async_req, true, true);
2276}
2277
2278static void nvme_tcp_complete_timed_out(struct request *rq)
2279{
2280        struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2281        struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl;
2282
2283        nvme_tcp_stop_queue(ctrl, nvme_tcp_queue_id(req->queue));
2284        if (blk_mq_request_started(rq) && !blk_mq_request_completed(rq)) {
2285                nvme_req(rq)->status = NVME_SC_HOST_ABORTED_CMD;
2286                blk_mq_complete_request(rq);
2287        }
2288}
2289
2290static enum blk_eh_timer_return
2291nvme_tcp_timeout(struct request *rq, bool reserved)
2292{
2293        struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2294        struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl;
2295        struct nvme_tcp_cmd_pdu *pdu = req->pdu;
2296
2297        dev_warn(ctrl->device,
2298                "queue %d: timeout request %#x type %d\n",
2299                nvme_tcp_queue_id(req->queue), rq->tag, pdu->hdr.type);
2300
2301        if (ctrl->state != NVME_CTRL_LIVE) {
2302                /*
2303                 * If we are resetting, connecting or deleting we should
2304                 * complete immediately because we may block controller
2305                 * teardown or setup sequence
2306                 * - ctrl disable/shutdown fabrics requests
2307                 * - connect requests
2308                 * - initialization admin requests
2309                 * - I/O requests that entered after unquiescing and
2310                 *   the controller stopped responding
2311                 *
2312                 * All other requests should be cancelled by the error
2313                 * recovery work, so it's fine that we fail it here.
2314                 */
2315                nvme_tcp_complete_timed_out(rq);
2316                return BLK_EH_DONE;
2317        }
2318
2319        /*
2320         * LIVE state should trigger the normal error recovery which will
2321         * handle completing this request.
2322         */
2323        nvme_tcp_error_recovery(ctrl);
2324        return BLK_EH_RESET_TIMER;
2325}
2326
2327static blk_status_t nvme_tcp_map_data(struct nvme_tcp_queue *queue,
2328                        struct request *rq)
2329{
2330        struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2331        struct nvme_tcp_cmd_pdu *pdu = req->pdu;
2332        struct nvme_command *c = &pdu->cmd;
2333
2334        c->common.flags |= NVME_CMD_SGL_METABUF;
2335
2336        if (!blk_rq_nr_phys_segments(rq))
2337                nvme_tcp_set_sg_null(c);
2338        else if (rq_data_dir(rq) == WRITE &&
2339            req->data_len <= nvme_tcp_inline_data_size(queue))
2340                nvme_tcp_set_sg_inline(queue, c, req->data_len);
2341        else
2342                nvme_tcp_set_sg_host_data(c, req->data_len);
2343
2344        return 0;
2345}
2346
2347static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns,
2348                struct request *rq)
2349{
2350        struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2351        struct nvme_tcp_cmd_pdu *pdu = req->pdu;
2352        struct nvme_tcp_queue *queue = req->queue;
2353        u8 hdgst = nvme_tcp_hdgst_len(queue), ddgst = 0;
2354        blk_status_t ret;
2355
2356        ret = nvme_setup_cmd(ns, rq);
2357        if (ret)
2358                return ret;
2359
2360        req->state = NVME_TCP_SEND_CMD_PDU;
2361        req->status = cpu_to_le16(NVME_SC_SUCCESS);
2362        req->offset = 0;
2363        req->data_sent = 0;
2364        req->pdu_len = 0;
2365        req->pdu_sent = 0;
2366        req->h2cdata_left = 0;
2367        req->data_len = blk_rq_nr_phys_segments(rq) ?
2368                                blk_rq_payload_bytes(rq) : 0;
2369        req->curr_bio = rq->bio;
2370        if (req->curr_bio && req->data_len)
2371                nvme_tcp_init_iter(req, rq_data_dir(rq));
2372
2373        if (rq_data_dir(rq) == WRITE &&
2374            req->data_len <= nvme_tcp_inline_data_size(queue))
2375                req->pdu_len = req->data_len;
2376
2377        pdu->hdr.type = nvme_tcp_cmd;
2378        pdu->hdr.flags = 0;
2379        if (queue->hdr_digest)
2380                pdu->hdr.flags |= NVME_TCP_F_HDGST;
2381        if (queue->data_digest && req->pdu_len) {
2382                pdu->hdr.flags |= NVME_TCP_F_DDGST;
2383                ddgst = nvme_tcp_ddgst_len(queue);
2384        }
2385        pdu->hdr.hlen = sizeof(*pdu);
2386        pdu->hdr.pdo = req->pdu_len ? pdu->hdr.hlen + hdgst : 0;
2387        pdu->hdr.plen =
2388                cpu_to_le32(pdu->hdr.hlen + hdgst + req->pdu_len + ddgst);
2389
2390        ret = nvme_tcp_map_data(queue, rq);
2391        if (unlikely(ret)) {
2392                nvme_cleanup_cmd(rq);
2393                dev_err(queue->ctrl->ctrl.device,
2394                        "Failed to map data (%d)\n", ret);
2395                return ret;
2396        }
2397
2398        return 0;
2399}
2400
2401static void nvme_tcp_commit_rqs(struct blk_mq_hw_ctx *hctx)
2402{
2403        struct nvme_tcp_queue *queue = hctx->driver_data;
2404
2405        if (!llist_empty(&queue->req_list))
2406                queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
2407}
2408
2409static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
2410                const struct blk_mq_queue_data *bd)
2411{
2412        struct nvme_ns *ns = hctx->queue->queuedata;
2413        struct nvme_tcp_queue *queue = hctx->driver_data;
2414        struct request *rq = bd->rq;
2415        struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2416        bool queue_ready = test_bit(NVME_TCP_Q_LIVE, &queue->flags);
2417        blk_status_t ret;
2418
2419        if (!nvme_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
2420                return nvme_fail_nonready_command(&queue->ctrl->ctrl, rq);
2421
2422        ret = nvme_tcp_setup_cmd_pdu(ns, rq);
2423        if (unlikely(ret))
2424                return ret;
2425
2426        blk_mq_start_request(rq);
2427
2428        nvme_tcp_queue_request(req, true, bd->last);
2429
2430        return BLK_STS_OK;
2431}
2432
2433static int nvme_tcp_map_queues(struct blk_mq_tag_set *set)
2434{
2435        struct nvme_tcp_ctrl *ctrl = set->driver_data;
2436        struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
2437
2438        if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) {
2439                /* separate read/write queues */
2440                set->map[HCTX_TYPE_DEFAULT].nr_queues =
2441                        ctrl->io_queues[HCTX_TYPE_DEFAULT];
2442                set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
2443                set->map[HCTX_TYPE_READ].nr_queues =
2444                        ctrl->io_queues[HCTX_TYPE_READ];
2445                set->map[HCTX_TYPE_READ].queue_offset =
2446                        ctrl->io_queues[HCTX_TYPE_DEFAULT];
2447        } else {
2448                /* shared read/write queues */
2449                set->map[HCTX_TYPE_DEFAULT].nr_queues =
2450                        ctrl->io_queues[HCTX_TYPE_DEFAULT];
2451                set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
2452                set->map[HCTX_TYPE_READ].nr_queues =
2453                        ctrl->io_queues[HCTX_TYPE_DEFAULT];
2454                set->map[HCTX_TYPE_READ].queue_offset = 0;
2455        }
2456        blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
2457        blk_mq_map_queues(&set->map[HCTX_TYPE_READ]);
2458
2459        if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) {
2460                /* map dedicated poll queues only if we have queues left */
2461                set->map[HCTX_TYPE_POLL].nr_queues =
2462                                ctrl->io_queues[HCTX_TYPE_POLL];
2463                set->map[HCTX_TYPE_POLL].queue_offset =
2464                        ctrl->io_queues[HCTX_TYPE_DEFAULT] +
2465                        ctrl->io_queues[HCTX_TYPE_READ];
2466                blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]);
2467        }
2468
2469        dev_info(ctrl->ctrl.device,
2470                "mapped %d/%d/%d default/read/poll queues.\n",
2471                ctrl->io_queues[HCTX_TYPE_DEFAULT],
2472                ctrl->io_queues[HCTX_TYPE_READ],
2473                ctrl->io_queues[HCTX_TYPE_POLL]);
2474
2475        return 0;
2476}
2477
2478static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
2479{
2480        struct nvme_tcp_queue *queue = hctx->driver_data;
2481        struct sock *sk = queue->sock->sk;
2482
2483        if (!test_bit(NVME_TCP_Q_LIVE, &queue->flags))
2484                return 0;
2485
2486        set_bit(NVME_TCP_Q_POLLING, &queue->flags);
2487        if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue))
2488                sk_busy_loop(sk, true);
2489        nvme_tcp_try_recv(queue);
2490        clear_bit(NVME_TCP_Q_POLLING, &queue->flags);
2491        return queue->nr_cqe;
2492}
2493
2494static const struct blk_mq_ops nvme_tcp_mq_ops = {
2495        .queue_rq       = nvme_tcp_queue_rq,
2496        .commit_rqs     = nvme_tcp_commit_rqs,
2497        .complete       = nvme_complete_rq,
2498        .init_request   = nvme_tcp_init_request,
2499        .exit_request   = nvme_tcp_exit_request,
2500        .init_hctx      = nvme_tcp_init_hctx,
2501        .timeout        = nvme_tcp_timeout,
2502        .map_queues     = nvme_tcp_map_queues,
2503        .poll           = nvme_tcp_poll,
2504};
2505
2506static const struct blk_mq_ops nvme_tcp_admin_mq_ops = {
2507        .queue_rq       = nvme_tcp_queue_rq,
2508        .complete       = nvme_complete_rq,
2509        .init_request   = nvme_tcp_init_request,
2510        .exit_request   = nvme_tcp_exit_request,
2511        .init_hctx      = nvme_tcp_init_admin_hctx,
2512        .timeout        = nvme_tcp_timeout,
2513};
2514
2515static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = {
2516        .name                   = "tcp",
2517        .module                 = THIS_MODULE,
2518        .flags                  = NVME_F_FABRICS,
2519        .reg_read32             = nvmf_reg_read32,
2520        .reg_read64             = nvmf_reg_read64,
2521        .reg_write32            = nvmf_reg_write32,
2522        .free_ctrl              = nvme_tcp_free_ctrl,
2523        .submit_async_event     = nvme_tcp_submit_async_event,
2524        .delete_ctrl            = nvme_tcp_delete_ctrl,
2525        .get_address            = nvmf_get_address,
2526};
2527
2528static bool
2529nvme_tcp_existing_controller(struct nvmf_ctrl_options *opts)
2530{
2531        struct nvme_tcp_ctrl *ctrl;
2532        bool found = false;
2533
2534        mutex_lock(&nvme_tcp_ctrl_mutex);
2535        list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list) {
2536                found = nvmf_ip_options_match(&ctrl->ctrl, opts);
2537                if (found)
2538                        break;
2539        }
2540        mutex_unlock(&nvme_tcp_ctrl_mutex);
2541
2542        return found;
2543}
2544
2545static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
2546                struct nvmf_ctrl_options *opts)
2547{
2548        struct nvme_tcp_ctrl *ctrl;
2549        int ret;
2550
2551        ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
2552        if (!ctrl)
2553                return ERR_PTR(-ENOMEM);
2554
2555        INIT_LIST_HEAD(&ctrl->list);
2556        ctrl->ctrl.opts = opts;
2557        ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues +
2558                                opts->nr_poll_queues + 1;
2559        ctrl->ctrl.sqsize = opts->queue_size - 1;
2560        ctrl->ctrl.kato = opts->kato;
2561
2562        INIT_DELAYED_WORK(&ctrl->connect_work,
2563                        nvme_tcp_reconnect_ctrl_work);
2564        INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work);
2565        INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work);
2566
2567        if (!(opts->mask & NVMF_OPT_TRSVCID)) {
2568                opts->trsvcid =
2569                        kstrdup(__stringify(NVME_TCP_DISC_PORT), GFP_KERNEL);
2570                if (!opts->trsvcid) {
2571                        ret = -ENOMEM;
2572                        goto out_free_ctrl;
2573                }
2574                opts->mask |= NVMF_OPT_TRSVCID;
2575        }
2576
2577        ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
2578                        opts->traddr, opts->trsvcid, &ctrl->addr);
2579        if (ret) {
2580                pr_err("malformed address passed: %s:%s\n",
2581                        opts->traddr, opts->trsvcid);
2582                goto out_free_ctrl;
2583        }
2584
2585        if (opts->mask & NVMF_OPT_HOST_TRADDR) {
2586                ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
2587                        opts->host_traddr, NULL, &ctrl->src_addr);
2588                if (ret) {
2589                        pr_err("malformed src address passed: %s\n",
2590                               opts->host_traddr);
2591                        goto out_free_ctrl;
2592                }
2593        }
2594
2595        if (opts->mask & NVMF_OPT_HOST_IFACE) {
2596                if (!__dev_get_by_name(&init_net, opts->host_iface)) {
2597                        pr_err("invalid interface passed: %s\n",
2598                               opts->host_iface);
2599                        ret = -ENODEV;
2600                        goto out_free_ctrl;
2601                }
2602        }
2603
2604        if (!opts->duplicate_connect && nvme_tcp_existing_controller(opts)) {
2605                ret = -EALREADY;
2606                goto out_free_ctrl;
2607        }
2608
2609        ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(*ctrl->queues),
2610                                GFP_KERNEL);
2611        if (!ctrl->queues) {
2612                ret = -ENOMEM;
2613                goto out_free_ctrl;
2614        }
2615
2616        ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_tcp_ctrl_ops, 0);
2617        if (ret)
2618                goto out_kfree_queues;
2619
2620        if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
2621                WARN_ON_ONCE(1);
2622                ret = -EINTR;
2623                goto out_uninit_ctrl;
2624        }
2625
2626        ret = nvme_tcp_setup_ctrl(&ctrl->ctrl, true);
2627        if (ret)
2628                goto out_uninit_ctrl;
2629
2630        dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n",
2631                nvmf_ctrl_subsysnqn(&ctrl->ctrl), &ctrl->addr);
2632
2633        mutex_lock(&nvme_tcp_ctrl_mutex);
2634        list_add_tail(&ctrl->list, &nvme_tcp_ctrl_list);
2635        mutex_unlock(&nvme_tcp_ctrl_mutex);
2636
2637        return &ctrl->ctrl;
2638
2639out_uninit_ctrl:
2640        nvme_uninit_ctrl(&ctrl->ctrl);
2641        nvme_put_ctrl(&ctrl->ctrl);
2642        if (ret > 0)
2643                ret = -EIO;
2644        return ERR_PTR(ret);
2645out_kfree_queues:
2646        kfree(ctrl->queues);
2647out_free_ctrl:
2648        kfree(ctrl);
2649        return ERR_PTR(ret);
2650}
2651
2652static struct nvmf_transport_ops nvme_tcp_transport = {
2653        .name           = "tcp",
2654        .module         = THIS_MODULE,
2655        .required_opts  = NVMF_OPT_TRADDR,
2656        .allowed_opts   = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
2657                          NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
2658                          NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST |
2659                          NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES |
2660                          NVMF_OPT_TOS | NVMF_OPT_HOST_IFACE,
2661        .create_ctrl    = nvme_tcp_create_ctrl,
2662};
2663
2664static int __init nvme_tcp_init_module(void)
2665{
2666        nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq",
2667                        WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
2668        if (!nvme_tcp_wq)
2669                return -ENOMEM;
2670
2671        nvmf_register_transport(&nvme_tcp_transport);
2672        return 0;
2673}
2674
2675static void __exit nvme_tcp_cleanup_module(void)
2676{
2677        struct nvme_tcp_ctrl *ctrl;
2678
2679        nvmf_unregister_transport(&nvme_tcp_transport);
2680
2681        mutex_lock(&nvme_tcp_ctrl_mutex);
2682        list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list)
2683                nvme_delete_ctrl(&ctrl->ctrl);
2684        mutex_unlock(&nvme_tcp_ctrl_mutex);
2685        flush_workqueue(nvme_delete_wq);
2686
2687        destroy_workqueue(nvme_tcp_wq);
2688}
2689
2690module_init(nvme_tcp_init_module);
2691module_exit(nvme_tcp_cleanup_module);
2692
2693MODULE_LICENSE("GPL v2");
2694