linux/drivers/nvme/target/tcp.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * NVMe over Fabrics TCP target.
   4 * Copyright (c) 2018 Lightbits Labs. All rights reserved.
   5 */
   6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   7#include <linux/module.h>
   8#include <linux/init.h>
   9#include <linux/slab.h>
  10#include <linux/err.h>
  11#include <linux/nvme-tcp.h>
  12#include <net/sock.h>
  13#include <net/tcp.h>
  14#include <linux/inet.h>
  15#include <linux/llist.h>
  16#include <crypto/hash.h>
  17
  18#include "nvmet.h"
  19
  20#define NVMET_TCP_DEF_INLINE_DATA_SIZE  (4 * PAGE_SIZE)
  21
  22/* Define the socket priority to use for connections were it is desirable
  23 * that the NIC consider performing optimized packet processing or filtering.
  24 * A non-zero value being sufficient to indicate general consideration of any
  25 * possible optimization.  Making it a module param allows for alternative
  26 * values that may be unique for some NIC implementations.
  27 */
  28static int so_priority;
  29module_param(so_priority, int, 0644);
  30MODULE_PARM_DESC(so_priority, "nvmet tcp socket optimize priority");
  31
  32#define NVMET_TCP_RECV_BUDGET           8
  33#define NVMET_TCP_SEND_BUDGET           8
  34#define NVMET_TCP_IO_WORK_BUDGET        64
  35
  36enum nvmet_tcp_send_state {
  37        NVMET_TCP_SEND_DATA_PDU,
  38        NVMET_TCP_SEND_DATA,
  39        NVMET_TCP_SEND_R2T,
  40        NVMET_TCP_SEND_DDGST,
  41        NVMET_TCP_SEND_RESPONSE
  42};
  43
  44enum nvmet_tcp_recv_state {
  45        NVMET_TCP_RECV_PDU,
  46        NVMET_TCP_RECV_DATA,
  47        NVMET_TCP_RECV_DDGST,
  48        NVMET_TCP_RECV_ERR,
  49};
  50
  51enum {
  52        NVMET_TCP_F_INIT_FAILED = (1 << 0),
  53};
  54
  55struct nvmet_tcp_cmd {
  56        struct nvmet_tcp_queue          *queue;
  57        struct nvmet_req                req;
  58
  59        struct nvme_tcp_cmd_pdu         *cmd_pdu;
  60        struct nvme_tcp_rsp_pdu         *rsp_pdu;
  61        struct nvme_tcp_data_pdu        *data_pdu;
  62        struct nvme_tcp_r2t_pdu         *r2t_pdu;
  63
  64        u32                             rbytes_done;
  65        u32                             wbytes_done;
  66
  67        u32                             pdu_len;
  68        u32                             pdu_recv;
  69        int                             sg_idx;
  70        int                             nr_mapped;
  71        struct msghdr                   recv_msg;
  72        struct kvec                     *iov;
  73        u32                             flags;
  74
  75        struct list_head                entry;
  76        struct llist_node               lentry;
  77
  78        /* send state */
  79        u32                             offset;
  80        struct scatterlist              *cur_sg;
  81        enum nvmet_tcp_send_state       state;
  82
  83        __le32                          exp_ddgst;
  84        __le32                          recv_ddgst;
  85};
  86
  87enum nvmet_tcp_queue_state {
  88        NVMET_TCP_Q_CONNECTING,
  89        NVMET_TCP_Q_LIVE,
  90        NVMET_TCP_Q_DISCONNECTING,
  91};
  92
  93struct nvmet_tcp_queue {
  94        struct socket           *sock;
  95        struct nvmet_tcp_port   *port;
  96        struct work_struct      io_work;
  97        int                     cpu;
  98        struct nvmet_cq         nvme_cq;
  99        struct nvmet_sq         nvme_sq;
 100
 101        /* send state */
 102        struct nvmet_tcp_cmd    *cmds;
 103        unsigned int            nr_cmds;
 104        struct list_head        free_list;
 105        struct llist_head       resp_list;
 106        struct list_head        resp_send_list;
 107        int                     send_list_len;
 108        struct nvmet_tcp_cmd    *snd_cmd;
 109
 110        /* recv state */
 111        int                     offset;
 112        int                     left;
 113        enum nvmet_tcp_recv_state rcv_state;
 114        struct nvmet_tcp_cmd    *cmd;
 115        union nvme_tcp_pdu      pdu;
 116
 117        /* digest state */
 118        bool                    hdr_digest;
 119        bool                    data_digest;
 120        struct ahash_request    *snd_hash;
 121        struct ahash_request    *rcv_hash;
 122
 123        spinlock_t              state_lock;
 124        enum nvmet_tcp_queue_state state;
 125
 126        struct sockaddr_storage sockaddr;
 127        struct sockaddr_storage sockaddr_peer;
 128        struct work_struct      release_work;
 129
 130        int                     idx;
 131        struct list_head        queue_list;
 132
 133        struct nvmet_tcp_cmd    connect;
 134
 135        struct page_frag_cache  pf_cache;
 136
 137        void (*data_ready)(struct sock *);
 138        void (*state_change)(struct sock *);
 139        void (*write_space)(struct sock *);
 140};
 141
 142struct nvmet_tcp_port {
 143        struct socket           *sock;
 144        struct work_struct      accept_work;
 145        struct nvmet_port       *nport;
 146        struct sockaddr_storage addr;
 147        int                     last_cpu;
 148        void (*data_ready)(struct sock *);
 149};
 150
 151static DEFINE_IDA(nvmet_tcp_queue_ida);
 152static LIST_HEAD(nvmet_tcp_queue_list);
 153static DEFINE_MUTEX(nvmet_tcp_queue_mutex);
 154
 155static struct workqueue_struct *nvmet_tcp_wq;
 156static const struct nvmet_fabrics_ops nvmet_tcp_ops;
 157static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c);
 158static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd);
 159
 160static inline u16 nvmet_tcp_cmd_tag(struct nvmet_tcp_queue *queue,
 161                struct nvmet_tcp_cmd *cmd)
 162{
 163        if (unlikely(!queue->nr_cmds)) {
 164                /* We didn't allocate cmds yet, send 0xffff */
 165                return USHRT_MAX;
 166        }
 167
 168        return cmd - queue->cmds;
 169}
 170
 171static inline bool nvmet_tcp_has_data_in(struct nvmet_tcp_cmd *cmd)
 172{
 173        return nvme_is_write(cmd->req.cmd) &&
 174                cmd->rbytes_done < cmd->req.transfer_len;
 175}
 176
 177static inline bool nvmet_tcp_need_data_in(struct nvmet_tcp_cmd *cmd)
 178{
 179        return nvmet_tcp_has_data_in(cmd) && !cmd->req.cqe->status;
 180}
 181
 182static inline bool nvmet_tcp_need_data_out(struct nvmet_tcp_cmd *cmd)
 183{
 184        return !nvme_is_write(cmd->req.cmd) &&
 185                cmd->req.transfer_len > 0 &&
 186                !cmd->req.cqe->status;
 187}
 188
 189static inline bool nvmet_tcp_has_inline_data(struct nvmet_tcp_cmd *cmd)
 190{
 191        return nvme_is_write(cmd->req.cmd) && cmd->pdu_len &&
 192                !cmd->rbytes_done;
 193}
 194
 195static inline struct nvmet_tcp_cmd *
 196nvmet_tcp_get_cmd(struct nvmet_tcp_queue *queue)
 197{
 198        struct nvmet_tcp_cmd *cmd;
 199
 200        cmd = list_first_entry_or_null(&queue->free_list,
 201                                struct nvmet_tcp_cmd, entry);
 202        if (!cmd)
 203                return NULL;
 204        list_del_init(&cmd->entry);
 205
 206        cmd->rbytes_done = cmd->wbytes_done = 0;
 207        cmd->pdu_len = 0;
 208        cmd->pdu_recv = 0;
 209        cmd->iov = NULL;
 210        cmd->flags = 0;
 211        return cmd;
 212}
 213
 214static inline void nvmet_tcp_put_cmd(struct nvmet_tcp_cmd *cmd)
 215{
 216        if (unlikely(cmd == &cmd->queue->connect))
 217                return;
 218
 219        list_add_tail(&cmd->entry, &cmd->queue->free_list);
 220}
 221
 222static inline u8 nvmet_tcp_hdgst_len(struct nvmet_tcp_queue *queue)
 223{
 224        return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0;
 225}
 226
 227static inline u8 nvmet_tcp_ddgst_len(struct nvmet_tcp_queue *queue)
 228{
 229        return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
 230}
 231
 232static inline void nvmet_tcp_hdgst(struct ahash_request *hash,
 233                void *pdu, size_t len)
 234{
 235        struct scatterlist sg;
 236
 237        sg_init_one(&sg, pdu, len);
 238        ahash_request_set_crypt(hash, &sg, pdu + len, len);
 239        crypto_ahash_digest(hash);
 240}
 241
 242static int nvmet_tcp_verify_hdgst(struct nvmet_tcp_queue *queue,
 243        void *pdu, size_t len)
 244{
 245        struct nvme_tcp_hdr *hdr = pdu;
 246        __le32 recv_digest;
 247        __le32 exp_digest;
 248
 249        if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) {
 250                pr_err("queue %d: header digest enabled but no header digest\n",
 251                        queue->idx);
 252                return -EPROTO;
 253        }
 254
 255        recv_digest = *(__le32 *)(pdu + hdr->hlen);
 256        nvmet_tcp_hdgst(queue->rcv_hash, pdu, len);
 257        exp_digest = *(__le32 *)(pdu + hdr->hlen);
 258        if (recv_digest != exp_digest) {
 259                pr_err("queue %d: header digest error: recv %#x expected %#x\n",
 260                        queue->idx, le32_to_cpu(recv_digest),
 261                        le32_to_cpu(exp_digest));
 262                return -EPROTO;
 263        }
 264
 265        return 0;
 266}
 267
 268static int nvmet_tcp_check_ddgst(struct nvmet_tcp_queue *queue, void *pdu)
 269{
 270        struct nvme_tcp_hdr *hdr = pdu;
 271        u8 digest_len = nvmet_tcp_hdgst_len(queue);
 272        u32 len;
 273
 274        len = le32_to_cpu(hdr->plen) - hdr->hlen -
 275                (hdr->flags & NVME_TCP_F_HDGST ? digest_len : 0);
 276
 277        if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) {
 278                pr_err("queue %d: data digest flag is cleared\n", queue->idx);
 279                return -EPROTO;
 280        }
 281
 282        return 0;
 283}
 284
 285static void nvmet_tcp_unmap_pdu_iovec(struct nvmet_tcp_cmd *cmd)
 286{
 287        struct scatterlist *sg;
 288        int i;
 289
 290        sg = &cmd->req.sg[cmd->sg_idx];
 291
 292        for (i = 0; i < cmd->nr_mapped; i++)
 293                kunmap(sg_page(&sg[i]));
 294}
 295
 296static void nvmet_tcp_map_pdu_iovec(struct nvmet_tcp_cmd *cmd)
 297{
 298        struct kvec *iov = cmd->iov;
 299        struct scatterlist *sg;
 300        u32 length, offset, sg_offset;
 301
 302        length = cmd->pdu_len;
 303        cmd->nr_mapped = DIV_ROUND_UP(length, PAGE_SIZE);
 304        offset = cmd->rbytes_done;
 305        cmd->sg_idx = DIV_ROUND_UP(offset, PAGE_SIZE);
 306        sg_offset = offset % PAGE_SIZE;
 307        sg = &cmd->req.sg[cmd->sg_idx];
 308
 309        while (length) {
 310                u32 iov_len = min_t(u32, length, sg->length - sg_offset);
 311
 312                iov->iov_base = kmap(sg_page(sg)) + sg->offset + sg_offset;
 313                iov->iov_len = iov_len;
 314
 315                length -= iov_len;
 316                sg = sg_next(sg);
 317                iov++;
 318        }
 319
 320        iov_iter_kvec(&cmd->recv_msg.msg_iter, READ, cmd->iov,
 321                cmd->nr_mapped, cmd->pdu_len);
 322}
 323
 324static void nvmet_tcp_fatal_error(struct nvmet_tcp_queue *queue)
 325{
 326        queue->rcv_state = NVMET_TCP_RECV_ERR;
 327        if (queue->nvme_sq.ctrl)
 328                nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl);
 329        else
 330                kernel_sock_shutdown(queue->sock, SHUT_RDWR);
 331}
 332
 333static void nvmet_tcp_socket_error(struct nvmet_tcp_queue *queue, int status)
 334{
 335        if (status == -EPIPE || status == -ECONNRESET)
 336                kernel_sock_shutdown(queue->sock, SHUT_RDWR);
 337        else
 338                nvmet_tcp_fatal_error(queue);
 339}
 340
 341static int nvmet_tcp_map_data(struct nvmet_tcp_cmd *cmd)
 342{
 343        struct nvme_sgl_desc *sgl = &cmd->req.cmd->common.dptr.sgl;
 344        u32 len = le32_to_cpu(sgl->length);
 345
 346        if (!len)
 347                return 0;
 348
 349        if (sgl->type == ((NVME_SGL_FMT_DATA_DESC << 4) |
 350                          NVME_SGL_FMT_OFFSET)) {
 351                if (!nvme_is_write(cmd->req.cmd))
 352                        return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
 353
 354                if (len > cmd->req.port->inline_data_size)
 355                        return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR;
 356                cmd->pdu_len = len;
 357        }
 358        cmd->req.transfer_len += len;
 359
 360        cmd->req.sg = sgl_alloc(len, GFP_KERNEL, &cmd->req.sg_cnt);
 361        if (!cmd->req.sg)
 362                return NVME_SC_INTERNAL;
 363        cmd->cur_sg = cmd->req.sg;
 364
 365        if (nvmet_tcp_has_data_in(cmd)) {
 366                cmd->iov = kmalloc_array(cmd->req.sg_cnt,
 367                                sizeof(*cmd->iov), GFP_KERNEL);
 368                if (!cmd->iov)
 369                        goto err;
 370        }
 371
 372        return 0;
 373err:
 374        sgl_free(cmd->req.sg);
 375        return NVME_SC_INTERNAL;
 376}
 377
 378static void nvmet_tcp_ddgst(struct ahash_request *hash,
 379                struct nvmet_tcp_cmd *cmd)
 380{
 381        ahash_request_set_crypt(hash, cmd->req.sg,
 382                (void *)&cmd->exp_ddgst, cmd->req.transfer_len);
 383        crypto_ahash_digest(hash);
 384}
 385
 386static void nvmet_setup_c2h_data_pdu(struct nvmet_tcp_cmd *cmd)
 387{
 388        struct nvme_tcp_data_pdu *pdu = cmd->data_pdu;
 389        struct nvmet_tcp_queue *queue = cmd->queue;
 390        u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
 391        u8 ddgst = nvmet_tcp_ddgst_len(cmd->queue);
 392
 393        cmd->offset = 0;
 394        cmd->state = NVMET_TCP_SEND_DATA_PDU;
 395
 396        pdu->hdr.type = nvme_tcp_c2h_data;
 397        pdu->hdr.flags = NVME_TCP_F_DATA_LAST | (queue->nvme_sq.sqhd_disabled ?
 398                                                NVME_TCP_F_DATA_SUCCESS : 0);
 399        pdu->hdr.hlen = sizeof(*pdu);
 400        pdu->hdr.pdo = pdu->hdr.hlen + hdgst;
 401        pdu->hdr.plen =
 402                cpu_to_le32(pdu->hdr.hlen + hdgst +
 403                                cmd->req.transfer_len + ddgst);
 404        pdu->command_id = cmd->req.cqe->command_id;
 405        pdu->data_length = cpu_to_le32(cmd->req.transfer_len);
 406        pdu->data_offset = cpu_to_le32(cmd->wbytes_done);
 407
 408        if (queue->data_digest) {
 409                pdu->hdr.flags |= NVME_TCP_F_DDGST;
 410                nvmet_tcp_ddgst(queue->snd_hash, cmd);
 411        }
 412
 413        if (cmd->queue->hdr_digest) {
 414                pdu->hdr.flags |= NVME_TCP_F_HDGST;
 415                nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
 416        }
 417}
 418
 419static void nvmet_setup_r2t_pdu(struct nvmet_tcp_cmd *cmd)
 420{
 421        struct nvme_tcp_r2t_pdu *pdu = cmd->r2t_pdu;
 422        struct nvmet_tcp_queue *queue = cmd->queue;
 423        u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
 424
 425        cmd->offset = 0;
 426        cmd->state = NVMET_TCP_SEND_R2T;
 427
 428        pdu->hdr.type = nvme_tcp_r2t;
 429        pdu->hdr.flags = 0;
 430        pdu->hdr.hlen = sizeof(*pdu);
 431        pdu->hdr.pdo = 0;
 432        pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
 433
 434        pdu->command_id = cmd->req.cmd->common.command_id;
 435        pdu->ttag = nvmet_tcp_cmd_tag(cmd->queue, cmd);
 436        pdu->r2t_length = cpu_to_le32(cmd->req.transfer_len - cmd->rbytes_done);
 437        pdu->r2t_offset = cpu_to_le32(cmd->rbytes_done);
 438        if (cmd->queue->hdr_digest) {
 439                pdu->hdr.flags |= NVME_TCP_F_HDGST;
 440                nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
 441        }
 442}
 443
 444static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd *cmd)
 445{
 446        struct nvme_tcp_rsp_pdu *pdu = cmd->rsp_pdu;
 447        struct nvmet_tcp_queue *queue = cmd->queue;
 448        u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
 449
 450        cmd->offset = 0;
 451        cmd->state = NVMET_TCP_SEND_RESPONSE;
 452
 453        pdu->hdr.type = nvme_tcp_rsp;
 454        pdu->hdr.flags = 0;
 455        pdu->hdr.hlen = sizeof(*pdu);
 456        pdu->hdr.pdo = 0;
 457        pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
 458        if (cmd->queue->hdr_digest) {
 459                pdu->hdr.flags |= NVME_TCP_F_HDGST;
 460                nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
 461        }
 462}
 463
 464static void nvmet_tcp_process_resp_list(struct nvmet_tcp_queue *queue)
 465{
 466        struct llist_node *node;
 467        struct nvmet_tcp_cmd *cmd;
 468
 469        for (node = llist_del_all(&queue->resp_list); node; node = node->next) {
 470                cmd = llist_entry(node, struct nvmet_tcp_cmd, lentry);
 471                list_add(&cmd->entry, &queue->resp_send_list);
 472                queue->send_list_len++;
 473        }
 474}
 475
 476static struct nvmet_tcp_cmd *nvmet_tcp_fetch_cmd(struct nvmet_tcp_queue *queue)
 477{
 478        queue->snd_cmd = list_first_entry_or_null(&queue->resp_send_list,
 479                                struct nvmet_tcp_cmd, entry);
 480        if (!queue->snd_cmd) {
 481                nvmet_tcp_process_resp_list(queue);
 482                queue->snd_cmd =
 483                        list_first_entry_or_null(&queue->resp_send_list,
 484                                        struct nvmet_tcp_cmd, entry);
 485                if (unlikely(!queue->snd_cmd))
 486                        return NULL;
 487        }
 488
 489        list_del_init(&queue->snd_cmd->entry);
 490        queue->send_list_len--;
 491
 492        if (nvmet_tcp_need_data_out(queue->snd_cmd))
 493                nvmet_setup_c2h_data_pdu(queue->snd_cmd);
 494        else if (nvmet_tcp_need_data_in(queue->snd_cmd))
 495                nvmet_setup_r2t_pdu(queue->snd_cmd);
 496        else
 497                nvmet_setup_response_pdu(queue->snd_cmd);
 498
 499        return queue->snd_cmd;
 500}
 501
 502static void nvmet_tcp_queue_response(struct nvmet_req *req)
 503{
 504        struct nvmet_tcp_cmd *cmd =
 505                container_of(req, struct nvmet_tcp_cmd, req);
 506        struct nvmet_tcp_queue  *queue = cmd->queue;
 507
 508        llist_add(&cmd->lentry, &queue->resp_list);
 509        queue_work_on(cmd->queue->cpu, nvmet_tcp_wq, &cmd->queue->io_work);
 510}
 511
 512static int nvmet_try_send_data_pdu(struct nvmet_tcp_cmd *cmd)
 513{
 514        u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
 515        int left = sizeof(*cmd->data_pdu) - cmd->offset + hdgst;
 516        int ret;
 517
 518        ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->data_pdu),
 519                        offset_in_page(cmd->data_pdu) + cmd->offset,
 520                        left, MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST);
 521        if (ret <= 0)
 522                return ret;
 523
 524        cmd->offset += ret;
 525        left -= ret;
 526
 527        if (left)
 528                return -EAGAIN;
 529
 530        cmd->state = NVMET_TCP_SEND_DATA;
 531        cmd->offset  = 0;
 532        return 1;
 533}
 534
 535static int nvmet_try_send_data(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
 536{
 537        struct nvmet_tcp_queue *queue = cmd->queue;
 538        int ret;
 539
 540        while (cmd->cur_sg) {
 541                struct page *page = sg_page(cmd->cur_sg);
 542                u32 left = cmd->cur_sg->length - cmd->offset;
 543                int flags = MSG_DONTWAIT;
 544
 545                if ((!last_in_batch && cmd->queue->send_list_len) ||
 546                    cmd->wbytes_done + left < cmd->req.transfer_len ||
 547                    queue->data_digest || !queue->nvme_sq.sqhd_disabled)
 548                        flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
 549
 550                ret = kernel_sendpage(cmd->queue->sock, page, cmd->offset,
 551                                        left, flags);
 552                if (ret <= 0)
 553                        return ret;
 554
 555                cmd->offset += ret;
 556                cmd->wbytes_done += ret;
 557
 558                /* Done with sg?*/
 559                if (cmd->offset == cmd->cur_sg->length) {
 560                        cmd->cur_sg = sg_next(cmd->cur_sg);
 561                        cmd->offset = 0;
 562                }
 563        }
 564
 565        if (queue->data_digest) {
 566                cmd->state = NVMET_TCP_SEND_DDGST;
 567                cmd->offset = 0;
 568        } else {
 569                if (queue->nvme_sq.sqhd_disabled) {
 570                        cmd->queue->snd_cmd = NULL;
 571                        nvmet_tcp_put_cmd(cmd);
 572                } else {
 573                        nvmet_setup_response_pdu(cmd);
 574                }
 575        }
 576
 577        if (queue->nvme_sq.sqhd_disabled) {
 578                kfree(cmd->iov);
 579                sgl_free(cmd->req.sg);
 580        }
 581
 582        return 1;
 583
 584}
 585
 586static int nvmet_try_send_response(struct nvmet_tcp_cmd *cmd,
 587                bool last_in_batch)
 588{
 589        u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
 590        int left = sizeof(*cmd->rsp_pdu) - cmd->offset + hdgst;
 591        int flags = MSG_DONTWAIT;
 592        int ret;
 593
 594        if (!last_in_batch && cmd->queue->send_list_len)
 595                flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
 596        else
 597                flags |= MSG_EOR;
 598
 599        ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->rsp_pdu),
 600                offset_in_page(cmd->rsp_pdu) + cmd->offset, left, flags);
 601        if (ret <= 0)
 602                return ret;
 603        cmd->offset += ret;
 604        left -= ret;
 605
 606        if (left)
 607                return -EAGAIN;
 608
 609        kfree(cmd->iov);
 610        sgl_free(cmd->req.sg);
 611        cmd->queue->snd_cmd = NULL;
 612        nvmet_tcp_put_cmd(cmd);
 613        return 1;
 614}
 615
 616static int nvmet_try_send_r2t(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
 617{
 618        u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue);
 619        int left = sizeof(*cmd->r2t_pdu) - cmd->offset + hdgst;
 620        int flags = MSG_DONTWAIT;
 621        int ret;
 622
 623        if (!last_in_batch && cmd->queue->send_list_len)
 624                flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
 625        else
 626                flags |= MSG_EOR;
 627
 628        ret = kernel_sendpage(cmd->queue->sock, virt_to_page(cmd->r2t_pdu),
 629                offset_in_page(cmd->r2t_pdu) + cmd->offset, left, flags);
 630        if (ret <= 0)
 631                return ret;
 632        cmd->offset += ret;
 633        left -= ret;
 634
 635        if (left)
 636                return -EAGAIN;
 637
 638        cmd->queue->snd_cmd = NULL;
 639        return 1;
 640}
 641
 642static int nvmet_try_send_ddgst(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
 643{
 644        struct nvmet_tcp_queue *queue = cmd->queue;
 645        struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
 646        struct kvec iov = {
 647                .iov_base = &cmd->exp_ddgst + cmd->offset,
 648                .iov_len = NVME_TCP_DIGEST_LENGTH - cmd->offset
 649        };
 650        int ret;
 651
 652        if (!last_in_batch && cmd->queue->send_list_len)
 653                msg.msg_flags |= MSG_MORE;
 654        else
 655                msg.msg_flags |= MSG_EOR;
 656
 657        ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
 658        if (unlikely(ret <= 0))
 659                return ret;
 660
 661        cmd->offset += ret;
 662
 663        if (queue->nvme_sq.sqhd_disabled) {
 664                cmd->queue->snd_cmd = NULL;
 665                nvmet_tcp_put_cmd(cmd);
 666        } else {
 667                nvmet_setup_response_pdu(cmd);
 668        }
 669        return 1;
 670}
 671
 672static int nvmet_tcp_try_send_one(struct nvmet_tcp_queue *queue,
 673                bool last_in_batch)
 674{
 675        struct nvmet_tcp_cmd *cmd = queue->snd_cmd;
 676        int ret = 0;
 677
 678        if (!cmd || queue->state == NVMET_TCP_Q_DISCONNECTING) {
 679                cmd = nvmet_tcp_fetch_cmd(queue);
 680                if (unlikely(!cmd))
 681                        return 0;
 682        }
 683
 684        if (cmd->state == NVMET_TCP_SEND_DATA_PDU) {
 685                ret = nvmet_try_send_data_pdu(cmd);
 686                if (ret <= 0)
 687                        goto done_send;
 688        }
 689
 690        if (cmd->state == NVMET_TCP_SEND_DATA) {
 691                ret = nvmet_try_send_data(cmd, last_in_batch);
 692                if (ret <= 0)
 693                        goto done_send;
 694        }
 695
 696        if (cmd->state == NVMET_TCP_SEND_DDGST) {
 697                ret = nvmet_try_send_ddgst(cmd, last_in_batch);
 698                if (ret <= 0)
 699                        goto done_send;
 700        }
 701
 702        if (cmd->state == NVMET_TCP_SEND_R2T) {
 703                ret = nvmet_try_send_r2t(cmd, last_in_batch);
 704                if (ret <= 0)
 705                        goto done_send;
 706        }
 707
 708        if (cmd->state == NVMET_TCP_SEND_RESPONSE)
 709                ret = nvmet_try_send_response(cmd, last_in_batch);
 710
 711done_send:
 712        if (ret < 0) {
 713                if (ret == -EAGAIN)
 714                        return 0;
 715                return ret;
 716        }
 717
 718        return 1;
 719}
 720
 721static int nvmet_tcp_try_send(struct nvmet_tcp_queue *queue,
 722                int budget, int *sends)
 723{
 724        int i, ret = 0;
 725
 726        for (i = 0; i < budget; i++) {
 727                ret = nvmet_tcp_try_send_one(queue, i == budget - 1);
 728                if (unlikely(ret < 0)) {
 729                        nvmet_tcp_socket_error(queue, ret);
 730                        goto done;
 731                } else if (ret == 0) {
 732                        break;
 733                }
 734                (*sends)++;
 735        }
 736done:
 737        return ret;
 738}
 739
 740static void nvmet_prepare_receive_pdu(struct nvmet_tcp_queue *queue)
 741{
 742        queue->offset = 0;
 743        queue->left = sizeof(struct nvme_tcp_hdr);
 744        queue->cmd = NULL;
 745        queue->rcv_state = NVMET_TCP_RECV_PDU;
 746}
 747
 748static void nvmet_tcp_free_crypto(struct nvmet_tcp_queue *queue)
 749{
 750        struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash);
 751
 752        ahash_request_free(queue->rcv_hash);
 753        ahash_request_free(queue->snd_hash);
 754        crypto_free_ahash(tfm);
 755}
 756
 757static int nvmet_tcp_alloc_crypto(struct nvmet_tcp_queue *queue)
 758{
 759        struct crypto_ahash *tfm;
 760
 761        tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC);
 762        if (IS_ERR(tfm))
 763                return PTR_ERR(tfm);
 764
 765        queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL);
 766        if (!queue->snd_hash)
 767                goto free_tfm;
 768        ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL);
 769
 770        queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL);
 771        if (!queue->rcv_hash)
 772                goto free_snd_hash;
 773        ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL);
 774
 775        return 0;
 776free_snd_hash:
 777        ahash_request_free(queue->snd_hash);
 778free_tfm:
 779        crypto_free_ahash(tfm);
 780        return -ENOMEM;
 781}
 782
 783
 784static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue)
 785{
 786        struct nvme_tcp_icreq_pdu *icreq = &queue->pdu.icreq;
 787        struct nvme_tcp_icresp_pdu *icresp = &queue->pdu.icresp;
 788        struct msghdr msg = {};
 789        struct kvec iov;
 790        int ret;
 791
 792        if (le32_to_cpu(icreq->hdr.plen) != sizeof(struct nvme_tcp_icreq_pdu)) {
 793                pr_err("bad nvme-tcp pdu length (%d)\n",
 794                        le32_to_cpu(icreq->hdr.plen));
 795                nvmet_tcp_fatal_error(queue);
 796        }
 797
 798        if (icreq->pfv != NVME_TCP_PFV_1_0) {
 799                pr_err("queue %d: bad pfv %d\n", queue->idx, icreq->pfv);
 800                return -EPROTO;
 801        }
 802
 803        if (icreq->hpda != 0) {
 804                pr_err("queue %d: unsupported hpda %d\n", queue->idx,
 805                        icreq->hpda);
 806                return -EPROTO;
 807        }
 808
 809        queue->hdr_digest = !!(icreq->digest & NVME_TCP_HDR_DIGEST_ENABLE);
 810        queue->data_digest = !!(icreq->digest & NVME_TCP_DATA_DIGEST_ENABLE);
 811        if (queue->hdr_digest || queue->data_digest) {
 812                ret = nvmet_tcp_alloc_crypto(queue);
 813                if (ret)
 814                        return ret;
 815        }
 816
 817        memset(icresp, 0, sizeof(*icresp));
 818        icresp->hdr.type = nvme_tcp_icresp;
 819        icresp->hdr.hlen = sizeof(*icresp);
 820        icresp->hdr.pdo = 0;
 821        icresp->hdr.plen = cpu_to_le32(icresp->hdr.hlen);
 822        icresp->pfv = cpu_to_le16(NVME_TCP_PFV_1_0);
 823        icresp->maxdata = cpu_to_le32(0x400000); /* 16M arbitrary limit */
 824        icresp->cpda = 0;
 825        if (queue->hdr_digest)
 826                icresp->digest |= NVME_TCP_HDR_DIGEST_ENABLE;
 827        if (queue->data_digest)
 828                icresp->digest |= NVME_TCP_DATA_DIGEST_ENABLE;
 829
 830        iov.iov_base = icresp;
 831        iov.iov_len = sizeof(*icresp);
 832        ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
 833        if (ret < 0)
 834                goto free_crypto;
 835
 836        queue->state = NVMET_TCP_Q_LIVE;
 837        nvmet_prepare_receive_pdu(queue);
 838        return 0;
 839free_crypto:
 840        if (queue->hdr_digest || queue->data_digest)
 841                nvmet_tcp_free_crypto(queue);
 842        return ret;
 843}
 844
 845static void nvmet_tcp_handle_req_failure(struct nvmet_tcp_queue *queue,
 846                struct nvmet_tcp_cmd *cmd, struct nvmet_req *req)
 847{
 848        size_t data_len = le32_to_cpu(req->cmd->common.dptr.sgl.length);
 849        int ret;
 850
 851        if (!nvme_is_write(cmd->req.cmd) ||
 852            data_len > cmd->req.port->inline_data_size) {
 853                nvmet_prepare_receive_pdu(queue);
 854                return;
 855        }
 856
 857        ret = nvmet_tcp_map_data(cmd);
 858        if (unlikely(ret)) {
 859                pr_err("queue %d: failed to map data\n", queue->idx);
 860                nvmet_tcp_fatal_error(queue);
 861                return;
 862        }
 863
 864        queue->rcv_state = NVMET_TCP_RECV_DATA;
 865        nvmet_tcp_map_pdu_iovec(cmd);
 866        cmd->flags |= NVMET_TCP_F_INIT_FAILED;
 867}
 868
 869static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue)
 870{
 871        struct nvme_tcp_data_pdu *data = &queue->pdu.data;
 872        struct nvmet_tcp_cmd *cmd;
 873
 874        if (likely(queue->nr_cmds))
 875                cmd = &queue->cmds[data->ttag];
 876        else
 877                cmd = &queue->connect;
 878
 879        if (le32_to_cpu(data->data_offset) != cmd->rbytes_done) {
 880                pr_err("ttag %u unexpected data offset %u (expected %u)\n",
 881                        data->ttag, le32_to_cpu(data->data_offset),
 882                        cmd->rbytes_done);
 883                /* FIXME: use path and transport errors */
 884                nvmet_req_complete(&cmd->req,
 885                        NVME_SC_INVALID_FIELD | NVME_SC_DNR);
 886                return -EPROTO;
 887        }
 888
 889        cmd->pdu_len = le32_to_cpu(data->data_length);
 890        cmd->pdu_recv = 0;
 891        nvmet_tcp_map_pdu_iovec(cmd);
 892        queue->cmd = cmd;
 893        queue->rcv_state = NVMET_TCP_RECV_DATA;
 894
 895        return 0;
 896}
 897
 898static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue *queue)
 899{
 900        struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr;
 901        struct nvme_command *nvme_cmd = &queue->pdu.cmd.cmd;
 902        struct nvmet_req *req;
 903        int ret;
 904
 905        if (unlikely(queue->state == NVMET_TCP_Q_CONNECTING)) {
 906                if (hdr->type != nvme_tcp_icreq) {
 907                        pr_err("unexpected pdu type (%d) before icreq\n",
 908                                hdr->type);
 909                        nvmet_tcp_fatal_error(queue);
 910                        return -EPROTO;
 911                }
 912                return nvmet_tcp_handle_icreq(queue);
 913        }
 914
 915        if (hdr->type == nvme_tcp_h2c_data) {
 916                ret = nvmet_tcp_handle_h2c_data_pdu(queue);
 917                if (unlikely(ret))
 918                        return ret;
 919                return 0;
 920        }
 921
 922        queue->cmd = nvmet_tcp_get_cmd(queue);
 923        if (unlikely(!queue->cmd)) {
 924                /* This should never happen */
 925                pr_err("queue %d: out of commands (%d) send_list_len: %d, opcode: %d",
 926                        queue->idx, queue->nr_cmds, queue->send_list_len,
 927                        nvme_cmd->common.opcode);
 928                nvmet_tcp_fatal_error(queue);
 929                return -ENOMEM;
 930        }
 931
 932        req = &queue->cmd->req;
 933        memcpy(req->cmd, nvme_cmd, sizeof(*nvme_cmd));
 934
 935        if (unlikely(!nvmet_req_init(req, &queue->nvme_cq,
 936                        &queue->nvme_sq, &nvmet_tcp_ops))) {
 937                pr_err("failed cmd %p id %d opcode %d, data_len: %d\n",
 938                        req->cmd, req->cmd->common.command_id,
 939                        req->cmd->common.opcode,
 940                        le32_to_cpu(req->cmd->common.dptr.sgl.length));
 941
 942                nvmet_tcp_handle_req_failure(queue, queue->cmd, req);
 943                return -EAGAIN;
 944        }
 945
 946        ret = nvmet_tcp_map_data(queue->cmd);
 947        if (unlikely(ret)) {
 948                pr_err("queue %d: failed to map data\n", queue->idx);
 949                if (nvmet_tcp_has_inline_data(queue->cmd))
 950                        nvmet_tcp_fatal_error(queue);
 951                else
 952                        nvmet_req_complete(req, ret);
 953                ret = -EAGAIN;
 954                goto out;
 955        }
 956
 957        if (nvmet_tcp_need_data_in(queue->cmd)) {
 958                if (nvmet_tcp_has_inline_data(queue->cmd)) {
 959                        queue->rcv_state = NVMET_TCP_RECV_DATA;
 960                        nvmet_tcp_map_pdu_iovec(queue->cmd);
 961                        return 0;
 962                }
 963                /* send back R2T */
 964                nvmet_tcp_queue_response(&queue->cmd->req);
 965                goto out;
 966        }
 967
 968        queue->cmd->req.execute(&queue->cmd->req);
 969out:
 970        nvmet_prepare_receive_pdu(queue);
 971        return ret;
 972}
 973
 974static const u8 nvme_tcp_pdu_sizes[] = {
 975        [nvme_tcp_icreq]        = sizeof(struct nvme_tcp_icreq_pdu),
 976        [nvme_tcp_cmd]          = sizeof(struct nvme_tcp_cmd_pdu),
 977        [nvme_tcp_h2c_data]     = sizeof(struct nvme_tcp_data_pdu),
 978};
 979
 980static inline u8 nvmet_tcp_pdu_size(u8 type)
 981{
 982        size_t idx = type;
 983
 984        return (idx < ARRAY_SIZE(nvme_tcp_pdu_sizes) &&
 985                nvme_tcp_pdu_sizes[idx]) ?
 986                        nvme_tcp_pdu_sizes[idx] : 0;
 987}
 988
 989static inline bool nvmet_tcp_pdu_valid(u8 type)
 990{
 991        switch (type) {
 992        case nvme_tcp_icreq:
 993        case nvme_tcp_cmd:
 994        case nvme_tcp_h2c_data:
 995                /* fallthru */
 996                return true;
 997        }
 998
 999        return false;
1000}
1001
1002static int nvmet_tcp_try_recv_pdu(struct nvmet_tcp_queue *queue)
1003{
1004        struct nvme_tcp_hdr *hdr = &queue->pdu.cmd.hdr;
1005        int len;
1006        struct kvec iov;
1007        struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
1008
1009recv:
1010        iov.iov_base = (void *)&queue->pdu + queue->offset;
1011        iov.iov_len = queue->left;
1012        len = kernel_recvmsg(queue->sock, &msg, &iov, 1,
1013                        iov.iov_len, msg.msg_flags);
1014        if (unlikely(len < 0))
1015                return len;
1016
1017        queue->offset += len;
1018        queue->left -= len;
1019        if (queue->left)
1020                return -EAGAIN;
1021
1022        if (queue->offset == sizeof(struct nvme_tcp_hdr)) {
1023                u8 hdgst = nvmet_tcp_hdgst_len(queue);
1024
1025                if (unlikely(!nvmet_tcp_pdu_valid(hdr->type))) {
1026                        pr_err("unexpected pdu type %d\n", hdr->type);
1027                        nvmet_tcp_fatal_error(queue);
1028                        return -EIO;
1029                }
1030
1031                if (unlikely(hdr->hlen != nvmet_tcp_pdu_size(hdr->type))) {
1032                        pr_err("pdu %d bad hlen %d\n", hdr->type, hdr->hlen);
1033                        return -EIO;
1034                }
1035
1036                queue->left = hdr->hlen - queue->offset + hdgst;
1037                goto recv;
1038        }
1039
1040        if (queue->hdr_digest &&
1041            nvmet_tcp_verify_hdgst(queue, &queue->pdu, queue->offset)) {
1042                nvmet_tcp_fatal_error(queue); /* fatal */
1043                return -EPROTO;
1044        }
1045
1046        if (queue->data_digest &&
1047            nvmet_tcp_check_ddgst(queue, &queue->pdu)) {
1048                nvmet_tcp_fatal_error(queue); /* fatal */
1049                return -EPROTO;
1050        }
1051
1052        return nvmet_tcp_done_recv_pdu(queue);
1053}
1054
1055static void nvmet_tcp_prep_recv_ddgst(struct nvmet_tcp_cmd *cmd)
1056{
1057        struct nvmet_tcp_queue *queue = cmd->queue;
1058
1059        nvmet_tcp_ddgst(queue->rcv_hash, cmd);
1060        queue->offset = 0;
1061        queue->left = NVME_TCP_DIGEST_LENGTH;
1062        queue->rcv_state = NVMET_TCP_RECV_DDGST;
1063}
1064
1065static int nvmet_tcp_try_recv_data(struct nvmet_tcp_queue *queue)
1066{
1067        struct nvmet_tcp_cmd  *cmd = queue->cmd;
1068        int ret;
1069
1070        while (msg_data_left(&cmd->recv_msg)) {
1071                ret = sock_recvmsg(cmd->queue->sock, &cmd->recv_msg,
1072                        cmd->recv_msg.msg_flags);
1073                if (ret <= 0)
1074                        return ret;
1075
1076                cmd->pdu_recv += ret;
1077                cmd->rbytes_done += ret;
1078        }
1079
1080        nvmet_tcp_unmap_pdu_iovec(cmd);
1081
1082        if (!(cmd->flags & NVMET_TCP_F_INIT_FAILED) &&
1083            cmd->rbytes_done == cmd->req.transfer_len) {
1084                if (queue->data_digest) {
1085                        nvmet_tcp_prep_recv_ddgst(cmd);
1086                        return 0;
1087                }
1088                cmd->req.execute(&cmd->req);
1089        }
1090
1091        nvmet_prepare_receive_pdu(queue);
1092        return 0;
1093}
1094
1095static int nvmet_tcp_try_recv_ddgst(struct nvmet_tcp_queue *queue)
1096{
1097        struct nvmet_tcp_cmd *cmd = queue->cmd;
1098        int ret;
1099        struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
1100        struct kvec iov = {
1101                .iov_base = (void *)&cmd->recv_ddgst + queue->offset,
1102                .iov_len = queue->left
1103        };
1104
1105        ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
1106                        iov.iov_len, msg.msg_flags);
1107        if (unlikely(ret < 0))
1108                return ret;
1109
1110        queue->offset += ret;
1111        queue->left -= ret;
1112        if (queue->left)
1113                return -EAGAIN;
1114
1115        if (queue->data_digest && cmd->exp_ddgst != cmd->recv_ddgst) {
1116                pr_err("queue %d: cmd %d pdu (%d) data digest error: recv %#x expected %#x\n",
1117                        queue->idx, cmd->req.cmd->common.command_id,
1118                        queue->pdu.cmd.hdr.type, le32_to_cpu(cmd->recv_ddgst),
1119                        le32_to_cpu(cmd->exp_ddgst));
1120                nvmet_tcp_finish_cmd(cmd);
1121                nvmet_tcp_fatal_error(queue);
1122                ret = -EPROTO;
1123                goto out;
1124        }
1125
1126        if (!(cmd->flags & NVMET_TCP_F_INIT_FAILED) &&
1127            cmd->rbytes_done == cmd->req.transfer_len)
1128                cmd->req.execute(&cmd->req);
1129        ret = 0;
1130out:
1131        nvmet_prepare_receive_pdu(queue);
1132        return ret;
1133}
1134
1135static int nvmet_tcp_try_recv_one(struct nvmet_tcp_queue *queue)
1136{
1137        int result = 0;
1138
1139        if (unlikely(queue->rcv_state == NVMET_TCP_RECV_ERR))
1140                return 0;
1141
1142        if (queue->rcv_state == NVMET_TCP_RECV_PDU) {
1143                result = nvmet_tcp_try_recv_pdu(queue);
1144                if (result != 0)
1145                        goto done_recv;
1146        }
1147
1148        if (queue->rcv_state == NVMET_TCP_RECV_DATA) {
1149                result = nvmet_tcp_try_recv_data(queue);
1150                if (result != 0)
1151                        goto done_recv;
1152        }
1153
1154        if (queue->rcv_state == NVMET_TCP_RECV_DDGST) {
1155                result = nvmet_tcp_try_recv_ddgst(queue);
1156                if (result != 0)
1157                        goto done_recv;
1158        }
1159
1160done_recv:
1161        if (result < 0) {
1162                if (result == -EAGAIN)
1163                        return 0;
1164                return result;
1165        }
1166        return 1;
1167}
1168
1169static int nvmet_tcp_try_recv(struct nvmet_tcp_queue *queue,
1170                int budget, int *recvs)
1171{
1172        int i, ret = 0;
1173
1174        for (i = 0; i < budget; i++) {
1175                ret = nvmet_tcp_try_recv_one(queue);
1176                if (unlikely(ret < 0)) {
1177                        nvmet_tcp_socket_error(queue, ret);
1178                        goto done;
1179                } else if (ret == 0) {
1180                        break;
1181                }
1182                (*recvs)++;
1183        }
1184done:
1185        return ret;
1186}
1187
1188static void nvmet_tcp_schedule_release_queue(struct nvmet_tcp_queue *queue)
1189{
1190        spin_lock(&queue->state_lock);
1191        if (queue->state != NVMET_TCP_Q_DISCONNECTING) {
1192                queue->state = NVMET_TCP_Q_DISCONNECTING;
1193                schedule_work(&queue->release_work);
1194        }
1195        spin_unlock(&queue->state_lock);
1196}
1197
1198static void nvmet_tcp_io_work(struct work_struct *w)
1199{
1200        struct nvmet_tcp_queue *queue =
1201                container_of(w, struct nvmet_tcp_queue, io_work);
1202        bool pending;
1203        int ret, ops = 0;
1204
1205        do {
1206                pending = false;
1207
1208                ret = nvmet_tcp_try_recv(queue, NVMET_TCP_RECV_BUDGET, &ops);
1209                if (ret > 0)
1210                        pending = true;
1211                else if (ret < 0)
1212                        return;
1213
1214                ret = nvmet_tcp_try_send(queue, NVMET_TCP_SEND_BUDGET, &ops);
1215                if (ret > 0)
1216                        pending = true;
1217                else if (ret < 0)
1218                        return;
1219
1220        } while (pending && ops < NVMET_TCP_IO_WORK_BUDGET);
1221
1222        /*
1223         * We exahusted our budget, requeue our selves
1224         */
1225        if (pending)
1226                queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
1227}
1228
1229static int nvmet_tcp_alloc_cmd(struct nvmet_tcp_queue *queue,
1230                struct nvmet_tcp_cmd *c)
1231{
1232        u8 hdgst = nvmet_tcp_hdgst_len(queue);
1233
1234        c->queue = queue;
1235        c->req.port = queue->port->nport;
1236
1237        c->cmd_pdu = page_frag_alloc(&queue->pf_cache,
1238                        sizeof(*c->cmd_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
1239        if (!c->cmd_pdu)
1240                return -ENOMEM;
1241        c->req.cmd = &c->cmd_pdu->cmd;
1242
1243        c->rsp_pdu = page_frag_alloc(&queue->pf_cache,
1244                        sizeof(*c->rsp_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
1245        if (!c->rsp_pdu)
1246                goto out_free_cmd;
1247        c->req.cqe = &c->rsp_pdu->cqe;
1248
1249        c->data_pdu = page_frag_alloc(&queue->pf_cache,
1250                        sizeof(*c->data_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
1251        if (!c->data_pdu)
1252                goto out_free_rsp;
1253
1254        c->r2t_pdu = page_frag_alloc(&queue->pf_cache,
1255                        sizeof(*c->r2t_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
1256        if (!c->r2t_pdu)
1257                goto out_free_data;
1258
1259        c->recv_msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
1260
1261        list_add_tail(&c->entry, &queue->free_list);
1262
1263        return 0;
1264out_free_data:
1265        page_frag_free(c->data_pdu);
1266out_free_rsp:
1267        page_frag_free(c->rsp_pdu);
1268out_free_cmd:
1269        page_frag_free(c->cmd_pdu);
1270        return -ENOMEM;
1271}
1272
1273static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c)
1274{
1275        page_frag_free(c->r2t_pdu);
1276        page_frag_free(c->data_pdu);
1277        page_frag_free(c->rsp_pdu);
1278        page_frag_free(c->cmd_pdu);
1279}
1280
1281static int nvmet_tcp_alloc_cmds(struct nvmet_tcp_queue *queue)
1282{
1283        struct nvmet_tcp_cmd *cmds;
1284        int i, ret = -EINVAL, nr_cmds = queue->nr_cmds;
1285
1286        cmds = kcalloc(nr_cmds, sizeof(struct nvmet_tcp_cmd), GFP_KERNEL);
1287        if (!cmds)
1288                goto out;
1289
1290        for (i = 0; i < nr_cmds; i++) {
1291                ret = nvmet_tcp_alloc_cmd(queue, cmds + i);
1292                if (ret)
1293                        goto out_free;
1294        }
1295
1296        queue->cmds = cmds;
1297
1298        return 0;
1299out_free:
1300        while (--i >= 0)
1301                nvmet_tcp_free_cmd(cmds + i);
1302        kfree(cmds);
1303out:
1304        return ret;
1305}
1306
1307static void nvmet_tcp_free_cmds(struct nvmet_tcp_queue *queue)
1308{
1309        struct nvmet_tcp_cmd *cmds = queue->cmds;
1310        int i;
1311
1312        for (i = 0; i < queue->nr_cmds; i++)
1313                nvmet_tcp_free_cmd(cmds + i);
1314
1315        nvmet_tcp_free_cmd(&queue->connect);
1316        kfree(cmds);
1317}
1318
1319static void nvmet_tcp_restore_socket_callbacks(struct nvmet_tcp_queue *queue)
1320{
1321        struct socket *sock = queue->sock;
1322
1323        write_lock_bh(&sock->sk->sk_callback_lock);
1324        sock->sk->sk_data_ready =  queue->data_ready;
1325        sock->sk->sk_state_change = queue->state_change;
1326        sock->sk->sk_write_space = queue->write_space;
1327        sock->sk->sk_user_data = NULL;
1328        write_unlock_bh(&sock->sk->sk_callback_lock);
1329}
1330
1331static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd)
1332{
1333        nvmet_req_uninit(&cmd->req);
1334        nvmet_tcp_unmap_pdu_iovec(cmd);
1335        kfree(cmd->iov);
1336        sgl_free(cmd->req.sg);
1337}
1338
1339static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue *queue)
1340{
1341        struct nvmet_tcp_cmd *cmd = queue->cmds;
1342        int i;
1343
1344        for (i = 0; i < queue->nr_cmds; i++, cmd++) {
1345                if (nvmet_tcp_need_data_in(cmd))
1346                        nvmet_tcp_finish_cmd(cmd);
1347        }
1348
1349        if (!queue->nr_cmds && nvmet_tcp_need_data_in(&queue->connect)) {
1350                /* failed in connect */
1351                nvmet_tcp_finish_cmd(&queue->connect);
1352        }
1353}
1354
1355static void nvmet_tcp_release_queue_work(struct work_struct *w)
1356{
1357        struct nvmet_tcp_queue *queue =
1358                container_of(w, struct nvmet_tcp_queue, release_work);
1359
1360        mutex_lock(&nvmet_tcp_queue_mutex);
1361        list_del_init(&queue->queue_list);
1362        mutex_unlock(&nvmet_tcp_queue_mutex);
1363
1364        nvmet_tcp_restore_socket_callbacks(queue);
1365        flush_work(&queue->io_work);
1366
1367        nvmet_tcp_uninit_data_in_cmds(queue);
1368        nvmet_sq_destroy(&queue->nvme_sq);
1369        cancel_work_sync(&queue->io_work);
1370        sock_release(queue->sock);
1371        nvmet_tcp_free_cmds(queue);
1372        if (queue->hdr_digest || queue->data_digest)
1373                nvmet_tcp_free_crypto(queue);
1374        ida_simple_remove(&nvmet_tcp_queue_ida, queue->idx);
1375
1376        kfree(queue);
1377}
1378
1379static void nvmet_tcp_data_ready(struct sock *sk)
1380{
1381        struct nvmet_tcp_queue *queue;
1382
1383        read_lock_bh(&sk->sk_callback_lock);
1384        queue = sk->sk_user_data;
1385        if (likely(queue))
1386                queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
1387        read_unlock_bh(&sk->sk_callback_lock);
1388}
1389
1390static void nvmet_tcp_write_space(struct sock *sk)
1391{
1392        struct nvmet_tcp_queue *queue;
1393
1394        read_lock_bh(&sk->sk_callback_lock);
1395        queue = sk->sk_user_data;
1396        if (unlikely(!queue))
1397                goto out;
1398
1399        if (unlikely(queue->state == NVMET_TCP_Q_CONNECTING)) {
1400                queue->write_space(sk);
1401                goto out;
1402        }
1403
1404        if (sk_stream_is_writeable(sk)) {
1405                clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1406                queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
1407        }
1408out:
1409        read_unlock_bh(&sk->sk_callback_lock);
1410}
1411
1412static void nvmet_tcp_state_change(struct sock *sk)
1413{
1414        struct nvmet_tcp_queue *queue;
1415
1416        write_lock_bh(&sk->sk_callback_lock);
1417        queue = sk->sk_user_data;
1418        if (!queue)
1419                goto done;
1420
1421        switch (sk->sk_state) {
1422        case TCP_FIN_WAIT1:
1423        case TCP_CLOSE_WAIT:
1424        case TCP_CLOSE:
1425                /* FALLTHRU */
1426                sk->sk_user_data = NULL;
1427                nvmet_tcp_schedule_release_queue(queue);
1428                break;
1429        default:
1430                pr_warn("queue %d unhandled state %d\n",
1431                        queue->idx, sk->sk_state);
1432        }
1433done:
1434        write_unlock_bh(&sk->sk_callback_lock);
1435}
1436
1437static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue)
1438{
1439        struct socket *sock = queue->sock;
1440        struct inet_sock *inet = inet_sk(sock->sk);
1441        int ret;
1442
1443        ret = kernel_getsockname(sock,
1444                (struct sockaddr *)&queue->sockaddr);
1445        if (ret < 0)
1446                return ret;
1447
1448        ret = kernel_getpeername(sock,
1449                (struct sockaddr *)&queue->sockaddr_peer);
1450        if (ret < 0)
1451                return ret;
1452
1453        /*
1454         * Cleanup whatever is sitting in the TCP transmit queue on socket
1455         * close. This is done to prevent stale data from being sent should
1456         * the network connection be restored before TCP times out.
1457         */
1458        sock_no_linger(sock->sk);
1459
1460        if (so_priority > 0)
1461                sock_set_priority(sock->sk, so_priority);
1462
1463        /* Set socket type of service */
1464        if (inet->rcv_tos > 0)
1465                ip_sock_set_tos(sock->sk, inet->rcv_tos);
1466
1467        write_lock_bh(&sock->sk->sk_callback_lock);
1468        sock->sk->sk_user_data = queue;
1469        queue->data_ready = sock->sk->sk_data_ready;
1470        sock->sk->sk_data_ready = nvmet_tcp_data_ready;
1471        queue->state_change = sock->sk->sk_state_change;
1472        sock->sk->sk_state_change = nvmet_tcp_state_change;
1473        queue->write_space = sock->sk->sk_write_space;
1474        sock->sk->sk_write_space = nvmet_tcp_write_space;
1475        write_unlock_bh(&sock->sk->sk_callback_lock);
1476
1477        return 0;
1478}
1479
1480static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port,
1481                struct socket *newsock)
1482{
1483        struct nvmet_tcp_queue *queue;
1484        int ret;
1485
1486        queue = kzalloc(sizeof(*queue), GFP_KERNEL);
1487        if (!queue)
1488                return -ENOMEM;
1489
1490        INIT_WORK(&queue->release_work, nvmet_tcp_release_queue_work);
1491        INIT_WORK(&queue->io_work, nvmet_tcp_io_work);
1492        queue->sock = newsock;
1493        queue->port = port;
1494        queue->nr_cmds = 0;
1495        spin_lock_init(&queue->state_lock);
1496        queue->state = NVMET_TCP_Q_CONNECTING;
1497        INIT_LIST_HEAD(&queue->free_list);
1498        init_llist_head(&queue->resp_list);
1499        INIT_LIST_HEAD(&queue->resp_send_list);
1500
1501        queue->idx = ida_simple_get(&nvmet_tcp_queue_ida, 0, 0, GFP_KERNEL);
1502        if (queue->idx < 0) {
1503                ret = queue->idx;
1504                goto out_free_queue;
1505        }
1506
1507        ret = nvmet_tcp_alloc_cmd(queue, &queue->connect);
1508        if (ret)
1509                goto out_ida_remove;
1510
1511        ret = nvmet_sq_init(&queue->nvme_sq);
1512        if (ret)
1513                goto out_free_connect;
1514
1515        port->last_cpu = cpumask_next_wrap(port->last_cpu,
1516                                cpu_online_mask, -1, false);
1517        queue->cpu = port->last_cpu;
1518        nvmet_prepare_receive_pdu(queue);
1519
1520        mutex_lock(&nvmet_tcp_queue_mutex);
1521        list_add_tail(&queue->queue_list, &nvmet_tcp_queue_list);
1522        mutex_unlock(&nvmet_tcp_queue_mutex);
1523
1524        ret = nvmet_tcp_set_queue_sock(queue);
1525        if (ret)
1526                goto out_destroy_sq;
1527
1528        queue_work_on(queue->cpu, nvmet_tcp_wq, &queue->io_work);
1529
1530        return 0;
1531out_destroy_sq:
1532        mutex_lock(&nvmet_tcp_queue_mutex);
1533        list_del_init(&queue->queue_list);
1534        mutex_unlock(&nvmet_tcp_queue_mutex);
1535        nvmet_sq_destroy(&queue->nvme_sq);
1536out_free_connect:
1537        nvmet_tcp_free_cmd(&queue->connect);
1538out_ida_remove:
1539        ida_simple_remove(&nvmet_tcp_queue_ida, queue->idx);
1540out_free_queue:
1541        kfree(queue);
1542        return ret;
1543}
1544
1545static void nvmet_tcp_accept_work(struct work_struct *w)
1546{
1547        struct nvmet_tcp_port *port =
1548                container_of(w, struct nvmet_tcp_port, accept_work);
1549        struct socket *newsock;
1550        int ret;
1551
1552        while (true) {
1553                ret = kernel_accept(port->sock, &newsock, O_NONBLOCK);
1554                if (ret < 0) {
1555                        if (ret != -EAGAIN)
1556                                pr_warn("failed to accept err=%d\n", ret);
1557                        return;
1558                }
1559                ret = nvmet_tcp_alloc_queue(port, newsock);
1560                if (ret) {
1561                        pr_err("failed to allocate queue\n");
1562                        sock_release(newsock);
1563                }
1564        }
1565}
1566
1567static void nvmet_tcp_listen_data_ready(struct sock *sk)
1568{
1569        struct nvmet_tcp_port *port;
1570
1571        read_lock_bh(&sk->sk_callback_lock);
1572        port = sk->sk_user_data;
1573        if (!port)
1574                goto out;
1575
1576        if (sk->sk_state == TCP_LISTEN)
1577                schedule_work(&port->accept_work);
1578out:
1579        read_unlock_bh(&sk->sk_callback_lock);
1580}
1581
1582static int nvmet_tcp_add_port(struct nvmet_port *nport)
1583{
1584        struct nvmet_tcp_port *port;
1585        __kernel_sa_family_t af;
1586        int ret;
1587
1588        port = kzalloc(sizeof(*port), GFP_KERNEL);
1589        if (!port)
1590                return -ENOMEM;
1591
1592        switch (nport->disc_addr.adrfam) {
1593        case NVMF_ADDR_FAMILY_IP4:
1594                af = AF_INET;
1595                break;
1596        case NVMF_ADDR_FAMILY_IP6:
1597                af = AF_INET6;
1598                break;
1599        default:
1600                pr_err("address family %d not supported\n",
1601                                nport->disc_addr.adrfam);
1602                ret = -EINVAL;
1603                goto err_port;
1604        }
1605
1606        ret = inet_pton_with_scope(&init_net, af, nport->disc_addr.traddr,
1607                        nport->disc_addr.trsvcid, &port->addr);
1608        if (ret) {
1609                pr_err("malformed ip/port passed: %s:%s\n",
1610                        nport->disc_addr.traddr, nport->disc_addr.trsvcid);
1611                goto err_port;
1612        }
1613
1614        port->nport = nport;
1615        port->last_cpu = -1;
1616        INIT_WORK(&port->accept_work, nvmet_tcp_accept_work);
1617        if (port->nport->inline_data_size < 0)
1618                port->nport->inline_data_size = NVMET_TCP_DEF_INLINE_DATA_SIZE;
1619
1620        ret = sock_create(port->addr.ss_family, SOCK_STREAM,
1621                                IPPROTO_TCP, &port->sock);
1622        if (ret) {
1623                pr_err("failed to create a socket\n");
1624                goto err_port;
1625        }
1626
1627        port->sock->sk->sk_user_data = port;
1628        port->data_ready = port->sock->sk->sk_data_ready;
1629        port->sock->sk->sk_data_ready = nvmet_tcp_listen_data_ready;
1630        sock_set_reuseaddr(port->sock->sk);
1631        tcp_sock_set_nodelay(port->sock->sk);
1632        if (so_priority > 0)
1633                sock_set_priority(port->sock->sk, so_priority);
1634
1635        ret = kernel_bind(port->sock, (struct sockaddr *)&port->addr,
1636                        sizeof(port->addr));
1637        if (ret) {
1638                pr_err("failed to bind port socket %d\n", ret);
1639                goto err_sock;
1640        }
1641
1642        ret = kernel_listen(port->sock, 128);
1643        if (ret) {
1644                pr_err("failed to listen %d on port sock\n", ret);
1645                goto err_sock;
1646        }
1647
1648        nport->priv = port;
1649        pr_info("enabling port %d (%pISpc)\n",
1650                le16_to_cpu(nport->disc_addr.portid), &port->addr);
1651
1652        return 0;
1653
1654err_sock:
1655        sock_release(port->sock);
1656err_port:
1657        kfree(port);
1658        return ret;
1659}
1660
1661static void nvmet_tcp_remove_port(struct nvmet_port *nport)
1662{
1663        struct nvmet_tcp_port *port = nport->priv;
1664
1665        write_lock_bh(&port->sock->sk->sk_callback_lock);
1666        port->sock->sk->sk_data_ready = port->data_ready;
1667        port->sock->sk->sk_user_data = NULL;
1668        write_unlock_bh(&port->sock->sk->sk_callback_lock);
1669        cancel_work_sync(&port->accept_work);
1670
1671        sock_release(port->sock);
1672        kfree(port);
1673}
1674
1675static void nvmet_tcp_delete_ctrl(struct nvmet_ctrl *ctrl)
1676{
1677        struct nvmet_tcp_queue *queue;
1678
1679        mutex_lock(&nvmet_tcp_queue_mutex);
1680        list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list)
1681                if (queue->nvme_sq.ctrl == ctrl)
1682                        kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1683        mutex_unlock(&nvmet_tcp_queue_mutex);
1684}
1685
1686static u16 nvmet_tcp_install_queue(struct nvmet_sq *sq)
1687{
1688        struct nvmet_tcp_queue *queue =
1689                container_of(sq, struct nvmet_tcp_queue, nvme_sq);
1690
1691        if (sq->qid == 0) {
1692                /* Let inflight controller teardown complete */
1693                flush_scheduled_work();
1694        }
1695
1696        queue->nr_cmds = sq->size * 2;
1697        if (nvmet_tcp_alloc_cmds(queue))
1698                return NVME_SC_INTERNAL;
1699        return 0;
1700}
1701
1702static void nvmet_tcp_disc_port_addr(struct nvmet_req *req,
1703                struct nvmet_port *nport, char *traddr)
1704{
1705        struct nvmet_tcp_port *port = nport->priv;
1706
1707        if (inet_addr_is_any((struct sockaddr *)&port->addr)) {
1708                struct nvmet_tcp_cmd *cmd =
1709                        container_of(req, struct nvmet_tcp_cmd, req);
1710                struct nvmet_tcp_queue *queue = cmd->queue;
1711
1712                sprintf(traddr, "%pISc", (struct sockaddr *)&queue->sockaddr);
1713        } else {
1714                memcpy(traddr, nport->disc_addr.traddr, NVMF_TRADDR_SIZE);
1715        }
1716}
1717
1718static const struct nvmet_fabrics_ops nvmet_tcp_ops = {
1719        .owner                  = THIS_MODULE,
1720        .type                   = NVMF_TRTYPE_TCP,
1721        .msdbd                  = 1,
1722        .add_port               = nvmet_tcp_add_port,
1723        .remove_port            = nvmet_tcp_remove_port,
1724        .queue_response         = nvmet_tcp_queue_response,
1725        .delete_ctrl            = nvmet_tcp_delete_ctrl,
1726        .install_queue          = nvmet_tcp_install_queue,
1727        .disc_traddr            = nvmet_tcp_disc_port_addr,
1728};
1729
1730static int __init nvmet_tcp_init(void)
1731{
1732        int ret;
1733
1734        nvmet_tcp_wq = alloc_workqueue("nvmet_tcp_wq", WQ_HIGHPRI, 0);
1735        if (!nvmet_tcp_wq)
1736                return -ENOMEM;
1737
1738        ret = nvmet_register_transport(&nvmet_tcp_ops);
1739        if (ret)
1740                goto err;
1741
1742        return 0;
1743err:
1744        destroy_workqueue(nvmet_tcp_wq);
1745        return ret;
1746}
1747
1748static void __exit nvmet_tcp_exit(void)
1749{
1750        struct nvmet_tcp_queue *queue;
1751
1752        nvmet_unregister_transport(&nvmet_tcp_ops);
1753
1754        flush_scheduled_work();
1755        mutex_lock(&nvmet_tcp_queue_mutex);
1756        list_for_each_entry(queue, &nvmet_tcp_queue_list, queue_list)
1757                kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1758        mutex_unlock(&nvmet_tcp_queue_mutex);
1759        flush_scheduled_work();
1760
1761        destroy_workqueue(nvmet_tcp_wq);
1762}
1763
1764module_init(nvmet_tcp_init);
1765module_exit(nvmet_tcp_exit);
1766
1767MODULE_LICENSE("GPL v2");
1768MODULE_ALIAS("nvmet-transport-3"); /* 3 == NVMF_TRTYPE_TCP */
1769