1
2
3
4
5
6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7#include <linux/module.h>
8#include <linux/init.h>
9#include <linux/slab.h>
10#include <linux/err.h>
11#include <linux/nvme-tcp.h>
12#include <net/sock.h>
13#include <net/tcp.h>
14#include <linux/blk-mq.h>
15#include <crypto/hash.h>
16#include <net/busy_poll.h>
17
18#include "nvme.h"
19#include "fabrics.h"
20
21struct nvme_tcp_queue;
22
23
24
25
26
27
28
29static int so_priority;
30module_param(so_priority, int, 0644);
31MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority");
32
33enum nvme_tcp_send_state {
34 NVME_TCP_SEND_CMD_PDU = 0,
35 NVME_TCP_SEND_H2C_PDU,
36 NVME_TCP_SEND_DATA,
37 NVME_TCP_SEND_DDGST,
38};
39
40struct nvme_tcp_request {
41 struct nvme_request req;
42 void *pdu;
43 struct nvme_tcp_queue *queue;
44 u32 data_len;
45 u32 pdu_len;
46 u32 pdu_sent;
47 u16 ttag;
48 struct list_head entry;
49 struct llist_node lentry;
50 __le32 ddgst;
51
52 struct bio *curr_bio;
53 struct iov_iter iter;
54
55
56 size_t offset;
57 size_t data_sent;
58 enum nvme_tcp_send_state state;
59};
60
61enum nvme_tcp_queue_flags {
62 NVME_TCP_Q_ALLOCATED = 0,
63 NVME_TCP_Q_LIVE = 1,
64 NVME_TCP_Q_POLLING = 2,
65};
66
67enum nvme_tcp_recv_state {
68 NVME_TCP_RECV_PDU = 0,
69 NVME_TCP_RECV_DATA,
70 NVME_TCP_RECV_DDGST,
71};
72
73struct nvme_tcp_ctrl;
74struct nvme_tcp_queue {
75 struct socket *sock;
76 struct work_struct io_work;
77 int io_cpu;
78
79 struct mutex queue_lock;
80 struct mutex send_mutex;
81 struct llist_head req_list;
82 struct list_head send_list;
83 bool more_requests;
84
85
86 void *pdu;
87 int pdu_remaining;
88 int pdu_offset;
89 size_t data_remaining;
90 size_t ddgst_remaining;
91 unsigned int nr_cqe;
92
93
94 struct nvme_tcp_request *request;
95
96 int queue_size;
97 size_t cmnd_capsule_len;
98 struct nvme_tcp_ctrl *ctrl;
99 unsigned long flags;
100 bool rd_enabled;
101
102 bool hdr_digest;
103 bool data_digest;
104 struct ahash_request *rcv_hash;
105 struct ahash_request *snd_hash;
106 __le32 exp_ddgst;
107 __le32 recv_ddgst;
108
109 struct page_frag_cache pf_cache;
110
111 void (*state_change)(struct sock *);
112 void (*data_ready)(struct sock *);
113 void (*write_space)(struct sock *);
114};
115
116struct nvme_tcp_ctrl {
117
118 struct nvme_tcp_queue *queues;
119 struct blk_mq_tag_set tag_set;
120
121
122 struct list_head list;
123 struct blk_mq_tag_set admin_tag_set;
124 struct sockaddr_storage addr;
125 struct sockaddr_storage src_addr;
126 struct nvme_ctrl ctrl;
127
128 struct work_struct err_work;
129 struct delayed_work connect_work;
130 struct nvme_tcp_request async_req;
131 u32 io_queues[HCTX_MAX_TYPES];
132};
133
134static LIST_HEAD(nvme_tcp_ctrl_list);
135static DEFINE_MUTEX(nvme_tcp_ctrl_mutex);
136static struct workqueue_struct *nvme_tcp_wq;
137static const struct blk_mq_ops nvme_tcp_mq_ops;
138static const struct blk_mq_ops nvme_tcp_admin_mq_ops;
139static int nvme_tcp_try_send(struct nvme_tcp_queue *queue);
140
141static inline struct nvme_tcp_ctrl *to_tcp_ctrl(struct nvme_ctrl *ctrl)
142{
143 return container_of(ctrl, struct nvme_tcp_ctrl, ctrl);
144}
145
146static inline int nvme_tcp_queue_id(struct nvme_tcp_queue *queue)
147{
148 return queue - queue->ctrl->queues;
149}
150
151static inline struct blk_mq_tags *nvme_tcp_tagset(struct nvme_tcp_queue *queue)
152{
153 u32 queue_idx = nvme_tcp_queue_id(queue);
154
155 if (queue_idx == 0)
156 return queue->ctrl->admin_tag_set.tags[queue_idx];
157 return queue->ctrl->tag_set.tags[queue_idx - 1];
158}
159
160static inline u8 nvme_tcp_hdgst_len(struct nvme_tcp_queue *queue)
161{
162 return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0;
163}
164
165static inline u8 nvme_tcp_ddgst_len(struct nvme_tcp_queue *queue)
166{
167 return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
168}
169
170static inline size_t nvme_tcp_inline_data_size(struct nvme_tcp_queue *queue)
171{
172 return queue->cmnd_capsule_len - sizeof(struct nvme_command);
173}
174
175static inline bool nvme_tcp_async_req(struct nvme_tcp_request *req)
176{
177 return req == &req->queue->ctrl->async_req;
178}
179
180static inline bool nvme_tcp_has_inline_data(struct nvme_tcp_request *req)
181{
182 struct request *rq;
183
184 if (unlikely(nvme_tcp_async_req(req)))
185 return false;
186
187 rq = blk_mq_rq_from_pdu(req);
188
189 return rq_data_dir(rq) == WRITE && req->data_len &&
190 req->data_len <= nvme_tcp_inline_data_size(req->queue);
191}
192
193static inline struct page *nvme_tcp_req_cur_page(struct nvme_tcp_request *req)
194{
195 return req->iter.bvec->bv_page;
196}
197
198static inline size_t nvme_tcp_req_cur_offset(struct nvme_tcp_request *req)
199{
200 return req->iter.bvec->bv_offset + req->iter.iov_offset;
201}
202
203static inline size_t nvme_tcp_req_cur_length(struct nvme_tcp_request *req)
204{
205 return min_t(size_t, iov_iter_single_seg_count(&req->iter),
206 req->pdu_len - req->pdu_sent);
207}
208
209static inline size_t nvme_tcp_pdu_data_left(struct nvme_tcp_request *req)
210{
211 return rq_data_dir(blk_mq_rq_from_pdu(req)) == WRITE ?
212 req->pdu_len - req->pdu_sent : 0;
213}
214
215static inline size_t nvme_tcp_pdu_last_send(struct nvme_tcp_request *req,
216 int len)
217{
218 return nvme_tcp_pdu_data_left(req) <= len;
219}
220
221static void nvme_tcp_init_iter(struct nvme_tcp_request *req,
222 unsigned int dir)
223{
224 struct request *rq = blk_mq_rq_from_pdu(req);
225 struct bio_vec *vec;
226 unsigned int size;
227 int nr_bvec;
228 size_t offset;
229
230 if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) {
231 vec = &rq->special_vec;
232 nr_bvec = 1;
233 size = blk_rq_payload_bytes(rq);
234 offset = 0;
235 } else {
236 struct bio *bio = req->curr_bio;
237 struct bvec_iter bi;
238 struct bio_vec bv;
239
240 vec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
241 nr_bvec = 0;
242 bio_for_each_bvec(bv, bio, bi) {
243 nr_bvec++;
244 }
245 size = bio->bi_iter.bi_size;
246 offset = bio->bi_iter.bi_bvec_done;
247 }
248
249 iov_iter_bvec(&req->iter, dir, vec, nr_bvec, size);
250 req->iter.iov_offset = offset;
251}
252
253static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req,
254 int len)
255{
256 req->data_sent += len;
257 req->pdu_sent += len;
258 iov_iter_advance(&req->iter, len);
259 if (!iov_iter_count(&req->iter) &&
260 req->data_sent < req->data_len) {
261 req->curr_bio = req->curr_bio->bi_next;
262 nvme_tcp_init_iter(req, WRITE);
263 }
264}
265
266static inline void nvme_tcp_send_all(struct nvme_tcp_queue *queue)
267{
268 int ret;
269
270
271 do {
272 ret = nvme_tcp_try_send(queue);
273 } while (ret > 0);
274}
275
276static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
277 bool sync, bool last)
278{
279 struct nvme_tcp_queue *queue = req->queue;
280 bool empty;
281
282 empty = llist_add(&req->lentry, &queue->req_list) &&
283 list_empty(&queue->send_list) && !queue->request;
284
285
286
287
288
289
290 if (queue->io_cpu == raw_smp_processor_id() &&
291 sync && empty && mutex_trylock(&queue->send_mutex)) {
292 queue->more_requests = !last;
293 nvme_tcp_send_all(queue);
294 queue->more_requests = false;
295 mutex_unlock(&queue->send_mutex);
296 } else if (last) {
297 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
298 }
299}
300
301static void nvme_tcp_process_req_list(struct nvme_tcp_queue *queue)
302{
303 struct nvme_tcp_request *req;
304 struct llist_node *node;
305
306 for (node = llist_del_all(&queue->req_list); node; node = node->next) {
307 req = llist_entry(node, struct nvme_tcp_request, lentry);
308 list_add(&req->entry, &queue->send_list);
309 }
310}
311
312static inline struct nvme_tcp_request *
313nvme_tcp_fetch_request(struct nvme_tcp_queue *queue)
314{
315 struct nvme_tcp_request *req;
316
317 req = list_first_entry_or_null(&queue->send_list,
318 struct nvme_tcp_request, entry);
319 if (!req) {
320 nvme_tcp_process_req_list(queue);
321 req = list_first_entry_or_null(&queue->send_list,
322 struct nvme_tcp_request, entry);
323 if (unlikely(!req))
324 return NULL;
325 }
326
327 list_del(&req->entry);
328 return req;
329}
330
331static inline void nvme_tcp_ddgst_final(struct ahash_request *hash,
332 __le32 *dgst)
333{
334 ahash_request_set_crypt(hash, NULL, (u8 *)dgst, 0);
335 crypto_ahash_final(hash);
336}
337
338static inline void nvme_tcp_ddgst_update(struct ahash_request *hash,
339 struct page *page, off_t off, size_t len)
340{
341 struct scatterlist sg;
342
343 sg_init_marker(&sg, 1);
344 sg_set_page(&sg, page, len, off);
345 ahash_request_set_crypt(hash, &sg, NULL, len);
346 crypto_ahash_update(hash);
347}
348
349static inline void nvme_tcp_hdgst(struct ahash_request *hash,
350 void *pdu, size_t len)
351{
352 struct scatterlist sg;
353
354 sg_init_one(&sg, pdu, len);
355 ahash_request_set_crypt(hash, &sg, pdu + len, len);
356 crypto_ahash_digest(hash);
357}
358
359static int nvme_tcp_verify_hdgst(struct nvme_tcp_queue *queue,
360 void *pdu, size_t pdu_len)
361{
362 struct nvme_tcp_hdr *hdr = pdu;
363 __le32 recv_digest;
364 __le32 exp_digest;
365
366 if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) {
367 dev_err(queue->ctrl->ctrl.device,
368 "queue %d: header digest flag is cleared\n",
369 nvme_tcp_queue_id(queue));
370 return -EPROTO;
371 }
372
373 recv_digest = *(__le32 *)(pdu + hdr->hlen);
374 nvme_tcp_hdgst(queue->rcv_hash, pdu, pdu_len);
375 exp_digest = *(__le32 *)(pdu + hdr->hlen);
376 if (recv_digest != exp_digest) {
377 dev_err(queue->ctrl->ctrl.device,
378 "header digest error: recv %#x expected %#x\n",
379 le32_to_cpu(recv_digest), le32_to_cpu(exp_digest));
380 return -EIO;
381 }
382
383 return 0;
384}
385
386static int nvme_tcp_check_ddgst(struct nvme_tcp_queue *queue, void *pdu)
387{
388 struct nvme_tcp_hdr *hdr = pdu;
389 u8 digest_len = nvme_tcp_hdgst_len(queue);
390 u32 len;
391
392 len = le32_to_cpu(hdr->plen) - hdr->hlen -
393 ((hdr->flags & NVME_TCP_F_HDGST) ? digest_len : 0);
394
395 if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) {
396 dev_err(queue->ctrl->ctrl.device,
397 "queue %d: data digest flag is cleared\n",
398 nvme_tcp_queue_id(queue));
399 return -EPROTO;
400 }
401 crypto_ahash_init(queue->rcv_hash);
402
403 return 0;
404}
405
406static void nvme_tcp_exit_request(struct blk_mq_tag_set *set,
407 struct request *rq, unsigned int hctx_idx)
408{
409 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
410
411 page_frag_free(req->pdu);
412}
413
414static int nvme_tcp_init_request(struct blk_mq_tag_set *set,
415 struct request *rq, unsigned int hctx_idx,
416 unsigned int numa_node)
417{
418 struct nvme_tcp_ctrl *ctrl = set->driver_data;
419 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
420 struct nvme_tcp_cmd_pdu *pdu;
421 int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
422 struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx];
423 u8 hdgst = nvme_tcp_hdgst_len(queue);
424
425 req->pdu = page_frag_alloc(&queue->pf_cache,
426 sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
427 GFP_KERNEL | __GFP_ZERO);
428 if (!req->pdu)
429 return -ENOMEM;
430
431 pdu = req->pdu;
432 req->queue = queue;
433 nvme_req(rq)->ctrl = &ctrl->ctrl;
434 nvme_req(rq)->cmd = &pdu->cmd;
435
436 return 0;
437}
438
439static int nvme_tcp_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
440 unsigned int hctx_idx)
441{
442 struct nvme_tcp_ctrl *ctrl = data;
443 struct nvme_tcp_queue *queue = &ctrl->queues[hctx_idx + 1];
444
445 hctx->driver_data = queue;
446 return 0;
447}
448
449static int nvme_tcp_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
450 unsigned int hctx_idx)
451{
452 struct nvme_tcp_ctrl *ctrl = data;
453 struct nvme_tcp_queue *queue = &ctrl->queues[0];
454
455 hctx->driver_data = queue;
456 return 0;
457}
458
459static enum nvme_tcp_recv_state
460nvme_tcp_recv_state(struct nvme_tcp_queue *queue)
461{
462 return (queue->pdu_remaining) ? NVME_TCP_RECV_PDU :
463 (queue->ddgst_remaining) ? NVME_TCP_RECV_DDGST :
464 NVME_TCP_RECV_DATA;
465}
466
467static void nvme_tcp_init_recv_ctx(struct nvme_tcp_queue *queue)
468{
469 queue->pdu_remaining = sizeof(struct nvme_tcp_rsp_pdu) +
470 nvme_tcp_hdgst_len(queue);
471 queue->pdu_offset = 0;
472 queue->data_remaining = -1;
473 queue->ddgst_remaining = 0;
474}
475
476static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl)
477{
478 if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
479 return;
480
481 dev_warn(ctrl->device, "starting error recovery\n");
482 queue_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work);
483}
484
485static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
486 struct nvme_completion *cqe)
487{
488 struct request *rq;
489
490 rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), cqe->command_id);
491 if (!rq) {
492 dev_err(queue->ctrl->ctrl.device,
493 "queue %d tag 0x%x not found\n",
494 nvme_tcp_queue_id(queue), cqe->command_id);
495 nvme_tcp_error_recovery(&queue->ctrl->ctrl);
496 return -EINVAL;
497 }
498
499 if (!nvme_try_complete_req(rq, cqe->status, cqe->result))
500 nvme_complete_rq(rq);
501 queue->nr_cqe++;
502
503 return 0;
504}
505
506static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue,
507 struct nvme_tcp_data_pdu *pdu)
508{
509 struct request *rq;
510
511 rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
512 if (!rq) {
513 dev_err(queue->ctrl->ctrl.device,
514 "queue %d tag %#x not found\n",
515 nvme_tcp_queue_id(queue), pdu->command_id);
516 return -ENOENT;
517 }
518
519 if (!blk_rq_payload_bytes(rq)) {
520 dev_err(queue->ctrl->ctrl.device,
521 "queue %d tag %#x unexpected data\n",
522 nvme_tcp_queue_id(queue), rq->tag);
523 return -EIO;
524 }
525
526 queue->data_remaining = le32_to_cpu(pdu->data_length);
527
528 if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS &&
529 unlikely(!(pdu->hdr.flags & NVME_TCP_F_DATA_LAST))) {
530 dev_err(queue->ctrl->ctrl.device,
531 "queue %d tag %#x SUCCESS set but not last PDU\n",
532 nvme_tcp_queue_id(queue), rq->tag);
533 nvme_tcp_error_recovery(&queue->ctrl->ctrl);
534 return -EPROTO;
535 }
536
537 return 0;
538}
539
540static int nvme_tcp_handle_comp(struct nvme_tcp_queue *queue,
541 struct nvme_tcp_rsp_pdu *pdu)
542{
543 struct nvme_completion *cqe = &pdu->cqe;
544 int ret = 0;
545
546
547
548
549
550
551
552 if (unlikely(nvme_is_aen_req(nvme_tcp_queue_id(queue),
553 cqe->command_id)))
554 nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status,
555 &cqe->result);
556 else
557 ret = nvme_tcp_process_nvme_cqe(queue, cqe);
558
559 return ret;
560}
561
562static int nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req,
563 struct nvme_tcp_r2t_pdu *pdu)
564{
565 struct nvme_tcp_data_pdu *data = req->pdu;
566 struct nvme_tcp_queue *queue = req->queue;
567 struct request *rq = blk_mq_rq_from_pdu(req);
568 u8 hdgst = nvme_tcp_hdgst_len(queue);
569 u8 ddgst = nvme_tcp_ddgst_len(queue);
570
571 req->pdu_len = le32_to_cpu(pdu->r2t_length);
572 req->pdu_sent = 0;
573
574 if (unlikely(!req->pdu_len)) {
575 dev_err(queue->ctrl->ctrl.device,
576 "req %d r2t len is %u, probably a bug...\n",
577 rq->tag, req->pdu_len);
578 return -EPROTO;
579 }
580
581 if (unlikely(req->data_sent + req->pdu_len > req->data_len)) {
582 dev_err(queue->ctrl->ctrl.device,
583 "req %d r2t len %u exceeded data len %u (%zu sent)\n",
584 rq->tag, req->pdu_len, req->data_len,
585 req->data_sent);
586 return -EPROTO;
587 }
588
589 if (unlikely(le32_to_cpu(pdu->r2t_offset) < req->data_sent)) {
590 dev_err(queue->ctrl->ctrl.device,
591 "req %d unexpected r2t offset %u (expected %zu)\n",
592 rq->tag, le32_to_cpu(pdu->r2t_offset),
593 req->data_sent);
594 return -EPROTO;
595 }
596
597 memset(data, 0, sizeof(*data));
598 data->hdr.type = nvme_tcp_h2c_data;
599 data->hdr.flags = NVME_TCP_F_DATA_LAST;
600 if (queue->hdr_digest)
601 data->hdr.flags |= NVME_TCP_F_HDGST;
602 if (queue->data_digest)
603 data->hdr.flags |= NVME_TCP_F_DDGST;
604 data->hdr.hlen = sizeof(*data);
605 data->hdr.pdo = data->hdr.hlen + hdgst;
606 data->hdr.plen =
607 cpu_to_le32(data->hdr.hlen + hdgst + req->pdu_len + ddgst);
608 data->ttag = pdu->ttag;
609 data->command_id = rq->tag;
610 data->data_offset = cpu_to_le32(req->data_sent);
611 data->data_length = cpu_to_le32(req->pdu_len);
612 return 0;
613}
614
615static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue,
616 struct nvme_tcp_r2t_pdu *pdu)
617{
618 struct nvme_tcp_request *req;
619 struct request *rq;
620 int ret;
621
622 rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
623 if (!rq) {
624 dev_err(queue->ctrl->ctrl.device,
625 "queue %d tag %#x not found\n",
626 nvme_tcp_queue_id(queue), pdu->command_id);
627 return -ENOENT;
628 }
629 req = blk_mq_rq_to_pdu(rq);
630
631 ret = nvme_tcp_setup_h2c_data_pdu(req, pdu);
632 if (unlikely(ret))
633 return ret;
634
635 req->state = NVME_TCP_SEND_H2C_PDU;
636 req->offset = 0;
637
638 nvme_tcp_queue_request(req, false, true);
639
640 return 0;
641}
642
643static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb,
644 unsigned int *offset, size_t *len)
645{
646 struct nvme_tcp_hdr *hdr;
647 char *pdu = queue->pdu;
648 size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining);
649 int ret;
650
651 ret = skb_copy_bits(skb, *offset,
652 &pdu[queue->pdu_offset], rcv_len);
653 if (unlikely(ret))
654 return ret;
655
656 queue->pdu_remaining -= rcv_len;
657 queue->pdu_offset += rcv_len;
658 *offset += rcv_len;
659 *len -= rcv_len;
660 if (queue->pdu_remaining)
661 return 0;
662
663 hdr = queue->pdu;
664 if (queue->hdr_digest) {
665 ret = nvme_tcp_verify_hdgst(queue, queue->pdu, hdr->hlen);
666 if (unlikely(ret))
667 return ret;
668 }
669
670
671 if (queue->data_digest) {
672 ret = nvme_tcp_check_ddgst(queue, queue->pdu);
673 if (unlikely(ret))
674 return ret;
675 }
676
677 switch (hdr->type) {
678 case nvme_tcp_c2h_data:
679 return nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu);
680 case nvme_tcp_rsp:
681 nvme_tcp_init_recv_ctx(queue);
682 return nvme_tcp_handle_comp(queue, (void *)queue->pdu);
683 case nvme_tcp_r2t:
684 nvme_tcp_init_recv_ctx(queue);
685 return nvme_tcp_handle_r2t(queue, (void *)queue->pdu);
686 default:
687 dev_err(queue->ctrl->ctrl.device,
688 "unsupported pdu type (%d)\n", hdr->type);
689 return -EINVAL;
690 }
691}
692
693static inline void nvme_tcp_end_request(struct request *rq, u16 status)
694{
695 union nvme_result res = {};
696
697 if (!nvme_try_complete_req(rq, cpu_to_le16(status << 1), res))
698 nvme_complete_rq(rq);
699}
700
701static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
702 unsigned int *offset, size_t *len)
703{
704 struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
705 struct nvme_tcp_request *req;
706 struct request *rq;
707
708 rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
709 if (!rq) {
710 dev_err(queue->ctrl->ctrl.device,
711 "queue %d tag %#x not found\n",
712 nvme_tcp_queue_id(queue), pdu->command_id);
713 return -ENOENT;
714 }
715 req = blk_mq_rq_to_pdu(rq);
716
717 while (true) {
718 int recv_len, ret;
719
720 recv_len = min_t(size_t, *len, queue->data_remaining);
721 if (!recv_len)
722 break;
723
724 if (!iov_iter_count(&req->iter)) {
725 req->curr_bio = req->curr_bio->bi_next;
726
727
728
729
730
731 if (!req->curr_bio) {
732 dev_err(queue->ctrl->ctrl.device,
733 "queue %d no space in request %#x",
734 nvme_tcp_queue_id(queue), rq->tag);
735 nvme_tcp_init_recv_ctx(queue);
736 return -EIO;
737 }
738 nvme_tcp_init_iter(req, READ);
739 }
740
741
742 recv_len = min_t(size_t, recv_len,
743 iov_iter_count(&req->iter));
744
745 if (queue->data_digest)
746 ret = skb_copy_and_hash_datagram_iter(skb, *offset,
747 &req->iter, recv_len, queue->rcv_hash);
748 else
749 ret = skb_copy_datagram_iter(skb, *offset,
750 &req->iter, recv_len);
751 if (ret) {
752 dev_err(queue->ctrl->ctrl.device,
753 "queue %d failed to copy request %#x data",
754 nvme_tcp_queue_id(queue), rq->tag);
755 return ret;
756 }
757
758 *len -= recv_len;
759 *offset += recv_len;
760 queue->data_remaining -= recv_len;
761 }
762
763 if (!queue->data_remaining) {
764 if (queue->data_digest) {
765 nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst);
766 queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH;
767 } else {
768 if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
769 nvme_tcp_end_request(rq, NVME_SC_SUCCESS);
770 queue->nr_cqe++;
771 }
772 nvme_tcp_init_recv_ctx(queue);
773 }
774 }
775
776 return 0;
777}
778
779static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue,
780 struct sk_buff *skb, unsigned int *offset, size_t *len)
781{
782 struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
783 char *ddgst = (char *)&queue->recv_ddgst;
784 size_t recv_len = min_t(size_t, *len, queue->ddgst_remaining);
785 off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining;
786 int ret;
787
788 ret = skb_copy_bits(skb, *offset, &ddgst[off], recv_len);
789 if (unlikely(ret))
790 return ret;
791
792 queue->ddgst_remaining -= recv_len;
793 *offset += recv_len;
794 *len -= recv_len;
795 if (queue->ddgst_remaining)
796 return 0;
797
798 if (queue->recv_ddgst != queue->exp_ddgst) {
799 dev_err(queue->ctrl->ctrl.device,
800 "data digest error: recv %#x expected %#x\n",
801 le32_to_cpu(queue->recv_ddgst),
802 le32_to_cpu(queue->exp_ddgst));
803 return -EIO;
804 }
805
806 if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
807 struct request *rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue),
808 pdu->command_id);
809
810 nvme_tcp_end_request(rq, NVME_SC_SUCCESS);
811 queue->nr_cqe++;
812 }
813
814 nvme_tcp_init_recv_ctx(queue);
815 return 0;
816}
817
818static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
819 unsigned int offset, size_t len)
820{
821 struct nvme_tcp_queue *queue = desc->arg.data;
822 size_t consumed = len;
823 int result;
824
825 while (len) {
826 switch (nvme_tcp_recv_state(queue)) {
827 case NVME_TCP_RECV_PDU:
828 result = nvme_tcp_recv_pdu(queue, skb, &offset, &len);
829 break;
830 case NVME_TCP_RECV_DATA:
831 result = nvme_tcp_recv_data(queue, skb, &offset, &len);
832 break;
833 case NVME_TCP_RECV_DDGST:
834 result = nvme_tcp_recv_ddgst(queue, skb, &offset, &len);
835 break;
836 default:
837 result = -EFAULT;
838 }
839 if (result) {
840 dev_err(queue->ctrl->ctrl.device,
841 "receive failed: %d\n", result);
842 queue->rd_enabled = false;
843 nvme_tcp_error_recovery(&queue->ctrl->ctrl);
844 return result;
845 }
846 }
847
848 return consumed;
849}
850
851static void nvme_tcp_data_ready(struct sock *sk)
852{
853 struct nvme_tcp_queue *queue;
854
855 read_lock_bh(&sk->sk_callback_lock);
856 queue = sk->sk_user_data;
857 if (likely(queue && queue->rd_enabled) &&
858 !test_bit(NVME_TCP_Q_POLLING, &queue->flags))
859 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
860 read_unlock_bh(&sk->sk_callback_lock);
861}
862
863static void nvme_tcp_write_space(struct sock *sk)
864{
865 struct nvme_tcp_queue *queue;
866
867 read_lock_bh(&sk->sk_callback_lock);
868 queue = sk->sk_user_data;
869 if (likely(queue && sk_stream_is_writeable(sk))) {
870 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
871 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
872 }
873 read_unlock_bh(&sk->sk_callback_lock);
874}
875
876static void nvme_tcp_state_change(struct sock *sk)
877{
878 struct nvme_tcp_queue *queue;
879
880 read_lock_bh(&sk->sk_callback_lock);
881 queue = sk->sk_user_data;
882 if (!queue)
883 goto done;
884
885 switch (sk->sk_state) {
886 case TCP_CLOSE:
887 case TCP_CLOSE_WAIT:
888 case TCP_LAST_ACK:
889 case TCP_FIN_WAIT1:
890 case TCP_FIN_WAIT2:
891 nvme_tcp_error_recovery(&queue->ctrl->ctrl);
892 break;
893 default:
894 dev_info(queue->ctrl->ctrl.device,
895 "queue %d socket state %d\n",
896 nvme_tcp_queue_id(queue), sk->sk_state);
897 }
898
899 queue->state_change(sk);
900done:
901 read_unlock_bh(&sk->sk_callback_lock);
902}
903
904static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue)
905{
906 return !list_empty(&queue->send_list) ||
907 !llist_empty(&queue->req_list) || queue->more_requests;
908}
909
910static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue)
911{
912 queue->request = NULL;
913}
914
915static void nvme_tcp_fail_request(struct nvme_tcp_request *req)
916{
917 nvme_tcp_end_request(blk_mq_rq_from_pdu(req), NVME_SC_HOST_PATH_ERROR);
918}
919
920static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
921{
922 struct nvme_tcp_queue *queue = req->queue;
923
924 while (true) {
925 struct page *page = nvme_tcp_req_cur_page(req);
926 size_t offset = nvme_tcp_req_cur_offset(req);
927 size_t len = nvme_tcp_req_cur_length(req);
928 bool last = nvme_tcp_pdu_last_send(req, len);
929 int ret, flags = MSG_DONTWAIT;
930
931 if (last && !queue->data_digest && !nvme_tcp_queue_more(queue))
932 flags |= MSG_EOR;
933 else
934 flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
935
936 if (sendpage_ok(page)) {
937 ret = kernel_sendpage(queue->sock, page, offset, len,
938 flags);
939 } else {
940 ret = sock_no_sendpage(queue->sock, page, offset, len,
941 flags);
942 }
943 if (ret <= 0)
944 return ret;
945
946 if (queue->data_digest)
947 nvme_tcp_ddgst_update(queue->snd_hash, page,
948 offset, ret);
949
950
951 if (last && ret == len) {
952 if (queue->data_digest) {
953 nvme_tcp_ddgst_final(queue->snd_hash,
954 &req->ddgst);
955 req->state = NVME_TCP_SEND_DDGST;
956 req->offset = 0;
957 } else {
958 nvme_tcp_done_send_req(queue);
959 }
960 return 1;
961 }
962 nvme_tcp_advance_req(req, ret);
963 }
964 return -EAGAIN;
965}
966
967static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
968{
969 struct nvme_tcp_queue *queue = req->queue;
970 struct nvme_tcp_cmd_pdu *pdu = req->pdu;
971 bool inline_data = nvme_tcp_has_inline_data(req);
972 u8 hdgst = nvme_tcp_hdgst_len(queue);
973 int len = sizeof(*pdu) + hdgst - req->offset;
974 int flags = MSG_DONTWAIT;
975 int ret;
976
977 if (inline_data || nvme_tcp_queue_more(queue))
978 flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
979 else
980 flags |= MSG_EOR;
981
982 if (queue->hdr_digest && !req->offset)
983 nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
984
985 ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
986 offset_in_page(pdu) + req->offset, len, flags);
987 if (unlikely(ret <= 0))
988 return ret;
989
990 len -= ret;
991 if (!len) {
992 if (inline_data) {
993 req->state = NVME_TCP_SEND_DATA;
994 if (queue->data_digest)
995 crypto_ahash_init(queue->snd_hash);
996 } else {
997 nvme_tcp_done_send_req(queue);
998 }
999 return 1;
1000 }
1001 req->offset += ret;
1002
1003 return -EAGAIN;
1004}
1005
1006static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
1007{
1008 struct nvme_tcp_queue *queue = req->queue;
1009 struct nvme_tcp_data_pdu *pdu = req->pdu;
1010 u8 hdgst = nvme_tcp_hdgst_len(queue);
1011 int len = sizeof(*pdu) - req->offset + hdgst;
1012 int ret;
1013
1014 if (queue->hdr_digest && !req->offset)
1015 nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
1016
1017 ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
1018 offset_in_page(pdu) + req->offset, len,
1019 MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST);
1020 if (unlikely(ret <= 0))
1021 return ret;
1022
1023 len -= ret;
1024 if (!len) {
1025 req->state = NVME_TCP_SEND_DATA;
1026 if (queue->data_digest)
1027 crypto_ahash_init(queue->snd_hash);
1028 return 1;
1029 }
1030 req->offset += ret;
1031
1032 return -EAGAIN;
1033}
1034
1035static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req)
1036{
1037 struct nvme_tcp_queue *queue = req->queue;
1038 int ret;
1039 struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
1040 struct kvec iov = {
1041 .iov_base = &req->ddgst + req->offset,
1042 .iov_len = NVME_TCP_DIGEST_LENGTH - req->offset
1043 };
1044
1045 if (nvme_tcp_queue_more(queue))
1046 msg.msg_flags |= MSG_MORE;
1047 else
1048 msg.msg_flags |= MSG_EOR;
1049
1050 ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
1051 if (unlikely(ret <= 0))
1052 return ret;
1053
1054 if (req->offset + ret == NVME_TCP_DIGEST_LENGTH) {
1055 nvme_tcp_done_send_req(queue);
1056 return 1;
1057 }
1058
1059 req->offset += ret;
1060 return -EAGAIN;
1061}
1062
1063static int nvme_tcp_try_send(struct nvme_tcp_queue *queue)
1064{
1065 struct nvme_tcp_request *req;
1066 int ret = 1;
1067
1068 if (!queue->request) {
1069 queue->request = nvme_tcp_fetch_request(queue);
1070 if (!queue->request)
1071 return 0;
1072 }
1073 req = queue->request;
1074
1075 if (req->state == NVME_TCP_SEND_CMD_PDU) {
1076 ret = nvme_tcp_try_send_cmd_pdu(req);
1077 if (ret <= 0)
1078 goto done;
1079 if (!nvme_tcp_has_inline_data(req))
1080 return ret;
1081 }
1082
1083 if (req->state == NVME_TCP_SEND_H2C_PDU) {
1084 ret = nvme_tcp_try_send_data_pdu(req);
1085 if (ret <= 0)
1086 goto done;
1087 }
1088
1089 if (req->state == NVME_TCP_SEND_DATA) {
1090 ret = nvme_tcp_try_send_data(req);
1091 if (ret <= 0)
1092 goto done;
1093 }
1094
1095 if (req->state == NVME_TCP_SEND_DDGST)
1096 ret = nvme_tcp_try_send_ddgst(req);
1097done:
1098 if (ret == -EAGAIN) {
1099 ret = 0;
1100 } else if (ret < 0) {
1101 dev_err(queue->ctrl->ctrl.device,
1102 "failed to send request %d\n", ret);
1103 if (ret != -EPIPE && ret != -ECONNRESET)
1104 nvme_tcp_fail_request(queue->request);
1105 nvme_tcp_done_send_req(queue);
1106 }
1107 return ret;
1108}
1109
1110static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
1111{
1112 struct socket *sock = queue->sock;
1113 struct sock *sk = sock->sk;
1114 read_descriptor_t rd_desc;
1115 int consumed;
1116
1117 rd_desc.arg.data = queue;
1118 rd_desc.count = 1;
1119 lock_sock(sk);
1120 queue->nr_cqe = 0;
1121 consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
1122 release_sock(sk);
1123 return consumed;
1124}
1125
1126static void nvme_tcp_io_work(struct work_struct *w)
1127{
1128 struct nvme_tcp_queue *queue =
1129 container_of(w, struct nvme_tcp_queue, io_work);
1130 unsigned long deadline = jiffies + msecs_to_jiffies(1);
1131
1132 do {
1133 bool pending = false;
1134 int result;
1135
1136 if (mutex_trylock(&queue->send_mutex)) {
1137 result = nvme_tcp_try_send(queue);
1138 mutex_unlock(&queue->send_mutex);
1139 if (result > 0)
1140 pending = true;
1141 else if (unlikely(result < 0))
1142 break;
1143 } else
1144 pending = !llist_empty(&queue->req_list);
1145
1146 result = nvme_tcp_try_recv(queue);
1147 if (result > 0)
1148 pending = true;
1149 else if (unlikely(result < 0))
1150 return;
1151
1152 if (!pending)
1153 return;
1154
1155 } while (!time_after(jiffies, deadline));
1156
1157 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
1158}
1159
1160static void nvme_tcp_free_crypto(struct nvme_tcp_queue *queue)
1161{
1162 struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash);
1163
1164 ahash_request_free(queue->rcv_hash);
1165 ahash_request_free(queue->snd_hash);
1166 crypto_free_ahash(tfm);
1167}
1168
1169static int nvme_tcp_alloc_crypto(struct nvme_tcp_queue *queue)
1170{
1171 struct crypto_ahash *tfm;
1172
1173 tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC);
1174 if (IS_ERR(tfm))
1175 return PTR_ERR(tfm);
1176
1177 queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL);
1178 if (!queue->snd_hash)
1179 goto free_tfm;
1180 ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL);
1181
1182 queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL);
1183 if (!queue->rcv_hash)
1184 goto free_snd_hash;
1185 ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL);
1186
1187 return 0;
1188free_snd_hash:
1189 ahash_request_free(queue->snd_hash);
1190free_tfm:
1191 crypto_free_ahash(tfm);
1192 return -ENOMEM;
1193}
1194
1195static void nvme_tcp_free_async_req(struct nvme_tcp_ctrl *ctrl)
1196{
1197 struct nvme_tcp_request *async = &ctrl->async_req;
1198
1199 page_frag_free(async->pdu);
1200}
1201
1202static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl)
1203{
1204 struct nvme_tcp_queue *queue = &ctrl->queues[0];
1205 struct nvme_tcp_request *async = &ctrl->async_req;
1206 u8 hdgst = nvme_tcp_hdgst_len(queue);
1207
1208 async->pdu = page_frag_alloc(&queue->pf_cache,
1209 sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
1210 GFP_KERNEL | __GFP_ZERO);
1211 if (!async->pdu)
1212 return -ENOMEM;
1213
1214 async->queue = &ctrl->queues[0];
1215 return 0;
1216}
1217
1218static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
1219{
1220 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1221 struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1222
1223 if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
1224 return;
1225
1226 if (queue->hdr_digest || queue->data_digest)
1227 nvme_tcp_free_crypto(queue);
1228
1229 sock_release(queue->sock);
1230 kfree(queue->pdu);
1231 mutex_destroy(&queue->queue_lock);
1232}
1233
1234static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue)
1235{
1236 struct nvme_tcp_icreq_pdu *icreq;
1237 struct nvme_tcp_icresp_pdu *icresp;
1238 struct msghdr msg = {};
1239 struct kvec iov;
1240 bool ctrl_hdgst, ctrl_ddgst;
1241 int ret;
1242
1243 icreq = kzalloc(sizeof(*icreq), GFP_KERNEL);
1244 if (!icreq)
1245 return -ENOMEM;
1246
1247 icresp = kzalloc(sizeof(*icresp), GFP_KERNEL);
1248 if (!icresp) {
1249 ret = -ENOMEM;
1250 goto free_icreq;
1251 }
1252
1253 icreq->hdr.type = nvme_tcp_icreq;
1254 icreq->hdr.hlen = sizeof(*icreq);
1255 icreq->hdr.pdo = 0;
1256 icreq->hdr.plen = cpu_to_le32(icreq->hdr.hlen);
1257 icreq->pfv = cpu_to_le16(NVME_TCP_PFV_1_0);
1258 icreq->maxr2t = 0;
1259 icreq->hpda = 0;
1260 if (queue->hdr_digest)
1261 icreq->digest |= NVME_TCP_HDR_DIGEST_ENABLE;
1262 if (queue->data_digest)
1263 icreq->digest |= NVME_TCP_DATA_DIGEST_ENABLE;
1264
1265 iov.iov_base = icreq;
1266 iov.iov_len = sizeof(*icreq);
1267 ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
1268 if (ret < 0)
1269 goto free_icresp;
1270
1271 memset(&msg, 0, sizeof(msg));
1272 iov.iov_base = icresp;
1273 iov.iov_len = sizeof(*icresp);
1274 ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
1275 iov.iov_len, msg.msg_flags);
1276 if (ret < 0)
1277 goto free_icresp;
1278
1279 ret = -EINVAL;
1280 if (icresp->hdr.type != nvme_tcp_icresp) {
1281 pr_err("queue %d: bad type returned %d\n",
1282 nvme_tcp_queue_id(queue), icresp->hdr.type);
1283 goto free_icresp;
1284 }
1285
1286 if (le32_to_cpu(icresp->hdr.plen) != sizeof(*icresp)) {
1287 pr_err("queue %d: bad pdu length returned %d\n",
1288 nvme_tcp_queue_id(queue), icresp->hdr.plen);
1289 goto free_icresp;
1290 }
1291
1292 if (icresp->pfv != NVME_TCP_PFV_1_0) {
1293 pr_err("queue %d: bad pfv returned %d\n",
1294 nvme_tcp_queue_id(queue), icresp->pfv);
1295 goto free_icresp;
1296 }
1297
1298 ctrl_ddgst = !!(icresp->digest & NVME_TCP_DATA_DIGEST_ENABLE);
1299 if ((queue->data_digest && !ctrl_ddgst) ||
1300 (!queue->data_digest && ctrl_ddgst)) {
1301 pr_err("queue %d: data digest mismatch host: %s ctrl: %s\n",
1302 nvme_tcp_queue_id(queue),
1303 queue->data_digest ? "enabled" : "disabled",
1304 ctrl_ddgst ? "enabled" : "disabled");
1305 goto free_icresp;
1306 }
1307
1308 ctrl_hdgst = !!(icresp->digest & NVME_TCP_HDR_DIGEST_ENABLE);
1309 if ((queue->hdr_digest && !ctrl_hdgst) ||
1310 (!queue->hdr_digest && ctrl_hdgst)) {
1311 pr_err("queue %d: header digest mismatch host: %s ctrl: %s\n",
1312 nvme_tcp_queue_id(queue),
1313 queue->hdr_digest ? "enabled" : "disabled",
1314 ctrl_hdgst ? "enabled" : "disabled");
1315 goto free_icresp;
1316 }
1317
1318 if (icresp->cpda != 0) {
1319 pr_err("queue %d: unsupported cpda returned %d\n",
1320 nvme_tcp_queue_id(queue), icresp->cpda);
1321 goto free_icresp;
1322 }
1323
1324 ret = 0;
1325free_icresp:
1326 kfree(icresp);
1327free_icreq:
1328 kfree(icreq);
1329 return ret;
1330}
1331
1332static bool nvme_tcp_admin_queue(struct nvme_tcp_queue *queue)
1333{
1334 return nvme_tcp_queue_id(queue) == 0;
1335}
1336
1337static bool nvme_tcp_default_queue(struct nvme_tcp_queue *queue)
1338{
1339 struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1340 int qid = nvme_tcp_queue_id(queue);
1341
1342 return !nvme_tcp_admin_queue(queue) &&
1343 qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT];
1344}
1345
1346static bool nvme_tcp_read_queue(struct nvme_tcp_queue *queue)
1347{
1348 struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1349 int qid = nvme_tcp_queue_id(queue);
1350
1351 return !nvme_tcp_admin_queue(queue) &&
1352 !nvme_tcp_default_queue(queue) &&
1353 qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] +
1354 ctrl->io_queues[HCTX_TYPE_READ];
1355}
1356
1357static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue)
1358{
1359 struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1360 int qid = nvme_tcp_queue_id(queue);
1361
1362 return !nvme_tcp_admin_queue(queue) &&
1363 !nvme_tcp_default_queue(queue) &&
1364 !nvme_tcp_read_queue(queue) &&
1365 qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] +
1366 ctrl->io_queues[HCTX_TYPE_READ] +
1367 ctrl->io_queues[HCTX_TYPE_POLL];
1368}
1369
1370static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
1371{
1372 struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1373 int qid = nvme_tcp_queue_id(queue);
1374 int n = 0;
1375
1376 if (nvme_tcp_default_queue(queue))
1377 n = qid - 1;
1378 else if (nvme_tcp_read_queue(queue))
1379 n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - 1;
1380 else if (nvme_tcp_poll_queue(queue))
1381 n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] -
1382 ctrl->io_queues[HCTX_TYPE_READ] - 1;
1383 queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
1384}
1385
1386static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
1387 int qid, size_t queue_size)
1388{
1389 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1390 struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1391 int ret, rcv_pdu_size;
1392
1393 mutex_init(&queue->queue_lock);
1394 queue->ctrl = ctrl;
1395 init_llist_head(&queue->req_list);
1396 INIT_LIST_HEAD(&queue->send_list);
1397 mutex_init(&queue->send_mutex);
1398 INIT_WORK(&queue->io_work, nvme_tcp_io_work);
1399 queue->queue_size = queue_size;
1400
1401 if (qid > 0)
1402 queue->cmnd_capsule_len = nctrl->ioccsz * 16;
1403 else
1404 queue->cmnd_capsule_len = sizeof(struct nvme_command) +
1405 NVME_TCP_ADMIN_CCSZ;
1406
1407 ret = sock_create(ctrl->addr.ss_family, SOCK_STREAM,
1408 IPPROTO_TCP, &queue->sock);
1409 if (ret) {
1410 dev_err(nctrl->device,
1411 "failed to create socket: %d\n", ret);
1412 goto err_destroy_mutex;
1413 }
1414
1415
1416 tcp_sock_set_syncnt(queue->sock->sk, 1);
1417
1418
1419 tcp_sock_set_nodelay(queue->sock->sk);
1420
1421
1422
1423
1424
1425
1426 sock_no_linger(queue->sock->sk);
1427
1428 if (so_priority > 0)
1429 sock_set_priority(queue->sock->sk, so_priority);
1430
1431
1432 if (nctrl->opts->tos >= 0)
1433 ip_sock_set_tos(queue->sock->sk, nctrl->opts->tos);
1434
1435
1436 queue->sock->sk->sk_rcvtimeo = 10 * HZ;
1437
1438 queue->sock->sk->sk_allocation = GFP_ATOMIC;
1439 nvme_tcp_set_queue_io_cpu(queue);
1440 queue->request = NULL;
1441 queue->data_remaining = 0;
1442 queue->ddgst_remaining = 0;
1443 queue->pdu_remaining = 0;
1444 queue->pdu_offset = 0;
1445 sk_set_memalloc(queue->sock->sk);
1446
1447 if (nctrl->opts->mask & NVMF_OPT_HOST_TRADDR) {
1448 ret = kernel_bind(queue->sock, (struct sockaddr *)&ctrl->src_addr,
1449 sizeof(ctrl->src_addr));
1450 if (ret) {
1451 dev_err(nctrl->device,
1452 "failed to bind queue %d socket %d\n",
1453 qid, ret);
1454 goto err_sock;
1455 }
1456 }
1457
1458 if (nctrl->opts->mask & NVMF_OPT_HOST_IFACE) {
1459 char *iface = nctrl->opts->host_iface;
1460 sockptr_t optval = KERNEL_SOCKPTR(iface);
1461
1462 ret = sock_setsockopt(queue->sock, SOL_SOCKET, SO_BINDTODEVICE,
1463 optval, strlen(iface));
1464 if (ret) {
1465 dev_err(nctrl->device,
1466 "failed to bind to interface %s queue %d err %d\n",
1467 iface, qid, ret);
1468 goto err_sock;
1469 }
1470 }
1471
1472 queue->hdr_digest = nctrl->opts->hdr_digest;
1473 queue->data_digest = nctrl->opts->data_digest;
1474 if (queue->hdr_digest || queue->data_digest) {
1475 ret = nvme_tcp_alloc_crypto(queue);
1476 if (ret) {
1477 dev_err(nctrl->device,
1478 "failed to allocate queue %d crypto\n", qid);
1479 goto err_sock;
1480 }
1481 }
1482
1483 rcv_pdu_size = sizeof(struct nvme_tcp_rsp_pdu) +
1484 nvme_tcp_hdgst_len(queue);
1485 queue->pdu = kmalloc(rcv_pdu_size, GFP_KERNEL);
1486 if (!queue->pdu) {
1487 ret = -ENOMEM;
1488 goto err_crypto;
1489 }
1490
1491 dev_dbg(nctrl->device, "connecting queue %d\n",
1492 nvme_tcp_queue_id(queue));
1493
1494 ret = kernel_connect(queue->sock, (struct sockaddr *)&ctrl->addr,
1495 sizeof(ctrl->addr), 0);
1496 if (ret) {
1497 dev_err(nctrl->device,
1498 "failed to connect socket: %d\n", ret);
1499 goto err_rcv_pdu;
1500 }
1501
1502 ret = nvme_tcp_init_connection(queue);
1503 if (ret)
1504 goto err_init_connect;
1505
1506 queue->rd_enabled = true;
1507 set_bit(NVME_TCP_Q_ALLOCATED, &queue->flags);
1508 nvme_tcp_init_recv_ctx(queue);
1509
1510 write_lock_bh(&queue->sock->sk->sk_callback_lock);
1511 queue->sock->sk->sk_user_data = queue;
1512 queue->state_change = queue->sock->sk->sk_state_change;
1513 queue->data_ready = queue->sock->sk->sk_data_ready;
1514 queue->write_space = queue->sock->sk->sk_write_space;
1515 queue->sock->sk->sk_data_ready = nvme_tcp_data_ready;
1516 queue->sock->sk->sk_state_change = nvme_tcp_state_change;
1517 queue->sock->sk->sk_write_space = nvme_tcp_write_space;
1518#ifdef CONFIG_NET_RX_BUSY_POLL
1519 queue->sock->sk->sk_ll_usec = 1;
1520#endif
1521 write_unlock_bh(&queue->sock->sk->sk_callback_lock);
1522
1523 return 0;
1524
1525err_init_connect:
1526 kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1527err_rcv_pdu:
1528 kfree(queue->pdu);
1529err_crypto:
1530 if (queue->hdr_digest || queue->data_digest)
1531 nvme_tcp_free_crypto(queue);
1532err_sock:
1533 sock_release(queue->sock);
1534 queue->sock = NULL;
1535err_destroy_mutex:
1536 mutex_destroy(&queue->queue_lock);
1537 return ret;
1538}
1539
1540static void nvme_tcp_restore_sock_calls(struct nvme_tcp_queue *queue)
1541{
1542 struct socket *sock = queue->sock;
1543
1544 write_lock_bh(&sock->sk->sk_callback_lock);
1545 sock->sk->sk_user_data = NULL;
1546 sock->sk->sk_data_ready = queue->data_ready;
1547 sock->sk->sk_state_change = queue->state_change;
1548 sock->sk->sk_write_space = queue->write_space;
1549 write_unlock_bh(&sock->sk->sk_callback_lock);
1550}
1551
1552static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue)
1553{
1554 kernel_sock_shutdown(queue->sock, SHUT_RDWR);
1555 nvme_tcp_restore_sock_calls(queue);
1556 cancel_work_sync(&queue->io_work);
1557}
1558
1559static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
1560{
1561 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1562 struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1563
1564 mutex_lock(&queue->queue_lock);
1565 if (test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags))
1566 __nvme_tcp_stop_queue(queue);
1567 mutex_unlock(&queue->queue_lock);
1568}
1569
1570static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
1571{
1572 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1573 int ret;
1574
1575 if (idx)
1576 ret = nvmf_connect_io_queue(nctrl, idx);
1577 else
1578 ret = nvmf_connect_admin_queue(nctrl);
1579
1580 if (!ret) {
1581 set_bit(NVME_TCP_Q_LIVE, &ctrl->queues[idx].flags);
1582 } else {
1583 if (test_bit(NVME_TCP_Q_ALLOCATED, &ctrl->queues[idx].flags))
1584 __nvme_tcp_stop_queue(&ctrl->queues[idx]);
1585 dev_err(nctrl->device,
1586 "failed to connect queue: %d ret=%d\n", idx, ret);
1587 }
1588 return ret;
1589}
1590
1591static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl,
1592 bool admin)
1593{
1594 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1595 struct blk_mq_tag_set *set;
1596 int ret;
1597
1598 if (admin) {
1599 set = &ctrl->admin_tag_set;
1600 memset(set, 0, sizeof(*set));
1601 set->ops = &nvme_tcp_admin_mq_ops;
1602 set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
1603 set->reserved_tags = NVMF_RESERVED_TAGS;
1604 set->numa_node = nctrl->numa_node;
1605 set->flags = BLK_MQ_F_BLOCKING;
1606 set->cmd_size = sizeof(struct nvme_tcp_request);
1607 set->driver_data = ctrl;
1608 set->nr_hw_queues = 1;
1609 set->timeout = NVME_ADMIN_TIMEOUT;
1610 } else {
1611 set = &ctrl->tag_set;
1612 memset(set, 0, sizeof(*set));
1613 set->ops = &nvme_tcp_mq_ops;
1614 set->queue_depth = nctrl->sqsize + 1;
1615 set->reserved_tags = NVMF_RESERVED_TAGS;
1616 set->numa_node = nctrl->numa_node;
1617 set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
1618 set->cmd_size = sizeof(struct nvme_tcp_request);
1619 set->driver_data = ctrl;
1620 set->nr_hw_queues = nctrl->queue_count - 1;
1621 set->timeout = NVME_IO_TIMEOUT;
1622 set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2;
1623 }
1624
1625 ret = blk_mq_alloc_tag_set(set);
1626 if (ret)
1627 return ERR_PTR(ret);
1628
1629 return set;
1630}
1631
1632static void nvme_tcp_free_admin_queue(struct nvme_ctrl *ctrl)
1633{
1634 if (to_tcp_ctrl(ctrl)->async_req.pdu) {
1635 cancel_work_sync(&ctrl->async_event_work);
1636 nvme_tcp_free_async_req(to_tcp_ctrl(ctrl));
1637 to_tcp_ctrl(ctrl)->async_req.pdu = NULL;
1638 }
1639
1640 nvme_tcp_free_queue(ctrl, 0);
1641}
1642
1643static void nvme_tcp_free_io_queues(struct nvme_ctrl *ctrl)
1644{
1645 int i;
1646
1647 for (i = 1; i < ctrl->queue_count; i++)
1648 nvme_tcp_free_queue(ctrl, i);
1649}
1650
1651static void nvme_tcp_stop_io_queues(struct nvme_ctrl *ctrl)
1652{
1653 int i;
1654
1655 for (i = 1; i < ctrl->queue_count; i++)
1656 nvme_tcp_stop_queue(ctrl, i);
1657}
1658
1659static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl)
1660{
1661 int i, ret = 0;
1662
1663 for (i = 1; i < ctrl->queue_count; i++) {
1664 ret = nvme_tcp_start_queue(ctrl, i);
1665 if (ret)
1666 goto out_stop_queues;
1667 }
1668
1669 return 0;
1670
1671out_stop_queues:
1672 for (i--; i >= 1; i--)
1673 nvme_tcp_stop_queue(ctrl, i);
1674 return ret;
1675}
1676
1677static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl)
1678{
1679 int ret;
1680
1681 ret = nvme_tcp_alloc_queue(ctrl, 0, NVME_AQ_DEPTH);
1682 if (ret)
1683 return ret;
1684
1685 ret = nvme_tcp_alloc_async_req(to_tcp_ctrl(ctrl));
1686 if (ret)
1687 goto out_free_queue;
1688
1689 return 0;
1690
1691out_free_queue:
1692 nvme_tcp_free_queue(ctrl, 0);
1693 return ret;
1694}
1695
1696static int __nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
1697{
1698 int i, ret;
1699
1700 for (i = 1; i < ctrl->queue_count; i++) {
1701 ret = nvme_tcp_alloc_queue(ctrl, i,
1702 ctrl->sqsize + 1);
1703 if (ret)
1704 goto out_free_queues;
1705 }
1706
1707 return 0;
1708
1709out_free_queues:
1710 for (i--; i >= 1; i--)
1711 nvme_tcp_free_queue(ctrl, i);
1712
1713 return ret;
1714}
1715
1716static unsigned int nvme_tcp_nr_io_queues(struct nvme_ctrl *ctrl)
1717{
1718 unsigned int nr_io_queues;
1719
1720 nr_io_queues = min(ctrl->opts->nr_io_queues, num_online_cpus());
1721 nr_io_queues += min(ctrl->opts->nr_write_queues, num_online_cpus());
1722 nr_io_queues += min(ctrl->opts->nr_poll_queues, num_online_cpus());
1723
1724 return nr_io_queues;
1725}
1726
1727static void nvme_tcp_set_io_queues(struct nvme_ctrl *nctrl,
1728 unsigned int nr_io_queues)
1729{
1730 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
1731 struct nvmf_ctrl_options *opts = nctrl->opts;
1732
1733 if (opts->nr_write_queues && opts->nr_io_queues < nr_io_queues) {
1734
1735
1736
1737
1738
1739 ctrl->io_queues[HCTX_TYPE_READ] = opts->nr_io_queues;
1740 nr_io_queues -= ctrl->io_queues[HCTX_TYPE_READ];
1741 ctrl->io_queues[HCTX_TYPE_DEFAULT] =
1742 min(opts->nr_write_queues, nr_io_queues);
1743 nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
1744 } else {
1745
1746
1747
1748
1749
1750 ctrl->io_queues[HCTX_TYPE_DEFAULT] =
1751 min(opts->nr_io_queues, nr_io_queues);
1752 nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT];
1753 }
1754
1755 if (opts->nr_poll_queues && nr_io_queues) {
1756
1757 ctrl->io_queues[HCTX_TYPE_POLL] =
1758 min(opts->nr_poll_queues, nr_io_queues);
1759 }
1760}
1761
1762static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
1763{
1764 unsigned int nr_io_queues;
1765 int ret;
1766
1767 nr_io_queues = nvme_tcp_nr_io_queues(ctrl);
1768 ret = nvme_set_queue_count(ctrl, &nr_io_queues);
1769 if (ret)
1770 return ret;
1771
1772 ctrl->queue_count = nr_io_queues + 1;
1773 if (ctrl->queue_count < 2) {
1774 dev_err(ctrl->device,
1775 "unable to set any I/O queues\n");
1776 return -ENOMEM;
1777 }
1778
1779 dev_info(ctrl->device,
1780 "creating %d I/O queues.\n", nr_io_queues);
1781
1782 nvme_tcp_set_io_queues(ctrl, nr_io_queues);
1783
1784 return __nvme_tcp_alloc_io_queues(ctrl);
1785}
1786
1787static void nvme_tcp_destroy_io_queues(struct nvme_ctrl *ctrl, bool remove)
1788{
1789 nvme_tcp_stop_io_queues(ctrl);
1790 if (remove) {
1791 blk_cleanup_queue(ctrl->connect_q);
1792 blk_mq_free_tag_set(ctrl->tagset);
1793 }
1794 nvme_tcp_free_io_queues(ctrl);
1795}
1796
1797static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
1798{
1799 int ret;
1800
1801 ret = nvme_tcp_alloc_io_queues(ctrl);
1802 if (ret)
1803 return ret;
1804
1805 if (new) {
1806 ctrl->tagset = nvme_tcp_alloc_tagset(ctrl, false);
1807 if (IS_ERR(ctrl->tagset)) {
1808 ret = PTR_ERR(ctrl->tagset);
1809 goto out_free_io_queues;
1810 }
1811
1812 ctrl->connect_q = blk_mq_init_queue(ctrl->tagset);
1813 if (IS_ERR(ctrl->connect_q)) {
1814 ret = PTR_ERR(ctrl->connect_q);
1815 goto out_free_tag_set;
1816 }
1817 }
1818
1819 ret = nvme_tcp_start_io_queues(ctrl);
1820 if (ret)
1821 goto out_cleanup_connect_q;
1822
1823 if (!new) {
1824 nvme_start_queues(ctrl);
1825 if (!nvme_wait_freeze_timeout(ctrl, NVME_IO_TIMEOUT)) {
1826
1827
1828
1829
1830
1831 ret = -ENODEV;
1832 goto out_wait_freeze_timed_out;
1833 }
1834 blk_mq_update_nr_hw_queues(ctrl->tagset,
1835 ctrl->queue_count - 1);
1836 nvme_unfreeze(ctrl);
1837 }
1838
1839 return 0;
1840
1841out_wait_freeze_timed_out:
1842 nvme_stop_queues(ctrl);
1843 nvme_sync_io_queues(ctrl);
1844 nvme_tcp_stop_io_queues(ctrl);
1845out_cleanup_connect_q:
1846 nvme_cancel_tagset(ctrl);
1847 if (new)
1848 blk_cleanup_queue(ctrl->connect_q);
1849out_free_tag_set:
1850 if (new)
1851 blk_mq_free_tag_set(ctrl->tagset);
1852out_free_io_queues:
1853 nvme_tcp_free_io_queues(ctrl);
1854 return ret;
1855}
1856
1857static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove)
1858{
1859 nvme_tcp_stop_queue(ctrl, 0);
1860 if (remove) {
1861 blk_cleanup_queue(ctrl->admin_q);
1862 blk_cleanup_queue(ctrl->fabrics_q);
1863 blk_mq_free_tag_set(ctrl->admin_tagset);
1864 }
1865 nvme_tcp_free_admin_queue(ctrl);
1866}
1867
1868static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
1869{
1870 int error;
1871
1872 error = nvme_tcp_alloc_admin_queue(ctrl);
1873 if (error)
1874 return error;
1875
1876 if (new) {
1877 ctrl->admin_tagset = nvme_tcp_alloc_tagset(ctrl, true);
1878 if (IS_ERR(ctrl->admin_tagset)) {
1879 error = PTR_ERR(ctrl->admin_tagset);
1880 goto out_free_queue;
1881 }
1882
1883 ctrl->fabrics_q = blk_mq_init_queue(ctrl->admin_tagset);
1884 if (IS_ERR(ctrl->fabrics_q)) {
1885 error = PTR_ERR(ctrl->fabrics_q);
1886 goto out_free_tagset;
1887 }
1888
1889 ctrl->admin_q = blk_mq_init_queue(ctrl->admin_tagset);
1890 if (IS_ERR(ctrl->admin_q)) {
1891 error = PTR_ERR(ctrl->admin_q);
1892 goto out_cleanup_fabrics_q;
1893 }
1894 }
1895
1896 error = nvme_tcp_start_queue(ctrl, 0);
1897 if (error)
1898 goto out_cleanup_queue;
1899
1900 error = nvme_enable_ctrl(ctrl);
1901 if (error)
1902 goto out_stop_queue;
1903
1904 blk_mq_unquiesce_queue(ctrl->admin_q);
1905
1906 error = nvme_init_ctrl_finish(ctrl);
1907 if (error)
1908 goto out_quiesce_queue;
1909
1910 return 0;
1911
1912out_quiesce_queue:
1913 blk_mq_quiesce_queue(ctrl->admin_q);
1914 blk_sync_queue(ctrl->admin_q);
1915out_stop_queue:
1916 nvme_tcp_stop_queue(ctrl, 0);
1917 nvme_cancel_admin_tagset(ctrl);
1918out_cleanup_queue:
1919 if (new)
1920 blk_cleanup_queue(ctrl->admin_q);
1921out_cleanup_fabrics_q:
1922 if (new)
1923 blk_cleanup_queue(ctrl->fabrics_q);
1924out_free_tagset:
1925 if (new)
1926 blk_mq_free_tag_set(ctrl->admin_tagset);
1927out_free_queue:
1928 nvme_tcp_free_admin_queue(ctrl);
1929 return error;
1930}
1931
1932static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl,
1933 bool remove)
1934{
1935 blk_mq_quiesce_queue(ctrl->admin_q);
1936 blk_sync_queue(ctrl->admin_q);
1937 nvme_tcp_stop_queue(ctrl, 0);
1938 nvme_cancel_admin_tagset(ctrl);
1939 if (remove)
1940 blk_mq_unquiesce_queue(ctrl->admin_q);
1941 nvme_tcp_destroy_admin_queue(ctrl, remove);
1942}
1943
1944static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
1945 bool remove)
1946{
1947 if (ctrl->queue_count <= 1)
1948 return;
1949 blk_mq_quiesce_queue(ctrl->admin_q);
1950 nvme_start_freeze(ctrl);
1951 nvme_stop_queues(ctrl);
1952 nvme_sync_io_queues(ctrl);
1953 nvme_tcp_stop_io_queues(ctrl);
1954 nvme_cancel_tagset(ctrl);
1955 if (remove)
1956 nvme_start_queues(ctrl);
1957 nvme_tcp_destroy_io_queues(ctrl, remove);
1958}
1959
1960static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl)
1961{
1962
1963 if (ctrl->state != NVME_CTRL_CONNECTING) {
1964 WARN_ON_ONCE(ctrl->state == NVME_CTRL_NEW ||
1965 ctrl->state == NVME_CTRL_LIVE);
1966 return;
1967 }
1968
1969 if (nvmf_should_reconnect(ctrl)) {
1970 dev_info(ctrl->device, "Reconnecting in %d seconds...\n",
1971 ctrl->opts->reconnect_delay);
1972 queue_delayed_work(nvme_wq, &to_tcp_ctrl(ctrl)->connect_work,
1973 ctrl->opts->reconnect_delay * HZ);
1974 } else {
1975 dev_info(ctrl->device, "Removing controller...\n");
1976 nvme_delete_ctrl(ctrl);
1977 }
1978}
1979
1980static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
1981{
1982 struct nvmf_ctrl_options *opts = ctrl->opts;
1983 int ret;
1984
1985 ret = nvme_tcp_configure_admin_queue(ctrl, new);
1986 if (ret)
1987 return ret;
1988
1989 if (ctrl->icdoff) {
1990 ret = -EOPNOTSUPP;
1991 dev_err(ctrl->device, "icdoff is not supported!\n");
1992 goto destroy_admin;
1993 }
1994
1995 if (!nvme_ctrl_sgl_supported(ctrl)) {
1996 ret = -EOPNOTSUPP;
1997 dev_err(ctrl->device, "Mandatory sgls are not supported!\n");
1998 goto destroy_admin;
1999 }
2000
2001 if (opts->queue_size > ctrl->sqsize + 1)
2002 dev_warn(ctrl->device,
2003 "queue_size %zu > ctrl sqsize %u, clamping down\n",
2004 opts->queue_size, ctrl->sqsize + 1);
2005
2006 if (ctrl->sqsize + 1 > ctrl->maxcmd) {
2007 dev_warn(ctrl->device,
2008 "sqsize %u > ctrl maxcmd %u, clamping down\n",
2009 ctrl->sqsize + 1, ctrl->maxcmd);
2010 ctrl->sqsize = ctrl->maxcmd - 1;
2011 }
2012
2013 if (ctrl->queue_count > 1) {
2014 ret = nvme_tcp_configure_io_queues(ctrl, new);
2015 if (ret)
2016 goto destroy_admin;
2017 }
2018
2019 if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) {
2020
2021
2022
2023
2024
2025 WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
2026 ctrl->state != NVME_CTRL_DELETING_NOIO);
2027 WARN_ON_ONCE(new);
2028 ret = -EINVAL;
2029 goto destroy_io;
2030 }
2031
2032 nvme_start_ctrl(ctrl);
2033 return 0;
2034
2035destroy_io:
2036 if (ctrl->queue_count > 1) {
2037 nvme_stop_queues(ctrl);
2038 nvme_sync_io_queues(ctrl);
2039 nvme_tcp_stop_io_queues(ctrl);
2040 nvme_cancel_tagset(ctrl);
2041 nvme_tcp_destroy_io_queues(ctrl, new);
2042 }
2043destroy_admin:
2044 blk_mq_quiesce_queue(ctrl->admin_q);
2045 blk_sync_queue(ctrl->admin_q);
2046 nvme_tcp_stop_queue(ctrl, 0);
2047 nvme_cancel_admin_tagset(ctrl);
2048 nvme_tcp_destroy_admin_queue(ctrl, new);
2049 return ret;
2050}
2051
2052static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
2053{
2054 struct nvme_tcp_ctrl *tcp_ctrl = container_of(to_delayed_work(work),
2055 struct nvme_tcp_ctrl, connect_work);
2056 struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
2057
2058 ++ctrl->nr_reconnects;
2059
2060 if (nvme_tcp_setup_ctrl(ctrl, false))
2061 goto requeue;
2062
2063 dev_info(ctrl->device, "Successfully reconnected (%d attempt)\n",
2064 ctrl->nr_reconnects);
2065
2066 ctrl->nr_reconnects = 0;
2067
2068 return;
2069
2070requeue:
2071 dev_info(ctrl->device, "Failed reconnect attempt %d\n",
2072 ctrl->nr_reconnects);
2073 nvme_tcp_reconnect_or_remove(ctrl);
2074}
2075
2076static void nvme_tcp_error_recovery_work(struct work_struct *work)
2077{
2078 struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
2079 struct nvme_tcp_ctrl, err_work);
2080 struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
2081
2082 nvme_stop_keep_alive(ctrl);
2083 nvme_tcp_teardown_io_queues(ctrl, false);
2084
2085 nvme_start_queues(ctrl);
2086 nvme_tcp_teardown_admin_queue(ctrl, false);
2087 blk_mq_unquiesce_queue(ctrl->admin_q);
2088
2089 if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
2090
2091 WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
2092 ctrl->state != NVME_CTRL_DELETING_NOIO);
2093 return;
2094 }
2095
2096 nvme_tcp_reconnect_or_remove(ctrl);
2097}
2098
2099static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
2100{
2101 cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work);
2102 cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
2103
2104 nvme_tcp_teardown_io_queues(ctrl, shutdown);
2105 blk_mq_quiesce_queue(ctrl->admin_q);
2106 if (shutdown)
2107 nvme_shutdown_ctrl(ctrl);
2108 else
2109 nvme_disable_ctrl(ctrl);
2110 nvme_tcp_teardown_admin_queue(ctrl, shutdown);
2111}
2112
2113static void nvme_tcp_delete_ctrl(struct nvme_ctrl *ctrl)
2114{
2115 nvme_tcp_teardown_ctrl(ctrl, true);
2116}
2117
2118static void nvme_reset_ctrl_work(struct work_struct *work)
2119{
2120 struct nvme_ctrl *ctrl =
2121 container_of(work, struct nvme_ctrl, reset_work);
2122
2123 nvme_stop_ctrl(ctrl);
2124 nvme_tcp_teardown_ctrl(ctrl, false);
2125
2126 if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
2127
2128 WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
2129 ctrl->state != NVME_CTRL_DELETING_NOIO);
2130 return;
2131 }
2132
2133 if (nvme_tcp_setup_ctrl(ctrl, false))
2134 goto out_fail;
2135
2136 return;
2137
2138out_fail:
2139 ++ctrl->nr_reconnects;
2140 nvme_tcp_reconnect_or_remove(ctrl);
2141}
2142
2143static void nvme_tcp_free_ctrl(struct nvme_ctrl *nctrl)
2144{
2145 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
2146
2147 if (list_empty(&ctrl->list))
2148 goto free_ctrl;
2149
2150 mutex_lock(&nvme_tcp_ctrl_mutex);
2151 list_del(&ctrl->list);
2152 mutex_unlock(&nvme_tcp_ctrl_mutex);
2153
2154 nvmf_free_options(nctrl->opts);
2155free_ctrl:
2156 kfree(ctrl->queues);
2157 kfree(ctrl);
2158}
2159
2160static void nvme_tcp_set_sg_null(struct nvme_command *c)
2161{
2162 struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
2163
2164 sg->addr = 0;
2165 sg->length = 0;
2166 sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
2167 NVME_SGL_FMT_TRANSPORT_A;
2168}
2169
2170static void nvme_tcp_set_sg_inline(struct nvme_tcp_queue *queue,
2171 struct nvme_command *c, u32 data_len)
2172{
2173 struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
2174
2175 sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff);
2176 sg->length = cpu_to_le32(data_len);
2177 sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
2178}
2179
2180static void nvme_tcp_set_sg_host_data(struct nvme_command *c,
2181 u32 data_len)
2182{
2183 struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
2184
2185 sg->addr = 0;
2186 sg->length = cpu_to_le32(data_len);
2187 sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
2188 NVME_SGL_FMT_TRANSPORT_A;
2189}
2190
2191static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg)
2192{
2193 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(arg);
2194 struct nvme_tcp_queue *queue = &ctrl->queues[0];
2195 struct nvme_tcp_cmd_pdu *pdu = ctrl->async_req.pdu;
2196 struct nvme_command *cmd = &pdu->cmd;
2197 u8 hdgst = nvme_tcp_hdgst_len(queue);
2198
2199 memset(pdu, 0, sizeof(*pdu));
2200 pdu->hdr.type = nvme_tcp_cmd;
2201 if (queue->hdr_digest)
2202 pdu->hdr.flags |= NVME_TCP_F_HDGST;
2203 pdu->hdr.hlen = sizeof(*pdu);
2204 pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
2205
2206 cmd->common.opcode = nvme_admin_async_event;
2207 cmd->common.command_id = NVME_AQ_BLK_MQ_DEPTH;
2208 cmd->common.flags |= NVME_CMD_SGL_METABUF;
2209 nvme_tcp_set_sg_null(cmd);
2210
2211 ctrl->async_req.state = NVME_TCP_SEND_CMD_PDU;
2212 ctrl->async_req.offset = 0;
2213 ctrl->async_req.curr_bio = NULL;
2214 ctrl->async_req.data_len = 0;
2215
2216 nvme_tcp_queue_request(&ctrl->async_req, true, true);
2217}
2218
2219static void nvme_tcp_complete_timed_out(struct request *rq)
2220{
2221 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2222 struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl;
2223
2224 nvme_tcp_stop_queue(ctrl, nvme_tcp_queue_id(req->queue));
2225 if (blk_mq_request_started(rq) && !blk_mq_request_completed(rq)) {
2226 nvme_req(rq)->status = NVME_SC_HOST_ABORTED_CMD;
2227 blk_mq_complete_request(rq);
2228 }
2229}
2230
2231static enum blk_eh_timer_return
2232nvme_tcp_timeout(struct request *rq, bool reserved)
2233{
2234 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2235 struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl;
2236 struct nvme_tcp_cmd_pdu *pdu = req->pdu;
2237
2238 dev_warn(ctrl->device,
2239 "queue %d: timeout request %#x type %d\n",
2240 nvme_tcp_queue_id(req->queue), rq->tag, pdu->hdr.type);
2241
2242 if (ctrl->state != NVME_CTRL_LIVE) {
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256 nvme_tcp_complete_timed_out(rq);
2257 return BLK_EH_DONE;
2258 }
2259
2260
2261
2262
2263
2264 nvme_tcp_error_recovery(ctrl);
2265 return BLK_EH_RESET_TIMER;
2266}
2267
2268static blk_status_t nvme_tcp_map_data(struct nvme_tcp_queue *queue,
2269 struct request *rq)
2270{
2271 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2272 struct nvme_tcp_cmd_pdu *pdu = req->pdu;
2273 struct nvme_command *c = &pdu->cmd;
2274
2275 c->common.flags |= NVME_CMD_SGL_METABUF;
2276
2277 if (!blk_rq_nr_phys_segments(rq))
2278 nvme_tcp_set_sg_null(c);
2279 else if (rq_data_dir(rq) == WRITE &&
2280 req->data_len <= nvme_tcp_inline_data_size(queue))
2281 nvme_tcp_set_sg_inline(queue, c, req->data_len);
2282 else
2283 nvme_tcp_set_sg_host_data(c, req->data_len);
2284
2285 return 0;
2286}
2287
2288static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns,
2289 struct request *rq)
2290{
2291 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2292 struct nvme_tcp_cmd_pdu *pdu = req->pdu;
2293 struct nvme_tcp_queue *queue = req->queue;
2294 u8 hdgst = nvme_tcp_hdgst_len(queue), ddgst = 0;
2295 blk_status_t ret;
2296
2297 ret = nvme_setup_cmd(ns, rq);
2298 if (ret)
2299 return ret;
2300
2301 req->state = NVME_TCP_SEND_CMD_PDU;
2302 req->offset = 0;
2303 req->data_sent = 0;
2304 req->pdu_len = 0;
2305 req->pdu_sent = 0;
2306 req->data_len = blk_rq_nr_phys_segments(rq) ?
2307 blk_rq_payload_bytes(rq) : 0;
2308 req->curr_bio = rq->bio;
2309 if (req->curr_bio && req->data_len)
2310 nvme_tcp_init_iter(req, rq_data_dir(rq));
2311
2312 if (rq_data_dir(rq) == WRITE &&
2313 req->data_len <= nvme_tcp_inline_data_size(queue))
2314 req->pdu_len = req->data_len;
2315
2316 pdu->hdr.type = nvme_tcp_cmd;
2317 pdu->hdr.flags = 0;
2318 if (queue->hdr_digest)
2319 pdu->hdr.flags |= NVME_TCP_F_HDGST;
2320 if (queue->data_digest && req->pdu_len) {
2321 pdu->hdr.flags |= NVME_TCP_F_DDGST;
2322 ddgst = nvme_tcp_ddgst_len(queue);
2323 }
2324 pdu->hdr.hlen = sizeof(*pdu);
2325 pdu->hdr.pdo = req->pdu_len ? pdu->hdr.hlen + hdgst : 0;
2326 pdu->hdr.plen =
2327 cpu_to_le32(pdu->hdr.hlen + hdgst + req->pdu_len + ddgst);
2328
2329 ret = nvme_tcp_map_data(queue, rq);
2330 if (unlikely(ret)) {
2331 nvme_cleanup_cmd(rq);
2332 dev_err(queue->ctrl->ctrl.device,
2333 "Failed to map data (%d)\n", ret);
2334 return ret;
2335 }
2336
2337 return 0;
2338}
2339
2340static void nvme_tcp_commit_rqs(struct blk_mq_hw_ctx *hctx)
2341{
2342 struct nvme_tcp_queue *queue = hctx->driver_data;
2343
2344 if (!llist_empty(&queue->req_list))
2345 queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
2346}
2347
2348static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
2349 const struct blk_mq_queue_data *bd)
2350{
2351 struct nvme_ns *ns = hctx->queue->queuedata;
2352 struct nvme_tcp_queue *queue = hctx->driver_data;
2353 struct request *rq = bd->rq;
2354 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2355 bool queue_ready = test_bit(NVME_TCP_Q_LIVE, &queue->flags);
2356 blk_status_t ret;
2357
2358 if (!nvme_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
2359 return nvme_fail_nonready_command(&queue->ctrl->ctrl, rq);
2360
2361 ret = nvme_tcp_setup_cmd_pdu(ns, rq);
2362 if (unlikely(ret))
2363 return ret;
2364
2365 blk_mq_start_request(rq);
2366
2367 nvme_tcp_queue_request(req, true, bd->last);
2368
2369 return BLK_STS_OK;
2370}
2371
2372static int nvme_tcp_map_queues(struct blk_mq_tag_set *set)
2373{
2374 struct nvme_tcp_ctrl *ctrl = set->driver_data;
2375 struct nvmf_ctrl_options *opts = ctrl->ctrl.opts;
2376
2377 if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) {
2378
2379 set->map[HCTX_TYPE_DEFAULT].nr_queues =
2380 ctrl->io_queues[HCTX_TYPE_DEFAULT];
2381 set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
2382 set->map[HCTX_TYPE_READ].nr_queues =
2383 ctrl->io_queues[HCTX_TYPE_READ];
2384 set->map[HCTX_TYPE_READ].queue_offset =
2385 ctrl->io_queues[HCTX_TYPE_DEFAULT];
2386 } else {
2387
2388 set->map[HCTX_TYPE_DEFAULT].nr_queues =
2389 ctrl->io_queues[HCTX_TYPE_DEFAULT];
2390 set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
2391 set->map[HCTX_TYPE_READ].nr_queues =
2392 ctrl->io_queues[HCTX_TYPE_DEFAULT];
2393 set->map[HCTX_TYPE_READ].queue_offset = 0;
2394 }
2395 blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
2396 blk_mq_map_queues(&set->map[HCTX_TYPE_READ]);
2397
2398 if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) {
2399
2400 set->map[HCTX_TYPE_POLL].nr_queues =
2401 ctrl->io_queues[HCTX_TYPE_POLL];
2402 set->map[HCTX_TYPE_POLL].queue_offset =
2403 ctrl->io_queues[HCTX_TYPE_DEFAULT] +
2404 ctrl->io_queues[HCTX_TYPE_READ];
2405 blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]);
2406 }
2407
2408 dev_info(ctrl->ctrl.device,
2409 "mapped %d/%d/%d default/read/poll queues.\n",
2410 ctrl->io_queues[HCTX_TYPE_DEFAULT],
2411 ctrl->io_queues[HCTX_TYPE_READ],
2412 ctrl->io_queues[HCTX_TYPE_POLL]);
2413
2414 return 0;
2415}
2416
2417static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx)
2418{
2419 struct nvme_tcp_queue *queue = hctx->driver_data;
2420 struct sock *sk = queue->sock->sk;
2421
2422 if (!test_bit(NVME_TCP_Q_LIVE, &queue->flags))
2423 return 0;
2424
2425 set_bit(NVME_TCP_Q_POLLING, &queue->flags);
2426 if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue))
2427 sk_busy_loop(sk, true);
2428 nvme_tcp_try_recv(queue);
2429 clear_bit(NVME_TCP_Q_POLLING, &queue->flags);
2430 return queue->nr_cqe;
2431}
2432
2433static const struct blk_mq_ops nvme_tcp_mq_ops = {
2434 .queue_rq = nvme_tcp_queue_rq,
2435 .commit_rqs = nvme_tcp_commit_rqs,
2436 .complete = nvme_complete_rq,
2437 .init_request = nvme_tcp_init_request,
2438 .exit_request = nvme_tcp_exit_request,
2439 .init_hctx = nvme_tcp_init_hctx,
2440 .timeout = nvme_tcp_timeout,
2441 .map_queues = nvme_tcp_map_queues,
2442 .poll = nvme_tcp_poll,
2443};
2444
2445static const struct blk_mq_ops nvme_tcp_admin_mq_ops = {
2446 .queue_rq = nvme_tcp_queue_rq,
2447 .complete = nvme_complete_rq,
2448 .init_request = nvme_tcp_init_request,
2449 .exit_request = nvme_tcp_exit_request,
2450 .init_hctx = nvme_tcp_init_admin_hctx,
2451 .timeout = nvme_tcp_timeout,
2452};
2453
2454static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = {
2455 .name = "tcp",
2456 .module = THIS_MODULE,
2457 .flags = NVME_F_FABRICS,
2458 .reg_read32 = nvmf_reg_read32,
2459 .reg_read64 = nvmf_reg_read64,
2460 .reg_write32 = nvmf_reg_write32,
2461 .free_ctrl = nvme_tcp_free_ctrl,
2462 .submit_async_event = nvme_tcp_submit_async_event,
2463 .delete_ctrl = nvme_tcp_delete_ctrl,
2464 .get_address = nvmf_get_address,
2465};
2466
2467static bool
2468nvme_tcp_existing_controller(struct nvmf_ctrl_options *opts)
2469{
2470 struct nvme_tcp_ctrl *ctrl;
2471 bool found = false;
2472
2473 mutex_lock(&nvme_tcp_ctrl_mutex);
2474 list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list) {
2475 found = nvmf_ip_options_match(&ctrl->ctrl, opts);
2476 if (found)
2477 break;
2478 }
2479 mutex_unlock(&nvme_tcp_ctrl_mutex);
2480
2481 return found;
2482}
2483
2484static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
2485 struct nvmf_ctrl_options *opts)
2486{
2487 struct nvme_tcp_ctrl *ctrl;
2488 int ret;
2489
2490 ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
2491 if (!ctrl)
2492 return ERR_PTR(-ENOMEM);
2493
2494 INIT_LIST_HEAD(&ctrl->list);
2495 ctrl->ctrl.opts = opts;
2496 ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues +
2497 opts->nr_poll_queues + 1;
2498 ctrl->ctrl.sqsize = opts->queue_size - 1;
2499 ctrl->ctrl.kato = opts->kato;
2500
2501 INIT_DELAYED_WORK(&ctrl->connect_work,
2502 nvme_tcp_reconnect_ctrl_work);
2503 INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work);
2504 INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work);
2505
2506 if (!(opts->mask & NVMF_OPT_TRSVCID)) {
2507 opts->trsvcid =
2508 kstrdup(__stringify(NVME_TCP_DISC_PORT), GFP_KERNEL);
2509 if (!opts->trsvcid) {
2510 ret = -ENOMEM;
2511 goto out_free_ctrl;
2512 }
2513 opts->mask |= NVMF_OPT_TRSVCID;
2514 }
2515
2516 ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
2517 opts->traddr, opts->trsvcid, &ctrl->addr);
2518 if (ret) {
2519 pr_err("malformed address passed: %s:%s\n",
2520 opts->traddr, opts->trsvcid);
2521 goto out_free_ctrl;
2522 }
2523
2524 if (opts->mask & NVMF_OPT_HOST_TRADDR) {
2525 ret = inet_pton_with_scope(&init_net, AF_UNSPEC,
2526 opts->host_traddr, NULL, &ctrl->src_addr);
2527 if (ret) {
2528 pr_err("malformed src address passed: %s\n",
2529 opts->host_traddr);
2530 goto out_free_ctrl;
2531 }
2532 }
2533
2534 if (opts->mask & NVMF_OPT_HOST_IFACE) {
2535 if (!__dev_get_by_name(&init_net, opts->host_iface)) {
2536 pr_err("invalid interface passed: %s\n",
2537 opts->host_iface);
2538 ret = -ENODEV;
2539 goto out_free_ctrl;
2540 }
2541 }
2542
2543 if (!opts->duplicate_connect && nvme_tcp_existing_controller(opts)) {
2544 ret = -EALREADY;
2545 goto out_free_ctrl;
2546 }
2547
2548 ctrl->queues = kcalloc(ctrl->ctrl.queue_count, sizeof(*ctrl->queues),
2549 GFP_KERNEL);
2550 if (!ctrl->queues) {
2551 ret = -ENOMEM;
2552 goto out_free_ctrl;
2553 }
2554
2555 ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_tcp_ctrl_ops, 0);
2556 if (ret)
2557 goto out_kfree_queues;
2558
2559 if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
2560 WARN_ON_ONCE(1);
2561 ret = -EINTR;
2562 goto out_uninit_ctrl;
2563 }
2564
2565 ret = nvme_tcp_setup_ctrl(&ctrl->ctrl, true);
2566 if (ret)
2567 goto out_uninit_ctrl;
2568
2569 dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n",
2570 ctrl->ctrl.opts->subsysnqn, &ctrl->addr);
2571
2572 mutex_lock(&nvme_tcp_ctrl_mutex);
2573 list_add_tail(&ctrl->list, &nvme_tcp_ctrl_list);
2574 mutex_unlock(&nvme_tcp_ctrl_mutex);
2575
2576 return &ctrl->ctrl;
2577
2578out_uninit_ctrl:
2579 nvme_uninit_ctrl(&ctrl->ctrl);
2580 nvme_put_ctrl(&ctrl->ctrl);
2581 if (ret > 0)
2582 ret = -EIO;
2583 return ERR_PTR(ret);
2584out_kfree_queues:
2585 kfree(ctrl->queues);
2586out_free_ctrl:
2587 kfree(ctrl);
2588 return ERR_PTR(ret);
2589}
2590
2591static struct nvmf_transport_ops nvme_tcp_transport = {
2592 .name = "tcp",
2593 .module = THIS_MODULE,
2594 .required_opts = NVMF_OPT_TRADDR,
2595 .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
2596 NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
2597 NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST |
2598 NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES |
2599 NVMF_OPT_TOS | NVMF_OPT_HOST_IFACE,
2600 .create_ctrl = nvme_tcp_create_ctrl,
2601};
2602
2603static int __init nvme_tcp_init_module(void)
2604{
2605 nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq",
2606 WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
2607 if (!nvme_tcp_wq)
2608 return -ENOMEM;
2609
2610 nvmf_register_transport(&nvme_tcp_transport);
2611 return 0;
2612}
2613
2614static void __exit nvme_tcp_cleanup_module(void)
2615{
2616 struct nvme_tcp_ctrl *ctrl;
2617
2618 nvmf_unregister_transport(&nvme_tcp_transport);
2619
2620 mutex_lock(&nvme_tcp_ctrl_mutex);
2621 list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list)
2622 nvme_delete_ctrl(&ctrl->ctrl);
2623 mutex_unlock(&nvme_tcp_ctrl_mutex);
2624 flush_workqueue(nvme_delete_wq);
2625
2626 destroy_workqueue(nvme_tcp_wq);
2627}
2628
2629module_init(nvme_tcp_init_module);
2630module_exit(nvme_tcp_cleanup_module);
2631
2632MODULE_LICENSE("GPL v2");
2633