diff options
Diffstat (limited to 'drivers/nvme/host/tcp.c')
| -rw-r--r-- | drivers/nvme/host/tcp.c | 1027 |
1 files changed, 700 insertions, 327 deletions
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 8cedc1ef496c..69cb04406b47 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -8,12 +8,17 @@ #include <linux/init.h> #include <linux/slab.h> #include <linux/err.h> +#include <linux/crc32.h> #include <linux/nvme-tcp.h> +#include <linux/nvme-keyring.h> #include <net/sock.h> #include <net/tcp.h> +#include <net/tls.h> +#include <net/tls_prot.h> +#include <net/handshake.h> #include <linux/blk-mq.h> -#include <crypto/hash.h> #include <net/busy_poll.h> +#include <trace/events/sock.h> #include "nvme.h" #include "fabrics.h" @@ -30,6 +35,26 @@ static int so_priority; module_param(so_priority, int, 0644); MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority"); +/* + * Use the unbound workqueue for nvme_tcp_wq, then we can set the cpu affinity + * from sysfs. + */ +static bool wq_unbound; +module_param(wq_unbound, bool, 0644); +MODULE_PARM_DESC(wq_unbound, "Use unbound workqueue for nvme-tcp IO context (default false)"); + +/* + * TLS handshake timeout + */ +static int tls_handshake_timeout = 10; +#ifdef CONFIG_NVME_TCP_TLS +module_param(tls_handshake_timeout, int, 0644); +MODULE_PARM_DESC(tls_handshake_timeout, + "nvme TLS handshake timeout in seconds (default 10)"); +#endif + +static atomic_t nvme_tcp_cpu_queues[NR_CPUS]; + #ifdef CONFIG_DEBUG_LOCK_ALLOC /* lockdep can detect a circular dependency of the form * sk_lock -> mmap_lock (page fault) -> fs locks -> sk_lock @@ -103,6 +128,7 @@ enum nvme_tcp_queue_flags { NVME_TCP_Q_ALLOCATED = 0, NVME_TCP_Q_LIVE = 1, NVME_TCP_Q_POLLING = 2, + NVME_TCP_Q_IO_CPU_SET = 3, }; enum nvme_tcp_recv_state { @@ -141,11 +167,13 @@ struct nvme_tcp_queue { bool hdr_digest; bool data_digest; - struct ahash_request *rcv_hash; - struct ahash_request *snd_hash; + bool tls_enabled; + u32 rcv_crc; + u32 snd_crc; __le32 exp_ddgst; __le32 recv_ddgst; - + struct completion tls_complete; + int tls_err; struct page_frag_cache pf_cache; void (*state_change)(struct sock *); @@ -188,6 +216,41 @@ static inline int nvme_tcp_queue_id(struct nvme_tcp_queue *queue) return queue - queue->ctrl->queues; } +static inline bool nvme_tcp_recv_pdu_supported(enum nvme_tcp_pdu_type type) +{ + switch (type) { + case nvme_tcp_c2h_term: + case nvme_tcp_c2h_data: + case nvme_tcp_r2t: + case nvme_tcp_rsp: + return true; + default: + return false; + } +} + +/* + * Check if the queue is TLS encrypted + */ +static inline bool nvme_tcp_queue_tls(struct nvme_tcp_queue *queue) +{ + if (!IS_ENABLED(CONFIG_NVME_TCP_TLS)) + return 0; + + return queue->tls_enabled; +} + +/* + * Check if TLS is configured for the controller. + */ +static inline bool nvme_tcp_tls_configured(struct nvme_ctrl *ctrl) +{ + if (!IS_ENABLED(CONFIG_NVME_TCP_TLS)) + return 0; + + return ctrl->opts->tls || ctrl->opts->concat; +} + static inline struct blk_mq_tags *nvme_tcp_tagset(struct nvme_tcp_queue *queue) { u32 queue_idx = nvme_tcp_queue_id(queue); @@ -207,6 +270,18 @@ static inline u8 nvme_tcp_ddgst_len(struct nvme_tcp_queue *queue) return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0; } +static inline void *nvme_tcp_req_cmd_pdu(struct nvme_tcp_request *req) +{ + return req->pdu; +} + +static inline void *nvme_tcp_req_data_pdu(struct nvme_tcp_request *req) +{ + /* use the pdu space in the back for the data pdu */ + return req->pdu + sizeof(struct nvme_tcp_cmd_pdu) - + sizeof(struct nvme_tcp_data_pdu); +} + static inline size_t nvme_tcp_inline_data_size(struct nvme_tcp_request *req) { if (nvme_is_fabrics(req->req.cmd)) @@ -315,14 +390,20 @@ static inline void nvme_tcp_send_all(struct nvme_tcp_queue *queue) } while (ret > 0); } -static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue) +static inline bool nvme_tcp_queue_has_pending(struct nvme_tcp_queue *queue) { return !list_empty(&queue->send_list) || !llist_empty(&queue->req_list); } +static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue) +{ + return !nvme_tcp_queue_tls(queue) && + nvme_tcp_queue_has_pending(queue); +} + static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req, - bool sync, bool last) + bool last) { struct nvme_tcp_queue *queue = req->queue; bool empty; @@ -336,12 +417,12 @@ static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req, * are on the same cpu, so we don't introduce contention. */ if (queue->io_cpu == raw_smp_processor_id() && - sync && empty && mutex_trylock(&queue->send_mutex)) { + empty && mutex_trylock(&queue->send_mutex)) { nvme_tcp_send_all(queue); mutex_unlock(&queue->send_mutex); } - if (last && nvme_tcp_queue_more(queue)) + if (last && nvme_tcp_queue_has_pending(queue)) queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); } @@ -371,36 +452,43 @@ nvme_tcp_fetch_request(struct nvme_tcp_queue *queue) return NULL; } - list_del(&req->entry); + list_del_init(&req->entry); + init_llist_node(&req->lentry); return req; } -static inline void nvme_tcp_ddgst_final(struct ahash_request *hash, - __le32 *dgst) +#define NVME_TCP_CRC_SEED (~0) + +static inline void nvme_tcp_ddgst_update(u32 *crcp, + struct page *page, size_t off, size_t len) { - ahash_request_set_crypt(hash, NULL, (u8 *)dgst, 0); - crypto_ahash_final(hash); + page += off / PAGE_SIZE; + off %= PAGE_SIZE; + while (len) { + const void *vaddr = kmap_local_page(page); + size_t n = min(len, (size_t)PAGE_SIZE - off); + + *crcp = crc32c(*crcp, vaddr + off, n); + kunmap_local(vaddr); + page++; + off = 0; + len -= n; + } } -static inline void nvme_tcp_ddgst_update(struct ahash_request *hash, - struct page *page, off_t off, size_t len) +static inline __le32 nvme_tcp_ddgst_final(u32 crc) { - struct scatterlist sg; - - sg_init_table(&sg, 1); - sg_set_page(&sg, page, len, off); - ahash_request_set_crypt(hash, &sg, NULL, len); - crypto_ahash_update(hash); + return cpu_to_le32(~crc); } -static inline void nvme_tcp_hdgst(struct ahash_request *hash, - void *pdu, size_t len) +static inline __le32 nvme_tcp_hdgst(const void *pdu, size_t len) { - struct scatterlist sg; + return cpu_to_le32(~crc32c(NVME_TCP_CRC_SEED, pdu, len)); +} - sg_init_one(&sg, pdu, len); - ahash_request_set_crypt(hash, &sg, pdu + len, len); - crypto_ahash_digest(hash); +static inline void nvme_tcp_set_hdgst(void *pdu, size_t len) +{ + *(__le32 *)(pdu + len) = nvme_tcp_hdgst(pdu, len); } static int nvme_tcp_verify_hdgst(struct nvme_tcp_queue *queue, @@ -418,8 +506,7 @@ static int nvme_tcp_verify_hdgst(struct nvme_tcp_queue *queue, } recv_digest = *(__le32 *)(pdu + hdr->hlen); - nvme_tcp_hdgst(queue->rcv_hash, pdu, pdu_len); - exp_digest = *(__le32 *)(pdu + hdr->hlen); + exp_digest = nvme_tcp_hdgst(pdu, pdu_len); if (recv_digest != exp_digest) { dev_err(queue->ctrl->ctrl.device, "header digest error: recv %#x expected %#x\n", @@ -445,7 +532,7 @@ static int nvme_tcp_check_ddgst(struct nvme_tcp_queue *queue, void *pdu) nvme_tcp_queue_id(queue)); return -EPROTO; } - crypto_ahash_init(queue->rcv_hash); + queue->rcv_crc = NVME_TCP_CRC_SEED; return 0; } @@ -479,6 +566,8 @@ static int nvme_tcp_init_request(struct blk_mq_tag_set *set, req->queue = queue; nvme_req(rq)->ctrl = &ctrl->ctrl; nvme_req(rq)->cmd = &pdu->cmd; + init_llist_node(&req->lentry); + INIT_LIST_HEAD(&req->entry); return 0; } @@ -613,7 +702,7 @@ static int nvme_tcp_handle_comp(struct nvme_tcp_queue *queue, static void nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req) { - struct nvme_tcp_data_pdu *data = req->pdu; + struct nvme_tcp_data_pdu *data = nvme_tcp_req_data_pdu(req); struct nvme_tcp_queue *queue = req->queue; struct request *rq = blk_mq_rq_from_pdu(req); u32 h2cdata_sent = req->pdu_len; @@ -683,17 +772,61 @@ static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue, return -EPROTO; } + if (llist_on_list(&req->lentry) || + !list_empty(&req->entry)) { + dev_err(queue->ctrl->ctrl.device, + "req %d unexpected r2t while processing request\n", + rq->tag); + return -EPROTO; + } + req->pdu_len = 0; req->h2cdata_left = r2t_length; req->h2cdata_offset = r2t_offset; req->ttag = pdu->ttag; nvme_tcp_setup_h2c_data_pdu(req); - nvme_tcp_queue_request(req, false, true); + + llist_add(&req->lentry, &queue->req_list); + queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); return 0; } +static void nvme_tcp_handle_c2h_term(struct nvme_tcp_queue *queue, + struct nvme_tcp_term_pdu *pdu) +{ + u16 fes; + const char *msg; + u32 plen = le32_to_cpu(pdu->hdr.plen); + + static const char * const msg_table[] = { + [NVME_TCP_FES_INVALID_PDU_HDR] = "Invalid PDU Header Field", + [NVME_TCP_FES_PDU_SEQ_ERR] = "PDU Sequence Error", + [NVME_TCP_FES_HDR_DIGEST_ERR] = "Header Digest Error", + [NVME_TCP_FES_DATA_OUT_OF_RANGE] = "Data Transfer Out Of Range", + [NVME_TCP_FES_DATA_LIMIT_EXCEEDED] = "Data Transfer Limit Exceeded", + [NVME_TCP_FES_UNSUPPORTED_PARAM] = "Unsupported Parameter", + }; + + if (plen < NVME_TCP_MIN_C2HTERM_PLEN || + plen > NVME_TCP_MAX_C2HTERM_PLEN) { + dev_err(queue->ctrl->ctrl.device, + "Received a malformed C2HTermReq PDU (plen = %u)\n", + plen); + return; + } + + fes = le16_to_cpu(pdu->fes); + if (fes && fes < ARRAY_SIZE(msg_table)) + msg = msg_table[fes]; + else + msg = "Unknown"; + + dev_err(queue->ctrl->ctrl.device, + "Received C2HTermReq (FES = %s)\n", msg); +} + static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb, unsigned int *offset, size_t *len) { @@ -715,6 +848,25 @@ static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb, return 0; hdr = queue->pdu; + if (unlikely(hdr->hlen != sizeof(struct nvme_tcp_rsp_pdu))) { + if (!nvme_tcp_recv_pdu_supported(hdr->type)) + goto unsupported_pdu; + + dev_err(queue->ctrl->ctrl.device, + "pdu type %d has unexpected header length (%d)\n", + hdr->type, hdr->hlen); + return -EPROTO; + } + + if (unlikely(hdr->type == nvme_tcp_c2h_term)) { + /* + * C2HTermReq never includes Header or Data digests. + * Skip the checks. + */ + nvme_tcp_handle_c2h_term(queue, (void *)queue->pdu); + return -EINVAL; + } + if (queue->hdr_digest) { ret = nvme_tcp_verify_hdgst(queue, queue->pdu, hdr->hlen); if (unlikely(ret)) @@ -738,10 +890,13 @@ static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb, nvme_tcp_init_recv_ctx(queue); return nvme_tcp_handle_r2t(queue, (void *)queue->pdu); default: - dev_err(queue->ctrl->ctrl.device, - "unsupported pdu type (%d)\n", hdr->type); - return -EINVAL; + goto unsupported_pdu; } + +unsupported_pdu: + dev_err(queue->ctrl->ctrl.device, + "unsupported pdu type (%d)\n", hdr->type); + return -EINVAL; } static inline void nvme_tcp_end_request(struct request *rq, u16 status) @@ -789,8 +944,8 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, iov_iter_count(&req->iter)); if (queue->data_digest) - ret = skb_copy_and_hash_datagram_iter(skb, *offset, - &req->iter, recv_len, queue->rcv_hash); + ret = skb_copy_and_crc32c_datagram_iter(skb, *offset, + &req->iter, recv_len, &queue->rcv_crc); else ret = skb_copy_datagram_iter(skb, *offset, &req->iter, recv_len); @@ -808,7 +963,7 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, if (!queue->data_remaining) { if (queue->data_digest) { - nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst); + queue->exp_ddgst = nvme_tcp_ddgst_final(queue->rcv_crc); queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH; } else { if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) { @@ -875,6 +1030,9 @@ static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff *skb, size_t consumed = len; int result; + if (unlikely(!queue->rd_enabled)) + return -EFAULT; + while (len) { switch (nvme_tcp_recv_state(queue)) { case NVME_TCP_RECV_PDU: @@ -905,6 +1063,8 @@ static void nvme_tcp_data_ready(struct sock *sk) { struct nvme_tcp_queue *queue; + trace_sk_data_ready(sk); + read_lock_bh(&sk->sk_callback_lock); queue = sk->sk_user_data; if (likely(queue && queue->rd_enabled) && @@ -921,6 +1081,9 @@ static void nvme_tcp_write_space(struct sock *sk) queue = sk->sk_user_data; if (likely(queue && sk_stream_is_writeable(sk))) { clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + /* Ensure pending TLS partial records are retried */ + if (nvme_tcp_queue_tls(queue)) + queue->write_space(sk); queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); } read_unlock_bh(&sk->sk_callback_lock); @@ -979,30 +1142,33 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) u32 h2cdata_left = req->h2cdata_left; while (true) { + struct bio_vec bvec; + struct msghdr msg = { + .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES, + }; struct page *page = nvme_tcp_req_cur_page(req); size_t offset = nvme_tcp_req_cur_offset(req); size_t len = nvme_tcp_req_cur_length(req); bool last = nvme_tcp_pdu_last_send(req, len); int req_data_sent = req->data_sent; - int ret, flags = MSG_DONTWAIT; + int ret; if (last && !queue->data_digest && !nvme_tcp_queue_more(queue)) - flags |= MSG_EOR; + msg.msg_flags |= MSG_EOR; else - flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST; + msg.msg_flags |= MSG_MORE; - if (sendpage_ok(page)) { - ret = kernel_sendpage(queue->sock, page, offset, len, - flags); - } else { - ret = sock_no_sendpage(queue->sock, page, offset, len, - flags); - } + if (!sendpages_ok(page, len, offset)) + msg.msg_flags &= ~MSG_SPLICE_PAGES; + + bvec_set_page(&bvec, page, len, offset); + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len); + ret = sock_sendmsg(queue->sock, &msg); if (ret <= 0) return ret; if (queue->data_digest) - nvme_tcp_ddgst_update(queue->snd_hash, page, + nvme_tcp_ddgst_update(&queue->snd_crc, page, offset, ret); /* @@ -1016,8 +1182,8 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) /* fully successful last send in current PDU */ if (last && ret == len) { if (queue->data_digest) { - nvme_tcp_ddgst_final(queue->snd_hash, - &req->ddgst); + req->ddgst = + nvme_tcp_ddgst_final(queue->snd_crc); req->state = NVME_TCP_SEND_DDGST; req->offset = 0; } else { @@ -1035,23 +1201,25 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req) { struct nvme_tcp_queue *queue = req->queue; - struct nvme_tcp_cmd_pdu *pdu = req->pdu; + struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req); + struct bio_vec bvec; + struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES, }; bool inline_data = nvme_tcp_has_inline_data(req); u8 hdgst = nvme_tcp_hdgst_len(queue); int len = sizeof(*pdu) + hdgst - req->offset; - int flags = MSG_DONTWAIT; int ret; if (inline_data || nvme_tcp_queue_more(queue)) - flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST; + msg.msg_flags |= MSG_MORE; else - flags |= MSG_EOR; + msg.msg_flags |= MSG_EOR; if (queue->hdr_digest && !req->offset) - nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); + nvme_tcp_set_hdgst(pdu, sizeof(*pdu)); - ret = kernel_sendpage(queue->sock, virt_to_page(pdu), - offset_in_page(pdu) + req->offset, len, flags); + bvec_set_virt(&bvec, (void *)pdu + req->offset, len); + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len); + ret = sock_sendmsg(queue->sock, &msg); if (unlikely(ret <= 0)) return ret; @@ -1060,7 +1228,7 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req) if (inline_data) { req->state = NVME_TCP_SEND_DATA; if (queue->data_digest) - crypto_ahash_init(queue->snd_hash); + queue->snd_crc = NVME_TCP_CRC_SEED; } else { nvme_tcp_done_send_req(queue); } @@ -1074,22 +1242,22 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req) static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req) { struct nvme_tcp_queue *queue = req->queue; - struct nvme_tcp_data_pdu *pdu = req->pdu; + struct nvme_tcp_data_pdu *pdu = nvme_tcp_req_data_pdu(req); + struct bio_vec bvec; + struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_MORE, }; u8 hdgst = nvme_tcp_hdgst_len(queue); int len = sizeof(*pdu) - req->offset + hdgst; int ret; if (queue->hdr_digest && !req->offset) - nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); + nvme_tcp_set_hdgst(pdu, sizeof(*pdu)); if (!req->h2cdata_left) - ret = kernel_sendpage(queue->sock, virt_to_page(pdu), - offset_in_page(pdu) + req->offset, len, - MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST); - else - ret = sock_no_sendpage(queue->sock, virt_to_page(pdu), - offset_in_page(pdu) + req->offset, len, - MSG_DONTWAIT | MSG_MORE); + msg.msg_flags |= MSG_SPLICE_PAGES; + + bvec_set_virt(&bvec, (void *)pdu + req->offset, len); + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len); + ret = sock_sendmsg(queue->sock, &msg); if (unlikely(ret <= 0)) return ret; @@ -1097,7 +1265,7 @@ static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req) if (!len) { req->state = NVME_TCP_SEND_DATA; if (queue->data_digest) - crypto_ahash_init(queue->snd_hash); + queue->snd_crc = NVME_TCP_CRC_SEED; return 1; } req->offset += ret; @@ -1201,7 +1369,7 @@ static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue) queue->nr_cqe = 0; consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb); release_sock(sk); - return consumed; + return consumed == -EAGAIN ? 0 : consumed; } static void nvme_tcp_io_work(struct work_struct *w) @@ -1229,6 +1397,11 @@ static void nvme_tcp_io_work(struct work_struct *w) else if (unlikely(result < 0)) return; + /* did we get some space after spending time in recv? */ + if (nvme_tcp_queue_has_pending(queue) && + sk_stream_is_writeable(queue->sock->sk)) + pending = true; + if (!pending || !queue->rd_enabled) return; @@ -1237,41 +1410,6 @@ static void nvme_tcp_io_work(struct work_struct *w) queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); } -static void nvme_tcp_free_crypto(struct nvme_tcp_queue *queue) -{ - struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash); - - ahash_request_free(queue->rcv_hash); - ahash_request_free(queue->snd_hash); - crypto_free_ahash(tfm); -} - -static int nvme_tcp_alloc_crypto(struct nvme_tcp_queue *queue) -{ - struct crypto_ahash *tfm; - - tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(tfm)) - return PTR_ERR(tfm); - - queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL); - if (!queue->snd_hash) - goto free_tfm; - ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL); - - queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL); - if (!queue->rcv_hash) - goto free_snd_hash; - ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL); - - return 0; -free_snd_hash: - ahash_request_free(queue->snd_hash); -free_tfm: - crypto_free_ahash(tfm); - return -ENOMEM; -} - static void nvme_tcp_free_async_req(struct nvme_tcp_ctrl *ctrl) { struct nvme_tcp_request *async = &ctrl->async_req; @@ -1297,7 +1435,6 @@ static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl) static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid) { - struct page *page; struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); struct nvme_tcp_queue *queue = &ctrl->queues[qid]; unsigned int noreclaim_flag; @@ -1305,17 +1442,12 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid) if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags)) return; - if (queue->hdr_digest || queue->data_digest) - nvme_tcp_free_crypto(queue); - - if (queue->pf_cache.va) { - page = virt_to_head_page(queue->pf_cache.va); - __page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias); - queue->pf_cache.va = NULL; - } + page_frag_cache_drain(&queue->pf_cache); noreclaim_flag = memalloc_noreclaim_save(); - sock_release(queue->sock); + /* ->sock will be released by fput() */ + fput(queue->sock->file); + queue->sock = NULL; memalloc_noreclaim_restore(noreclaim_flag); kfree(queue->pdu); @@ -1327,6 +1459,8 @@ static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue) { struct nvme_tcp_icreq_pdu *icreq; struct nvme_tcp_icresp_pdu *icresp; + char cbuf[CMSG_LEN(sizeof(char))] = {}; + u8 ctype; struct msghdr msg = {}; struct kvec iov; bool ctrl_hdgst, ctrl_ddgst; @@ -1358,17 +1492,39 @@ static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue) iov.iov_base = icreq; iov.iov_len = sizeof(*icreq); ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len); - if (ret < 0) + if (ret < 0) { + pr_warn("queue %d: failed to send icreq, error %d\n", + nvme_tcp_queue_id(queue), ret); goto free_icresp; + } memset(&msg, 0, sizeof(msg)); iov.iov_base = icresp; iov.iov_len = sizeof(*icresp); + if (nvme_tcp_queue_tls(queue)) { + msg.msg_control = cbuf; + msg.msg_controllen = sizeof(cbuf); + } + msg.msg_flags = MSG_WAITALL; ret = kernel_recvmsg(queue->sock, &msg, &iov, 1, iov.iov_len, msg.msg_flags); - if (ret < 0) + if (ret >= 0 && ret < sizeof(*icresp)) + ret = -ECONNRESET; + if (ret < 0) { + pr_warn("queue %d: failed to receive icresp, error %d\n", + nvme_tcp_queue_id(queue), ret); goto free_icresp; - + } + ret = -ENOTCONN; + if (nvme_tcp_queue_tls(queue)) { + ctype = tls_get_record_type(queue->sock->sk, + (struct cmsghdr *)cbuf); + if (ctype != TLS_RECORD_TYPE_DATA) { + pr_err("queue %d: unhandled TLS record %d\n", + nvme_tcp_queue_id(queue), ctype); + goto free_icresp; + } + } ret = -EINVAL; if (icresp->hdr.type != nvme_tcp_icresp) { pr_err("queue %d: bad type returned %d\n", @@ -1468,27 +1624,150 @@ static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue) ctrl->io_queues[HCTX_TYPE_POLL]; } +/* + * Track the number of queues assigned to each cpu using a global per-cpu + * counter and select the least used cpu from the mq_map. Our goal is to spread + * different controllers I/O threads across different cpu cores. + * + * Note that the accounting is not 100% perfect, but we don't need to be, we're + * simply putting our best effort to select the best candidate cpu core that we + * find at any given point. + */ static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue) { struct nvme_tcp_ctrl *ctrl = queue->ctrl; - int qid = nvme_tcp_queue_id(queue); - int n = 0; + struct blk_mq_tag_set *set = &ctrl->tag_set; + int qid = nvme_tcp_queue_id(queue) - 1; + unsigned int *mq_map = NULL; + int cpu, min_queues = INT_MAX, io_cpu; + + if (wq_unbound) + goto out; if (nvme_tcp_default_queue(queue)) - n = qid - 1; + mq_map = set->map[HCTX_TYPE_DEFAULT].mq_map; else if (nvme_tcp_read_queue(queue)) - n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - 1; + mq_map = set->map[HCTX_TYPE_READ].mq_map; else if (nvme_tcp_poll_queue(queue)) - n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - - ctrl->io_queues[HCTX_TYPE_READ] - 1; - queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false); + mq_map = set->map[HCTX_TYPE_POLL].mq_map; + + if (WARN_ON(!mq_map)) + goto out; + + /* Search for the least used cpu from the mq_map */ + io_cpu = WORK_CPU_UNBOUND; + for_each_online_cpu(cpu) { + int num_queues = atomic_read(&nvme_tcp_cpu_queues[cpu]); + + if (mq_map[cpu] != qid) + continue; + if (num_queues < min_queues) { + io_cpu = cpu; + min_queues = num_queues; + } + } + if (io_cpu != WORK_CPU_UNBOUND) { + queue->io_cpu = io_cpu; + atomic_inc(&nvme_tcp_cpu_queues[io_cpu]); + set_bit(NVME_TCP_Q_IO_CPU_SET, &queue->flags); + } +out: + dev_dbg(ctrl->ctrl.device, "queue %d: using cpu %d\n", + qid, queue->io_cpu); +} + +static void nvme_tcp_tls_done(void *data, int status, key_serial_t pskid) +{ + struct nvme_tcp_queue *queue = data; + struct nvme_tcp_ctrl *ctrl = queue->ctrl; + int qid = nvme_tcp_queue_id(queue); + struct key *tls_key; + + dev_dbg(ctrl->ctrl.device, "queue %d: TLS handshake done, key %x, status %d\n", + qid, pskid, status); + + if (status) { + queue->tls_err = -status; + goto out_complete; + } + + tls_key = nvme_tls_key_lookup(pskid); + if (IS_ERR(tls_key)) { + dev_warn(ctrl->ctrl.device, "queue %d: Invalid key %x\n", + qid, pskid); + queue->tls_err = -ENOKEY; + } else { + queue->tls_enabled = true; + if (qid == 0) + ctrl->ctrl.tls_pskid = key_serial(tls_key); + key_put(tls_key); + queue->tls_err = 0; + } + +out_complete: + complete(&queue->tls_complete); +} + +static int nvme_tcp_start_tls(struct nvme_ctrl *nctrl, + struct nvme_tcp_queue *queue, + key_serial_t pskid) +{ + int qid = nvme_tcp_queue_id(queue); + int ret; + struct tls_handshake_args args; + unsigned long tmo = tls_handshake_timeout * HZ; + key_serial_t keyring = nvme_keyring_id(); + + dev_dbg(nctrl->device, "queue %d: start TLS with key %x\n", + qid, pskid); + memset(&args, 0, sizeof(args)); + args.ta_sock = queue->sock; + args.ta_done = nvme_tcp_tls_done; + args.ta_data = queue; + args.ta_my_peerids[0] = pskid; + args.ta_num_peerids = 1; + if (nctrl->opts->keyring) + keyring = key_serial(nctrl->opts->keyring); + args.ta_keyring = keyring; + args.ta_timeout_ms = tls_handshake_timeout * 1000; + queue->tls_err = -EOPNOTSUPP; + init_completion(&queue->tls_complete); + ret = tls_client_hello_psk(&args, GFP_KERNEL); + if (ret) { + dev_err(nctrl->device, "queue %d: failed to start TLS: %d\n", + qid, ret); + return ret; + } + ret = wait_for_completion_interruptible_timeout(&queue->tls_complete, tmo); + if (ret <= 0) { + if (ret == 0) + ret = -ETIMEDOUT; + + dev_err(nctrl->device, + "queue %d: TLS handshake failed, error %d\n", + qid, ret); + tls_handshake_cancel(queue->sock->sk); + } else { + if (queue->tls_err) { + dev_err(nctrl->device, + "queue %d: TLS handshake complete, error %d\n", + qid, queue->tls_err); + } else { + dev_dbg(nctrl->device, + "queue %d: TLS handshake complete\n", qid); + } + ret = queue->tls_err; + } + return ret; } -static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid) +static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid, + key_serial_t pskid) { struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); struct nvme_tcp_queue *queue = &ctrl->queues[qid]; int ret, rcv_pdu_size; + struct file *sock_file; mutex_init(&queue->queue_lock); queue->ctrl = ctrl; @@ -1503,7 +1782,8 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid) queue->cmnd_capsule_len = sizeof(struct nvme_command) + NVME_TCP_ADMIN_CCSZ; - ret = sock_create(ctrl->addr.ss_family, SOCK_STREAM, + ret = sock_create_kern(current->nsproxy->net_ns, + ctrl->addr.ss_family, SOCK_STREAM, IPPROTO_TCP, &queue->sock); if (ret) { dev_err(nctrl->device, @@ -1511,6 +1791,13 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid) goto err_destroy_mutex; } + sock_file = sock_alloc_file(queue->sock, O_CLOEXEC, NULL); + if (IS_ERR(sock_file)) { + ret = PTR_ERR(sock_file); + goto err_destroy_mutex; + } + + sk_net_refcnt_upgrade(queue->sock->sk); nvme_tcp_reclassify_socket(queue->sock); /* Single syn retry */ @@ -1538,7 +1825,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid) queue->sock->sk->sk_allocation = GFP_ATOMIC; queue->sock->sk->sk_use_task_frag = false; - nvme_tcp_set_queue_io_cpu(queue); + queue->io_cpu = WORK_CPU_UNBOUND; queue->request = NULL; queue->data_remaining = 0; queue->ddgst_remaining = 0; @@ -1547,7 +1834,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid) sk_set_memalloc(queue->sock->sk); if (nctrl->opts->mask & NVMF_OPT_HOST_TRADDR) { - ret = kernel_bind(queue->sock, (struct sockaddr *)&ctrl->src_addr, + ret = kernel_bind(queue->sock, (struct sockaddr_unsized *)&ctrl->src_addr, sizeof(ctrl->src_addr)); if (ret) { dev_err(nctrl->device, @@ -1573,27 +1860,19 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid) queue->hdr_digest = nctrl->opts->hdr_digest; queue->data_digest = nctrl->opts->data_digest; - if (queue->hdr_digest || queue->data_digest) { - ret = nvme_tcp_alloc_crypto(queue); - if (ret) { - dev_err(nctrl->device, - "failed to allocate queue %d crypto\n", qid); - goto err_sock; - } - } rcv_pdu_size = sizeof(struct nvme_tcp_rsp_pdu) + nvme_tcp_hdgst_len(queue); queue->pdu = kmalloc(rcv_pdu_size, GFP_KERNEL); if (!queue->pdu) { ret = -ENOMEM; - goto err_crypto; + goto err_sock; } dev_dbg(nctrl->device, "connecting queue %d\n", nvme_tcp_queue_id(queue)); - ret = kernel_connect(queue->sock, (struct sockaddr *)&ctrl->addr, + ret = kernel_connect(queue->sock, (struct sockaddr_unsized *)&ctrl->addr, sizeof(ctrl->addr), 0); if (ret) { dev_err(nctrl->device, @@ -1601,26 +1880,18 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid) goto err_rcv_pdu; } + /* If PSKs are configured try to start TLS */ + if (nvme_tcp_tls_configured(nctrl) && pskid) { + ret = nvme_tcp_start_tls(nctrl, queue, pskid); + if (ret) + goto err_init_connect; + } + ret = nvme_tcp_init_connection(queue); if (ret) goto err_init_connect; - queue->rd_enabled = true; set_bit(NVME_TCP_Q_ALLOCATED, &queue->flags); - nvme_tcp_init_recv_ctx(queue); - - write_lock_bh(&queue->sock->sk->sk_callback_lock); - queue->sock->sk->sk_user_data = queue; - queue->state_change = queue->sock->sk->sk_state_change; - queue->data_ready = queue->sock->sk->sk_data_ready; - queue->write_space = queue->sock->sk->sk_write_space; - queue->sock->sk->sk_data_ready = nvme_tcp_data_ready; - queue->sock->sk->sk_state_change = nvme_tcp_state_change; - queue->sock->sk->sk_write_space = nvme_tcp_write_space; -#ifdef CONFIG_NET_RX_BUSY_POLL - queue->sock->sk->sk_ll_usec = 1; -#endif - write_unlock_bh(&queue->sock->sk->sk_callback_lock); return 0; @@ -1628,11 +1899,9 @@ err_init_connect: kernel_sock_shutdown(queue->sock, SHUT_RDWR); err_rcv_pdu: kfree(queue->pdu); -err_crypto: - if (queue->hdr_digest || queue->data_digest) - nvme_tcp_free_crypto(queue); err_sock: - sock_release(queue->sock); + /* ->sock will be released by fput() */ + fput(queue->sock->file); queue->sock = NULL; err_destroy_mutex: mutex_destroy(&queue->send_mutex); @@ -1640,7 +1909,7 @@ err_destroy_mutex: return ret; } -static void nvme_tcp_restore_sock_calls(struct nvme_tcp_queue *queue) +static void nvme_tcp_restore_sock_ops(struct nvme_tcp_queue *queue) { struct socket *sock = queue->sock; @@ -1655,11 +1924,11 @@ static void nvme_tcp_restore_sock_calls(struct nvme_tcp_queue *queue) static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue) { kernel_sock_shutdown(queue->sock, SHUT_RDWR); - nvme_tcp_restore_sock_calls(queue); + nvme_tcp_restore_sock_ops(queue); cancel_work_sync(&queue->io_work); } -static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid) +static void nvme_tcp_stop_queue_nowait(struct nvme_ctrl *nctrl, int qid) { struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); struct nvme_tcp_queue *queue = &ctrl->queues[qid]; @@ -1667,27 +1936,79 @@ static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid) if (!test_bit(NVME_TCP_Q_ALLOCATED, &queue->flags)) return; + if (test_and_clear_bit(NVME_TCP_Q_IO_CPU_SET, &queue->flags)) + atomic_dec(&nvme_tcp_cpu_queues[queue->io_cpu]); + mutex_lock(&queue->queue_lock); if (test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags)) __nvme_tcp_stop_queue(queue); + /* Stopping the queue will disable TLS */ + queue->tls_enabled = false; mutex_unlock(&queue->queue_lock); } +static void nvme_tcp_wait_queue(struct nvme_ctrl *nctrl, int qid) +{ + struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); + struct nvme_tcp_queue *queue = &ctrl->queues[qid]; + int timeout = 100; + + while (timeout > 0) { + if (!test_bit(NVME_TCP_Q_ALLOCATED, &queue->flags) || + !sk_wmem_alloc_get(queue->sock->sk)) + return; + msleep(2); + timeout -= 2; + } + dev_warn(nctrl->device, + "qid %d: timeout draining sock wmem allocation expired\n", + qid); +} + +static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid) +{ + nvme_tcp_stop_queue_nowait(nctrl, qid); + nvme_tcp_wait_queue(nctrl, qid); +} + + +static void nvme_tcp_setup_sock_ops(struct nvme_tcp_queue *queue) +{ + write_lock_bh(&queue->sock->sk->sk_callback_lock); + queue->sock->sk->sk_user_data = queue; + queue->state_change = queue->sock->sk->sk_state_change; + queue->data_ready = queue->sock->sk->sk_data_ready; + queue->write_space = queue->sock->sk->sk_write_space; + queue->sock->sk->sk_data_ready = nvme_tcp_data_ready; + queue->sock->sk->sk_state_change = nvme_tcp_state_change; + queue->sock->sk->sk_write_space = nvme_tcp_write_space; +#ifdef CONFIG_NET_RX_BUSY_POLL + queue->sock->sk->sk_ll_usec = 1; +#endif + write_unlock_bh(&queue->sock->sk->sk_callback_lock); +} + static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx) { struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); + struct nvme_tcp_queue *queue = &ctrl->queues[idx]; int ret; - if (idx) + queue->rd_enabled = true; + nvme_tcp_init_recv_ctx(queue); + nvme_tcp_setup_sock_ops(queue); + + if (idx) { + nvme_tcp_set_queue_io_cpu(queue); ret = nvmf_connect_io_queue(nctrl, idx); - else + } else ret = nvmf_connect_admin_queue(nctrl); if (!ret) { - set_bit(NVME_TCP_Q_LIVE, &ctrl->queues[idx].flags); + set_bit(NVME_TCP_Q_LIVE, &queue->flags); } else { - if (test_bit(NVME_TCP_Q_ALLOCATED, &ctrl->queues[idx].flags)) - __nvme_tcp_stop_queue(&ctrl->queues[idx]); + if (test_bit(NVME_TCP_Q_ALLOCATED, &queue->flags)) + __nvme_tcp_stop_queue(queue); dev_err(nctrl->device, "failed to connect queue: %d ret=%d\n", idx, ret); } @@ -1718,7 +2039,9 @@ static void nvme_tcp_stop_io_queues(struct nvme_ctrl *ctrl) int i; for (i = 1; i < ctrl->queue_count; i++) - nvme_tcp_stop_queue(ctrl, i); + nvme_tcp_stop_queue_nowait(ctrl, i); + for (i = 1; i < ctrl->queue_count; i++) + nvme_tcp_wait_queue(ctrl, i); } static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl, @@ -1743,8 +2066,23 @@ out_stop_queues: static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl) { int ret; + key_serial_t pskid = 0; + + if (nvme_tcp_tls_configured(ctrl)) { + if (ctrl->opts->tls_key) + pskid = key_serial(ctrl->opts->tls_key); + else if (ctrl->opts->tls) { + pskid = nvme_tls_psk_default(ctrl->opts->keyring, + ctrl->opts->host->nqn, + ctrl->opts->subsysnqn); + if (!pskid) { + dev_err(ctrl->device, "no valid PSK found\n"); + return -ENOKEY; + } + } + } - ret = nvme_tcp_alloc_queue(ctrl, 0); + ret = nvme_tcp_alloc_queue(ctrl, 0, pskid); if (ret) return ret; @@ -1763,8 +2101,30 @@ static int __nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl) { int i, ret; + if (nvme_tcp_tls_configured(ctrl)) { + if (ctrl->opts->concat) { + /* + * The generated PSK is stored in the + * fabric options + */ + if (!ctrl->opts->tls_key) { + dev_err(ctrl->device, "no PSK generated\n"); + return -ENOKEY; + } + if (ctrl->tls_pskid && + ctrl->tls_pskid != key_serial(ctrl->opts->tls_key)) { + dev_err(ctrl->device, "Stale PSK id %08x\n", ctrl->tls_pskid); + ctrl->tls_pskid = 0; + } + } else if (!ctrl->tls_pskid) { + dev_err(ctrl->device, "no PSK negotiated\n"); + return -ENOKEY; + } + } + for (i = 1; i < ctrl->queue_count; i++) { - ret = nvme_tcp_alloc_queue(ctrl, i); + ret = nvme_tcp_alloc_queue(ctrl, i, + ctrl->tls_pskid); if (ret) goto out_free_queues; } @@ -1778,58 +2138,12 @@ out_free_queues: return ret; } -static unsigned int nvme_tcp_nr_io_queues(struct nvme_ctrl *ctrl) -{ - unsigned int nr_io_queues; - - nr_io_queues = min(ctrl->opts->nr_io_queues, num_online_cpus()); - nr_io_queues += min(ctrl->opts->nr_write_queues, num_online_cpus()); - nr_io_queues += min(ctrl->opts->nr_poll_queues, num_online_cpus()); - - return nr_io_queues; -} - -static void nvme_tcp_set_io_queues(struct nvme_ctrl *nctrl, - unsigned int nr_io_queues) -{ - struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); - struct nvmf_ctrl_options *opts = nctrl->opts; - - if (opts->nr_write_queues && opts->nr_io_queues < nr_io_queues) { - /* - * separate read/write queues - * hand out dedicated default queues only after we have - * sufficient read queues. - */ - ctrl->io_queues[HCTX_TYPE_READ] = opts->nr_io_queues; - nr_io_queues -= ctrl->io_queues[HCTX_TYPE_READ]; - ctrl->io_queues[HCTX_TYPE_DEFAULT] = - min(opts->nr_write_queues, nr_io_queues); - nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT]; - } else { - /* - * shared read/write queues - * either no write queues were requested, or we don't have - * sufficient queue count to have dedicated default queues. - */ - ctrl->io_queues[HCTX_TYPE_DEFAULT] = - min(opts->nr_io_queues, nr_io_queues); - nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT]; - } - - if (opts->nr_poll_queues && nr_io_queues) { - /* map dedicated poll queues only if we have queues left */ - ctrl->io_queues[HCTX_TYPE_POLL] = - min(opts->nr_poll_queues, nr_io_queues); - } -} - static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl) { unsigned int nr_io_queues; int ret; - nr_io_queues = nvme_tcp_nr_io_queues(ctrl); + nr_io_queues = nvmf_nr_io_queues(ctrl->opts); ret = nvme_set_queue_count(ctrl, &nr_io_queues); if (ret) return ret; @@ -1844,19 +2158,11 @@ static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl) dev_info(ctrl->device, "creating %d I/O queues.\n", nr_io_queues); - nvme_tcp_set_io_queues(ctrl, nr_io_queues); - + nvmf_set_io_queues(ctrl->opts, nr_io_queues, + to_tcp_ctrl(ctrl)->io_queues); return __nvme_tcp_alloc_io_queues(ctrl); } -static void nvme_tcp_destroy_io_queues(struct nvme_ctrl *ctrl, bool remove) -{ - nvme_tcp_stop_io_queues(ctrl); - if (remove) - nvme_remove_io_tag_set(ctrl); - nvme_tcp_free_io_queues(ctrl); -} - static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) { int ret, nr_queues; @@ -1876,7 +2182,7 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) /* * Only start IO queues for which we have allocated the tagset - * and limitted it to the available queues. On reconnects, the + * and limited it to the available queues. On reconnects, the * queue number might have changed. */ nr_queues = min(ctrl->tagset->nr_hw_queues + 1, ctrl->queue_count); @@ -1885,6 +2191,7 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) goto out_cleanup_connect_q; if (!new) { + nvme_start_freeze(ctrl); nvme_unquiesce_io_queues(ctrl); if (!nvme_wait_freeze_timeout(ctrl, NVME_IO_TIMEOUT)) { /* @@ -1893,6 +2200,7 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) * to be safe. */ ret = -ENODEV; + nvme_unfreeze(ctrl); goto out_wait_freeze_timed_out; } blk_mq_update_nr_hw_queues(ctrl->tagset, @@ -1924,14 +2232,6 @@ out_free_io_queues: return ret; } -static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove) -{ - nvme_tcp_stop_queue(ctrl, 0); - if (remove) - nvme_remove_admin_tag_set(ctrl); - nvme_tcp_free_admin_queue(ctrl); -} - static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new) { int error; @@ -1953,6 +2253,9 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new) if (error) goto out_cleanup_tagset; + if (ctrl->opts->concat && !ctrl->tls_pskid) + return 0; + error = nvme_enable_ctrl(ctrl); if (error) goto out_stop_queue; @@ -1986,9 +2289,16 @@ static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl, blk_sync_queue(ctrl->admin_q); nvme_tcp_stop_queue(ctrl, 0); nvme_cancel_admin_tagset(ctrl); - if (remove) + if (remove) { nvme_unquiesce_admin_queue(ctrl); - nvme_tcp_destroy_admin_queue(ctrl, remove); + nvme_remove_admin_tag_set(ctrl); + } + nvme_tcp_free_admin_queue(ctrl); + if (ctrl->tls_pskid) { + dev_dbg(ctrl->device, "Wipe negotiated TLS_PSK %08x\n", + ctrl->tls_pskid); + ctrl->tls_pskid = 0; + } } static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl, @@ -1996,37 +2306,61 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl, { if (ctrl->queue_count <= 1) return; - nvme_quiesce_admin_queue(ctrl); - nvme_start_freeze(ctrl); nvme_quiesce_io_queues(ctrl); nvme_sync_io_queues(ctrl); nvme_tcp_stop_io_queues(ctrl); nvme_cancel_tagset(ctrl); - if (remove) + if (remove) { nvme_unquiesce_io_queues(ctrl); - nvme_tcp_destroy_io_queues(ctrl, remove); + nvme_remove_io_tag_set(ctrl); + } + nvme_tcp_free_io_queues(ctrl); } -static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl) +static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl, + int status) { + enum nvme_ctrl_state state = nvme_ctrl_state(ctrl); + /* If we are resetting/deleting then do nothing */ - if (ctrl->state != NVME_CTRL_CONNECTING) { - WARN_ON_ONCE(ctrl->state == NVME_CTRL_NEW || - ctrl->state == NVME_CTRL_LIVE); + if (state != NVME_CTRL_CONNECTING) { + WARN_ON_ONCE(state == NVME_CTRL_NEW || state == NVME_CTRL_LIVE); return; } - if (nvmf_should_reconnect(ctrl)) { + if (nvmf_should_reconnect(ctrl, status)) { dev_info(ctrl->device, "Reconnecting in %d seconds...\n", ctrl->opts->reconnect_delay); queue_delayed_work(nvme_wq, &to_tcp_ctrl(ctrl)->connect_work, ctrl->opts->reconnect_delay * HZ); } else { - dev_info(ctrl->device, "Removing controller...\n"); + dev_info(ctrl->device, "Removing controller (%d)...\n", + status); nvme_delete_ctrl(ctrl); } } +/* + * The TLS key is set by secure concatenation after negotiation has been + * completed on the admin queue. We need to revoke the key when: + * - concatenation is enabled (otherwise it's a static key set by the user) + * and + * - the generated key is present in ctrl->tls_key (otherwise there's nothing + * to revoke) + * and + * - a valid PSK key ID has been set in ctrl->tls_pskid (otherwise TLS + * negotiation has not run). + * + * We cannot always revoke the key as nvme_tcp_alloc_admin_queue() is called + * twice during secure concatenation, once on a 'normal' connection to run the + * DH-HMAC-CHAP negotiation (which generates the key, so it _must not_ be set), + * and once after the negotiation (which uses the key, so it _must_ be set). + */ +static bool nvme_tcp_key_revoke_needed(struct nvme_ctrl *ctrl) +{ + return ctrl->opts->concat && ctrl->opts->tls_key && ctrl->tls_pskid; +} + static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new) { struct nvmf_ctrl_options *opts = ctrl->opts; @@ -2036,6 +2370,16 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new) if (ret) return ret; + if (ctrl->opts->concat && !ctrl->tls_pskid) { + /* See comments for nvme_tcp_key_revoke_needed() */ + dev_dbg(ctrl->device, "restart admin queue for secure concatenation\n"); + nvme_stop_keep_alive(ctrl); + nvme_tcp_teardown_admin_queue(ctrl, false); + ret = nvme_tcp_configure_admin_queue(ctrl, false); + if (ret) + goto destroy_admin; + } + if (ctrl->icdoff) { ret = -EOPNOTSUPP; dev_err(ctrl->device, "icdoff is not supported!\n"); @@ -2072,8 +2416,10 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new) * unless we're during creation of a new controller to * avoid races with teardown flow. */ - WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING && - ctrl->state != NVME_CTRL_DELETING_NOIO); + enum nvme_ctrl_state state = nvme_ctrl_state(ctrl); + + WARN_ON_ONCE(state != NVME_CTRL_DELETING && + state != NVME_CTRL_DELETING_NOIO); WARN_ON_ONCE(new); ret = -EINVAL; goto destroy_io; @@ -2088,14 +2434,13 @@ destroy_io: nvme_sync_io_queues(ctrl); nvme_tcp_stop_io_queues(ctrl); nvme_cancel_tagset(ctrl); - nvme_tcp_destroy_io_queues(ctrl, new); + if (new) + nvme_remove_io_tag_set(ctrl); + nvme_tcp_free_io_queues(ctrl); } destroy_admin: - nvme_quiesce_admin_queue(ctrl); - blk_sync_queue(ctrl->admin_q); - nvme_tcp_stop_queue(ctrl, 0); - nvme_cancel_admin_tagset(ctrl); - nvme_tcp_destroy_admin_queue(ctrl, new); + nvme_stop_keep_alive(ctrl); + nvme_tcp_teardown_admin_queue(ctrl, new); return ret; } @@ -2104,23 +2449,25 @@ static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work) struct nvme_tcp_ctrl *tcp_ctrl = container_of(to_delayed_work(work), struct nvme_tcp_ctrl, connect_work); struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl; + int ret; ++ctrl->nr_reconnects; - if (nvme_tcp_setup_ctrl(ctrl, false)) + ret = nvme_tcp_setup_ctrl(ctrl, false); + if (ret) goto requeue; - dev_info(ctrl->device, "Successfully reconnected (%d attempt)\n", - ctrl->nr_reconnects); + dev_info(ctrl->device, "Successfully reconnected (attempt %d/%d)\n", + ctrl->nr_reconnects, ctrl->opts->max_reconnects); ctrl->nr_reconnects = 0; return; requeue: - dev_info(ctrl->device, "Failed reconnect attempt %d\n", - ctrl->nr_reconnects); - nvme_tcp_reconnect_or_remove(ctrl); + dev_info(ctrl->device, "Failed reconnect attempt %d/%d\n", + ctrl->nr_reconnects, ctrl->opts->max_reconnects); + nvme_tcp_reconnect_or_remove(ctrl, ret); } static void nvme_tcp_error_recovery_work(struct work_struct *work) @@ -2129,6 +2476,8 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work) struct nvme_tcp_ctrl, err_work); struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl; + if (nvme_tcp_key_revoke_needed(ctrl)) + nvme_auth_revoke_tls_key(ctrl); nvme_stop_keep_alive(ctrl); flush_work(&ctrl->async_event_work); nvme_tcp_teardown_io_queues(ctrl, false); @@ -2140,12 +2489,14 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work) if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) { /* state change failure is ok if we started ctrl delete */ - WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING && - ctrl->state != NVME_CTRL_DELETING_NOIO); + enum nvme_ctrl_state state = nvme_ctrl_state(ctrl); + + WARN_ON_ONCE(state != NVME_CTRL_DELETING && + state != NVME_CTRL_DELETING_NOIO); return; } - nvme_tcp_reconnect_or_remove(ctrl); + nvme_tcp_reconnect_or_remove(ctrl, 0); } static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown) @@ -2165,25 +2516,31 @@ static void nvme_reset_ctrl_work(struct work_struct *work) { struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, reset_work); + int ret; + if (nvme_tcp_key_revoke_needed(ctrl)) + nvme_auth_revoke_tls_key(ctrl); nvme_stop_ctrl(ctrl); nvme_tcp_teardown_ctrl(ctrl, false); if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) { /* state change failure is ok if we started ctrl delete */ - WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING && - ctrl->state != NVME_CTRL_DELETING_NOIO); + enum nvme_ctrl_state state = nvme_ctrl_state(ctrl); + + WARN_ON_ONCE(state != NVME_CTRL_DELETING && + state != NVME_CTRL_DELETING_NOIO); return; } - if (nvme_tcp_setup_ctrl(ctrl, false)) + ret = nvme_tcp_setup_ctrl(ctrl, false); + if (ret) goto out_fail; return; out_fail: ++ctrl->nr_reconnects; - nvme_tcp_reconnect_or_remove(ctrl); + nvme_tcp_reconnect_or_remove(ctrl, ret); } static void nvme_tcp_stop_ctrl(struct nvme_ctrl *ctrl) @@ -2264,8 +2621,10 @@ static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg) ctrl->async_req.offset = 0; ctrl->async_req.curr_bio = NULL; ctrl->async_req.data_len = 0; + init_llist_node(&ctrl->async_req.lentry); + INIT_LIST_HEAD(&ctrl->async_req.entry); - nvme_tcp_queue_request(&ctrl->async_req, true, true); + nvme_tcp_queue_request(&ctrl->async_req, true); } static void nvme_tcp_complete_timed_out(struct request *rq) @@ -2281,13 +2640,16 @@ static enum blk_eh_timer_return nvme_tcp_timeout(struct request *rq) { struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl; - struct nvme_tcp_cmd_pdu *pdu = req->pdu; + struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req); + struct nvme_command *cmd = &pdu->cmd; + int qid = nvme_tcp_queue_id(req->queue); dev_warn(ctrl->device, - "queue %d: timeout request %#x type %d\n", - nvme_tcp_queue_id(req->queue), rq->tag, pdu->hdr.type); + "I/O tag %d (%04x) type %d opcode %#x (%s) QID %d timeout\n", + rq->tag, nvme_cid(rq), pdu->hdr.type, cmd->common.opcode, + nvme_fabrics_opcode_str(qid, cmd), qid); - if (ctrl->state != NVME_CTRL_LIVE) { + if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE) { /* * If we are resetting, connecting or deleting we should * complete immediately because we may block controller @@ -2317,7 +2679,7 @@ static blk_status_t nvme_tcp_map_data(struct nvme_tcp_queue *queue, struct request *rq) { struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); - struct nvme_tcp_cmd_pdu *pdu = req->pdu; + struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req); struct nvme_command *c = &pdu->cmd; c->common.flags |= NVME_CMD_SGL_METABUF; @@ -2337,7 +2699,7 @@ static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns, struct request *rq) { struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); - struct nvme_tcp_cmd_pdu *pdu = req->pdu; + struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req); struct nvme_tcp_queue *queue = req->queue; u8 hdgst = nvme_tcp_hdgst_len(queue), ddgst = 0; blk_status_t ret; @@ -2414,7 +2776,7 @@ static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx, nvme_start_request(rq); - nvme_tcp_queue_request(req, true, bd->last); + nvme_tcp_queue_request(req, bd->last); return BLK_STS_OK; } @@ -2422,50 +2784,15 @@ static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx, static void nvme_tcp_map_queues(struct blk_mq_tag_set *set) { struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(set->driver_data); - struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; - - if (opts->nr_write_queues && ctrl->io_queues[HCTX_TYPE_READ]) { - /* separate read/write queues */ - set->map[HCTX_TYPE_DEFAULT].nr_queues = - ctrl->io_queues[HCTX_TYPE_DEFAULT]; - set->map[HCTX_TYPE_DEFAULT].queue_offset = 0; - set->map[HCTX_TYPE_READ].nr_queues = - ctrl->io_queues[HCTX_TYPE_READ]; - set->map[HCTX_TYPE_READ].queue_offset = - ctrl->io_queues[HCTX_TYPE_DEFAULT]; - } else { - /* shared read/write queues */ - set->map[HCTX_TYPE_DEFAULT].nr_queues = - ctrl->io_queues[HCTX_TYPE_DEFAULT]; - set->map[HCTX_TYPE_DEFAULT].queue_offset = 0; - set->map[HCTX_TYPE_READ].nr_queues = - ctrl->io_queues[HCTX_TYPE_DEFAULT]; - set->map[HCTX_TYPE_READ].queue_offset = 0; - } - blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); - blk_mq_map_queues(&set->map[HCTX_TYPE_READ]); - - if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) { - /* map dedicated poll queues only if we have queues left */ - set->map[HCTX_TYPE_POLL].nr_queues = - ctrl->io_queues[HCTX_TYPE_POLL]; - set->map[HCTX_TYPE_POLL].queue_offset = - ctrl->io_queues[HCTX_TYPE_DEFAULT] + - ctrl->io_queues[HCTX_TYPE_READ]; - blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]); - } - - dev_info(ctrl->ctrl.device, - "mapped %d/%d/%d default/read/poll queues.\n", - ctrl->io_queues[HCTX_TYPE_DEFAULT], - ctrl->io_queues[HCTX_TYPE_READ], - ctrl->io_queues[HCTX_TYPE_POLL]); + + nvmf_map_queues(set, &ctrl->ctrl, ctrl->io_queues); } static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) { struct nvme_tcp_queue *queue = hctx->driver_data; struct sock *sk = queue->sock->sk; + int ret; if (!test_bit(NVME_TCP_Q_LIVE, &queue->flags)) return 0; @@ -2473,9 +2800,9 @@ static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) set_bit(NVME_TCP_Q_POLLING, &queue->flags); if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue)) sk_busy_loop(sk, true); - nvme_tcp_try_recv(queue); + ret = nvme_tcp_try_recv(queue); clear_bit(NVME_TCP_Q_POLLING, &queue->flags); - return queue->nr_cqe; + return ret < 0 ? ret : queue->nr_cqe; } static int nvme_tcp_get_address(struct nvme_ctrl *ctrl, char *buf, int size) @@ -2486,6 +2813,11 @@ static int nvme_tcp_get_address(struct nvme_ctrl *ctrl, char *buf, int size) len = nvmf_get_address(ctrl, buf, size); + if (!test_bit(NVME_TCP_Q_LIVE, &queue->flags)) + return len; + + mutex_lock(&queue->queue_lock); + ret = kernel_getsockname(queue->sock, (struct sockaddr *)&src_addr); if (ret > 0) { if (len > 0) @@ -2494,6 +2826,8 @@ static int nvme_tcp_get_address(struct nvme_ctrl *ctrl, char *buf, int size) (len) ? "," : "", &src_addr); } + mutex_unlock(&queue->queue_lock); + return len; } @@ -2525,11 +2859,13 @@ static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = { .reg_read32 = nvmf_reg_read32, .reg_read64 = nvmf_reg_read64, .reg_write32 = nvmf_reg_write32, + .subsystem_reset = nvmf_subsystem_reset, .free_ctrl = nvme_tcp_free_ctrl, .submit_async_event = nvme_tcp_submit_async_event, .delete_ctrl = nvme_tcp_delete_ctrl, .get_address = nvme_tcp_get_address, .stop_ctrl = nvme_tcp_stop_ctrl, + .get_virt_boundary = nvmf_get_virt_boundary, }; static bool @@ -2549,7 +2885,7 @@ nvme_tcp_existing_controller(struct nvmf_ctrl_options *opts) return found; } -static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, +static struct nvme_tcp_ctrl *nvme_tcp_alloc_ctrl(struct device *dev, struct nvmf_ctrl_options *opts) { struct nvme_tcp_ctrl *ctrl; @@ -2624,6 +2960,28 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, if (ret) goto out_kfree_queues; + return ctrl; +out_kfree_queues: + kfree(ctrl->queues); +out_free_ctrl: + kfree(ctrl); + return ERR_PTR(ret); +} + +static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, + struct nvmf_ctrl_options *opts) +{ + struct nvme_tcp_ctrl *ctrl; + int ret; + + ctrl = nvme_tcp_alloc_ctrl(dev, opts); + if (IS_ERR(ctrl)) + return ERR_CAST(ctrl); + + ret = nvme_add_ctrl(&ctrl->ctrl); + if (ret) + goto out_put_ctrl; + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { WARN_ON_ONCE(1); ret = -EINTR; @@ -2634,8 +2992,8 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, if (ret) goto out_uninit_ctrl; - dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n", - nvmf_ctrl_subsysnqn(&ctrl->ctrl), &ctrl->addr); + dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp, hostnqn: %s\n", + nvmf_ctrl_subsysnqn(&ctrl->ctrl), &ctrl->addr, opts->host->nqn); mutex_lock(&nvme_tcp_ctrl_mutex); list_add_tail(&ctrl->list, &nvme_tcp_ctrl_list); @@ -2645,15 +3003,11 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, out_uninit_ctrl: nvme_uninit_ctrl(&ctrl->ctrl); +out_put_ctrl: nvme_put_ctrl(&ctrl->ctrl); if (ret > 0) ret = -EIO; return ERR_PTR(ret); -out_kfree_queues: - kfree(ctrl->queues); -out_free_ctrl: - kfree(ctrl); - return ERR_PTR(ret); } static struct nvmf_transport_ops nvme_tcp_transport = { @@ -2664,17 +3018,35 @@ static struct nvmf_transport_ops nvme_tcp_transport = { NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO | NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST | NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES | - NVMF_OPT_TOS | NVMF_OPT_HOST_IFACE, + NVMF_OPT_TOS | NVMF_OPT_HOST_IFACE | NVMF_OPT_TLS | + NVMF_OPT_KEYRING | NVMF_OPT_TLS_KEY | NVMF_OPT_CONCAT, .create_ctrl = nvme_tcp_create_ctrl, }; static int __init nvme_tcp_init_module(void) { - nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq", - WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); + unsigned int wq_flags = WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_SYSFS; + int cpu; + + BUILD_BUG_ON(sizeof(struct nvme_tcp_hdr) != 8); + BUILD_BUG_ON(sizeof(struct nvme_tcp_cmd_pdu) != 72); + BUILD_BUG_ON(sizeof(struct nvme_tcp_data_pdu) != 24); + BUILD_BUG_ON(sizeof(struct nvme_tcp_rsp_pdu) != 24); + BUILD_BUG_ON(sizeof(struct nvme_tcp_r2t_pdu) != 24); + BUILD_BUG_ON(sizeof(struct nvme_tcp_icreq_pdu) != 128); + BUILD_BUG_ON(sizeof(struct nvme_tcp_icresp_pdu) != 128); + BUILD_BUG_ON(sizeof(struct nvme_tcp_term_pdu) != 24); + + if (wq_unbound) + wq_flags |= WQ_UNBOUND; + + nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq", wq_flags, 0); if (!nvme_tcp_wq) return -ENOMEM; + for_each_possible_cpu(cpu) + atomic_set(&nvme_tcp_cpu_queues[cpu], 0); + nvmf_register_transport(&nvme_tcp_transport); return 0; } @@ -2697,4 +3069,5 @@ static void __exit nvme_tcp_cleanup_module(void) module_init(nvme_tcp_init_module); module_exit(nvme_tcp_cleanup_module); +MODULE_DESCRIPTION("NVMe host TCP transport driver"); MODULE_LICENSE("GPL v2"); |
