diff options
Diffstat (limited to 'drivers/nvme/host/tcp.c')
| -rw-r--r-- | drivers/nvme/host/tcp.c | 1879 |
1 files changed, 1339 insertions, 540 deletions
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 265a0543b381..69cb04406b47 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -8,17 +8,91 @@ #include <linux/init.h> #include <linux/slab.h> #include <linux/err.h> +#include <linux/crc32.h> #include <linux/nvme-tcp.h> +#include <linux/nvme-keyring.h> #include <net/sock.h> #include <net/tcp.h> +#include <net/tls.h> +#include <net/tls_prot.h> +#include <net/handshake.h> #include <linux/blk-mq.h> -#include <crypto/hash.h> +#include <net/busy_poll.h> +#include <trace/events/sock.h> #include "nvme.h" #include "fabrics.h" struct nvme_tcp_queue; +/* Define the socket priority to use for connections were it is desirable + * that the NIC consider performing optimized packet processing or filtering. + * A non-zero value being sufficient to indicate general consideration of any + * possible optimization. Making it a module param allows for alternative + * values that may be unique for some NIC implementations. + */ +static int so_priority; +module_param(so_priority, int, 0644); +MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority"); + +/* + * Use the unbound workqueue for nvme_tcp_wq, then we can set the cpu affinity + * from sysfs. + */ +static bool wq_unbound; +module_param(wq_unbound, bool, 0644); +MODULE_PARM_DESC(wq_unbound, "Use unbound workqueue for nvme-tcp IO context (default false)"); + +/* + * TLS handshake timeout + */ +static int tls_handshake_timeout = 10; +#ifdef CONFIG_NVME_TCP_TLS +module_param(tls_handshake_timeout, int, 0644); +MODULE_PARM_DESC(tls_handshake_timeout, + "nvme TLS handshake timeout in seconds (default 10)"); +#endif + +static atomic_t nvme_tcp_cpu_queues[NR_CPUS]; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +/* lockdep can detect a circular dependency of the form + * sk_lock -> mmap_lock (page fault) -> fs locks -> sk_lock + * because dependencies are tracked for both nvme-tcp and user contexts. Using + * a separate class prevents lockdep from conflating nvme-tcp socket use with + * user-space socket API use. + */ +static struct lock_class_key nvme_tcp_sk_key[2]; +static struct lock_class_key nvme_tcp_slock_key[2]; + +static void nvme_tcp_reclassify_socket(struct socket *sock) +{ + struct sock *sk = sock->sk; + + if (WARN_ON_ONCE(!sock_allow_reclassification(sk))) + return; + + switch (sk->sk_family) { + case AF_INET: + sock_lock_init_class_and_name(sk, "slock-AF_INET-NVME", + &nvme_tcp_slock_key[0], + "sk_lock-AF_INET-NVME", + &nvme_tcp_sk_key[0]); + break; + case AF_INET6: + sock_lock_init_class_and_name(sk, "slock-AF_INET6-NVME", + &nvme_tcp_slock_key[1], + "sk_lock-AF_INET6-NVME", + &nvme_tcp_sk_key[1]); + break; + default: + WARN_ON_ONCE(1); + } +} +#else +static void nvme_tcp_reclassify_socket(struct socket *sock) { } +#endif + enum nvme_tcp_send_state { NVME_TCP_SEND_CMD_PDU = 0, NVME_TCP_SEND_H2C_PDU, @@ -33,8 +107,12 @@ struct nvme_tcp_request { u32 data_len; u32 pdu_len; u32 pdu_sent; + u32 h2cdata_left; + u32 h2cdata_offset; u16 ttag; + __le16 status; struct list_head entry; + struct llist_node lentry; __le32 ddgst; struct bio *curr_bio; @@ -49,6 +127,8 @@ struct nvme_tcp_request { enum nvme_tcp_queue_flags { NVME_TCP_Q_ALLOCATED = 0, NVME_TCP_Q_LIVE = 1, + NVME_TCP_Q_POLLING = 2, + NVME_TCP_Q_IO_CPU_SET = 3, }; enum nvme_tcp_recv_state { @@ -63,7 +143,9 @@ struct nvme_tcp_queue { struct work_struct io_work; int io_cpu; - spinlock_t lock; + struct mutex queue_lock; + struct mutex send_mutex; + struct llist_head req_list; struct list_head send_list; /* recv state */ @@ -72,11 +154,12 @@ struct nvme_tcp_queue { int pdu_offset; size_t data_remaining; size_t ddgst_remaining; + unsigned int nr_cqe; /* send state */ struct nvme_tcp_request *request; - int queue_size; + u32 maxh2cdata; size_t cmnd_capsule_len; struct nvme_tcp_ctrl *ctrl; unsigned long flags; @@ -84,11 +167,13 @@ struct nvme_tcp_queue { bool hdr_digest; bool data_digest; - struct ahash_request *rcv_hash; - struct ahash_request *snd_hash; + bool tls_enabled; + u32 rcv_crc; + u32 snd_crc; __le32 exp_ddgst; __le32 recv_ddgst; - + struct completion tls_complete; + int tls_err; struct page_frag_cache pf_cache; void (*state_change)(struct sock *); @@ -111,13 +196,15 @@ struct nvme_tcp_ctrl { struct work_struct err_work; struct delayed_work connect_work; struct nvme_tcp_request async_req; + u32 io_queues[HCTX_MAX_TYPES]; }; static LIST_HEAD(nvme_tcp_ctrl_list); static DEFINE_MUTEX(nvme_tcp_ctrl_mutex); static struct workqueue_struct *nvme_tcp_wq; -static struct blk_mq_ops nvme_tcp_mq_ops; -static struct blk_mq_ops nvme_tcp_admin_mq_ops; +static const struct blk_mq_ops nvme_tcp_mq_ops; +static const struct blk_mq_ops nvme_tcp_admin_mq_ops; +static int nvme_tcp_try_send(struct nvme_tcp_queue *queue); static inline struct nvme_tcp_ctrl *to_tcp_ctrl(struct nvme_ctrl *ctrl) { @@ -129,6 +216,41 @@ static inline int nvme_tcp_queue_id(struct nvme_tcp_queue *queue) return queue - queue->ctrl->queues; } +static inline bool nvme_tcp_recv_pdu_supported(enum nvme_tcp_pdu_type type) +{ + switch (type) { + case nvme_tcp_c2h_term: + case nvme_tcp_c2h_data: + case nvme_tcp_r2t: + case nvme_tcp_rsp: + return true; + default: + return false; + } +} + +/* + * Check if the queue is TLS encrypted + */ +static inline bool nvme_tcp_queue_tls(struct nvme_tcp_queue *queue) +{ + if (!IS_ENABLED(CONFIG_NVME_TCP_TLS)) + return 0; + + return queue->tls_enabled; +} + +/* + * Check if TLS is configured for the controller. + */ +static inline bool nvme_tcp_tls_configured(struct nvme_ctrl *ctrl) +{ + if (!IS_ENABLED(CONFIG_NVME_TCP_TLS)) + return 0; + + return ctrl->opts->tls || ctrl->opts->concat; +} + static inline struct blk_mq_tags *nvme_tcp_tagset(struct nvme_tcp_queue *queue) { u32 queue_idx = nvme_tcp_queue_id(queue); @@ -148,9 +270,23 @@ static inline u8 nvme_tcp_ddgst_len(struct nvme_tcp_queue *queue) return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0; } -static inline size_t nvme_tcp_inline_data_size(struct nvme_tcp_queue *queue) +static inline void *nvme_tcp_req_cmd_pdu(struct nvme_tcp_request *req) { - return queue->cmnd_capsule_len - sizeof(struct nvme_command); + return req->pdu; +} + +static inline void *nvme_tcp_req_data_pdu(struct nvme_tcp_request *req) +{ + /* use the pdu space in the back for the data pdu */ + return req->pdu + sizeof(struct nvme_tcp_cmd_pdu) - + sizeof(struct nvme_tcp_data_pdu); +} + +static inline size_t nvme_tcp_inline_data_size(struct nvme_tcp_request *req) +{ + if (nvme_is_fabrics(req->req.cmd)) + return NVME_TCP_ADMIN_CCSZ; + return req->queue->cmnd_capsule_len - sizeof(struct nvme_command); } static inline bool nvme_tcp_async_req(struct nvme_tcp_request *req) @@ -161,16 +297,14 @@ static inline bool nvme_tcp_async_req(struct nvme_tcp_request *req) static inline bool nvme_tcp_has_inline_data(struct nvme_tcp_request *req) { struct request *rq; - unsigned int bytes; if (unlikely(nvme_tcp_async_req(req))) return false; /* async events don't have a request */ rq = blk_mq_rq_from_pdu(req); - bytes = blk_rq_payload_bytes(rq); - return rq_data_dir(rq) == WRITE && bytes && - bytes <= nvme_tcp_inline_data_size(req->queue); + return rq_data_dir(rq) == WRITE && req->data_len && + req->data_len <= nvme_tcp_inline_data_size(req); } static inline struct page *nvme_tcp_req_cur_page(struct nvme_tcp_request *req) @@ -185,15 +319,10 @@ static inline size_t nvme_tcp_req_cur_offset(struct nvme_tcp_request *req) static inline size_t nvme_tcp_req_cur_length(struct nvme_tcp_request *req) { - return min_t(size_t, req->iter.bvec->bv_len - req->iter.iov_offset, + return min_t(size_t, iov_iter_single_seg_count(&req->iter), req->pdu_len - req->pdu_sent); } -static inline size_t nvme_tcp_req_offset(struct nvme_tcp_request *req) -{ - return req->iter.iov_offset; -} - static inline size_t nvme_tcp_pdu_data_left(struct nvme_tcp_request *req) { return rq_data_dir(blk_mq_rq_from_pdu(req)) == WRITE ? @@ -212,24 +341,29 @@ static void nvme_tcp_init_iter(struct nvme_tcp_request *req, struct request *rq = blk_mq_rq_from_pdu(req); struct bio_vec *vec; unsigned int size; - int nsegs; + int nr_bvec; size_t offset; if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) { vec = &rq->special_vec; - nsegs = 1; + nr_bvec = 1; size = blk_rq_payload_bytes(rq); offset = 0; } else { struct bio *bio = req->curr_bio; + struct bvec_iter bi; + struct bio_vec bv; vec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter); - nsegs = bio_segments(bio); + nr_bvec = 0; + bio_for_each_bvec(bv, bio, bi) { + nr_bvec++; + } size = bio->bi_iter.bi_size; offset = bio->bi_iter.bi_bvec_done; } - iov_iter_bvec(&req->iter, dir, vec, nsegs, size); + iov_iter_bvec(&req->iter, dir, vec, nr_bvec, size); req->iter.iov_offset = offset; } @@ -242,19 +376,65 @@ static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req, if (!iov_iter_count(&req->iter) && req->data_sent < req->data_len) { req->curr_bio = req->curr_bio->bi_next; - nvme_tcp_init_iter(req, WRITE); + nvme_tcp_init_iter(req, ITER_SOURCE); } } -static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req) +static inline void nvme_tcp_send_all(struct nvme_tcp_queue *queue) +{ + int ret; + + /* drain the send queue as much as we can... */ + do { + ret = nvme_tcp_try_send(queue); + } while (ret > 0); +} + +static inline bool nvme_tcp_queue_has_pending(struct nvme_tcp_queue *queue) +{ + return !list_empty(&queue->send_list) || + !llist_empty(&queue->req_list); +} + +static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue) +{ + return !nvme_tcp_queue_tls(queue) && + nvme_tcp_queue_has_pending(queue); +} + +static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req, + bool last) { struct nvme_tcp_queue *queue = req->queue; + bool empty; - spin_lock(&queue->lock); - list_add_tail(&req->entry, &queue->send_list); - spin_unlock(&queue->lock); + empty = llist_add(&req->lentry, &queue->req_list) && + list_empty(&queue->send_list) && !queue->request; - queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); + /* + * if we're the first on the send_list and we can try to send + * directly, otherwise queue io_work. Also, only do that if we + * are on the same cpu, so we don't introduce contention. + */ + if (queue->io_cpu == raw_smp_processor_id() && + empty && mutex_trylock(&queue->send_mutex)) { + nvme_tcp_send_all(queue); + mutex_unlock(&queue->send_mutex); + } + + if (last && nvme_tcp_queue_has_pending(queue)) + queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); +} + +static void nvme_tcp_process_req_list(struct nvme_tcp_queue *queue) +{ + struct nvme_tcp_request *req; + struct llist_node *node; + + for (node = llist_del_all(&queue->req_list); node; node = node->next) { + req = llist_entry(node, struct nvme_tcp_request, lentry); + list_add(&req->entry, &queue->send_list); + } } static inline struct nvme_tcp_request * @@ -262,42 +442,53 @@ nvme_tcp_fetch_request(struct nvme_tcp_queue *queue) { struct nvme_tcp_request *req; - spin_lock(&queue->lock); req = list_first_entry_or_null(&queue->send_list, struct nvme_tcp_request, entry); - if (req) - list_del(&req->entry); - spin_unlock(&queue->lock); + if (!req) { + nvme_tcp_process_req_list(queue); + req = list_first_entry_or_null(&queue->send_list, + struct nvme_tcp_request, entry); + if (unlikely(!req)) + return NULL; + } + list_del_init(&req->entry); + init_llist_node(&req->lentry); return req; } -static inline void nvme_tcp_ddgst_final(struct ahash_request *hash, - __le32 *dgst) +#define NVME_TCP_CRC_SEED (~0) + +static inline void nvme_tcp_ddgst_update(u32 *crcp, + struct page *page, size_t off, size_t len) { - ahash_request_set_crypt(hash, NULL, (u8 *)dgst, 0); - crypto_ahash_final(hash); + page += off / PAGE_SIZE; + off %= PAGE_SIZE; + while (len) { + const void *vaddr = kmap_local_page(page); + size_t n = min(len, (size_t)PAGE_SIZE - off); + + *crcp = crc32c(*crcp, vaddr + off, n); + kunmap_local(vaddr); + page++; + off = 0; + len -= n; + } } -static inline void nvme_tcp_ddgst_update(struct ahash_request *hash, - struct page *page, off_t off, size_t len) +static inline __le32 nvme_tcp_ddgst_final(u32 crc) { - struct scatterlist sg; - - sg_init_marker(&sg, 1); - sg_set_page(&sg, page, len, off); - ahash_request_set_crypt(hash, &sg, NULL, len); - crypto_ahash_update(hash); + return cpu_to_le32(~crc); } -static inline void nvme_tcp_hdgst(struct ahash_request *hash, - void *pdu, size_t len) +static inline __le32 nvme_tcp_hdgst(const void *pdu, size_t len) { - struct scatterlist sg; + return cpu_to_le32(~crc32c(NVME_TCP_CRC_SEED, pdu, len)); +} - sg_init_one(&sg, pdu, len); - ahash_request_set_crypt(hash, &sg, pdu + len, len); - crypto_ahash_digest(hash); +static inline void nvme_tcp_set_hdgst(void *pdu, size_t len) +{ + *(__le32 *)(pdu + len) = nvme_tcp_hdgst(pdu, len); } static int nvme_tcp_verify_hdgst(struct nvme_tcp_queue *queue, @@ -315,8 +506,7 @@ static int nvme_tcp_verify_hdgst(struct nvme_tcp_queue *queue, } recv_digest = *(__le32 *)(pdu + hdr->hlen); - nvme_tcp_hdgst(queue->rcv_hash, pdu, pdu_len); - exp_digest = *(__le32 *)(pdu + hdr->hlen); + exp_digest = nvme_tcp_hdgst(pdu, pdu_len); if (recv_digest != exp_digest) { dev_err(queue->ctrl->ctrl.device, "header digest error: recv %#x expected %#x\n", @@ -342,7 +532,7 @@ static int nvme_tcp_check_ddgst(struct nvme_tcp_queue *queue, void *pdu) nvme_tcp_queue_id(queue)); return -EPROTO; } - crypto_ahash_init(queue->rcv_hash); + queue->rcv_crc = NVME_TCP_CRC_SEED; return 0; } @@ -359,8 +549,9 @@ static int nvme_tcp_init_request(struct blk_mq_tag_set *set, struct request *rq, unsigned int hctx_idx, unsigned int numa_node) { - struct nvme_tcp_ctrl *ctrl = set->driver_data; + struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(set->driver_data); struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_tcp_cmd_pdu *pdu; int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0; struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx]; u8 hdgst = nvme_tcp_hdgst_len(queue); @@ -371,8 +562,12 @@ static int nvme_tcp_init_request(struct blk_mq_tag_set *set, if (!req->pdu) return -ENOMEM; + pdu = req->pdu; req->queue = queue; nvme_req(rq)->ctrl = &ctrl->ctrl; + nvme_req(rq)->cmd = &pdu->cmd; + init_llist_node(&req->lentry); + INIT_LIST_HEAD(&req->entry); return 0; } @@ -380,7 +575,7 @@ static int nvme_tcp_init_request(struct blk_mq_tag_set *set, static int nvme_tcp_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int hctx_idx) { - struct nvme_tcp_ctrl *ctrl = data; + struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(data); struct nvme_tcp_queue *queue = &ctrl->queues[hctx_idx + 1]; hctx->driver_data = queue; @@ -390,7 +585,7 @@ static int nvme_tcp_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, static int nvme_tcp_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int hctx_idx) { - struct nvme_tcp_ctrl *ctrl = data; + struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(data); struct nvme_tcp_queue *queue = &ctrl->queues[0]; hctx->driver_data = queue; @@ -419,24 +614,32 @@ static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl) if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) return; - queue_work(nvme_wq, &to_tcp_ctrl(ctrl)->err_work); + dev_warn(ctrl->device, "starting error recovery\n"); + queue_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work); } static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue, struct nvme_completion *cqe) { + struct nvme_tcp_request *req; struct request *rq; - rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), cqe->command_id); + rq = nvme_find_rq(nvme_tcp_tagset(queue), cqe->command_id); if (!rq) { dev_err(queue->ctrl->ctrl.device, - "queue %d tag 0x%x not found\n", - nvme_tcp_queue_id(queue), cqe->command_id); + "got bad cqe.command_id %#x on queue %d\n", + cqe->command_id, nvme_tcp_queue_id(queue)); nvme_tcp_error_recovery(&queue->ctrl->ctrl); return -EINVAL; } - nvme_end_request(rq, cqe->status, cqe->result); + req = blk_mq_rq_to_pdu(rq); + if (req->status == cpu_to_le16(NVME_SC_SUCCESS)) + req->status = cqe->status; + + if (!nvme_try_complete_req(rq, req->status, cqe->result)) + nvme_complete_rq(rq); + queue->nr_cqe++; return 0; } @@ -446,11 +649,11 @@ static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue, { struct request *rq; - rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id); + rq = nvme_find_rq(nvme_tcp_tagset(queue), pdu->command_id); if (!rq) { dev_err(queue->ctrl->ctrl.device, - "queue %d tag %#x not found\n", - nvme_tcp_queue_id(queue), pdu->command_id); + "got bad c2hdata.command_id %#x on queue %d\n", + pdu->command_id, nvme_tcp_queue_id(queue)); return -ENOENT; } @@ -463,8 +666,16 @@ static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue, queue->data_remaining = le32_to_cpu(pdu->data_length); - return 0; + if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS && + unlikely(!(pdu->hdr.flags & NVME_TCP_F_DATA_LAST))) { + dev_err(queue->ctrl->ctrl.device, + "queue %d tag %#x SUCCESS set but not last PDU\n", + nvme_tcp_queue_id(queue), rq->tag); + nvme_tcp_error_recovery(&queue->ctrl->ctrl); + return -EPROTO; + } + return 0; } static int nvme_tcp_handle_comp(struct nvme_tcp_queue *queue, @@ -479,8 +690,8 @@ static int nvme_tcp_handle_comp(struct nvme_tcp_queue *queue, * aborts. We don't even bother to allocate a struct request * for them but rather special case them here. */ - if (unlikely(nvme_tcp_queue_id(queue) == 0 && - cqe->command_id >= NVME_AQ_BLK_MQ_DEPTH)) + if (unlikely(nvme_is_aen_req(nvme_tcp_queue_id(queue), + cqe->command_id))) nvme_complete_async_event(&queue->ctrl->ctrl, cqe->status, &cqe->result); else @@ -489,37 +700,26 @@ static int nvme_tcp_handle_comp(struct nvme_tcp_queue *queue, return ret; } -static int nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req, - struct nvme_tcp_r2t_pdu *pdu) +static void nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req) { - struct nvme_tcp_data_pdu *data = req->pdu; + struct nvme_tcp_data_pdu *data = nvme_tcp_req_data_pdu(req); struct nvme_tcp_queue *queue = req->queue; struct request *rq = blk_mq_rq_from_pdu(req); + u32 h2cdata_sent = req->pdu_len; u8 hdgst = nvme_tcp_hdgst_len(queue); u8 ddgst = nvme_tcp_ddgst_len(queue); - req->pdu_len = le32_to_cpu(pdu->r2t_length); + req->state = NVME_TCP_SEND_H2C_PDU; + req->offset = 0; + req->pdu_len = min(req->h2cdata_left, queue->maxh2cdata); req->pdu_sent = 0; - - if (unlikely(req->data_sent + req->pdu_len > req->data_len)) { - dev_err(queue->ctrl->ctrl.device, - "req %d r2t len %u exceeded data len %u (%zu sent)\n", - rq->tag, req->pdu_len, req->data_len, - req->data_sent); - return -EPROTO; - } - - if (unlikely(le32_to_cpu(pdu->r2t_offset) < req->data_sent)) { - dev_err(queue->ctrl->ctrl.device, - "req %d unexpected r2t offset %u (expected %zu)\n", - rq->tag, le32_to_cpu(pdu->r2t_offset), - req->data_sent); - return -EPROTO; - } + req->h2cdata_left -= req->pdu_len; + req->h2cdata_offset += h2cdata_sent; memset(data, 0, sizeof(*data)); data->hdr.type = nvme_tcp_h2c_data; - data->hdr.flags = NVME_TCP_F_DATA_LAST; + if (!req->h2cdata_left) + data->hdr.flags = NVME_TCP_F_DATA_LAST; if (queue->hdr_digest) data->hdr.flags |= NVME_TCP_F_HDGST; if (queue->data_digest) @@ -528,11 +728,10 @@ static int nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req, data->hdr.pdo = data->hdr.hlen + hdgst; data->hdr.plen = cpu_to_le32(data->hdr.hlen + hdgst + req->pdu_len + ddgst); - data->ttag = pdu->ttag; - data->command_id = rq->tag; - data->data_offset = cpu_to_le32(req->data_sent); + data->ttag = req->ttag; + data->command_id = nvme_cid(rq); + data->data_offset = cpu_to_le32(req->h2cdata_offset); data->data_length = cpu_to_le32(req->pdu_len); - return 0; } static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue, @@ -540,29 +739,94 @@ static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue, { struct nvme_tcp_request *req; struct request *rq; - int ret; + u32 r2t_length = le32_to_cpu(pdu->r2t_length); + u32 r2t_offset = le32_to_cpu(pdu->r2t_offset); - rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id); + rq = nvme_find_rq(nvme_tcp_tagset(queue), pdu->command_id); if (!rq) { dev_err(queue->ctrl->ctrl.device, - "queue %d tag %#x not found\n", - nvme_tcp_queue_id(queue), pdu->command_id); + "got bad r2t.command_id %#x on queue %d\n", + pdu->command_id, nvme_tcp_queue_id(queue)); return -ENOENT; } req = blk_mq_rq_to_pdu(rq); - ret = nvme_tcp_setup_h2c_data_pdu(req, pdu); - if (unlikely(ret)) - return ret; + if (unlikely(!r2t_length)) { + dev_err(queue->ctrl->ctrl.device, + "req %d r2t len is %u, probably a bug...\n", + rq->tag, r2t_length); + return -EPROTO; + } - req->state = NVME_TCP_SEND_H2C_PDU; - req->offset = 0; + if (unlikely(req->data_sent + r2t_length > req->data_len)) { + dev_err(queue->ctrl->ctrl.device, + "req %d r2t len %u exceeded data len %u (%zu sent)\n", + rq->tag, r2t_length, req->data_len, req->data_sent); + return -EPROTO; + } + + if (unlikely(r2t_offset < req->data_sent)) { + dev_err(queue->ctrl->ctrl.device, + "req %d unexpected r2t offset %u (expected %zu)\n", + rq->tag, r2t_offset, req->data_sent); + return -EPROTO; + } - nvme_tcp_queue_request(req); + if (llist_on_list(&req->lentry) || + !list_empty(&req->entry)) { + dev_err(queue->ctrl->ctrl.device, + "req %d unexpected r2t while processing request\n", + rq->tag); + return -EPROTO; + } + + req->pdu_len = 0; + req->h2cdata_left = r2t_length; + req->h2cdata_offset = r2t_offset; + req->ttag = pdu->ttag; + + nvme_tcp_setup_h2c_data_pdu(req); + + llist_add(&req->lentry, &queue->req_list); + queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); return 0; } +static void nvme_tcp_handle_c2h_term(struct nvme_tcp_queue *queue, + struct nvme_tcp_term_pdu *pdu) +{ + u16 fes; + const char *msg; + u32 plen = le32_to_cpu(pdu->hdr.plen); + + static const char * const msg_table[] = { + [NVME_TCP_FES_INVALID_PDU_HDR] = "Invalid PDU Header Field", + [NVME_TCP_FES_PDU_SEQ_ERR] = "PDU Sequence Error", + [NVME_TCP_FES_HDR_DIGEST_ERR] = "Header Digest Error", + [NVME_TCP_FES_DATA_OUT_OF_RANGE] = "Data Transfer Out Of Range", + [NVME_TCP_FES_DATA_LIMIT_EXCEEDED] = "Data Transfer Limit Exceeded", + [NVME_TCP_FES_UNSUPPORTED_PARAM] = "Unsupported Parameter", + }; + + if (plen < NVME_TCP_MIN_C2HTERM_PLEN || + plen > NVME_TCP_MAX_C2HTERM_PLEN) { + dev_err(queue->ctrl->ctrl.device, + "Received a malformed C2HTermReq PDU (plen = %u)\n", + plen); + return; + } + + fes = le16_to_cpu(pdu->fes); + if (fes && fes < ARRAY_SIZE(msg_table)) + msg = msg_table[fes]; + else + msg = "Unknown"; + + dev_err(queue->ctrl->ctrl.device, + "Received C2HTermReq (FES = %s)\n", msg); +} + static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb, unsigned int *offset, size_t *len) { @@ -584,6 +848,25 @@ static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb, return 0; hdr = queue->pdu; + if (unlikely(hdr->hlen != sizeof(struct nvme_tcp_rsp_pdu))) { + if (!nvme_tcp_recv_pdu_supported(hdr->type)) + goto unsupported_pdu; + + dev_err(queue->ctrl->ctrl.device, + "pdu type %d has unexpected header length (%d)\n", + hdr->type, hdr->hlen); + return -EPROTO; + } + + if (unlikely(hdr->type == nvme_tcp_c2h_term)) { + /* + * C2HTermReq never includes Header or Data digests. + * Skip the checks. + */ + nvme_tcp_handle_c2h_term(queue, (void *)queue->pdu); + return -EINVAL; + } + if (queue->hdr_digest) { ret = nvme_tcp_verify_hdgst(queue, queue->pdu, hdr->hlen); if (unlikely(ret)) @@ -599,40 +882,38 @@ static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb, switch (hdr->type) { case nvme_tcp_c2h_data: - ret = nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu); - break; + return nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu); case nvme_tcp_rsp: nvme_tcp_init_recv_ctx(queue); - ret = nvme_tcp_handle_comp(queue, (void *)queue->pdu); - break; + return nvme_tcp_handle_comp(queue, (void *)queue->pdu); case nvme_tcp_r2t: nvme_tcp_init_recv_ctx(queue); - ret = nvme_tcp_handle_r2t(queue, (void *)queue->pdu); - break; + return nvme_tcp_handle_r2t(queue, (void *)queue->pdu); default: - dev_err(queue->ctrl->ctrl.device, - "unsupported pdu type (%d)\n", hdr->type); - return -EINVAL; + goto unsupported_pdu; } - return ret; +unsupported_pdu: + dev_err(queue->ctrl->ctrl.device, + "unsupported pdu type (%d)\n", hdr->type); + return -EINVAL; +} + +static inline void nvme_tcp_end_request(struct request *rq, u16 status) +{ + union nvme_result res = {}; + + if (!nvme_try_complete_req(rq, cpu_to_le16(status << 1), res)) + nvme_complete_rq(rq); } static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, unsigned int *offset, size_t *len) { struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu; - struct nvme_tcp_request *req; - struct request *rq; - - rq = blk_mq_tag_to_rq(nvme_tcp_tagset(queue), pdu->command_id); - if (!rq) { - dev_err(queue->ctrl->ctrl.device, - "queue %d tag %#x not found\n", - nvme_tcp_queue_id(queue), pdu->command_id); - return -ENOENT; - } - req = blk_mq_rq_to_pdu(rq); + struct request *rq = + nvme_cid_to_rq(nvme_tcp_tagset(queue), pdu->command_id); + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); while (true) { int recv_len, ret; @@ -655,7 +936,7 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, nvme_tcp_init_recv_ctx(queue); return -EIO; } - nvme_tcp_init_iter(req, READ); + nvme_tcp_init_iter(req, ITER_DEST); } /* we can read only from what is left in this bio */ @@ -663,8 +944,8 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, iov_iter_count(&req->iter)); if (queue->data_digest) - ret = skb_copy_and_hash_datagram_iter(skb, *offset, - &req->iter, recv_len, queue->rcv_hash); + ret = skb_copy_and_crc32c_datagram_iter(skb, *offset, + &req->iter, recv_len, &queue->rcv_crc); else ret = skb_copy_datagram_iter(skb, *offset, &req->iter, recv_len); @@ -682,9 +963,14 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, if (!queue->data_remaining) { if (queue->data_digest) { - nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst); + queue->exp_ddgst = nvme_tcp_ddgst_final(queue->rcv_crc); queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH; } else { + if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) { + nvme_tcp_end_request(rq, + le16_to_cpu(req->status)); + queue->nr_cqe++; + } nvme_tcp_init_recv_ctx(queue); } } @@ -695,6 +981,7 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue, struct sk_buff *skb, unsigned int *offset, size_t *len) { + struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu; char *ddgst = (char *)&queue->recv_ddgst; size_t recv_len = min_t(size_t, *len, queue->ddgst_remaining); off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining; @@ -711,11 +998,25 @@ static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue, return 0; if (queue->recv_ddgst != queue->exp_ddgst) { + struct request *rq = nvme_cid_to_rq(nvme_tcp_tagset(queue), + pdu->command_id); + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); + + req->status = cpu_to_le16(NVME_SC_DATA_XFER_ERROR); + dev_err(queue->ctrl->ctrl.device, "data digest error: recv %#x expected %#x\n", le32_to_cpu(queue->recv_ddgst), le32_to_cpu(queue->exp_ddgst)); - return -EIO; + } + + if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) { + struct request *rq = nvme_cid_to_rq(nvme_tcp_tagset(queue), + pdu->command_id); + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); + + nvme_tcp_end_request(rq, le16_to_cpu(req->status)); + queue->nr_cqe++; } nvme_tcp_init_recv_ctx(queue); @@ -729,6 +1030,9 @@ static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff *skb, size_t consumed = len; int result; + if (unlikely(!queue->rd_enabled)) + return -EFAULT; + while (len) { switch (nvme_tcp_recv_state(queue)) { case NVME_TCP_RECV_PDU: @@ -759,11 +1063,14 @@ static void nvme_tcp_data_ready(struct sock *sk) { struct nvme_tcp_queue *queue; - read_lock(&sk->sk_callback_lock); + trace_sk_data_ready(sk); + + read_lock_bh(&sk->sk_callback_lock); queue = sk->sk_user_data; - if (likely(queue && queue->rd_enabled)) + if (likely(queue && queue->rd_enabled) && + !test_bit(NVME_TCP_Q_POLLING, &queue->flags)) queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); - read_unlock(&sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); } static void nvme_tcp_write_space(struct sock *sk) @@ -774,6 +1081,9 @@ static void nvme_tcp_write_space(struct sock *sk) queue = sk->sk_user_data; if (likely(queue && sk_stream_is_writeable(sk))) { clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + /* Ensure pending TLS partial records are retried */ + if (nvme_tcp_queue_tls(queue)) + queue->write_space(sk); queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); } read_unlock_bh(&sk->sk_callback_lock); @@ -783,7 +1093,7 @@ static void nvme_tcp_state_change(struct sock *sk) { struct nvme_tcp_queue *queue; - read_lock(&sk->sk_callback_lock); + read_lock_bh(&sk->sk_callback_lock); queue = sk->sk_user_data; if (!queue) goto done; @@ -794,7 +1104,6 @@ static void nvme_tcp_state_change(struct sock *sk) case TCP_LAST_ACK: case TCP_FIN_WAIT1: case TCP_FIN_WAIT2: - /* fallthrough */ nvme_tcp_error_recovery(&queue->ctrl->ctrl); break; default: @@ -805,7 +1114,7 @@ static void nvme_tcp_state_change(struct sock *sk) queue->state_change(sk); done: - read_unlock(&sk->sk_callback_lock); + read_unlock_bh(&sk->sk_callback_lock); } static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue) @@ -815,46 +1124,73 @@ static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue) static void nvme_tcp_fail_request(struct nvme_tcp_request *req) { - union nvme_result res = {}; + if (nvme_tcp_async_req(req)) { + union nvme_result res = {}; - nvme_end_request(blk_mq_rq_from_pdu(req), - cpu_to_le16(NVME_SC_DATA_XFER_ERROR), res); + nvme_complete_async_event(&req->queue->ctrl->ctrl, + cpu_to_le16(NVME_SC_HOST_PATH_ERROR), &res); + } else { + nvme_tcp_end_request(blk_mq_rq_from_pdu(req), + NVME_SC_HOST_PATH_ERROR); + } } static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) { struct nvme_tcp_queue *queue = req->queue; + int req_data_len = req->data_len; + u32 h2cdata_left = req->h2cdata_left; while (true) { + struct bio_vec bvec; + struct msghdr msg = { + .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES, + }; struct page *page = nvme_tcp_req_cur_page(req); size_t offset = nvme_tcp_req_cur_offset(req); size_t len = nvme_tcp_req_cur_length(req); bool last = nvme_tcp_pdu_last_send(req, len); - int ret, flags = MSG_DONTWAIT; + int req_data_sent = req->data_sent; + int ret; - if (last && !queue->data_digest) - flags |= MSG_EOR; + if (last && !queue->data_digest && !nvme_tcp_queue_more(queue)) + msg.msg_flags |= MSG_EOR; else - flags |= MSG_MORE; + msg.msg_flags |= MSG_MORE; + + if (!sendpages_ok(page, len, offset)) + msg.msg_flags &= ~MSG_SPLICE_PAGES; - ret = kernel_sendpage(queue->sock, page, offset, len, flags); + bvec_set_page(&bvec, page, len, offset); + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len); + ret = sock_sendmsg(queue->sock, &msg); if (ret <= 0) return ret; - nvme_tcp_advance_req(req, ret); if (queue->data_digest) - nvme_tcp_ddgst_update(queue->snd_hash, page, + nvme_tcp_ddgst_update(&queue->snd_crc, page, offset, ret); - /* fully successful last write*/ + /* + * update the request iterator except for the last payload send + * in the request where we don't want to modify it as we may + * compete with the RX path completing the request. + */ + if (req_data_sent + ret < req_data_len) + nvme_tcp_advance_req(req, ret); + + /* fully successful last send in current PDU */ if (last && ret == len) { if (queue->data_digest) { - nvme_tcp_ddgst_final(queue->snd_hash, - &req->ddgst); + req->ddgst = + nvme_tcp_ddgst_final(queue->snd_crc); req->state = NVME_TCP_SEND_DDGST; req->offset = 0; } else { - nvme_tcp_done_send_req(queue); + if (h2cdata_left) + nvme_tcp_setup_h2c_data_pdu(req); + else + nvme_tcp_done_send_req(queue); } return 1; } @@ -865,18 +1201,25 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req) { struct nvme_tcp_queue *queue = req->queue; - struct nvme_tcp_cmd_pdu *pdu = req->pdu; + struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req); + struct bio_vec bvec; + struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES, }; bool inline_data = nvme_tcp_has_inline_data(req); - int flags = MSG_DONTWAIT | (inline_data ? MSG_MORE : MSG_EOR); u8 hdgst = nvme_tcp_hdgst_len(queue); int len = sizeof(*pdu) + hdgst - req->offset; int ret; + if (inline_data || nvme_tcp_queue_more(queue)) + msg.msg_flags |= MSG_MORE; + else + msg.msg_flags |= MSG_EOR; + if (queue->hdr_digest && !req->offset) - nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); + nvme_tcp_set_hdgst(pdu, sizeof(*pdu)); - ret = kernel_sendpage(queue->sock, virt_to_page(pdu), - offset_in_page(pdu) + req->offset, len, flags); + bvec_set_virt(&bvec, (void *)pdu + req->offset, len); + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len); + ret = sock_sendmsg(queue->sock, &msg); if (unlikely(ret <= 0)) return ret; @@ -885,8 +1228,7 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req) if (inline_data) { req->state = NVME_TCP_SEND_DATA; if (queue->data_digest) - crypto_ahash_init(queue->snd_hash); - nvme_tcp_init_iter(req, WRITE); + queue->snd_crc = NVME_TCP_CRC_SEED; } else { nvme_tcp_done_send_req(queue); } @@ -900,17 +1242,22 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req) static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req) { struct nvme_tcp_queue *queue = req->queue; - struct nvme_tcp_data_pdu *pdu = req->pdu; + struct nvme_tcp_data_pdu *pdu = nvme_tcp_req_data_pdu(req); + struct bio_vec bvec; + struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_MORE, }; u8 hdgst = nvme_tcp_hdgst_len(queue); int len = sizeof(*pdu) - req->offset + hdgst; int ret; if (queue->hdr_digest && !req->offset) - nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); + nvme_tcp_set_hdgst(pdu, sizeof(*pdu)); + + if (!req->h2cdata_left) + msg.msg_flags |= MSG_SPLICE_PAGES; - ret = kernel_sendpage(queue->sock, virt_to_page(pdu), - offset_in_page(pdu) + req->offset, len, - MSG_DONTWAIT | MSG_MORE); + bvec_set_virt(&bvec, (void *)pdu + req->offset, len); + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len); + ret = sock_sendmsg(queue->sock, &msg); if (unlikely(ret <= 0)) return ret; @@ -918,9 +1265,7 @@ static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req) if (!len) { req->state = NVME_TCP_SEND_DATA; if (queue->data_digest) - crypto_ahash_init(queue->snd_hash); - if (!req->data_sent) - nvme_tcp_init_iter(req, WRITE); + queue->snd_crc = NVME_TCP_CRC_SEED; return 1; } req->offset += ret; @@ -931,19 +1276,29 @@ static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req) static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req) { struct nvme_tcp_queue *queue = req->queue; + size_t offset = req->offset; + u32 h2cdata_left = req->h2cdata_left; int ret; - struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_EOR }; + struct msghdr msg = { .msg_flags = MSG_DONTWAIT }; struct kvec iov = { - .iov_base = &req->ddgst + req->offset, + .iov_base = (u8 *)&req->ddgst + req->offset, .iov_len = NVME_TCP_DIGEST_LENGTH - req->offset }; + if (nvme_tcp_queue_more(queue)) + msg.msg_flags |= MSG_MORE; + else + msg.msg_flags |= MSG_EOR; + ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len); if (unlikely(ret <= 0)) return ret; - if (req->offset + ret == NVME_TCP_DIGEST_LENGTH) { - nvme_tcp_done_send_req(queue); + if (offset + ret == NVME_TCP_DIGEST_LENGTH) { + if (h2cdata_left) + nvme_tcp_setup_h2c_data_pdu(req); + else + nvme_tcp_done_send_req(queue); return 1; } @@ -954,6 +1309,7 @@ static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req) static int nvme_tcp_try_send(struct nvme_tcp_queue *queue) { struct nvme_tcp_request *req; + unsigned int noreclaim_flag; int ret = 1; if (!queue->request) { @@ -963,12 +1319,13 @@ static int nvme_tcp_try_send(struct nvme_tcp_queue *queue) } req = queue->request; + noreclaim_flag = memalloc_noreclaim_save(); if (req->state == NVME_TCP_SEND_CMD_PDU) { ret = nvme_tcp_try_send_cmd_pdu(req); if (ret <= 0) goto done; if (!nvme_tcp_has_inline_data(req)) - return ret; + goto out; } if (req->state == NVME_TCP_SEND_H2C_PDU) { @@ -986,92 +1343,71 @@ static int nvme_tcp_try_send(struct nvme_tcp_queue *queue) if (req->state == NVME_TCP_SEND_DDGST) ret = nvme_tcp_try_send_ddgst(req); done: - if (ret == -EAGAIN) + if (ret == -EAGAIN) { ret = 0; + } else if (ret < 0) { + dev_err(queue->ctrl->ctrl.device, + "failed to send request %d\n", ret); + nvme_tcp_fail_request(queue->request); + nvme_tcp_done_send_req(queue); + } +out: + memalloc_noreclaim_restore(noreclaim_flag); return ret; } static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue) { - struct sock *sk = queue->sock->sk; + struct socket *sock = queue->sock; + struct sock *sk = sock->sk; read_descriptor_t rd_desc; int consumed; rd_desc.arg.data = queue; rd_desc.count = 1; lock_sock(sk); - consumed = tcp_read_sock(sk, &rd_desc, nvme_tcp_recv_skb); + queue->nr_cqe = 0; + consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb); release_sock(sk); - return consumed; + return consumed == -EAGAIN ? 0 : consumed; } static void nvme_tcp_io_work(struct work_struct *w) { struct nvme_tcp_queue *queue = container_of(w, struct nvme_tcp_queue, io_work); - unsigned long start = jiffies + msecs_to_jiffies(1); + unsigned long deadline = jiffies + msecs_to_jiffies(1); do { bool pending = false; int result; - result = nvme_tcp_try_send(queue); - if (result > 0) { - pending = true; - } else if (unlikely(result < 0)) { - dev_err(queue->ctrl->ctrl.device, - "failed to send request %d\n", result); - if (result != -EPIPE) - nvme_tcp_fail_request(queue->request); - nvme_tcp_done_send_req(queue); - return; + if (mutex_trylock(&queue->send_mutex)) { + result = nvme_tcp_try_send(queue); + mutex_unlock(&queue->send_mutex); + if (result > 0) + pending = true; + else if (unlikely(result < 0)) + break; } result = nvme_tcp_try_recv(queue); if (result > 0) pending = true; - - if (!pending) + else if (unlikely(result < 0)) return; - } while (time_after(jiffies, start)); /* quota is exhausted */ - - queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); -} - -static void nvme_tcp_free_crypto(struct nvme_tcp_queue *queue) -{ - struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash); - - ahash_request_free(queue->rcv_hash); - ahash_request_free(queue->snd_hash); - crypto_free_ahash(tfm); -} - -static int nvme_tcp_alloc_crypto(struct nvme_tcp_queue *queue) -{ - struct crypto_ahash *tfm; - - tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(tfm)) - return PTR_ERR(tfm); + /* did we get some space after spending time in recv? */ + if (nvme_tcp_queue_has_pending(queue) && + sk_stream_is_writeable(queue->sock->sk)) + pending = true; - queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL); - if (!queue->snd_hash) - goto free_tfm; - ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL); + if (!pending || !queue->rd_enabled) + return; - queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL); - if (!queue->rcv_hash) - goto free_snd_hash; - ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL); + } while (!time_after(jiffies, deadline)); /* quota is exhausted */ - return 0; -free_snd_hash: - ahash_request_free(queue->snd_hash); -free_tfm: - crypto_free_ahash(tfm); - return -ENOMEM; + queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); } static void nvme_tcp_free_async_req(struct nvme_tcp_ctrl *ctrl) @@ -1101,24 +1437,34 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid) { struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); struct nvme_tcp_queue *queue = &ctrl->queues[qid]; + unsigned int noreclaim_flag; if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags)) return; - if (queue->hdr_digest || queue->data_digest) - nvme_tcp_free_crypto(queue); + page_frag_cache_drain(&queue->pf_cache); + + noreclaim_flag = memalloc_noreclaim_save(); + /* ->sock will be released by fput() */ + fput(queue->sock->file); + queue->sock = NULL; + memalloc_noreclaim_restore(noreclaim_flag); - sock_release(queue->sock); kfree(queue->pdu); + mutex_destroy(&queue->send_mutex); + mutex_destroy(&queue->queue_lock); } static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue) { struct nvme_tcp_icreq_pdu *icreq; struct nvme_tcp_icresp_pdu *icresp; + char cbuf[CMSG_LEN(sizeof(char))] = {}; + u8 ctype; struct msghdr msg = {}; struct kvec iov; bool ctrl_hdgst, ctrl_ddgst; + u32 maxh2cdata; int ret; icreq = kzalloc(sizeof(*icreq), GFP_KERNEL); @@ -1146,17 +1492,39 @@ static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue) iov.iov_base = icreq; iov.iov_len = sizeof(*icreq); ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len); - if (ret < 0) + if (ret < 0) { + pr_warn("queue %d: failed to send icreq, error %d\n", + nvme_tcp_queue_id(queue), ret); goto free_icresp; + } memset(&msg, 0, sizeof(msg)); iov.iov_base = icresp; iov.iov_len = sizeof(*icresp); + if (nvme_tcp_queue_tls(queue)) { + msg.msg_control = cbuf; + msg.msg_controllen = sizeof(cbuf); + } + msg.msg_flags = MSG_WAITALL; ret = kernel_recvmsg(queue->sock, &msg, &iov, 1, iov.iov_len, msg.msg_flags); - if (ret < 0) + if (ret >= 0 && ret < sizeof(*icresp)) + ret = -ECONNRESET; + if (ret < 0) { + pr_warn("queue %d: failed to receive icresp, error %d\n", + nvme_tcp_queue_id(queue), ret); goto free_icresp; - + } + ret = -ENOTCONN; + if (nvme_tcp_queue_tls(queue)) { + ctype = tls_get_record_type(queue->sock->sk, + (struct cmsghdr *)cbuf); + if (ctype != TLS_RECORD_TYPE_DATA) { + pr_err("queue %d: unhandled TLS record %d\n", + nvme_tcp_queue_id(queue), ctype); + goto free_icresp; + } + } ret = -EINVAL; if (icresp->hdr.type != nvme_tcp_icresp) { pr_err("queue %d: bad type returned %d\n", @@ -1202,6 +1570,14 @@ static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue) goto free_icresp; } + maxh2cdata = le32_to_cpu(icresp->maxdata); + if ((maxh2cdata % 4) || (maxh2cdata < NVME_TCP_MIN_MAXH2CDATA)) { + pr_err("queue %d: invalid maxh2cdata returned %u\n", + nvme_tcp_queue_id(queue), maxh2cdata); + goto free_icresp; + } + queue->maxh2cdata = maxh2cdata; + ret = 0; free_icresp: kfree(icresp); @@ -1210,73 +1586,246 @@ free_icreq: return ret; } -static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, - int qid, size_t queue_size) +static bool nvme_tcp_admin_queue(struct nvme_tcp_queue *queue) +{ + return nvme_tcp_queue_id(queue) == 0; +} + +static bool nvme_tcp_default_queue(struct nvme_tcp_queue *queue) +{ + struct nvme_tcp_ctrl *ctrl = queue->ctrl; + int qid = nvme_tcp_queue_id(queue); + + return !nvme_tcp_admin_queue(queue) && + qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT]; +} + +static bool nvme_tcp_read_queue(struct nvme_tcp_queue *queue) +{ + struct nvme_tcp_ctrl *ctrl = queue->ctrl; + int qid = nvme_tcp_queue_id(queue); + + return !nvme_tcp_admin_queue(queue) && + !nvme_tcp_default_queue(queue) && + qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] + + ctrl->io_queues[HCTX_TYPE_READ]; +} + +static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue) +{ + struct nvme_tcp_ctrl *ctrl = queue->ctrl; + int qid = nvme_tcp_queue_id(queue); + + return !nvme_tcp_admin_queue(queue) && + !nvme_tcp_default_queue(queue) && + !nvme_tcp_read_queue(queue) && + qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] + + ctrl->io_queues[HCTX_TYPE_READ] + + ctrl->io_queues[HCTX_TYPE_POLL]; +} + +/* + * Track the number of queues assigned to each cpu using a global per-cpu + * counter and select the least used cpu from the mq_map. Our goal is to spread + * different controllers I/O threads across different cpu cores. + * + * Note that the accounting is not 100% perfect, but we don't need to be, we're + * simply putting our best effort to select the best candidate cpu core that we + * find at any given point. + */ +static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue) +{ + struct nvme_tcp_ctrl *ctrl = queue->ctrl; + struct blk_mq_tag_set *set = &ctrl->tag_set; + int qid = nvme_tcp_queue_id(queue) - 1; + unsigned int *mq_map = NULL; + int cpu, min_queues = INT_MAX, io_cpu; + + if (wq_unbound) + goto out; + + if (nvme_tcp_default_queue(queue)) + mq_map = set->map[HCTX_TYPE_DEFAULT].mq_map; + else if (nvme_tcp_read_queue(queue)) + mq_map = set->map[HCTX_TYPE_READ].mq_map; + else if (nvme_tcp_poll_queue(queue)) + mq_map = set->map[HCTX_TYPE_POLL].mq_map; + + if (WARN_ON(!mq_map)) + goto out; + + /* Search for the least used cpu from the mq_map */ + io_cpu = WORK_CPU_UNBOUND; + for_each_online_cpu(cpu) { + int num_queues = atomic_read(&nvme_tcp_cpu_queues[cpu]); + + if (mq_map[cpu] != qid) + continue; + if (num_queues < min_queues) { + io_cpu = cpu; + min_queues = num_queues; + } + } + if (io_cpu != WORK_CPU_UNBOUND) { + queue->io_cpu = io_cpu; + atomic_inc(&nvme_tcp_cpu_queues[io_cpu]); + set_bit(NVME_TCP_Q_IO_CPU_SET, &queue->flags); + } +out: + dev_dbg(ctrl->ctrl.device, "queue %d: using cpu %d\n", + qid, queue->io_cpu); +} + +static void nvme_tcp_tls_done(void *data, int status, key_serial_t pskid) +{ + struct nvme_tcp_queue *queue = data; + struct nvme_tcp_ctrl *ctrl = queue->ctrl; + int qid = nvme_tcp_queue_id(queue); + struct key *tls_key; + + dev_dbg(ctrl->ctrl.device, "queue %d: TLS handshake done, key %x, status %d\n", + qid, pskid, status); + + if (status) { + queue->tls_err = -status; + goto out_complete; + } + + tls_key = nvme_tls_key_lookup(pskid); + if (IS_ERR(tls_key)) { + dev_warn(ctrl->ctrl.device, "queue %d: Invalid key %x\n", + qid, pskid); + queue->tls_err = -ENOKEY; + } else { + queue->tls_enabled = true; + if (qid == 0) + ctrl->ctrl.tls_pskid = key_serial(tls_key); + key_put(tls_key); + queue->tls_err = 0; + } + +out_complete: + complete(&queue->tls_complete); +} + +static int nvme_tcp_start_tls(struct nvme_ctrl *nctrl, + struct nvme_tcp_queue *queue, + key_serial_t pskid) +{ + int qid = nvme_tcp_queue_id(queue); + int ret; + struct tls_handshake_args args; + unsigned long tmo = tls_handshake_timeout * HZ; + key_serial_t keyring = nvme_keyring_id(); + + dev_dbg(nctrl->device, "queue %d: start TLS with key %x\n", + qid, pskid); + memset(&args, 0, sizeof(args)); + args.ta_sock = queue->sock; + args.ta_done = nvme_tcp_tls_done; + args.ta_data = queue; + args.ta_my_peerids[0] = pskid; + args.ta_num_peerids = 1; + if (nctrl->opts->keyring) + keyring = key_serial(nctrl->opts->keyring); + args.ta_keyring = keyring; + args.ta_timeout_ms = tls_handshake_timeout * 1000; + queue->tls_err = -EOPNOTSUPP; + init_completion(&queue->tls_complete); + ret = tls_client_hello_psk(&args, GFP_KERNEL); + if (ret) { + dev_err(nctrl->device, "queue %d: failed to start TLS: %d\n", + qid, ret); + return ret; + } + ret = wait_for_completion_interruptible_timeout(&queue->tls_complete, tmo); + if (ret <= 0) { + if (ret == 0) + ret = -ETIMEDOUT; + + dev_err(nctrl->device, + "queue %d: TLS handshake failed, error %d\n", + qid, ret); + tls_handshake_cancel(queue->sock->sk); + } else { + if (queue->tls_err) { + dev_err(nctrl->device, + "queue %d: TLS handshake complete, error %d\n", + qid, queue->tls_err); + } else { + dev_dbg(nctrl->device, + "queue %d: TLS handshake complete\n", qid); + } + ret = queue->tls_err; + } + return ret; +} + +static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid, + key_serial_t pskid) { struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); struct nvme_tcp_queue *queue = &ctrl->queues[qid]; - struct linger sol = { .l_onoff = 1, .l_linger = 0 }; - int ret, opt, rcv_pdu_size, n; + int ret, rcv_pdu_size; + struct file *sock_file; + mutex_init(&queue->queue_lock); queue->ctrl = ctrl; + init_llist_head(&queue->req_list); INIT_LIST_HEAD(&queue->send_list); - spin_lock_init(&queue->lock); + mutex_init(&queue->send_mutex); INIT_WORK(&queue->io_work, nvme_tcp_io_work); - queue->queue_size = queue_size; if (qid > 0) - queue->cmnd_capsule_len = ctrl->ctrl.ioccsz * 16; + queue->cmnd_capsule_len = nctrl->ioccsz * 16; else queue->cmnd_capsule_len = sizeof(struct nvme_command) + NVME_TCP_ADMIN_CCSZ; - ret = sock_create(ctrl->addr.ss_family, SOCK_STREAM, + ret = sock_create_kern(current->nsproxy->net_ns, + ctrl->addr.ss_family, SOCK_STREAM, IPPROTO_TCP, &queue->sock); if (ret) { - dev_err(ctrl->ctrl.device, + dev_err(nctrl->device, "failed to create socket: %d\n", ret); - return ret; + goto err_destroy_mutex; } - /* Single syn retry */ - opt = 1; - ret = kernel_setsockopt(queue->sock, IPPROTO_TCP, TCP_SYNCNT, - (char *)&opt, sizeof(opt)); - if (ret) { - dev_err(ctrl->ctrl.device, - "failed to set TCP_SYNCNT sock opt %d\n", ret); - goto err_sock; + sock_file = sock_alloc_file(queue->sock, O_CLOEXEC, NULL); + if (IS_ERR(sock_file)) { + ret = PTR_ERR(sock_file); + goto err_destroy_mutex; } + sk_net_refcnt_upgrade(queue->sock->sk); + nvme_tcp_reclassify_socket(queue->sock); + + /* Single syn retry */ + tcp_sock_set_syncnt(queue->sock->sk, 1); + /* Set TCP no delay */ - opt = 1; - ret = kernel_setsockopt(queue->sock, IPPROTO_TCP, - TCP_NODELAY, (char *)&opt, sizeof(opt)); - if (ret) { - dev_err(ctrl->ctrl.device, - "failed to set TCP_NODELAY sock opt %d\n", ret); - goto err_sock; - } + tcp_sock_set_nodelay(queue->sock->sk); /* * Cleanup whatever is sitting in the TCP transmit queue on socket * close. This is done to prevent stale data from being sent should * the network connection be restored before TCP times out. */ - ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_LINGER, - (char *)&sol, sizeof(sol)); - if (ret) { - dev_err(ctrl->ctrl.device, - "failed to set SO_LINGER sock opt %d\n", ret); - goto err_sock; - } + sock_no_linger(queue->sock->sk); + + if (so_priority > 0) + sock_set_priority(queue->sock->sk, so_priority); + + /* Set socket type of service */ + if (nctrl->opts->tos >= 0) + ip_sock_set_tos(queue->sock->sk, nctrl->opts->tos); + + /* Set 10 seconds timeout for icresp recvmsg */ + queue->sock->sk->sk_rcvtimeo = 10 * HZ; queue->sock->sk->sk_allocation = GFP_ATOMIC; - if (!qid) - n = 0; - else - n = (qid - 1) % num_online_cpus(); - queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false); + queue->sock->sk->sk_use_task_frag = false; + queue->io_cpu = WORK_CPU_UNBOUND; queue->request = NULL; queue->data_remaining = 0; queue->ddgst_remaining = 0; @@ -1284,64 +1833,65 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, queue->pdu_offset = 0; sk_set_memalloc(queue->sock->sk); - if (ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR) { - ret = kernel_bind(queue->sock, (struct sockaddr *)&ctrl->src_addr, + if (nctrl->opts->mask & NVMF_OPT_HOST_TRADDR) { + ret = kernel_bind(queue->sock, (struct sockaddr_unsized *)&ctrl->src_addr, sizeof(ctrl->src_addr)); if (ret) { - dev_err(ctrl->ctrl.device, + dev_err(nctrl->device, "failed to bind queue %d socket %d\n", qid, ret); goto err_sock; } } - queue->hdr_digest = nctrl->opts->hdr_digest; - queue->data_digest = nctrl->opts->data_digest; - if (queue->hdr_digest || queue->data_digest) { - ret = nvme_tcp_alloc_crypto(queue); + if (nctrl->opts->mask & NVMF_OPT_HOST_IFACE) { + char *iface = nctrl->opts->host_iface; + sockptr_t optval = KERNEL_SOCKPTR(iface); + + ret = sock_setsockopt(queue->sock, SOL_SOCKET, SO_BINDTODEVICE, + optval, strlen(iface)); if (ret) { - dev_err(ctrl->ctrl.device, - "failed to allocate queue %d crypto\n", qid); + dev_err(nctrl->device, + "failed to bind to interface %s queue %d err %d\n", + iface, qid, ret); goto err_sock; } } + queue->hdr_digest = nctrl->opts->hdr_digest; + queue->data_digest = nctrl->opts->data_digest; + rcv_pdu_size = sizeof(struct nvme_tcp_rsp_pdu) + nvme_tcp_hdgst_len(queue); queue->pdu = kmalloc(rcv_pdu_size, GFP_KERNEL); if (!queue->pdu) { ret = -ENOMEM; - goto err_crypto; + goto err_sock; } - dev_dbg(ctrl->ctrl.device, "connecting queue %d\n", + dev_dbg(nctrl->device, "connecting queue %d\n", nvme_tcp_queue_id(queue)); - ret = kernel_connect(queue->sock, (struct sockaddr *)&ctrl->addr, + ret = kernel_connect(queue->sock, (struct sockaddr_unsized *)&ctrl->addr, sizeof(ctrl->addr), 0); if (ret) { - dev_err(ctrl->ctrl.device, + dev_err(nctrl->device, "failed to connect socket: %d\n", ret); goto err_rcv_pdu; } + /* If PSKs are configured try to start TLS */ + if (nvme_tcp_tls_configured(nctrl) && pskid) { + ret = nvme_tcp_start_tls(nctrl, queue, pskid); + if (ret) + goto err_init_connect; + } + ret = nvme_tcp_init_connection(queue); if (ret) goto err_init_connect; - queue->rd_enabled = true; set_bit(NVME_TCP_Q_ALLOCATED, &queue->flags); - nvme_tcp_init_recv_ctx(queue); - - write_lock_bh(&queue->sock->sk->sk_callback_lock); - queue->sock->sk->sk_user_data = queue; - queue->state_change = queue->sock->sk->sk_state_change; - queue->data_ready = queue->sock->sk->sk_data_ready; - queue->write_space = queue->sock->sk->sk_write_space; - queue->sock->sk->sk_data_ready = nvme_tcp_data_ready; - queue->sock->sk->sk_state_change = nvme_tcp_state_change; - queue->sock->sk->sk_write_space = nvme_tcp_write_space; - write_unlock_bh(&queue->sock->sk->sk_callback_lock); return 0; @@ -1349,16 +1899,17 @@ err_init_connect: kernel_sock_shutdown(queue->sock, SHUT_RDWR); err_rcv_pdu: kfree(queue->pdu); -err_crypto: - if (queue->hdr_digest || queue->data_digest) - nvme_tcp_free_crypto(queue); err_sock: - sock_release(queue->sock); + /* ->sock will be released by fput() */ + fput(queue->sock->file); queue->sock = NULL; +err_destroy_mutex: + mutex_destroy(&queue->send_mutex); + mutex_destroy(&queue->queue_lock); return ret; } -static void nvme_tcp_restore_sock_calls(struct nvme_tcp_queue *queue) +static void nvme_tcp_restore_sock_ops(struct nvme_tcp_queue *queue) { struct socket *sock = queue->sock; @@ -1373,84 +1924,101 @@ static void nvme_tcp_restore_sock_calls(struct nvme_tcp_queue *queue) static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue) { kernel_sock_shutdown(queue->sock, SHUT_RDWR); - nvme_tcp_restore_sock_calls(queue); + nvme_tcp_restore_sock_ops(queue); cancel_work_sync(&queue->io_work); } -static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid) +static void nvme_tcp_stop_queue_nowait(struct nvme_ctrl *nctrl, int qid) { struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); struct nvme_tcp_queue *queue = &ctrl->queues[qid]; - if (!test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags)) + if (!test_bit(NVME_TCP_Q_ALLOCATED, &queue->flags)) return; - __nvme_tcp_stop_queue(queue); + if (test_and_clear_bit(NVME_TCP_Q_IO_CPU_SET, &queue->flags)) + atomic_dec(&nvme_tcp_cpu_queues[queue->io_cpu]); + + mutex_lock(&queue->queue_lock); + if (test_and_clear_bit(NVME_TCP_Q_LIVE, &queue->flags)) + __nvme_tcp_stop_queue(queue); + /* Stopping the queue will disable TLS */ + queue->tls_enabled = false; + mutex_unlock(&queue->queue_lock); +} + +static void nvme_tcp_wait_queue(struct nvme_ctrl *nctrl, int qid) +{ + struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); + struct nvme_tcp_queue *queue = &ctrl->queues[qid]; + int timeout = 100; + + while (timeout > 0) { + if (!test_bit(NVME_TCP_Q_ALLOCATED, &queue->flags) || + !sk_wmem_alloc_get(queue->sock->sk)) + return; + msleep(2); + timeout -= 2; + } + dev_warn(nctrl->device, + "qid %d: timeout draining sock wmem allocation expired\n", + qid); +} + +static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid) +{ + nvme_tcp_stop_queue_nowait(nctrl, qid); + nvme_tcp_wait_queue(nctrl, qid); +} + + +static void nvme_tcp_setup_sock_ops(struct nvme_tcp_queue *queue) +{ + write_lock_bh(&queue->sock->sk->sk_callback_lock); + queue->sock->sk->sk_user_data = queue; + queue->state_change = queue->sock->sk->sk_state_change; + queue->data_ready = queue->sock->sk->sk_data_ready; + queue->write_space = queue->sock->sk->sk_write_space; + queue->sock->sk->sk_data_ready = nvme_tcp_data_ready; + queue->sock->sk->sk_state_change = nvme_tcp_state_change; + queue->sock->sk->sk_write_space = nvme_tcp_write_space; +#ifdef CONFIG_NET_RX_BUSY_POLL + queue->sock->sk->sk_ll_usec = 1; +#endif + write_unlock_bh(&queue->sock->sk->sk_callback_lock); } static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx) { struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); + struct nvme_tcp_queue *queue = &ctrl->queues[idx]; int ret; - if (idx) - ret = nvmf_connect_io_queue(nctrl, idx, false); - else + queue->rd_enabled = true; + nvme_tcp_init_recv_ctx(queue); + nvme_tcp_setup_sock_ops(queue); + + if (idx) { + nvme_tcp_set_queue_io_cpu(queue); + ret = nvmf_connect_io_queue(nctrl, idx); + } else ret = nvmf_connect_admin_queue(nctrl); if (!ret) { - set_bit(NVME_TCP_Q_LIVE, &ctrl->queues[idx].flags); + set_bit(NVME_TCP_Q_LIVE, &queue->flags); } else { - __nvme_tcp_stop_queue(&ctrl->queues[idx]); + if (test_bit(NVME_TCP_Q_ALLOCATED, &queue->flags)) + __nvme_tcp_stop_queue(queue); dev_err(nctrl->device, "failed to connect queue: %d ret=%d\n", idx, ret); } return ret; } -static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl, - bool admin) -{ - struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); - struct blk_mq_tag_set *set; - int ret; - - if (admin) { - set = &ctrl->admin_tag_set; - memset(set, 0, sizeof(*set)); - set->ops = &nvme_tcp_admin_mq_ops; - set->queue_depth = NVME_AQ_MQ_TAG_DEPTH; - set->reserved_tags = 2; /* connect + keep-alive */ - set->numa_node = NUMA_NO_NODE; - set->cmd_size = sizeof(struct nvme_tcp_request); - set->driver_data = ctrl; - set->nr_hw_queues = 1; - set->timeout = ADMIN_TIMEOUT; - } else { - set = &ctrl->tag_set; - memset(set, 0, sizeof(*set)); - set->ops = &nvme_tcp_mq_ops; - set->queue_depth = nctrl->sqsize + 1; - set->reserved_tags = 1; /* fabric connect */ - set->numa_node = NUMA_NO_NODE; - set->flags = BLK_MQ_F_SHOULD_MERGE; - set->cmd_size = sizeof(struct nvme_tcp_request); - set->driver_data = ctrl; - set->nr_hw_queues = nctrl->queue_count - 1; - set->timeout = NVME_IO_TIMEOUT; - set->nr_maps = 2 /* default + read */; - } - - ret = blk_mq_alloc_tag_set(set); - if (ret) - return ERR_PTR(ret); - - return set; -} - static void nvme_tcp_free_admin_queue(struct nvme_ctrl *ctrl) { if (to_tcp_ctrl(ctrl)->async_req.pdu) { + cancel_work_sync(&ctrl->async_event_work); nvme_tcp_free_async_req(to_tcp_ctrl(ctrl)); to_tcp_ctrl(ctrl)->async_req.pdu = NULL; } @@ -1471,14 +2039,17 @@ static void nvme_tcp_stop_io_queues(struct nvme_ctrl *ctrl) int i; for (i = 1; i < ctrl->queue_count; i++) - nvme_tcp_stop_queue(ctrl, i); + nvme_tcp_stop_queue_nowait(ctrl, i); + for (i = 1; i < ctrl->queue_count; i++) + nvme_tcp_wait_queue(ctrl, i); } -static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl) +static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl, + int first, int last) { - int i, ret = 0; + int i, ret; - for (i = 1; i < ctrl->queue_count; i++) { + for (i = first; i < last; i++) { ret = nvme_tcp_start_queue(ctrl, i); if (ret) goto out_stop_queues; @@ -1487,7 +2058,7 @@ static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl) return 0; out_stop_queues: - for (i--; i >= 1; i--) + for (i--; i >= first; i--) nvme_tcp_stop_queue(ctrl, i); return ret; } @@ -1495,8 +2066,23 @@ out_stop_queues: static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl) { int ret; + key_serial_t pskid = 0; + + if (nvme_tcp_tls_configured(ctrl)) { + if (ctrl->opts->tls_key) + pskid = key_serial(ctrl->opts->tls_key); + else if (ctrl->opts->tls) { + pskid = nvme_tls_psk_default(ctrl->opts->keyring, + ctrl->opts->host->nqn, + ctrl->opts->subsysnqn); + if (!pskid) { + dev_err(ctrl->device, "no valid PSK found\n"); + return -ENOKEY; + } + } + } - ret = nvme_tcp_alloc_queue(ctrl, 0, NVME_AQ_DEPTH); + ret = nvme_tcp_alloc_queue(ctrl, 0, pskid); if (ret) return ret; @@ -1511,13 +2097,34 @@ out_free_queue: return ret; } -static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl) +static int __nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl) { int i, ret; + if (nvme_tcp_tls_configured(ctrl)) { + if (ctrl->opts->concat) { + /* + * The generated PSK is stored in the + * fabric options + */ + if (!ctrl->opts->tls_key) { + dev_err(ctrl->device, "no PSK generated\n"); + return -ENOKEY; + } + if (ctrl->tls_pskid && + ctrl->tls_pskid != key_serial(ctrl->opts->tls_key)) { + dev_err(ctrl->device, "Stale PSK id %08x\n", ctrl->tls_pskid); + ctrl->tls_pskid = 0; + } + } else if (!ctrl->tls_pskid) { + dev_err(ctrl->device, "no PSK negotiated\n"); + return -ENOKEY; + } + } + for (i = 1; i < ctrl->queue_count; i++) { ret = nvme_tcp_alloc_queue(ctrl, i, - ctrl->sqsize + 1); + ctrl->tls_pskid); if (ret) goto out_free_queues; } @@ -1531,98 +2138,100 @@ out_free_queues: return ret; } -static unsigned int nvme_tcp_nr_io_queues(struct nvme_ctrl *ctrl) -{ - unsigned int nr_io_queues; - - nr_io_queues = min(ctrl->opts->nr_io_queues, num_online_cpus()); - nr_io_queues += min(ctrl->opts->nr_write_queues, num_online_cpus()); - - return nr_io_queues; -} - -static int nvme_alloc_io_queues(struct nvme_ctrl *ctrl) +static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl) { unsigned int nr_io_queues; int ret; - nr_io_queues = nvme_tcp_nr_io_queues(ctrl); + nr_io_queues = nvmf_nr_io_queues(ctrl->opts); ret = nvme_set_queue_count(ctrl, &nr_io_queues); if (ret) return ret; - ctrl->queue_count = nr_io_queues + 1; - if (ctrl->queue_count < 2) - return 0; + if (nr_io_queues == 0) { + dev_err(ctrl->device, + "unable to set any I/O queues\n"); + return -ENOMEM; + } + ctrl->queue_count = nr_io_queues + 1; dev_info(ctrl->device, "creating %d I/O queues.\n", nr_io_queues); - return nvme_tcp_alloc_io_queues(ctrl); -} - -static void nvme_tcp_destroy_io_queues(struct nvme_ctrl *ctrl, bool remove) -{ - nvme_tcp_stop_io_queues(ctrl); - if (remove) { - blk_cleanup_queue(ctrl->connect_q); - blk_mq_free_tag_set(ctrl->tagset); - } - nvme_tcp_free_io_queues(ctrl); + nvmf_set_io_queues(ctrl->opts, nr_io_queues, + to_tcp_ctrl(ctrl)->io_queues); + return __nvme_tcp_alloc_io_queues(ctrl); } static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) { - int ret; + int ret, nr_queues; - ret = nvme_alloc_io_queues(ctrl); + ret = nvme_tcp_alloc_io_queues(ctrl); if (ret) return ret; if (new) { - ctrl->tagset = nvme_tcp_alloc_tagset(ctrl, false); - if (IS_ERR(ctrl->tagset)) { - ret = PTR_ERR(ctrl->tagset); + ret = nvme_alloc_io_tag_set(ctrl, &to_tcp_ctrl(ctrl)->tag_set, + &nvme_tcp_mq_ops, + ctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2, + sizeof(struct nvme_tcp_request)); + if (ret) goto out_free_io_queues; - } + } + + /* + * Only start IO queues for which we have allocated the tagset + * and limited it to the available queues. On reconnects, the + * queue number might have changed. + */ + nr_queues = min(ctrl->tagset->nr_hw_queues + 1, ctrl->queue_count); + ret = nvme_tcp_start_io_queues(ctrl, 1, nr_queues); + if (ret) + goto out_cleanup_connect_q; - ctrl->connect_q = blk_mq_init_queue(ctrl->tagset); - if (IS_ERR(ctrl->connect_q)) { - ret = PTR_ERR(ctrl->connect_q); - goto out_free_tag_set; + if (!new) { + nvme_start_freeze(ctrl); + nvme_unquiesce_io_queues(ctrl); + if (!nvme_wait_freeze_timeout(ctrl, NVME_IO_TIMEOUT)) { + /* + * If we timed out waiting for freeze we are likely to + * be stuck. Fail the controller initialization just + * to be safe. + */ + ret = -ENODEV; + nvme_unfreeze(ctrl); + goto out_wait_freeze_timed_out; } - } else { blk_mq_update_nr_hw_queues(ctrl->tagset, ctrl->queue_count - 1); + nvme_unfreeze(ctrl); } - ret = nvme_tcp_start_io_queues(ctrl); + /* + * If the number of queues has increased (reconnect case) + * start all new queues now. + */ + ret = nvme_tcp_start_io_queues(ctrl, nr_queues, + ctrl->tagset->nr_hw_queues + 1); if (ret) - goto out_cleanup_connect_q; + goto out_wait_freeze_timed_out; return 0; +out_wait_freeze_timed_out: + nvme_quiesce_io_queues(ctrl); + nvme_sync_io_queues(ctrl); + nvme_tcp_stop_io_queues(ctrl); out_cleanup_connect_q: + nvme_cancel_tagset(ctrl); if (new) - blk_cleanup_queue(ctrl->connect_q); -out_free_tag_set: - if (new) - blk_mq_free_tag_set(ctrl->tagset); + nvme_remove_io_tag_set(ctrl); out_free_io_queues: nvme_tcp_free_io_queues(ctrl); return ret; } -static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove) -{ - nvme_tcp_stop_queue(ctrl, 0); - if (remove) { - blk_cleanup_queue(ctrl->admin_q); - blk_mq_free_tag_set(ctrl->admin_tagset); - } - nvme_tcp_free_admin_queue(ctrl); -} - static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new) { int error; @@ -1632,50 +2241,42 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new) return error; if (new) { - ctrl->admin_tagset = nvme_tcp_alloc_tagset(ctrl, true); - if (IS_ERR(ctrl->admin_tagset)) { - error = PTR_ERR(ctrl->admin_tagset); + error = nvme_alloc_admin_tag_set(ctrl, + &to_tcp_ctrl(ctrl)->admin_tag_set, + &nvme_tcp_admin_mq_ops, + sizeof(struct nvme_tcp_request)); + if (error) goto out_free_queue; - } - - ctrl->admin_q = blk_mq_init_queue(ctrl->admin_tagset); - if (IS_ERR(ctrl->admin_q)) { - error = PTR_ERR(ctrl->admin_q); - goto out_free_tagset; - } } error = nvme_tcp_start_queue(ctrl, 0); if (error) - goto out_cleanup_queue; - - error = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap); - if (error) { - dev_err(ctrl->device, - "prop_get NVME_REG_CAP failed\n"); - goto out_stop_queue; - } + goto out_cleanup_tagset; - ctrl->sqsize = min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize); + if (ctrl->opts->concat && !ctrl->tls_pskid) + return 0; - error = nvme_enable_ctrl(ctrl, ctrl->cap); + error = nvme_enable_ctrl(ctrl); if (error) goto out_stop_queue; - error = nvme_init_identify(ctrl); + nvme_unquiesce_admin_queue(ctrl); + + error = nvme_init_ctrl_finish(ctrl, false); if (error) - goto out_stop_queue; + goto out_quiesce_queue; return 0; +out_quiesce_queue: + nvme_quiesce_admin_queue(ctrl); + blk_sync_queue(ctrl->admin_q); out_stop_queue: nvme_tcp_stop_queue(ctrl, 0); -out_cleanup_queue: + nvme_cancel_admin_tagset(ctrl); +out_cleanup_tagset: if (new) - blk_cleanup_queue(ctrl->admin_q); -out_free_tagset: - if (new) - blk_mq_free_tag_set(ctrl->admin_tagset); + nvme_remove_admin_tag_set(ctrl); out_free_queue: nvme_tcp_free_admin_queue(ctrl); return error; @@ -1684,11 +2285,20 @@ out_free_queue: static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl, bool remove) { - blk_mq_quiesce_queue(ctrl->admin_q); + nvme_quiesce_admin_queue(ctrl); + blk_sync_queue(ctrl->admin_q); nvme_tcp_stop_queue(ctrl, 0); - blk_mq_tagset_busy_iter(ctrl->admin_tagset, nvme_cancel_request, ctrl); - blk_mq_unquiesce_queue(ctrl->admin_q); - nvme_tcp_destroy_admin_queue(ctrl, remove); + nvme_cancel_admin_tagset(ctrl); + if (remove) { + nvme_unquiesce_admin_queue(ctrl); + nvme_remove_admin_tag_set(ctrl); + } + nvme_tcp_free_admin_queue(ctrl); + if (ctrl->tls_pskid) { + dev_dbg(ctrl->device, "Wipe negotiated TLS_PSK %08x\n", + ctrl->tls_pskid); + ctrl->tls_pskid = 0; + } } static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl, @@ -1696,48 +2306,92 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl, { if (ctrl->queue_count <= 1) return; - nvme_stop_queues(ctrl); + nvme_quiesce_io_queues(ctrl); + nvme_sync_io_queues(ctrl); nvme_tcp_stop_io_queues(ctrl); - blk_mq_tagset_busy_iter(ctrl->tagset, nvme_cancel_request, ctrl); - if (remove) - nvme_start_queues(ctrl); - nvme_tcp_destroy_io_queues(ctrl, remove); + nvme_cancel_tagset(ctrl); + if (remove) { + nvme_unquiesce_io_queues(ctrl); + nvme_remove_io_tag_set(ctrl); + } + nvme_tcp_free_io_queues(ctrl); } -static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl) +static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl, + int status) { + enum nvme_ctrl_state state = nvme_ctrl_state(ctrl); + /* If we are resetting/deleting then do nothing */ - if (ctrl->state != NVME_CTRL_CONNECTING) { - WARN_ON_ONCE(ctrl->state == NVME_CTRL_NEW || - ctrl->state == NVME_CTRL_LIVE); + if (state != NVME_CTRL_CONNECTING) { + WARN_ON_ONCE(state == NVME_CTRL_NEW || state == NVME_CTRL_LIVE); return; } - if (nvmf_should_reconnect(ctrl)) { + if (nvmf_should_reconnect(ctrl, status)) { dev_info(ctrl->device, "Reconnecting in %d seconds...\n", ctrl->opts->reconnect_delay); queue_delayed_work(nvme_wq, &to_tcp_ctrl(ctrl)->connect_work, ctrl->opts->reconnect_delay * HZ); } else { - dev_info(ctrl->device, "Removing controller...\n"); + dev_info(ctrl->device, "Removing controller (%d)...\n", + status); nvme_delete_ctrl(ctrl); } } +/* + * The TLS key is set by secure concatenation after negotiation has been + * completed on the admin queue. We need to revoke the key when: + * - concatenation is enabled (otherwise it's a static key set by the user) + * and + * - the generated key is present in ctrl->tls_key (otherwise there's nothing + * to revoke) + * and + * - a valid PSK key ID has been set in ctrl->tls_pskid (otherwise TLS + * negotiation has not run). + * + * We cannot always revoke the key as nvme_tcp_alloc_admin_queue() is called + * twice during secure concatenation, once on a 'normal' connection to run the + * DH-HMAC-CHAP negotiation (which generates the key, so it _must not_ be set), + * and once after the negotiation (which uses the key, so it _must_ be set). + */ +static bool nvme_tcp_key_revoke_needed(struct nvme_ctrl *ctrl) +{ + return ctrl->opts->concat && ctrl->opts->tls_key && ctrl->tls_pskid; +} + static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new) { struct nvmf_ctrl_options *opts = ctrl->opts; - int ret = -EINVAL; + int ret; ret = nvme_tcp_configure_admin_queue(ctrl, new); if (ret) return ret; + if (ctrl->opts->concat && !ctrl->tls_pskid) { + /* See comments for nvme_tcp_key_revoke_needed() */ + dev_dbg(ctrl->device, "restart admin queue for secure concatenation\n"); + nvme_stop_keep_alive(ctrl); + nvme_tcp_teardown_admin_queue(ctrl, false); + ret = nvme_tcp_configure_admin_queue(ctrl, false); + if (ret) + goto destroy_admin; + } + if (ctrl->icdoff) { + ret = -EOPNOTSUPP; dev_err(ctrl->device, "icdoff is not supported!\n"); goto destroy_admin; } + if (!nvme_ctrl_sgl_supported(ctrl)) { + ret = -EOPNOTSUPP; + dev_err(ctrl->device, "Mandatory sgls are not supported!\n"); + goto destroy_admin; + } + if (opts->queue_size > ctrl->sqsize + 1) dev_warn(ctrl->device, "queue_size %zu > ctrl sqsize %u, clamping down\n", @@ -1757,8 +2411,16 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new) } if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) { - /* state change failure is ok if we're in DELETING state */ - WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING); + /* + * state change failure is ok if we started ctrl delete, + * unless we're during creation of a new controller to + * avoid races with teardown flow. + */ + enum nvme_ctrl_state state = nvme_ctrl_state(ctrl); + + WARN_ON_ONCE(state != NVME_CTRL_DELETING && + state != NVME_CTRL_DELETING_NOIO); + WARN_ON_ONCE(new); ret = -EINVAL; goto destroy_io; } @@ -1767,11 +2429,18 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new) return 0; destroy_io: - if (ctrl->queue_count > 1) - nvme_tcp_destroy_io_queues(ctrl, new); + if (ctrl->queue_count > 1) { + nvme_quiesce_io_queues(ctrl); + nvme_sync_io_queues(ctrl); + nvme_tcp_stop_io_queues(ctrl); + nvme_cancel_tagset(ctrl); + if (new) + nvme_remove_io_tag_set(ctrl); + nvme_tcp_free_io_queues(ctrl); + } destroy_admin: - nvme_tcp_stop_queue(ctrl, 0); - nvme_tcp_destroy_admin_queue(ctrl, new); + nvme_stop_keep_alive(ctrl); + nvme_tcp_teardown_admin_queue(ctrl, new); return ret; } @@ -1780,23 +2449,25 @@ static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work) struct nvme_tcp_ctrl *tcp_ctrl = container_of(to_delayed_work(work), struct nvme_tcp_ctrl, connect_work); struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl; + int ret; ++ctrl->nr_reconnects; - if (nvme_tcp_setup_ctrl(ctrl, false)) + ret = nvme_tcp_setup_ctrl(ctrl, false); + if (ret) goto requeue; - dev_info(ctrl->device, "Successfully reconnected (%d attempt)\n", - ctrl->nr_reconnects); + dev_info(ctrl->device, "Successfully reconnected (attempt %d/%d)\n", + ctrl->nr_reconnects, ctrl->opts->max_reconnects); ctrl->nr_reconnects = 0; return; requeue: - dev_info(ctrl->device, "Failed reconnect attempt %d\n", - ctrl->nr_reconnects); - nvme_tcp_reconnect_or_remove(ctrl); + dev_info(ctrl->device, "Failed reconnect attempt %d/%d\n", + ctrl->nr_reconnects, ctrl->opts->max_reconnects); + nvme_tcp_reconnect_or_remove(ctrl, ret); } static void nvme_tcp_error_recovery_work(struct work_struct *work) @@ -1805,28 +2476,34 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work) struct nvme_tcp_ctrl, err_work); struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl; + if (nvme_tcp_key_revoke_needed(ctrl)) + nvme_auth_revoke_tls_key(ctrl); nvme_stop_keep_alive(ctrl); + flush_work(&ctrl->async_event_work); nvme_tcp_teardown_io_queues(ctrl, false); /* unquiesce to fail fast pending requests */ - nvme_start_queues(ctrl); + nvme_unquiesce_io_queues(ctrl); nvme_tcp_teardown_admin_queue(ctrl, false); + nvme_unquiesce_admin_queue(ctrl); + nvme_auth_stop(ctrl); if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) { - /* state change failure is ok if we're in DELETING state */ - WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING); + /* state change failure is ok if we started ctrl delete */ + enum nvme_ctrl_state state = nvme_ctrl_state(ctrl); + + WARN_ON_ONCE(state != NVME_CTRL_DELETING && + state != NVME_CTRL_DELETING_NOIO); return; } - nvme_tcp_reconnect_or_remove(ctrl); + nvme_tcp_reconnect_or_remove(ctrl, 0); } static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown) { nvme_tcp_teardown_io_queues(ctrl, shutdown); - if (shutdown) - nvme_shutdown_ctrl(ctrl); - else - nvme_disable_ctrl(ctrl, ctrl->cap); + nvme_quiesce_admin_queue(ctrl); + nvme_disable_ctrl(ctrl, shutdown); nvme_tcp_teardown_admin_queue(ctrl, shutdown); } @@ -1839,29 +2516,36 @@ static void nvme_reset_ctrl_work(struct work_struct *work) { struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, reset_work); + int ret; + if (nvme_tcp_key_revoke_needed(ctrl)) + nvme_auth_revoke_tls_key(ctrl); nvme_stop_ctrl(ctrl); nvme_tcp_teardown_ctrl(ctrl, false); if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) { - /* state change failure is ok if we're in DELETING state */ - WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING); + /* state change failure is ok if we started ctrl delete */ + enum nvme_ctrl_state state = nvme_ctrl_state(ctrl); + + WARN_ON_ONCE(state != NVME_CTRL_DELETING && + state != NVME_CTRL_DELETING_NOIO); return; } - if (nvme_tcp_setup_ctrl(ctrl, false)) + ret = nvme_tcp_setup_ctrl(ctrl, false); + if (ret) goto out_fail; return; out_fail: ++ctrl->nr_reconnects; - nvme_tcp_reconnect_or_remove(ctrl); + nvme_tcp_reconnect_or_remove(ctrl, ret); } static void nvme_tcp_stop_ctrl(struct nvme_ctrl *ctrl) { - cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work); + flush_work(&to_tcp_ctrl(ctrl)->err_work); cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work); } @@ -1937,33 +2621,57 @@ static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg) ctrl->async_req.offset = 0; ctrl->async_req.curr_bio = NULL; ctrl->async_req.data_len = 0; + init_llist_node(&ctrl->async_req.lentry); + INIT_LIST_HEAD(&ctrl->async_req.entry); - nvme_tcp_queue_request(&ctrl->async_req); + nvme_tcp_queue_request(&ctrl->async_req, true); } -static enum blk_eh_timer_return -nvme_tcp_timeout(struct request *rq, bool reserved) +static void nvme_tcp_complete_timed_out(struct request *rq) { struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); - struct nvme_tcp_ctrl *ctrl = req->queue->ctrl; - struct nvme_tcp_cmd_pdu *pdu = req->pdu; + struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl; - dev_dbg(ctrl->ctrl.device, - "queue %d: timeout request %#x type %d\n", - nvme_tcp_queue_id(req->queue), rq->tag, - pdu->hdr.type); - - if (ctrl->ctrl.state != NVME_CTRL_LIVE) { - union nvme_result res = {}; + nvme_tcp_stop_queue(ctrl, nvme_tcp_queue_id(req->queue)); + nvmf_complete_timed_out_request(rq); +} - nvme_req(rq)->flags |= NVME_REQ_CANCELLED; - nvme_end_request(rq, cpu_to_le16(NVME_SC_ABORT_REQ), res); +static enum blk_eh_timer_return nvme_tcp_timeout(struct request *rq) +{ + struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); + struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl; + struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req); + struct nvme_command *cmd = &pdu->cmd; + int qid = nvme_tcp_queue_id(req->queue); + + dev_warn(ctrl->device, + "I/O tag %d (%04x) type %d opcode %#x (%s) QID %d timeout\n", + rq->tag, nvme_cid(rq), pdu->hdr.type, cmd->common.opcode, + nvme_fabrics_opcode_str(qid, cmd), qid); + + if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE) { + /* + * If we are resetting, connecting or deleting we should + * complete immediately because we may block controller + * teardown or setup sequence + * - ctrl disable/shutdown fabrics requests + * - connect requests + * - initialization admin requests + * - I/O requests that entered after unquiescing and + * the controller stopped responding + * + * All other requests should be cancelled by the error + * recovery work, so it's fine that we fail it here. + */ + nvme_tcp_complete_timed_out(rq); return BLK_EH_DONE; } - /* queue error recovery */ - nvme_tcp_error_recovery(&ctrl->ctrl); - + /* + * LIVE state should trigger the normal error recovery which will + * handle completing this request. + */ + nvme_tcp_error_recovery(ctrl); return BLK_EH_RESET_TIMER; } @@ -1971,13 +2679,15 @@ static blk_status_t nvme_tcp_map_data(struct nvme_tcp_queue *queue, struct request *rq) { struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); - struct nvme_tcp_cmd_pdu *pdu = req->pdu; + struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req); struct nvme_command *c = &pdu->cmd; c->common.flags |= NVME_CMD_SGL_METABUF; - if (rq_data_dir(rq) == WRITE && req->data_len && - req->data_len <= nvme_tcp_inline_data_size(queue)) + if (!blk_rq_nr_phys_segments(rq)) + nvme_tcp_set_sg_null(c); + else if (rq_data_dir(rq) == WRITE && + req->data_len <= nvme_tcp_inline_data_size(req)) nvme_tcp_set_sg_inline(queue, c, req->data_len); else nvme_tcp_set_sg_host_data(c, req->data_len); @@ -1989,28 +2699,31 @@ static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns, struct request *rq) { struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); - struct nvme_tcp_cmd_pdu *pdu = req->pdu; + struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req); struct nvme_tcp_queue *queue = req->queue; u8 hdgst = nvme_tcp_hdgst_len(queue), ddgst = 0; blk_status_t ret; - ret = nvme_setup_cmd(ns, rq, &pdu->cmd); + ret = nvme_setup_cmd(ns, rq); if (ret) return ret; req->state = NVME_TCP_SEND_CMD_PDU; + req->status = cpu_to_le16(NVME_SC_SUCCESS); req->offset = 0; req->data_sent = 0; req->pdu_len = 0; req->pdu_sent = 0; - req->data_len = blk_rq_payload_bytes(rq); + req->h2cdata_left = 0; + req->data_len = blk_rq_nr_phys_segments(rq) ? + blk_rq_payload_bytes(rq) : 0; req->curr_bio = rq->bio; + if (req->curr_bio && req->data_len) + nvme_tcp_init_iter(req, rq_data_dir(rq)); if (rq_data_dir(rq) == WRITE && - req->data_len <= nvme_tcp_inline_data_size(queue)) + req->data_len <= nvme_tcp_inline_data_size(req)) req->pdu_len = req->data_len; - else if (req->curr_bio) - nvme_tcp_init_iter(req, READ); pdu->hdr.type = nvme_tcp_cmd; pdu->hdr.flags = 0; @@ -2027,6 +2740,7 @@ static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns, ret = nvme_tcp_map_data(queue, rq); if (unlikely(ret)) { + nvme_cleanup_cmd(rq); dev_err(queue->ctrl->ctrl.device, "Failed to map data (%d)\n", ret); return ret; @@ -2035,6 +2749,14 @@ static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns, return 0; } +static void nvme_tcp_commit_rqs(struct blk_mq_hw_ctx *hctx) +{ + struct nvme_tcp_queue *queue = hctx->driver_data; + + if (!llist_empty(&queue->req_list)) + queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); +} + static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { @@ -2045,54 +2767,83 @@ static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx, bool queue_ready = test_bit(NVME_TCP_Q_LIVE, &queue->flags); blk_status_t ret; - if (!nvmf_check_ready(&queue->ctrl->ctrl, rq, queue_ready)) - return nvmf_fail_nonready_command(&queue->ctrl->ctrl, rq); + if (!nvme_check_ready(&queue->ctrl->ctrl, rq, queue_ready)) + return nvme_fail_nonready_command(&queue->ctrl->ctrl, rq); ret = nvme_tcp_setup_cmd_pdu(ns, rq); if (unlikely(ret)) return ret; - blk_mq_start_request(rq); + nvme_start_request(rq); - nvme_tcp_queue_request(req); + nvme_tcp_queue_request(req, bd->last); return BLK_STS_OK; } -static int nvme_tcp_map_queues(struct blk_mq_tag_set *set) +static void nvme_tcp_map_queues(struct blk_mq_tag_set *set) { - struct nvme_tcp_ctrl *ctrl = set->driver_data; + struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(set->driver_data); - set->map[HCTX_TYPE_DEFAULT].queue_offset = 0; - set->map[HCTX_TYPE_READ].nr_queues = ctrl->ctrl.opts->nr_io_queues; - if (ctrl->ctrl.opts->nr_write_queues) { - /* separate read/write queues */ - set->map[HCTX_TYPE_DEFAULT].nr_queues = - ctrl->ctrl.opts->nr_write_queues; - set->map[HCTX_TYPE_READ].queue_offset = - ctrl->ctrl.opts->nr_write_queues; - } else { - /* mixed read/write queues */ - set->map[HCTX_TYPE_DEFAULT].nr_queues = - ctrl->ctrl.opts->nr_io_queues; - set->map[HCTX_TYPE_READ].queue_offset = 0; + nvmf_map_queues(set, &ctrl->ctrl, ctrl->io_queues); +} + +static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) +{ + struct nvme_tcp_queue *queue = hctx->driver_data; + struct sock *sk = queue->sock->sk; + int ret; + + if (!test_bit(NVME_TCP_Q_LIVE, &queue->flags)) + return 0; + + set_bit(NVME_TCP_Q_POLLING, &queue->flags); + if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue)) + sk_busy_loop(sk, true); + ret = nvme_tcp_try_recv(queue); + clear_bit(NVME_TCP_Q_POLLING, &queue->flags); + return ret < 0 ? ret : queue->nr_cqe; +} + +static int nvme_tcp_get_address(struct nvme_ctrl *ctrl, char *buf, int size) +{ + struct nvme_tcp_queue *queue = &to_tcp_ctrl(ctrl)->queues[0]; + struct sockaddr_storage src_addr; + int ret, len; + + len = nvmf_get_address(ctrl, buf, size); + + if (!test_bit(NVME_TCP_Q_LIVE, &queue->flags)) + return len; + + mutex_lock(&queue->queue_lock); + + ret = kernel_getsockname(queue->sock, (struct sockaddr *)&src_addr); + if (ret > 0) { + if (len > 0) + len--; /* strip trailing newline */ + len += scnprintf(buf + len, size - len, "%ssrc_addr=%pISc\n", + (len) ? "," : "", &src_addr); } - blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); - blk_mq_map_queues(&set->map[HCTX_TYPE_READ]); - return 0; + + mutex_unlock(&queue->queue_lock); + + return len; } -static struct blk_mq_ops nvme_tcp_mq_ops = { +static const struct blk_mq_ops nvme_tcp_mq_ops = { .queue_rq = nvme_tcp_queue_rq, + .commit_rqs = nvme_tcp_commit_rqs, .complete = nvme_complete_rq, .init_request = nvme_tcp_init_request, .exit_request = nvme_tcp_exit_request, .init_hctx = nvme_tcp_init_hctx, .timeout = nvme_tcp_timeout, .map_queues = nvme_tcp_map_queues, + .poll = nvme_tcp_poll, }; -static struct blk_mq_ops nvme_tcp_admin_mq_ops = { +static const struct blk_mq_ops nvme_tcp_admin_mq_ops = { .queue_rq = nvme_tcp_queue_rq, .complete = nvme_complete_rq, .init_request = nvme_tcp_init_request, @@ -2104,15 +2855,17 @@ static struct blk_mq_ops nvme_tcp_admin_mq_ops = { static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = { .name = "tcp", .module = THIS_MODULE, - .flags = NVME_F_FABRICS, + .flags = NVME_F_FABRICS | NVME_F_BLOCKING, .reg_read32 = nvmf_reg_read32, .reg_read64 = nvmf_reg_read64, .reg_write32 = nvmf_reg_write32, + .subsystem_reset = nvmf_subsystem_reset, .free_ctrl = nvme_tcp_free_ctrl, .submit_async_event = nvme_tcp_submit_async_event, .delete_ctrl = nvme_tcp_delete_ctrl, - .get_address = nvmf_get_address, + .get_address = nvme_tcp_get_address, .stop_ctrl = nvme_tcp_stop_ctrl, + .get_virt_boundary = nvmf_get_virt_boundary, }; static bool @@ -2132,7 +2885,7 @@ nvme_tcp_existing_controller(struct nvmf_ctrl_options *opts) return found; } -static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, +static struct nvme_tcp_ctrl *nvme_tcp_alloc_ctrl(struct device *dev, struct nvmf_ctrl_options *opts) { struct nvme_tcp_ctrl *ctrl; @@ -2144,7 +2897,8 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, INIT_LIST_HEAD(&ctrl->list); ctrl->ctrl.opts = opts; - ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues + 1; + ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues + + opts->nr_poll_queues + 1; ctrl->ctrl.sqsize = opts->queue_size - 1; ctrl->ctrl.kato = opts->kato; @@ -2181,6 +2935,15 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, } } + if (opts->mask & NVMF_OPT_HOST_IFACE) { + if (!__dev_get_by_name(&init_net, opts->host_iface)) { + pr_err("invalid interface passed: %s\n", + opts->host_iface); + ret = -ENODEV; + goto out_free_ctrl; + } + } + if (!opts->duplicate_connect && nvme_tcp_existing_controller(opts)) { ret = -EALREADY; goto out_free_ctrl; @@ -2197,6 +2960,28 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, if (ret) goto out_kfree_queues; + return ctrl; +out_kfree_queues: + kfree(ctrl->queues); +out_free_ctrl: + kfree(ctrl); + return ERR_PTR(ret); +} + +static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, + struct nvmf_ctrl_options *opts) +{ + struct nvme_tcp_ctrl *ctrl; + int ret; + + ctrl = nvme_tcp_alloc_ctrl(dev, opts); + if (IS_ERR(ctrl)) + return ERR_CAST(ctrl); + + ret = nvme_add_ctrl(&ctrl->ctrl); + if (ret) + goto out_put_ctrl; + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { WARN_ON_ONCE(1); ret = -EINTR; @@ -2207,10 +2992,8 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, if (ret) goto out_uninit_ctrl; - dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n", - ctrl->ctrl.opts->subsysnqn, &ctrl->addr); - - nvme_get_ctrl(&ctrl->ctrl); + dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp, hostnqn: %s\n", + nvmf_ctrl_subsysnqn(&ctrl->ctrl), &ctrl->addr, opts->host->nqn); mutex_lock(&nvme_tcp_ctrl_mutex); list_add_tail(&ctrl->list, &nvme_tcp_ctrl_list); @@ -2220,15 +3003,11 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, out_uninit_ctrl: nvme_uninit_ctrl(&ctrl->ctrl); +out_put_ctrl: nvme_put_ctrl(&ctrl->ctrl); if (ret > 0) ret = -EIO; return ERR_PTR(ret); -out_kfree_queues: - kfree(ctrl->queues); -out_free_ctrl: - kfree(ctrl); - return ERR_PTR(ret); } static struct nvmf_transport_ops nvme_tcp_transport = { @@ -2238,17 +3017,36 @@ static struct nvmf_transport_ops nvme_tcp_transport = { .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY | NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO | NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST | - NVMF_OPT_NR_WRITE_QUEUES, + NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES | + NVMF_OPT_TOS | NVMF_OPT_HOST_IFACE | NVMF_OPT_TLS | + NVMF_OPT_KEYRING | NVMF_OPT_TLS_KEY | NVMF_OPT_CONCAT, .create_ctrl = nvme_tcp_create_ctrl, }; static int __init nvme_tcp_init_module(void) { - nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq", - WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); + unsigned int wq_flags = WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_SYSFS; + int cpu; + + BUILD_BUG_ON(sizeof(struct nvme_tcp_hdr) != 8); + BUILD_BUG_ON(sizeof(struct nvme_tcp_cmd_pdu) != 72); + BUILD_BUG_ON(sizeof(struct nvme_tcp_data_pdu) != 24); + BUILD_BUG_ON(sizeof(struct nvme_tcp_rsp_pdu) != 24); + BUILD_BUG_ON(sizeof(struct nvme_tcp_r2t_pdu) != 24); + BUILD_BUG_ON(sizeof(struct nvme_tcp_icreq_pdu) != 128); + BUILD_BUG_ON(sizeof(struct nvme_tcp_icresp_pdu) != 128); + BUILD_BUG_ON(sizeof(struct nvme_tcp_term_pdu) != 24); + + if (wq_unbound) + wq_flags |= WQ_UNBOUND; + + nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq", wq_flags, 0); if (!nvme_tcp_wq) return -ENOMEM; + for_each_possible_cpu(cpu) + atomic_set(&nvme_tcp_cpu_queues[cpu], 0); + nvmf_register_transport(&nvme_tcp_transport); return 0; } @@ -2271,4 +3069,5 @@ static void __exit nvme_tcp_cleanup_module(void) module_init(nvme_tcp_init_module); module_exit(nvme_tcp_cleanup_module); +MODULE_DESCRIPTION("NVMe host TCP transport driver"); MODULE_LICENSE("GPL v2"); |
