diff options
Diffstat (limited to 'net/9p/trans_rdma.c')
| -rw-r--r-- | net/9p/trans_rdma.c | 387 |
1 files changed, 150 insertions, 237 deletions
diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c index 928f2bb9bf8d..4d406479f83b 100644 --- a/net/9p/trans_rdma.c +++ b/net/9p/trans_rdma.c @@ -1,6 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-only /* - * linux/fs/9p/trans_rdma.c - * * RDMA transport layer based on the trans_fd.c implementation. * * Copyright (C) 2008 by Tom Tucker <tom@opengridcomputing.com> @@ -8,22 +7,6 @@ * Copyright (C) 2004-2005 by Latchesar Ionkov <lucho@ionkov.net> * Copyright (C) 2004-2008 by Eric Van Hensbergen <ericvh@gmail.com> * Copyright (C) 1997-2002 by Ron Minnich <rminnich@sarnoff.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to: - * Free Software Foundation - * 51 Franklin Street, Fifth Floor - * Boston, MA 02111-1301 USA - * */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -38,25 +21,21 @@ #include <linux/un.h> #include <linux/uaccess.h> #include <linux/inet.h> -#include <linux/idr.h> #include <linux/file.h> -#include <linux/parser.h> +#include <linux/fs_context.h> #include <linux/semaphore.h> #include <linux/slab.h> +#include <linux/seq_file.h> #include <net/9p/9p.h> #include <net/9p/client.h> #include <net/9p/transport.h> #include <rdma/ib_verbs.h> #include <rdma/rdma_cm.h> -#define P9_PORT 5640 -#define P9_RDMA_SQ_DEPTH 32 -#define P9_RDMA_RQ_DEPTH 32 #define P9_RDMA_SEND_SGE 4 #define P9_RDMA_RECV_SGE 4 #define P9_RDMA_IRD 0 #define P9_RDMA_ORD 0 -#define P9_RDMA_TIMEOUT 30000 /* 30 seconds */ #define P9_RDMA_MAXSIZE (1024*1024) /* 1MB */ /** @@ -67,9 +46,9 @@ * @pd: Protection Domain pointer * @qp: Queue Pair pointer * @cq: Completion Queue pointer - * @dm_mr: DMA Memory Region pointer - * @lkey: The local access only memory region key * @timeout: Number of uSecs to wait for connection management events + * @privport: Whether a privileged port may be used + * @port: The port to use * @sq_depth: The depth of the Send Queue * @sq_sem: Semaphore for the SQ * @rq_depth: The depth of the Receive Queue. @@ -94,9 +73,9 @@ struct p9_trans_rdma { struct ib_pd *pd; struct ib_qp *qp; struct ib_cq *cq; - struct ib_mr *dma_mr; - u32 lkey; long timeout; + bool privport; + u16 port; int sq_depth; struct semaphore sq_sem; int rq_depth; @@ -108,117 +87,39 @@ struct p9_trans_rdma { struct completion cm_done; }; +struct p9_rdma_req; + /** - * p9_rdma_context - Keeps track of in-process WR + * struct p9_rdma_context - Keeps track of in-process WR * - * @wc_op: The original WR op for when the CQE completes in error. + * @cqe: completion queue entry * @busa: Bus address to unmap when the WR completes * @req: Keeps track of requests (send) * @rc: Keepts track of replies (receive) */ -struct p9_rdma_req; struct p9_rdma_context { - enum ib_wc_opcode wc_op; + struct ib_cqe cqe; dma_addr_t busa; union { struct p9_req_t *req; - struct p9_fcall *rc; + struct p9_fcall rc; }; }; -/** - * p9_rdma_opts - Collection of mount options - * @port: port of connection - * @sq_depth: The requested depth of the SQ. This really doesn't need - * to be any deeper than the number of threads used in the client - * @rq_depth: The depth of the RQ. Should be greater than or equal to SQ depth - * @timeout: Time to wait in msecs for CM events - */ -struct p9_rdma_opts { - short port; - int sq_depth; - int rq_depth; - long timeout; -}; - -/* - * Option Parsing (code inspired by NFS code) - */ -enum { - /* Options that take integer arguments */ - Opt_port, Opt_rq_depth, Opt_sq_depth, Opt_timeout, Opt_err, -}; - -static match_table_t tokens = { - {Opt_port, "port=%u"}, - {Opt_sq_depth, "sq=%u"}, - {Opt_rq_depth, "rq=%u"}, - {Opt_timeout, "timeout=%u"}, - {Opt_err, NULL}, -}; - -/** - * parse_opts - parse mount options into rdma options structure - * @params: options string passed from mount - * @opts: rdma transport-specific structure to parse options into - * - * Returns 0 upon success, -ERRNO upon failure - */ -static int parse_opts(char *params, struct p9_rdma_opts *opts) +static int p9_rdma_show_options(struct seq_file *m, struct p9_client *clnt) { - char *p; - substring_t args[MAX_OPT_ARGS]; - int option; - char *options, *tmp_options; - - opts->port = P9_PORT; - opts->sq_depth = P9_RDMA_SQ_DEPTH; - opts->rq_depth = P9_RDMA_RQ_DEPTH; - opts->timeout = P9_RDMA_TIMEOUT; - - if (!params) - return 0; - - tmp_options = kstrdup(params, GFP_KERNEL); - if (!tmp_options) { - p9_debug(P9_DEBUG_ERROR, - "failed to allocate copy of option string\n"); - return -ENOMEM; - } - options = tmp_options; - - while ((p = strsep(&options, ",")) != NULL) { - int token; - int r; - if (!*p) - continue; - token = match_token(p, tokens, args); - r = match_int(&args[0], &option); - if (r < 0) { - p9_debug(P9_DEBUG_ERROR, - "integer field, but no integer?\n"); - continue; - } - switch (token) { - case Opt_port: - opts->port = option; - break; - case Opt_sq_depth: - opts->sq_depth = option; - break; - case Opt_rq_depth: - opts->rq_depth = option; - break; - case Opt_timeout: - opts->timeout = option; - break; - default: - continue; - } - } - /* RQ must be at least as large as the SQ */ - opts->rq_depth = max(opts->rq_depth, opts->sq_depth); - kfree(tmp_options); + struct p9_trans_rdma *rdma = clnt->trans; + + if (rdma->port != P9_RDMA_PORT) + seq_printf(m, ",port=%u", rdma->port); + if (rdma->sq_depth != P9_RDMA_SQ_DEPTH) + seq_printf(m, ",sq=%u", rdma->sq_depth); + if (rdma->rq_depth != P9_RDMA_RQ_DEPTH) + seq_printf(m, ",rq=%u", rdma->rq_depth); + if (rdma->timeout != P9_RDMA_TIMEOUT) + seq_printf(m, ",timeout=%lu", rdma->timeout); + if (rdma->privport) + seq_puts(m, ",privport"); return 0; } @@ -246,8 +147,7 @@ p9_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) case RDMA_CM_EVENT_DISCONNECTED: if (rdma) rdma->state = P9_RDMA_CLOSED; - if (c) - c->status = Disconnected; + c->status = Disconnected; break; case RDMA_CM_EVENT_TIMEWAIT_EXIT: @@ -275,9 +175,12 @@ p9_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) } static void -handle_recv(struct p9_client *client, struct p9_trans_rdma *rdma, - struct p9_rdma_context *c, enum ib_wc_status status, u32 byte_len) +recv_done(struct ib_cq *cq, struct ib_wc *wc) { + struct p9_client *client = cq->cq_context; + struct p9_trans_rdma *rdma = client->trans; + struct p9_rdma_context *c = + container_of(wc->wr_cqe, struct p9_rdma_context, cqe); struct p9_req_t *req; int err = 0; int16_t tag; @@ -286,10 +189,11 @@ handle_recv(struct p9_client *client, struct p9_trans_rdma *rdma, ib_dma_unmap_single(rdma->cm_id->device, c->busa, client->msize, DMA_FROM_DEVICE); - if (status != IB_WC_SUCCESS) + if (wc->status != IB_WC_SUCCESS) goto err_out; - err = p9_parse_header(c->rc, NULL, NULL, &tag, 1); + c->rc.size = wc->byte_len; + err = p9_parse_header(&c->rc, NULL, NULL, &tag, 1); if (err) goto err_out; @@ -299,30 +203,42 @@ handle_recv(struct p9_client *client, struct p9_trans_rdma *rdma, /* Check that we have not yet received a reply for this request. */ - if (unlikely(req->rc)) { + if (unlikely(req->rc.sdata)) { pr_err("Duplicate reply for request %d", tag); goto err_out; } - req->rc = c->rc; - req->status = REQ_STATUS_RCVD; - p9_client_cb(client, req); + req->rc.size = c->rc.size; + req->rc.sdata = c->rc.sdata; + p9_client_cb(client, req, REQ_STATUS_RCVD); + out: + up(&rdma->rq_sem); + kfree(c); return; err_out: - p9_debug(P9_DEBUG_ERROR, "req %p err %d status %d\n", req, err, status); + p9_debug(P9_DEBUG_ERROR, "req %p err %d status %d\n", + req, err, wc->status); rdma->state = P9_RDMA_FLUSHING; client->status = Disconnected; + goto out; } static void -handle_send(struct p9_client *client, struct p9_trans_rdma *rdma, - struct p9_rdma_context *c, enum ib_wc_status status, u32 byte_len) +send_done(struct ib_cq *cq, struct ib_wc *wc) { + struct p9_client *client = cq->cq_context; + struct p9_trans_rdma *rdma = client->trans; + struct p9_rdma_context *c = + container_of(wc->wr_cqe, struct p9_rdma_context, cqe); + ib_dma_unmap_single(rdma->cm_id->device, - c->busa, c->req->tc->size, + c->busa, c->req->tc.size, DMA_TO_DEVICE); + up(&rdma->sq_sem); + p9_req_put(client, c->req); + kfree(c); } static void qp_event_handler(struct ib_event *event, void *context) @@ -331,50 +247,11 @@ static void qp_event_handler(struct ib_event *event, void *context) event->event, context); } -static void cq_comp_handler(struct ib_cq *cq, void *cq_context) -{ - struct p9_client *client = cq_context; - struct p9_trans_rdma *rdma = client->trans; - int ret; - struct ib_wc wc; - - ib_req_notify_cq(rdma->cq, IB_CQ_NEXT_COMP); - while ((ret = ib_poll_cq(cq, 1, &wc)) > 0) { - struct p9_rdma_context *c = (void *) (unsigned long) wc.wr_id; - - switch (c->wc_op) { - case IB_WC_RECV: - handle_recv(client, rdma, c, wc.status, wc.byte_len); - up(&rdma->rq_sem); - break; - - case IB_WC_SEND: - handle_send(client, rdma, c, wc.status, wc.byte_len); - up(&rdma->sq_sem); - break; - - default: - pr_err("unexpected completion type, c->wc_op=%d, wc.opcode=%d, status=%d\n", - c->wc_op, wc.opcode, wc.status); - break; - } - kfree(c); - } -} - -static void cq_event_handler(struct ib_event *e, void *v) -{ - p9_debug(P9_DEBUG_ERROR, "CQ event %d context %p\n", e->event, v); -} - static void rdma_destroy_trans(struct p9_trans_rdma *rdma) { if (!rdma) return; - if (rdma->dma_mr && !IS_ERR(rdma->dma_mr)) - ib_dereg_mr(rdma->dma_mr); - if (rdma->qp && !IS_ERR(rdma->qp)) ib_destroy_qp(rdma->qp); @@ -382,7 +259,7 @@ static void rdma_destroy_trans(struct p9_trans_rdma *rdma) ib_dealloc_pd(rdma->pd); if (rdma->cq && !IS_ERR(rdma->cq)) - ib_destroy_cq(rdma->cq); + ib_free_cq(rdma->cq); if (rdma->cm_id && !IS_ERR(rdma->cm_id)) rdma_destroy_id(rdma->cm_id); @@ -394,25 +271,32 @@ static int post_recv(struct p9_client *client, struct p9_rdma_context *c) { struct p9_trans_rdma *rdma = client->trans; - struct ib_recv_wr wr, *bad_wr; + struct ib_recv_wr wr; struct ib_sge sge; + int ret; c->busa = ib_dma_map_single(rdma->cm_id->device, - c->rc->sdata, client->msize, + c->rc.sdata, client->msize, DMA_FROM_DEVICE); if (ib_dma_mapping_error(rdma->cm_id->device, c->busa)) goto error; + c->cqe.done = recv_done; + sge.addr = c->busa; sge.length = client->msize; - sge.lkey = rdma->lkey; + sge.lkey = rdma->pd->local_dma_lkey; wr.next = NULL; - c->wc_op = IB_WC_RECV; - wr.wr_id = (unsigned long) c; + wr.wr_cqe = &c->cqe; wr.sg_list = &sge; wr.num_sge = 1; - return ib_post_recv(rdma->qp, &wr, &bad_wr); + + ret = ib_post_recv(rdma->qp, &wr, NULL); + if (ret) + ib_dma_unmap_single(rdma->cm_id->device, c->busa, + client->msize, DMA_FROM_DEVICE); + return ret; error: p9_debug(P9_DEBUG_ERROR, "EIO\n"); @@ -422,7 +306,7 @@ post_recv(struct p9_client *client, struct p9_rdma_context *c) static int rdma_request(struct p9_client *client, struct p9_req_t *req) { struct p9_trans_rdma *rdma = client->trans; - struct ib_send_wr wr, *bad_wr; + struct ib_send_wr wr; struct ib_sge sge; int err = 0; unsigned long flags; @@ -439,9 +323,9 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req) **/ if (unlikely(atomic_read(&rdma->excess_rc) > 0)) { if ((atomic_sub_return(1, &rdma->excess_rc) >= 0)) { - /* Got one ! */ - kfree(req->rc); - req->rc = NULL; + /* Got one! */ + p9_fcall_fini(&req->rc); + req->rc.sdata = NULL; goto dont_need_post_recv; } else { /* We raced and lost. */ @@ -455,7 +339,7 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req) err = -ENOMEM; goto recv_error; } - rpl_context->rc = req->rc; + rpl_context->rc.sdata = req->rc.sdata; /* * Post a receive buffer for this request. We need to ensure @@ -471,11 +355,11 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req) err = post_recv(client, rpl_context); if (err) { - p9_debug(P9_DEBUG_FCALL, "POST RECV failed\n"); + p9_debug(P9_DEBUG_ERROR, "POST RECV failed: %d\n", err); goto recv_error; } /* remove posted receive buffer from request structure */ - req->rc = NULL; + req->rc.sdata = NULL; dont_need_post_recv: /* Post the request */ @@ -487,20 +371,21 @@ dont_need_post_recv: c->req = req; c->busa = ib_dma_map_single(rdma->cm_id->device, - c->req->tc->sdata, c->req->tc->size, + c->req->tc.sdata, c->req->tc.size, DMA_TO_DEVICE); if (ib_dma_mapping_error(rdma->cm_id->device, c->busa)) { err = -EIO; goto send_error; } + c->cqe.done = send_done; + sge.addr = c->busa; - sge.length = c->req->tc->size; - sge.lkey = rdma->lkey; + sge.length = c->req->tc.size; + sge.lkey = rdma->pd->local_dma_lkey; wr.next = NULL; - c->wc_op = IB_WC_SEND; - wr.wr_id = (unsigned long) c; + wr.wr_cqe = &c->cqe; wr.opcode = IB_WR_SEND; wr.send_flags = IB_SEND_SIGNALED; wr.sg_list = &sge; @@ -508,18 +393,27 @@ dont_need_post_recv: if (down_interruptible(&rdma->sq_sem)) { err = -EINTR; - goto send_error; + goto dma_unmap; } - err = ib_post_send(rdma->qp, &wr, &bad_wr); + /* Mark request as `sent' *before* we actually send it, + * because doing if after could erase the REQ_STATUS_RCVD + * status in case of a very fast reply. + */ + WRITE_ONCE(req->status, REQ_STATUS_SENT); + err = ib_post_send(rdma->qp, &wr, NULL); if (err) - goto send_error; + goto dma_unmap; /* Success */ return 0; +dma_unmap: + ib_dma_unmap_single(rdma->cm_id->device, c->busa, + c->req->tc.size, DMA_TO_DEVICE); /* Handle errors that happened during or while preparing the send: */ send_error: + WRITE_ONCE(req->status, REQ_STATUS_ERROR); kfree(c); p9_debug(P9_DEBUG_ERROR, "Error %d in rdma_request()\n", err); @@ -533,7 +427,7 @@ dont_need_post_recv: recv_error: kfree(rpl_context); spin_lock_irqsave(&rdma->req_lock, flags); - if (rdma->state < P9_RDMA_CLOSING) { + if (err != -EINTR && rdma->state < P9_RDMA_CLOSING) { rdma->state = P9_RDMA_CLOSING; spin_unlock_irqrestore(&rdma->req_lock, flags); rdma_disconnect(rdma->cm_id); @@ -570,6 +464,8 @@ static struct p9_trans_rdma *alloc_rdma(struct p9_rdma_opts *opts) if (!rdma) return NULL; + rdma->port = opts->port; + rdma->privport = opts->privport; rdma->sq_depth = opts->sq_depth; rdma->rq_depth = opts->rq_depth; rdma->timeout = opts->timeout; @@ -582,9 +478,11 @@ static struct p9_trans_rdma *alloc_rdma(struct p9_rdma_opts *opts) return rdma; } -/* its not clear to me we can do anything after send has been posted */ static int rdma_cancel(struct p9_client *client, struct p9_req_t *req) { + /* Nothing to do here. + * We will take care of it (if we have to) in rdma_cancelled() + */ return 1; } @@ -594,31 +492,48 @@ static int rdma_cancel(struct p9_client *client, struct p9_req_t *req) static int rdma_cancelled(struct p9_client *client, struct p9_req_t *req) { struct p9_trans_rdma *rdma = client->trans; - atomic_inc(&rdma->excess_rc); return 0; } +static int p9_rdma_bind_privport(struct p9_trans_rdma *rdma) +{ + struct sockaddr_in cl = { + .sin_family = AF_INET, + .sin_addr.s_addr = htonl(INADDR_ANY), + }; + int port, err = -EINVAL; + + for (port = P9_DEF_MAX_RESVPORT; port >= P9_DEF_MIN_RESVPORT; port--) { + cl.sin_port = htons((ushort)port); + err = rdma_bind_addr(rdma->cm_id, (struct sockaddr *)&cl); + if (err != -EADDRINUSE) + break; + } + return err; +} + /** - * trans_create_rdma - Transport method for creating atransport instance + * rdma_create_trans - Transport method for creating a transport instance * @client: client instance - * @addr: IP address string - * @args: Mount options string + * @fc: The filesystem context */ static int -rdma_create_trans(struct p9_client *client, const char *addr, char *args) +rdma_create_trans(struct p9_client *client, struct fs_context *fc) { + const char *addr = fc->source; + struct v9fs_context *ctx = fc->fs_private; + struct p9_rdma_opts opts = ctx->rdma_opts; int err; - struct p9_rdma_opts opts; struct p9_trans_rdma *rdma; struct rdma_conn_param conn_param; struct ib_qp_init_attr qp_attr; - struct ib_device_attr devattr; - /* Parse the transport specific mount options */ - err = parse_opts(args, &opts); - if (err < 0) - return err; + if (addr == NULL) + return -EINVAL; + + /* options are already parsed, in the fs context */ + opts = ctx->rdma_opts; /* Create and initialize the RDMA transport structure */ rdma = alloc_rdma(&opts); @@ -626,14 +541,24 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args) return -ENOMEM; /* Create the RDMA CM ID */ - rdma->cm_id = rdma_create_id(p9_cm_event_handler, client, RDMA_PS_TCP, - IB_QPT_RC); + rdma->cm_id = rdma_create_id(&init_net, p9_cm_event_handler, client, + RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(rdma->cm_id)) goto error; /* Associate the client with the transport */ client->trans = rdma; + /* Bind to a privileged port if we need to */ + if (opts.privport) { + err = p9_rdma_bind_privport(rdma); + if (err < 0) { + pr_err("%s (%d): problem binding to privport: %d\n", + __func__, task_pid_nr(current), -err); + goto error; + } + } + /* Resolve the server's address */ rdma->addr.sin_family = AF_INET; rdma->addr.sin_addr.s_addr = in_aton(addr); @@ -655,35 +580,18 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args) if (err || (rdma->state != P9_RDMA_ROUTE_RESOLVED)) goto error; - /* Query the device attributes */ - err = ib_query_device(rdma->cm_id->device, &devattr); - if (err) - goto error; - /* Create the Completion Queue */ - rdma->cq = ib_create_cq(rdma->cm_id->device, cq_comp_handler, - cq_event_handler, client, - opts.sq_depth + opts.rq_depth + 1, 0); + rdma->cq = ib_alloc_cq_any(rdma->cm_id->device, client, + opts.sq_depth + opts.rq_depth + 1, + IB_POLL_SOFTIRQ); if (IS_ERR(rdma->cq)) goto error; - ib_req_notify_cq(rdma->cq, IB_CQ_NEXT_COMP); /* Create the Protection Domain */ - rdma->pd = ib_alloc_pd(rdma->cm_id->device); + rdma->pd = ib_alloc_pd(rdma->cm_id->device, 0); if (IS_ERR(rdma->pd)) goto error; - /* Cache the DMA lkey in the transport */ - rdma->dma_mr = NULL; - if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) - rdma->lkey = rdma->cm_id->device->local_dma_lkey; - else { - rdma->dma_mr = ib_get_dma_mr(rdma->pd, IB_ACCESS_LOCAL_WRITE); - if (IS_ERR(rdma->dma_mr)) - goto error; - rdma->lkey = rdma->dma_mr->lkey; - } - /* Create the Queue Pair */ memset(&qp_attr, 0, sizeof qp_attr); qp_attr.event_handler = qp_event_handler; @@ -726,12 +634,16 @@ error: static struct p9_trans_module p9_rdma_trans = { .name = "rdma", .maxsize = P9_RDMA_MAXSIZE, - .def = 0, + .pooled_rbuffers = true, + .def = false, + .supports_vmalloc = false, .owner = THIS_MODULE, .create = rdma_create_trans, .close = rdma_close, .request = rdma_request, .cancel = rdma_cancel, + .cancelled = rdma_cancelled, + .show_options = p9_rdma_show_options, }; /** @@ -750,6 +662,7 @@ static void __exit p9_trans_rdma_exit(void) module_init(p9_trans_rdma_init); module_exit(p9_trans_rdma_exit); +MODULE_ALIAS_9P("rdma"); MODULE_AUTHOR("Tom Tucker <tom@opengridcomputing.com>"); MODULE_DESCRIPTION("RDMA Transport for 9P"); |
