diff options
Diffstat (limited to 'net')
624 files changed, 18328 insertions, 12285 deletions
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index fda3a80e9340..2b74ed56eb16 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -193,6 +193,8 @@ int register_vlan_dev(struct net_device *dev, struct netlink_ext_ack *extack) vlan_group_set_device(grp, vlan->vlan_proto, vlan_id, dev); grp->nr_vlan_devs++; + netdev_update_features(dev); + return 0; out_unregister_netdev: diff --git a/net/9p/client.c b/net/9p/client.c index 5c1ca57ccd28..f60d1d041adb 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -20,8 +20,8 @@ #include <linux/uio.h> #include <linux/netfs.h> #include <net/9p/9p.h> -#include <linux/parser.h> #include <linux/seq_file.h> +#include <linux/fs_context.h> #include <net/9p/client.h> #include <net/9p/transport.h> #include "protocol.h" @@ -29,32 +29,10 @@ #define CREATE_TRACE_POINTS #include <trace/events/9p.h> -/* DEFAULT MSIZE = 32 pages worth of payload + P9_HDRSZ + - * room for write (16 extra) or read (11 extra) operands. - */ - -#define DEFAULT_MSIZE ((128 * 1024) + P9_IOHDRSZ) - /* Client Option Parsing (code inspired by NFS code) * - a little lazy - parse all client options */ -enum { - Opt_msize, - Opt_trans, - Opt_legacy, - Opt_version, - Opt_err, -}; - -static const match_table_t tokens = { - {Opt_msize, "msize=%u"}, - {Opt_legacy, "noextend"}, - {Opt_trans, "trans=%s"}, - {Opt_version, "version=%s"}, - {Opt_err, NULL}, -}; - inline int p9_is_proto_dotl(struct p9_client *clnt) { return clnt->proto_version == p9_proto_2000L; @@ -103,124 +81,16 @@ static int safe_errno(int err) return err; } -/* Interpret mount option for protocol version */ -static int get_protocol_version(char *s) +static int apply_client_options(struct p9_client *clnt, struct fs_context *fc) { - int version = -EINVAL; - - if (!strcmp(s, "9p2000")) { - version = p9_proto_legacy; - p9_debug(P9_DEBUG_9P, "Protocol version: Legacy\n"); - } else if (!strcmp(s, "9p2000.u")) { - version = p9_proto_2000u; - p9_debug(P9_DEBUG_9P, "Protocol version: 9P2000.u\n"); - } else if (!strcmp(s, "9p2000.L")) { - version = p9_proto_2000L; - p9_debug(P9_DEBUG_9P, "Protocol version: 9P2000.L\n"); - } else { - pr_info("Unknown protocol version %s\n", s); - } + struct v9fs_context *ctx = fc->fs_private; - return version; -} + clnt->msize = ctx->client_opts.msize; + clnt->trans_mod = ctx->client_opts.trans_mod; + ctx->client_opts.trans_mod = NULL; + clnt->proto_version = ctx->client_opts.proto_version; -/** - * parse_opts - parse mount options into client structure - * @opts: options string passed from mount - * @clnt: existing v9fs client information - * - * Return 0 upon success, -ERRNO upon failure - */ - -static int parse_opts(char *opts, struct p9_client *clnt) -{ - char *options, *tmp_options; - char *p; - substring_t args[MAX_OPT_ARGS]; - int option; - char *s; - int ret = 0; - - clnt->proto_version = p9_proto_2000L; - clnt->msize = DEFAULT_MSIZE; - - if (!opts) - return 0; - - tmp_options = kstrdup(opts, GFP_KERNEL); - if (!tmp_options) - return -ENOMEM; - options = tmp_options; - - while ((p = strsep(&options, ",")) != NULL) { - int token, r; - - if (!*p) - continue; - token = match_token(p, tokens, args); - switch (token) { - case Opt_msize: - r = match_int(&args[0], &option); - if (r < 0) { - p9_debug(P9_DEBUG_ERROR, - "integer field, but no integer?\n"); - ret = r; - continue; - } - if (option < 4096) { - p9_debug(P9_DEBUG_ERROR, - "msize should be at least 4k\n"); - ret = -EINVAL; - continue; - } - clnt->msize = option; - break; - case Opt_trans: - s = match_strdup(&args[0]); - if (!s) { - ret = -ENOMEM; - p9_debug(P9_DEBUG_ERROR, - "problem allocating copy of trans arg\n"); - goto free_and_return; - } - - v9fs_put_trans(clnt->trans_mod); - clnt->trans_mod = v9fs_get_trans_by_name(s); - if (!clnt->trans_mod) { - pr_info("Could not find request transport: %s\n", - s); - ret = -EINVAL; - } - kfree(s); - break; - case Opt_legacy: - clnt->proto_version = p9_proto_legacy; - break; - case Opt_version: - s = match_strdup(&args[0]); - if (!s) { - ret = -ENOMEM; - p9_debug(P9_DEBUG_ERROR, - "problem allocating copy of version arg\n"); - goto free_and_return; - } - r = get_protocol_version(s); - if (r < 0) - ret = r; - else - clnt->proto_version = r; - kfree(s); - break; - default: - continue; - } - } - -free_and_return: - if (ret) - v9fs_put_trans(clnt->trans_mod); - kfree(tmp_options); - return ret; + return 0; } static int p9_fcall_init(struct p9_client *c, struct p9_fcall *fc, @@ -229,8 +99,15 @@ static int p9_fcall_init(struct p9_client *c, struct p9_fcall *fc, if (likely(c->fcall_cache) && alloc_msize == c->msize) { fc->sdata = kmem_cache_alloc(c->fcall_cache, GFP_NOFS); fc->cache = c->fcall_cache; + if (!fc->sdata && c->trans_mod->supports_vmalloc) { + fc->sdata = kvmalloc(alloc_msize, GFP_NOFS); + fc->cache = NULL; + } } else { - fc->sdata = kmalloc(alloc_msize, GFP_NOFS); + if (c->trans_mod->supports_vmalloc) + fc->sdata = kvmalloc(alloc_msize, GFP_NOFS); + else + fc->sdata = kmalloc(alloc_msize, GFP_NOFS); fc->cache = NULL; } if (!fc->sdata) @@ -252,7 +129,7 @@ void p9_fcall_fini(struct p9_fcall *fc) if (fc->cache) kmem_cache_free(fc->cache, fc->sdata); else - kfree(fc->sdata); + kvfree(fc->sdata); } EXPORT_SYMBOL(p9_fcall_fini); @@ -974,7 +851,7 @@ error: return err; } -struct p9_client *p9_client_create(const char *dev_name, char *options) +struct p9_client *p9_client_create(struct fs_context *fc) { int err; static atomic_t seqno = ATOMIC_INIT(0); @@ -997,8 +874,8 @@ struct p9_client *p9_client_create(const char *dev_name, char *options) idr_init(&clnt->fids); idr_init(&clnt->reqs); - err = parse_opts(options, clnt); - if (err < 0) + err = apply_client_options(clnt, fc); + if (err) goto free_client; if (!clnt->trans_mod) @@ -1014,7 +891,7 @@ struct p9_client *p9_client_create(const char *dev_name, char *options) p9_debug(P9_DEBUG_MUX, "clnt %p trans %p msize %d protocol %d\n", clnt, clnt->trans_mod, clnt->msize, clnt->proto_version); - err = clnt->trans_mod->create(clnt, dev_name, options); + err = clnt->trans_mod->create(clnt, fc); if (err) goto put_trans; diff --git a/net/9p/mod.c b/net/9p/mod.c index 55576c1866fa..85160b52da55 100644 --- a/net/9p/mod.c +++ b/net/9p/mod.c @@ -16,7 +16,6 @@ #include <linux/moduleparam.h> #include <net/9p/9p.h> #include <linux/fs.h> -#include <linux/parser.h> #include <net/9p/client.h> #include <net/9p/transport.h> #include <linux/list.h> @@ -171,6 +170,7 @@ void v9fs_put_trans(struct p9_trans_module *m) if (m) module_put(m->owner); } +EXPORT_SYMBOL(v9fs_put_trans); /** * init_p9 - Initialize module diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 339ec4e54778..0e331c1b2112 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -22,7 +22,7 @@ #include <linux/uaccess.h> #include <linux/inet.h> #include <linux/file.h> -#include <linux/parser.h> +#include <linux/fs_context.h> #include <linux/slab.h> #include <linux/seq_file.h> #include <net/9p/9p.h> @@ -31,48 +31,12 @@ #include <linux/syscalls.h> /* killme */ -#define P9_PORT 564 #define MAX_SOCK_BUF (1024*1024) #define MAXPOLLWADDR 2 static struct p9_trans_module p9_tcp_trans; static struct p9_trans_module p9_fd_trans; -/** - * struct p9_fd_opts - per-transport options - * @rfd: file descriptor for reading (trans=fd) - * @wfd: file descriptor for writing (trans=fd) - * @port: port to connect to (trans=tcp) - * @privport: port is privileged - */ - -struct p9_fd_opts { - int rfd; - int wfd; - u16 port; - bool privport; -}; - -/* - * Option Parsing (code inspired by NFS code) - * - a little lazy - parse all fd-transport options - */ - -enum { - /* Options that take integer arguments */ - Opt_port, Opt_rfdno, Opt_wfdno, Opt_err, - /* Options that take no arguments */ - Opt_privport, -}; - -static const match_table_t tokens = { - {Opt_port, "port=%u"}, - {Opt_rfdno, "rfdno=%u"}, - {Opt_wfdno, "wfdno=%u"}, - {Opt_privport, "privport"}, - {Opt_err, NULL}, -}; - enum { Rworksched = 1, /* read work scheduled or running */ Rpending = 2, /* can read */ @@ -666,7 +630,6 @@ static void p9_poll_mux(struct p9_conn *m) static int p9_fd_request(struct p9_client *client, struct p9_req_t *req) { - __poll_t n; int err; struct p9_trans_fd *ts = client->trans; struct p9_conn *m = &ts->conn; @@ -686,13 +649,7 @@ static int p9_fd_request(struct p9_client *client, struct p9_req_t *req) list_add_tail(&req->req_list, &m->unsent_req_list); spin_unlock(&m->req_lock); - if (test_and_clear_bit(Wpending, &m->wsched)) - n = EPOLLOUT; - else - n = p9_fd_poll(m->client, NULL, NULL); - - if (n & EPOLLOUT && !test_and_set_bit(Wworksched, &m->wsched)) - schedule_work(&m->wq); + p9_poll_mux(m); return 0; } @@ -726,10 +683,10 @@ static int p9_fd_cancelled(struct p9_client *client, struct p9_req_t *req) p9_debug(P9_DEBUG_TRANS, "client %p req %p\n", client, req); spin_lock(&m->req_lock); - /* Ignore cancelled request if message has been received - * before lock. - */ - if (req->status == REQ_STATUS_RCVD) { + /* Ignore cancelled request if status changed since the request was + * processed in p9_client_flush() + */ + if (req->status != REQ_STATUS_SENT) { spin_unlock(&m->req_lock); return 0; } @@ -749,7 +706,7 @@ static int p9_fd_cancelled(struct p9_client *client, struct p9_req_t *req) static int p9_fd_show_options(struct seq_file *m, struct p9_client *clnt) { if (clnt->trans_mod == &p9_tcp_trans) { - if (clnt->trans_opts.tcp.port != P9_PORT) + if (clnt->trans_opts.tcp.port != P9_FD_PORT) seq_printf(m, ",port=%u", clnt->trans_opts.tcp.port); } else if (clnt->trans_mod == &p9_fd_trans) { if (clnt->trans_opts.fd.rfd != ~0) @@ -760,73 +717,6 @@ static int p9_fd_show_options(struct seq_file *m, struct p9_client *clnt) return 0; } -/** - * parse_opts - parse mount options into p9_fd_opts structure - * @params: options string passed from mount - * @opts: fd transport-specific structure to parse options into - * - * Returns 0 upon success, -ERRNO upon failure - */ - -static int parse_opts(char *params, struct p9_fd_opts *opts) -{ - char *p; - substring_t args[MAX_OPT_ARGS]; - int option; - char *options, *tmp_options; - - opts->port = P9_PORT; - opts->rfd = ~0; - opts->wfd = ~0; - opts->privport = false; - - if (!params) - return 0; - - tmp_options = kstrdup(params, GFP_KERNEL); - if (!tmp_options) { - p9_debug(P9_DEBUG_ERROR, - "failed to allocate copy of option string\n"); - return -ENOMEM; - } - options = tmp_options; - - while ((p = strsep(&options, ",")) != NULL) { - int token; - int r; - if (!*p) - continue; - token = match_token(p, tokens, args); - if ((token != Opt_err) && (token != Opt_privport)) { - r = match_int(&args[0], &option); - if (r < 0) { - p9_debug(P9_DEBUG_ERROR, - "integer field, but no integer?\n"); - continue; - } - } - switch (token) { - case Opt_port: - opts->port = option; - break; - case Opt_rfdno: - opts->rfd = option; - break; - case Opt_wfdno: - opts->wfd = option; - break; - case Opt_privport: - opts->privport = true; - break; - default: - continue; - } - } - - kfree(tmp_options); - return 0; -} - static int p9_fd_open(struct p9_client *client, int rfd, int wfd) { struct p9_trans_fd *ts = kzalloc(sizeof(struct p9_trans_fd), @@ -973,7 +863,7 @@ static int p9_bind_privport(struct socket *sock) ((struct sockaddr_in *)&stor)->sin_port = htons((ushort)port); else ((struct sockaddr_in6 *)&stor)->sin6_port = htons((ushort)port); - err = kernel_bind(sock, (struct sockaddr *)&stor, sizeof(stor)); + err = kernel_bind(sock, (struct sockaddr_unsized *)&stor, sizeof(stor)); if (err != -EADDRINUSE) break; } @@ -981,17 +871,18 @@ static int p9_bind_privport(struct socket *sock) } static int -p9_fd_create_tcp(struct p9_client *client, const char *addr, char *args) +p9_fd_create_tcp(struct p9_client *client, struct fs_context *fc) { + const char *addr = fc->source; + struct v9fs_context *ctx = fc->fs_private; int err; char port_str[6]; struct socket *csocket; struct sockaddr_storage stor = { 0 }; struct p9_fd_opts opts; - err = parse_opts(args, &opts); - if (err < 0) - return err; + /* opts are already parsed in context */ + opts = ctx->fd_opts; if (!addr) return -EINVAL; @@ -1025,7 +916,7 @@ p9_fd_create_tcp(struct p9_client *client, const char *addr, char *args) } err = READ_ONCE(csocket->ops)->connect(csocket, - (struct sockaddr *)&stor, + (struct sockaddr_unsized *)&stor, sizeof(stor), 0); if (err < 0) { pr_err("%s (%d): problem connecting socket to %s\n", @@ -1038,8 +929,9 @@ p9_fd_create_tcp(struct p9_client *client, const char *addr, char *args) } static int -p9_fd_create_unix(struct p9_client *client, const char *addr, char *args) +p9_fd_create_unix(struct p9_client *client, struct fs_context *fc) { + const char *addr = fc->source; int err; struct socket *csocket; struct sockaddr_un sun_server; @@ -1065,8 +957,8 @@ p9_fd_create_unix(struct p9_client *client, const char *addr, char *args) return err; } - err = READ_ONCE(csocket->ops)->connect(csocket, (struct sockaddr *)&sun_server, - sizeof(struct sockaddr_un) - 1, 0); + err = READ_ONCE(csocket->ops)->connect(csocket, (struct sockaddr_unsized *)&sun_server, + sizeof(struct sockaddr_un) - 1, 0); if (err < 0) { pr_err("%s (%d): problem connecting socket: %s: %d\n", __func__, task_pid_nr(current), addr, err); @@ -1078,14 +970,12 @@ p9_fd_create_unix(struct p9_client *client, const char *addr, char *args) } static int -p9_fd_create(struct p9_client *client, const char *addr, char *args) +p9_fd_create(struct p9_client *client, struct fs_context *fc) { + struct v9fs_context *ctx = fc->fs_private; + struct p9_fd_opts opts = ctx->fd_opts; int err; - struct p9_fd_opts opts; - err = parse_opts(args, &opts); - if (err < 0) - return err; client->trans_opts.fd.rfd = opts.rfd; client->trans_opts.fd.wfd = opts.wfd; @@ -1107,7 +997,8 @@ static struct p9_trans_module p9_tcp_trans = { .name = "tcp", .maxsize = MAX_SOCK_BUF, .pooled_rbuffers = false, - .def = 0, + .def = false, + .supports_vmalloc = true, .create = p9_fd_create_tcp, .close = p9_fd_close, .request = p9_fd_request, @@ -1121,7 +1012,8 @@ MODULE_ALIAS_9P("tcp"); static struct p9_trans_module p9_unix_trans = { .name = "unix", .maxsize = MAX_SOCK_BUF, - .def = 0, + .def = false, + .supports_vmalloc = true, .create = p9_fd_create_unix, .close = p9_fd_close, .request = p9_fd_request, @@ -1135,7 +1027,8 @@ MODULE_ALIAS_9P("unix"); static struct p9_trans_module p9_fd_trans = { .name = "fd", .maxsize = MAX_SOCK_BUF, - .def = 0, + .def = false, + .supports_vmalloc = true, .create = p9_fd_create, .close = p9_fd_close, .request = p9_fd_request, diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c index b84748baf9cb..4d406479f83b 100644 --- a/net/9p/trans_rdma.c +++ b/net/9p/trans_rdma.c @@ -22,7 +22,7 @@ #include <linux/uaccess.h> #include <linux/inet.h> #include <linux/file.h> -#include <linux/parser.h> +#include <linux/fs_context.h> #include <linux/semaphore.h> #include <linux/slab.h> #include <linux/seq_file.h> @@ -32,14 +32,10 @@ #include <rdma/ib_verbs.h> #include <rdma/rdma_cm.h> -#define P9_PORT 5640 -#define P9_RDMA_SQ_DEPTH 32 -#define P9_RDMA_RQ_DEPTH 32 #define P9_RDMA_SEND_SGE 4 #define P9_RDMA_RECV_SGE 4 #define P9_RDMA_IRD 0 #define P9_RDMA_ORD 0 -#define P9_RDMA_TIMEOUT 30000 /* 30 seconds */ #define P9_RDMA_MAXSIZE (1024*1024) /* 1MB */ /** @@ -110,48 +106,11 @@ struct p9_rdma_context { }; }; -/** - * struct p9_rdma_opts - Collection of mount options - * @port: port of connection - * @privport: Whether a privileged port may be used - * @sq_depth: The requested depth of the SQ. This really doesn't need - * to be any deeper than the number of threads used in the client - * @rq_depth: The depth of the RQ. Should be greater than or equal to SQ depth - * @timeout: Time to wait in msecs for CM events - */ -struct p9_rdma_opts { - short port; - bool privport; - int sq_depth; - int rq_depth; - long timeout; -}; - -/* - * Option Parsing (code inspired by NFS code) - */ -enum { - /* Options that take integer arguments */ - Opt_port, Opt_rq_depth, Opt_sq_depth, Opt_timeout, - /* Options that take no argument */ - Opt_privport, - Opt_err, -}; - -static match_table_t tokens = { - {Opt_port, "port=%u"}, - {Opt_sq_depth, "sq=%u"}, - {Opt_rq_depth, "rq=%u"}, - {Opt_timeout, "timeout=%u"}, - {Opt_privport, "privport"}, - {Opt_err, NULL}, -}; - static int p9_rdma_show_options(struct seq_file *m, struct p9_client *clnt) { struct p9_trans_rdma *rdma = clnt->trans; - if (rdma->port != P9_PORT) + if (rdma->port != P9_RDMA_PORT) seq_printf(m, ",port=%u", rdma->port); if (rdma->sq_depth != P9_RDMA_SQ_DEPTH) seq_printf(m, ",sq=%u", rdma->sq_depth); @@ -164,77 +123,6 @@ static int p9_rdma_show_options(struct seq_file *m, struct p9_client *clnt) return 0; } -/** - * parse_opts - parse mount options into rdma options structure - * @params: options string passed from mount - * @opts: rdma transport-specific structure to parse options into - * - * Returns 0 upon success, -ERRNO upon failure - */ -static int parse_opts(char *params, struct p9_rdma_opts *opts) -{ - char *p; - substring_t args[MAX_OPT_ARGS]; - int option; - char *options, *tmp_options; - - opts->port = P9_PORT; - opts->sq_depth = P9_RDMA_SQ_DEPTH; - opts->rq_depth = P9_RDMA_RQ_DEPTH; - opts->timeout = P9_RDMA_TIMEOUT; - opts->privport = false; - - if (!params) - return 0; - - tmp_options = kstrdup(params, GFP_KERNEL); - if (!tmp_options) { - p9_debug(P9_DEBUG_ERROR, - "failed to allocate copy of option string\n"); - return -ENOMEM; - } - options = tmp_options; - - while ((p = strsep(&options, ",")) != NULL) { - int token; - int r; - if (!*p) - continue; - token = match_token(p, tokens, args); - if ((token != Opt_err) && (token != Opt_privport)) { - r = match_int(&args[0], &option); - if (r < 0) { - p9_debug(P9_DEBUG_ERROR, - "integer field, but no integer?\n"); - continue; - } - } - switch (token) { - case Opt_port: - opts->port = option; - break; - case Opt_sq_depth: - opts->sq_depth = option; - break; - case Opt_rq_depth: - opts->rq_depth = option; - break; - case Opt_timeout: - opts->timeout = option; - break; - case Opt_privport: - opts->privport = true; - break; - default: - continue; - } - } - /* RQ must be at least as large as the SQ */ - opts->rq_depth = max(opts->rq_depth, opts->sq_depth); - kfree(tmp_options); - return 0; -} - static int p9_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) { @@ -628,14 +516,15 @@ static int p9_rdma_bind_privport(struct p9_trans_rdma *rdma) /** * rdma_create_trans - Transport method for creating a transport instance * @client: client instance - * @addr: IP address string - * @args: Mount options string + * @fc: The filesystem context */ static int -rdma_create_trans(struct p9_client *client, const char *addr, char *args) +rdma_create_trans(struct p9_client *client, struct fs_context *fc) { + const char *addr = fc->source; + struct v9fs_context *ctx = fc->fs_private; + struct p9_rdma_opts opts = ctx->rdma_opts; int err; - struct p9_rdma_opts opts; struct p9_trans_rdma *rdma; struct rdma_conn_param conn_param; struct ib_qp_init_attr qp_attr; @@ -643,10 +532,8 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args) if (addr == NULL) return -EINVAL; - /* Parse the transport specific mount options */ - err = parse_opts(args, &opts); - if (err < 0) - return err; + /* options are already parsed, in the fs context */ + opts = ctx->rdma_opts; /* Create and initialize the RDMA transport structure */ rdma = alloc_rdma(&opts); @@ -748,7 +635,8 @@ static struct p9_trans_module p9_rdma_trans = { .name = "rdma", .maxsize = P9_RDMA_MAXSIZE, .pooled_rbuffers = true, - .def = 0, + .def = false, + .supports_vmalloc = false, .owner = THIS_MODULE, .create = rdma_create_trans, .close = rdma_close, diff --git a/net/9p/trans_usbg.c b/net/9p/trans_usbg.c index 6b694f117aef..93547637deae 100644 --- a/net/9p/trans_usbg.c +++ b/net/9p/trans_usbg.c @@ -27,6 +27,7 @@ #include <linux/cleanup.h> #include <linux/kernel.h> #include <linux/module.h> +#include <linux/fs_context.h> #include <linux/usb/composite.h> #include <linux/usb/func_utils.h> @@ -231,6 +232,8 @@ static void usb9pfs_rx_complete(struct usb_ep *ep, struct usb_request *req) struct f_usb9pfs *usb9pfs = ep->driver_data; struct usb_composite_dev *cdev = usb9pfs->function.config->cdev; struct p9_req_t *p9_rx_req; + unsigned int req_size = req->actual; + int status = REQ_STATUS_RCVD; if (req->status) { dev_err(&cdev->gadget->dev, "%s usb9pfs complete --> %d, %d/%d\n", @@ -242,11 +245,19 @@ static void usb9pfs_rx_complete(struct usb_ep *ep, struct usb_request *req) if (!p9_rx_req) return; - memcpy(p9_rx_req->rc.sdata, req->buf, req->actual); + if (req_size > p9_rx_req->rc.capacity) { + dev_err(&cdev->gadget->dev, + "%s received data size %u exceeds buffer capacity %zu\n", + ep->name, req_size, p9_rx_req->rc.capacity); + req_size = 0; + status = REQ_STATUS_ERROR; + } + + memcpy(p9_rx_req->rc.sdata, req->buf, req_size); - p9_rx_req->rc.size = req->actual; + p9_rx_req->rc.size = req_size; - p9_client_cb(usb9pfs->client, p9_rx_req, REQ_STATUS_RCVD); + p9_client_cb(usb9pfs->client, p9_rx_req, status); p9_req_put(usb9pfs->client, p9_rx_req); complete(&usb9pfs->received); @@ -366,8 +377,9 @@ out: return ret; } -static int p9_usbg_create(struct p9_client *client, const char *devname, char *args) +static int p9_usbg_create(struct p9_client *client, struct fs_context *fc) { + const char *devname = fc->source; struct f_usb9pfs_dev *dev; struct f_usb9pfs *usb9pfs; int ret = -ENOENT; @@ -504,6 +516,7 @@ static struct p9_trans_module p9_usbg_trans = { .close = p9_usbg_close, .request = p9_usbg_request, .cancel = p9_usbg_cancel, + .supports_vmalloc = false, .owner = THIS_MODULE, }; diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index 0b8086f58ad5..10c2dd486438 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -26,7 +26,7 @@ #include <linux/highmem.h> #include <linux/slab.h> #include <net/9p/9p.h> -#include <linux/parser.h> +#include <linux/fs_context.h> #include <net/9p/client.h> #include <net/9p/transport.h> #include <linux/scatterlist.h> @@ -679,8 +679,7 @@ fail: /** * p9_virtio_create - allocate a new virtio channel * @client: client instance invoking this transport - * @devname: string identifying the channel to connect to (unused) - * @args: args passed from sys_mount() for per-transport options (unused) + * @fc: the filesystem context * * This sets up a transport channel for 9p communication. Right now * we only match the first available channel, but eventually we could look up @@ -691,8 +690,9 @@ fail: */ static int -p9_virtio_create(struct p9_client *client, const char *devname, char *args) +p9_virtio_create(struct p9_client *client, struct fs_context *fc) { + const char *devname = fc->source; struct virtio_chan *chan; int ret = -ENOENT; int found = 0; @@ -802,7 +802,8 @@ static struct p9_trans_module p9_virtio_trans = { */ .maxsize = PAGE_SIZE * (VIRTQUEUE_NUM - 3), .pooled_rbuffers = false, - .def = 1, + .def = true, + .supports_vmalloc = false, .owner = THIS_MODULE, }; diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c index b9ff69c7522a..12f752a92332 100644 --- a/net/9p/trans_xen.c +++ b/net/9p/trans_xen.c @@ -15,6 +15,7 @@ #include <linux/module.h> #include <linux/spinlock.h> +#include <linux/fs_context.h> #include <net/9p/9p.h> #include <net/9p/client.h> #include <net/9p/transport.h> @@ -66,8 +67,9 @@ static int p9_xen_cancel(struct p9_client *client, struct p9_req_t *req) return 1; } -static int p9_xen_create(struct p9_client *client, const char *addr, char *args) +static int p9_xen_create(struct p9_client *client, struct fs_context *fc) { + const char *addr = fc->source; struct xen_9pfs_front_priv *priv; if (addr == NULL) @@ -257,7 +259,8 @@ static struct p9_trans_module p9_xen_trans = { .name = "xen", .maxsize = 1 << (XEN_9PFS_RING_ORDER + XEN_PAGE_SHIFT - 2), .pooled_rbuffers = false, - .def = 1, + .def = true, + .supports_vmalloc = false, .create = p9_xen_create, .close = p9_xen_close, .request = p9_xen_request, diff --git a/net/Kconfig b/net/Kconfig index d5865cf19799..62266eaf0e95 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -82,11 +82,13 @@ config NET_CRC32C menu "Networking options" source "net/packet/Kconfig" +source "net/psp/Kconfig" source "net/unix/Kconfig" source "net/tls/Kconfig" source "net/xfrm/Kconfig" source "net/iucv/Kconfig" source "net/smc/Kconfig" +source "drivers/dibs/Kconfig" source "net/xdp/Kconfig" config NET_HANDSHAKE @@ -398,15 +400,15 @@ config NET_PKTGEN module will be called pktgen. config NET_DROP_MONITOR - tristate "Network packet drop alerting service" + tristate "Legacy network packet drop alerting service" depends on INET && TRACEPOINTS help This feature provides an alerting service to userspace in the event that packets are discarded in the network stack. Alerts are broadcast via netlink socket to any listening user space - process. If you don't need network drop alerts, or if you are ok - just checking the various proc files and other utilities for - drop statistics, say N here. + process. This feature is NOT related to "perf" based drop monitoring. + Say N here unless you need to support older userspace tools like + "dropwatch". endmenu # Network testing diff --git a/net/Makefile b/net/Makefile index aac960c41db6..90e3d72bf58b 100644 --- a/net/Makefile +++ b/net/Makefile @@ -18,6 +18,7 @@ obj-$(CONFIG_INET) += ipv4/ obj-$(CONFIG_TLS) += tls/ obj-$(CONFIG_XFRM) += xfrm/ obj-$(CONFIG_UNIX) += unix/ +obj-$(CONFIG_INET_PSP) += psp/ obj-y += ipv6/ obj-$(CONFIG_PACKET) += packet/ obj-$(CONFIG_NET_KEY) += key/ diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 30242fe10341..2a01fff46c9d 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1149,7 +1149,7 @@ out: } /* Set the address 'our end' of the connection */ -static int atalk_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +static int atalk_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { struct sockaddr_at *addr = (struct sockaddr_at *)uaddr; struct sock *sk = sock->sk; @@ -1204,7 +1204,7 @@ out: } /* Set the address we talk to */ -static int atalk_connect(struct socket *sock, struct sockaddr *uaddr, +static int atalk_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; diff --git a/net/atm/clip.c b/net/atm/clip.c index f7a5565e794e..8f152e5fa659 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -168,10 +168,10 @@ static int neigh_check_cb(struct neighbour *n) static void idle_timer_check(struct timer_list *unused) { - write_lock(&arp_tbl.lock); + spin_lock(&arp_tbl.lock); __neigh_for_each_release(&arp_tbl, neigh_check_cb); mod_timer(&idle_timer, jiffies + CLIP_CHECK_INTERVAL * HZ); - write_unlock(&arp_tbl.lock); + spin_unlock(&arp_tbl.lock); } static int clip_arp_rcv(struct sk_buff *skb) diff --git a/net/atm/common.c b/net/atm/common.c index d7f7976ea13a..fe77f51f6ce1 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -157,7 +157,7 @@ int vcc_create(struct net *net, struct socket *sock, int protocol, int family, i memset(&vcc->local, 0, sizeof(struct sockaddr_atmsvc)); memset(&vcc->remote, 0, sizeof(struct sockaddr_atmsvc)); vcc->qos.txtp.max_sdu = 1 << 16; /* for meta VCs */ - refcount_set(&sk->sk_wmem_alloc, 1); + refcount_set(&sk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS); atomic_set(&sk->sk_rmem_alloc, 0); vcc->push = NULL; vcc->pop = NULL; @@ -635,18 +635,27 @@ int vcc_sendmsg(struct socket *sock, struct msghdr *m, size_t size) skb->dev = NULL; /* for paths shared with net_device interfaces */ if (!copy_from_iter_full(skb_put(skb, size), size, &m->msg_iter)) { - atm_return_tx(vcc, skb); - kfree_skb(skb); error = -EFAULT; - goto out; + goto free_skb; } if (eff != size) memset(skb->data + size, 0, eff-size); + + if (vcc->dev->ops->pre_send) { + error = vcc->dev->ops->pre_send(vcc, skb); + if (error) + goto free_skb; + } + error = vcc->dev->ops->send(vcc, skb); error = error ? error : size; out: release_sock(sk); return error; +free_skb: + atm_return_tx(vcc, skb); + kfree_skb(skb); + goto out; } __poll_t vcc_poll(struct file *file, struct socket *sock, poll_table *wait) @@ -872,7 +881,7 @@ out_atmproc_exit: out_atmsvc_exit: atmsvc_exit(); out_atmpvc_exit: - atmsvc_exit(); + atmpvc_exit(); out_unregister_vcc_proto: proto_unregister(&vcc_proto); goto out; diff --git a/net/atm/pvc.c b/net/atm/pvc.c index 66d9a9bd5896..8f5e76f5dd9e 100644 --- a/net/atm/pvc.c +++ b/net/atm/pvc.c @@ -24,7 +24,7 @@ static int pvc_shutdown(struct socket *sock, int how) return 0; } -static int pvc_bind(struct socket *sock, struct sockaddr *sockaddr, +static int pvc_bind(struct socket *sock, struct sockaddr_unsized *sockaddr, int sockaddr_len) { struct sock *sk = sock->sk; @@ -56,7 +56,7 @@ out: return error; } -static int pvc_connect(struct socket *sock, struct sockaddr *sockaddr, +static int pvc_connect(struct socket *sock, struct sockaddr_unsized *sockaddr, int sockaddr_len, int flags) { return pvc_bind(sock, sockaddr, sockaddr_len); diff --git a/net/atm/resources.c b/net/atm/resources.c index b19d851e1f44..7c6fdedbcf4e 100644 --- a/net/atm/resources.c +++ b/net/atm/resources.c @@ -112,7 +112,9 @@ struct atm_dev *atm_dev_register(const char *type, struct device *parent, if (atm_proc_dev_register(dev) < 0) { pr_err("atm_proc_dev_register failed for dev %s\n", type); - goto out_fail; + mutex_unlock(&atm_dev_mutex); + kfree(dev); + return NULL; } if (atm_register_sysfs(dev, parent) < 0) { @@ -128,7 +130,7 @@ out: return dev; out_fail: - kfree(dev); + put_device(&dev->class_dev); dev = NULL; goto out; } diff --git a/net/atm/svc.c b/net/atm/svc.c index f8137ae693b0..005964250ecd 100644 --- a/net/atm/svc.c +++ b/net/atm/svc.c @@ -97,7 +97,7 @@ static int svc_release(struct socket *sock) return 0; } -static int svc_bind(struct socket *sock, struct sockaddr *sockaddr, +static int svc_bind(struct socket *sock, struct sockaddr_unsized *sockaddr, int sockaddr_len) { DEFINE_WAIT(wait); @@ -153,7 +153,7 @@ out: return error; } -static int svc_connect(struct socket *sock, struct sockaddr *sockaddr, +static int svc_connect(struct socket *sock, struct sockaddr_unsized *sockaddr, int sockaddr_len, int flags) { DEFINE_WAIT(wait); diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 6ef8b2a57a9b..7ebbff2f0020 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1094,7 +1094,7 @@ static int ax25_release(struct socket *sock) * that we've implemented support for SO_BINDTODEVICE. It is however small * and trivially backward compatible. */ -static int ax25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +static int ax25_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { struct sock *sk = sock->sk; struct full_sockaddr_ax25 *addr = (struct full_sockaddr_ax25 *)uaddr; @@ -1175,7 +1175,7 @@ out: * FIXME: nonblock behaviour looks like it may have a bug. */ static int __must_check ax25_connect(struct socket *sock, - struct sockaddr *uaddr, int addr_len, int flags) + struct sockaddr_unsized *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; ax25_cb *ax25 = sk_to_ax25(sk), *ax25t; diff --git a/net/ax25/ax25_in.c b/net/ax25/ax25_in.c index 1cac25aca637..f2d66af86359 100644 --- a/net/ax25/ax25_in.c +++ b/net/ax25/ax25_in.c @@ -433,6 +433,10 @@ free: int ax25_kiss_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *ptype, struct net_device *orig_dev) { + skb = skb_share_check(skb, GFP_ATOMIC); + if (!skb) + return NET_RX_DROP; + skb_orphan(skb); if (!net_eq(dev_net(dev), &init_net)) { diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig index 20b316207f9a..58c408b7a7d9 100644 --- a/net/batman-adv/Kconfig +++ b/net/batman-adv/Kconfig @@ -35,6 +35,7 @@ config BATMAN_ADV_BLA bool "Bridge Loop Avoidance" depends on BATMAN_ADV && INET select CRC16 + select NET_CRC32C default y help This option enables BLA (Bridge Loop Avoidance), a mechanism @@ -53,19 +54,6 @@ config BATMAN_ADV_DAT mesh networks. If you think that your network does not need this option you can safely remove it and save some space. -config BATMAN_ADV_NC - bool "Network Coding" - depends on BATMAN_ADV - help - This option enables network coding, a mechanism that aims to - increase the overall network throughput by fusing multiple - packets in one transmission. - Note that interfaces controlled by batman-adv must be manually - configured to have promiscuous mode enabled in order to make - network coding work. - If you think that your network does not need this feature you - can safely disable it and save some space. - config BATMAN_ADV_MCAST bool "Multicast optimisation" depends on BATMAN_ADV && INET && !(BRIDGE=m && BATMAN_ADV=y) diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile index 1cc9be6de456..d3c4d4143c14 100644 --- a/net/batman-adv/Makefile +++ b/net/batman-adv/Makefile @@ -23,7 +23,6 @@ batman-adv-y += mesh-interface.o batman-adv-$(CONFIG_BATMAN_ADV_MCAST) += multicast.o batman-adv-$(CONFIG_BATMAN_ADV_MCAST) += multicast_forw.o batman-adv-y += netlink.o -batman-adv-$(CONFIG_BATMAN_ADV_NC) += network-coding.o batman-adv-y += originator.o batman-adv-y += routing.o batman-adv-y += send.o diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 54fe38b3b2fd..b75c2228e69a 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -52,7 +52,6 @@ #include "hash.h" #include "log.h" #include "netlink.h" -#include "network-coding.h" #include "originator.h" #include "routing.h" #include "send.h" @@ -1406,10 +1405,6 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset, if (!orig_neigh_node) goto out; - /* Update nc_nodes of the originator */ - batadv_nc_update_nc_node(bat_priv, orig_node, orig_neigh_node, - ogm_packet, is_single_hop_neigh); - orig_neigh_router = batadv_orig_router_get(orig_neigh_node, if_outgoing); diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 747755647c6a..3dc791c15bf7 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -1588,8 +1588,7 @@ int batadv_bla_init(struct batadv_priv *bat_priv) * batadv_bla_check_duplist() - Check if a frame is in the broadcast dup. * @bat_priv: the bat priv with all the mesh interface information * @skb: contains the multicast packet to be checked - * @payload_ptr: pointer to position inside the head buffer of the skb - * marking the start of the data to be CRC'ed + * @payload_offset: offset in the skb, marking the start of the data to be CRC'ed * @orig: originator mac address, NULL if unknown * * Check if it is on our broadcast list. Another gateway might have sent the @@ -1604,16 +1603,18 @@ int batadv_bla_init(struct batadv_priv *bat_priv) * Return: true if a packet is in the duplicate list, false otherwise. */ static bool batadv_bla_check_duplist(struct batadv_priv *bat_priv, - struct sk_buff *skb, u8 *payload_ptr, + struct sk_buff *skb, int payload_offset, const u8 *orig) { struct batadv_bcast_duplist_entry *entry; bool ret = false; + int payload_len; int i, curr; - __be32 crc; + u32 crc; /* calculate the crc ... */ - crc = batadv_skb_crc32(skb, payload_ptr); + payload_len = skb->len - payload_offset; + crc = skb_crc32c(skb, payload_offset, payload_len, 0); spin_lock_bh(&bat_priv->bla.bcast_duplist_lock); @@ -1693,7 +1694,7 @@ out: static bool batadv_bla_check_ucast_duplist(struct batadv_priv *bat_priv, struct sk_buff *skb) { - return batadv_bla_check_duplist(bat_priv, skb, (u8 *)skb->data, NULL); + return batadv_bla_check_duplist(bat_priv, skb, 0, NULL); } /** @@ -1711,12 +1712,10 @@ bool batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv, struct sk_buff *skb) { struct batadv_bcast_packet *bcast_packet; - u8 *payload_ptr; bcast_packet = (struct batadv_bcast_packet *)skb->data; - payload_ptr = (u8 *)(bcast_packet + 1); - return batadv_bla_check_duplist(bat_priv, skb, payload_ptr, + return batadv_bla_check_duplist(bat_priv, skb, sizeof(*bcast_packet), bcast_packet->orig); } diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index bace57e4f9a5..5113f879736b 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -22,6 +22,7 @@ #include <linux/minmax.h> #include <linux/mutex.h> #include <linux/netdevice.h> +#include <linux/notifier.h> #include <linux/printk.h> #include <linux/rculist.h> #include <linux/rtnetlink.h> diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h index 262a78364742..9db8a310961e 100644 --- a/net/batman-adv/hard-interface.h +++ b/net/batman-adv/hard-interface.h @@ -12,7 +12,6 @@ #include <linux/compiler.h> #include <linux/kref.h> #include <linux/netdevice.h> -#include <linux/notifier.h> #include <linux/rcupdate.h> #include <linux/stddef.h> #include <linux/types.h> diff --git a/net/batman-adv/log.h b/net/batman-adv/log.h index 567afaa8df99..225b747a2048 100644 --- a/net/batman-adv/log.h +++ b/net/batman-adv/log.h @@ -51,9 +51,6 @@ enum batadv_dbg_level { /** @BATADV_DBG_DAT: ARP snooping and DAT related messages */ BATADV_DBG_DAT = BIT(4), - /** @BATADV_DBG_NC: network coding related messages */ - BATADV_DBG_NC = BIT(5), - /** @BATADV_DBG_MCAST: multicast related messages */ BATADV_DBG_MCAST = BIT(6), diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 20346d7b6b69..3a35aadd8b41 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -11,7 +11,6 @@ #include <linux/build_bug.h> #include <linux/byteorder/generic.h> #include <linux/container_of.h> -#include <linux/crc32.h> #include <linux/device.h> #include <linux/errno.h> #include <linux/gfp.h> @@ -53,7 +52,6 @@ #include "mesh-interface.h" #include "multicast.h" #include "netlink.h" -#include "network-coding.h" #include "originator.h" #include "routing.h" #include "send.h" @@ -103,7 +101,6 @@ static int __init batadv_init(void) batadv_v_init(); batadv_iv_init(); - batadv_nc_init(); batadv_tp_meter_init(); batadv_event_workqueue = create_singlethread_workqueue("bat_events"); @@ -218,12 +215,6 @@ int batadv_mesh_init(struct net_device *mesh_iface) goto err_dat; } - ret = batadv_nc_mesh_init(bat_priv); - if (ret < 0) { - atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING); - goto err_nc; - } - batadv_gw_init(bat_priv); batadv_mcast_init(bat_priv); @@ -232,8 +223,6 @@ int batadv_mesh_init(struct net_device *mesh_iface) return 0; -err_nc: - batadv_dat_free(bat_priv); err_dat: batadv_bla_free(bat_priv); err_bla: @@ -264,7 +253,6 @@ void batadv_mesh_free(struct net_device *mesh_iface) batadv_gw_node_free(bat_priv); batadv_v_mesh_free(bat_priv); - batadv_nc_mesh_free(bat_priv); batadv_dat_free(bat_priv); batadv_bla_free(bat_priv); @@ -336,11 +324,6 @@ int batadv_max_header_len(void) header_len = max_t(int, header_len, sizeof(struct batadv_bcast_packet)); -#ifdef CONFIG_BATMAN_ADV_NC - header_len = max_t(int, header_len, - sizeof(struct batadv_coded_packet)); -#endif - return header_len + ETH_HLEN; } @@ -578,39 +561,6 @@ void batadv_recv_handler_unregister(u8 packet_type) } /** - * batadv_skb_crc32() - calculate CRC32 of the whole packet and skip bytes in - * the header - * @skb: skb pointing to fragmented socket buffers - * @payload_ptr: Pointer to position inside the head buffer of the skb - * marking the start of the data to be CRC'ed - * - * payload_ptr must always point to an address in the skb head buffer and not to - * a fragment. - * - * Return: big endian crc32c of the checksummed data - */ -__be32 batadv_skb_crc32(struct sk_buff *skb, u8 *payload_ptr) -{ - u32 crc = 0; - unsigned int from; - unsigned int to = skb->len; - struct skb_seq_state st; - const u8 *data; - unsigned int len; - unsigned int consumed = 0; - - from = (unsigned int)(payload_ptr - skb->data); - - skb_prepare_seq_read(skb, from, to, &st); - while ((len = skb_seq_read(consumed, &data, &st)) != 0) { - crc = crc32c(crc, data, len); - consumed += len; - } - - return htonl(crc); -} - -/** * batadv_get_vid() - extract the VLAN identifier from skb if any * @skb: the buffer containing the packet * @header_len: length of the batman header preceding the ethernet header diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 1481eb2bacee..af230b017bc1 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -13,7 +13,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2025.3" +#define BATADV_SOURCE_VERSION "2025.5" #endif /* B.A.T.M.A.N. parameters */ @@ -121,8 +121,6 @@ #define BATADV_RESET_PROTECTION_MS 30000 #define BATADV_EXPECTED_SEQNO_RANGE 65536 -#define BATADV_NC_NODE_TIMEOUT 10000 /* Milliseconds */ - /** * BATADV_TP_MAX_NUM - maximum number of simultaneously active tp sessions */ @@ -250,7 +248,6 @@ batadv_recv_handler_register(u8 packet_type, int (*recv_handler)(struct sk_buff *, struct batadv_hard_iface *)); void batadv_recv_handler_unregister(u8 packet_type); -__be32 batadv_skb_crc32(struct sk_buff *skb, u8 *payload_ptr); /** * batadv_compare_eth() - Compare two not u16 aligned Ethernet addresses diff --git a/net/batman-adv/mesh-interface.c b/net/batman-adv/mesh-interface.c index de2c2d9c6e4d..df7e95811ef5 100644 --- a/net/batman-adv/mesh-interface.c +++ b/net/batman-adv/mesh-interface.c @@ -37,6 +37,7 @@ #include <linux/string.h> #include <linux/types.h> #include <net/netlink.h> +#include <net/rtnetlink.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> @@ -46,7 +47,6 @@ #include "gateway_client.h" #include "hard-interface.h" #include "multicast.h" -#include "network-coding.h" #include "send.h" #include "translation-table.h" @@ -802,8 +802,6 @@ static int batadv_meshif_init_late(struct net_device *dev) bat_priv->primary_if = NULL; - batadv_nc_init_bat_priv(bat_priv); - if (!bat_priv->algo_ops) { ret = batadv_algo_select(bat_priv, batadv_routing_algo); if (ret < 0) @@ -947,17 +945,6 @@ static const struct { { "dat_put_rx" }, { "dat_cached_reply_tx" }, #endif -#ifdef CONFIG_BATMAN_ADV_NC - { "nc_code" }, - { "nc_code_bytes" }, - { "nc_recode" }, - { "nc_recode_bytes" }, - { "nc_buffer" }, - { "nc_decode" }, - { "nc_decode_bytes" }, - { "nc_decode_failed" }, - { "nc_sniffed" }, -#endif }; static void batadv_get_strings(struct net_device *dev, u32 stringset, u8 *data) diff --git a/net/batman-adv/mesh-interface.h b/net/batman-adv/mesh-interface.h index 7ba055b2bc26..53756c5a45e0 100644 --- a/net/batman-adv/mesh-interface.h +++ b/net/batman-adv/mesh-interface.h @@ -13,7 +13,6 @@ #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/types.h> -#include <net/rtnetlink.h> int batadv_skb_head_push(struct sk_buff *skb, unsigned int len); void batadv_interface_rx(struct net_device *mesh_iface, diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index beb181b3a7d8..78c651f634cd 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -44,7 +44,6 @@ #include "log.h" #include "mesh-interface.h" #include "multicast.h" -#include "network-coding.h" #include "originator.h" #include "tp_meter.h" #include "translation-table.h" @@ -144,7 +143,6 @@ static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = { [BATADV_ATTR_LOG_LEVEL] = { .type = NLA_U32 }, [BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED] = { .type = NLA_U8 }, [BATADV_ATTR_MULTICAST_FANOUT] = { .type = NLA_U32 }, - [BATADV_ATTR_NETWORK_CODING_ENABLED] = { .type = NLA_U8 }, [BATADV_ATTR_ORIG_INTERVAL] = { .type = NLA_U32 }, [BATADV_ATTR_ELP_INTERVAL] = { .type = NLA_U32 }, [BATADV_ATTR_THROUGHPUT_OVERRIDE] = { .type = NLA_U32 }, @@ -345,12 +343,6 @@ static int batadv_netlink_mesh_fill(struct sk_buff *msg, goto nla_put_failure; #endif /* CONFIG_BATMAN_ADV_MCAST */ -#ifdef CONFIG_BATMAN_ADV_NC - if (nla_put_u8(msg, BATADV_ATTR_NETWORK_CODING_ENABLED, - !!atomic_read(&bat_priv->network_coding))) - goto nla_put_failure; -#endif /* CONFIG_BATMAN_ADV_NC */ - if (nla_put_u32(msg, BATADV_ATTR_ORIG_INTERVAL, atomic_read(&bat_priv->orig_interval))) goto nla_put_failure; @@ -588,15 +580,6 @@ static int batadv_netlink_set_mesh(struct sk_buff *skb, struct genl_info *info) } #endif /* CONFIG_BATMAN_ADV_MCAST */ -#ifdef CONFIG_BATMAN_ADV_NC - if (info->attrs[BATADV_ATTR_NETWORK_CODING_ENABLED]) { - attr = info->attrs[BATADV_ATTR_NETWORK_CODING_ENABLED]; - - atomic_set(&bat_priv->network_coding, !!nla_get_u8(attr)); - batadv_nc_status_update(bat_priv->mesh_iface); - } -#endif /* CONFIG_BATMAN_ADV_NC */ - if (info->attrs[BATADV_ATTR_ORIG_INTERVAL]) { u32 orig_interval; diff --git a/net/batman-adv/netlink.h b/net/batman-adv/netlink.h index fe4548b974bb..4eae9e5ff135 100644 --- a/net/batman-adv/netlink.h +++ b/net/batman-adv/netlink.h @@ -11,7 +11,6 @@ #include <linux/netlink.h> #include <linux/types.h> -#include <net/genetlink.h> void batadv_netlink_register(void); void batadv_netlink_unregister(void); diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c deleted file mode 100644 index 9f56308779cc..000000000000 --- a/net/batman-adv/network-coding.c +++ /dev/null @@ -1,1873 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) B.A.T.M.A.N. contributors: - * - * Martin Hundebøll, Jeppe Ledet-Pedersen - */ - -#include "network-coding.h" -#include "main.h" - -#include <linux/atomic.h> -#include <linux/bitops.h> -#include <linux/byteorder/generic.h> -#include <linux/compiler.h> -#include <linux/container_of.h> -#include <linux/errno.h> -#include <linux/etherdevice.h> -#include <linux/gfp.h> -#include <linux/if_ether.h> -#include <linux/if_packet.h> -#include <linux/init.h> -#include <linux/jhash.h> -#include <linux/jiffies.h> -#include <linux/kref.h> -#include <linux/list.h> -#include <linux/lockdep.h> -#include <linux/net.h> -#include <linux/netdevice.h> -#include <linux/printk.h> -#include <linux/random.h> -#include <linux/rculist.h> -#include <linux/rcupdate.h> -#include <linux/skbuff.h> -#include <linux/slab.h> -#include <linux/spinlock.h> -#include <linux/stddef.h> -#include <linux/string.h> -#include <linux/workqueue.h> -#include <uapi/linux/batadv_packet.h> - -#include "hash.h" -#include "log.h" -#include "originator.h" -#include "routing.h" -#include "send.h" -#include "tvlv.h" - -static struct lock_class_key batadv_nc_coding_hash_lock_class_key; -static struct lock_class_key batadv_nc_decoding_hash_lock_class_key; - -static void batadv_nc_worker(struct work_struct *work); -static int batadv_nc_recv_coded_packet(struct sk_buff *skb, - struct batadv_hard_iface *recv_if); - -/** - * batadv_nc_init() - one-time initialization for network coding - * - * Return: 0 on success or negative error number in case of failure - */ -int __init batadv_nc_init(void) -{ - /* Register our packet type */ - return batadv_recv_handler_register(BATADV_CODED, - batadv_nc_recv_coded_packet); -} - -/** - * batadv_nc_start_timer() - initialise the nc periodic worker - * @bat_priv: the bat priv with all the mesh interface information - */ -static void batadv_nc_start_timer(struct batadv_priv *bat_priv) -{ - queue_delayed_work(batadv_event_workqueue, &bat_priv->nc.work, - msecs_to_jiffies(10)); -} - -/** - * batadv_nc_tvlv_container_update() - update the network coding tvlv container - * after network coding setting change - * @bat_priv: the bat priv with all the mesh interface information - */ -static void batadv_nc_tvlv_container_update(struct batadv_priv *bat_priv) -{ - char nc_mode; - - nc_mode = atomic_read(&bat_priv->network_coding); - - switch (nc_mode) { - case 0: - batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_NC, 1); - break; - case 1: - batadv_tvlv_container_register(bat_priv, BATADV_TVLV_NC, 1, - NULL, 0); - break; - } -} - -/** - * batadv_nc_status_update() - update the network coding tvlv container after - * network coding setting change - * @net_dev: the mesh interface net device - */ -void batadv_nc_status_update(struct net_device *net_dev) -{ - struct batadv_priv *bat_priv = netdev_priv(net_dev); - - batadv_nc_tvlv_container_update(bat_priv); -} - -/** - * batadv_nc_tvlv_ogm_handler_v1() - process incoming nc tvlv container - * @bat_priv: the bat priv with all the mesh interface information - * @orig: the orig_node of the ogm - * @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags) - * @tvlv_value: tvlv buffer containing the gateway data - * @tvlv_value_len: tvlv buffer length - */ -static void batadv_nc_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, - struct batadv_orig_node *orig, - u8 flags, - void *tvlv_value, u16 tvlv_value_len) -{ - if (flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND) - clear_bit(BATADV_ORIG_CAPA_HAS_NC, &orig->capabilities); - else - set_bit(BATADV_ORIG_CAPA_HAS_NC, &orig->capabilities); -} - -/** - * batadv_nc_mesh_init() - initialise coding hash table and start housekeeping - * @bat_priv: the bat priv with all the mesh interface information - * - * Return: 0 on success or negative error number in case of failure - */ -int batadv_nc_mesh_init(struct batadv_priv *bat_priv) -{ - bat_priv->nc.timestamp_fwd_flush = jiffies; - bat_priv->nc.timestamp_sniffed_purge = jiffies; - - if (bat_priv->nc.coding_hash || bat_priv->nc.decoding_hash) - return 0; - - bat_priv->nc.coding_hash = batadv_hash_new(128); - if (!bat_priv->nc.coding_hash) - goto err; - - batadv_hash_set_lock_class(bat_priv->nc.coding_hash, - &batadv_nc_coding_hash_lock_class_key); - - bat_priv->nc.decoding_hash = batadv_hash_new(128); - if (!bat_priv->nc.decoding_hash) { - batadv_hash_destroy(bat_priv->nc.coding_hash); - goto err; - } - - batadv_hash_set_lock_class(bat_priv->nc.decoding_hash, - &batadv_nc_decoding_hash_lock_class_key); - - INIT_DELAYED_WORK(&bat_priv->nc.work, batadv_nc_worker); - batadv_nc_start_timer(bat_priv); - - batadv_tvlv_handler_register(bat_priv, batadv_nc_tvlv_ogm_handler_v1, - NULL, NULL, BATADV_TVLV_NC, 1, - BATADV_TVLV_HANDLER_OGM_CIFNOTFND); - batadv_nc_tvlv_container_update(bat_priv); - return 0; - -err: - return -ENOMEM; -} - -/** - * batadv_nc_init_bat_priv() - initialise the nc specific bat_priv variables - * @bat_priv: the bat priv with all the mesh interface information - */ -void batadv_nc_init_bat_priv(struct batadv_priv *bat_priv) -{ - atomic_set(&bat_priv->network_coding, 0); - bat_priv->nc.min_tq = 200; - bat_priv->nc.max_fwd_delay = 10; - bat_priv->nc.max_buffer_time = 200; -} - -/** - * batadv_nc_init_orig() - initialise the nc fields of an orig_node - * @orig_node: the orig_node which is going to be initialised - */ -void batadv_nc_init_orig(struct batadv_orig_node *orig_node) -{ - INIT_LIST_HEAD(&orig_node->in_coding_list); - INIT_LIST_HEAD(&orig_node->out_coding_list); - spin_lock_init(&orig_node->in_coding_list_lock); - spin_lock_init(&orig_node->out_coding_list_lock); -} - -/** - * batadv_nc_node_release() - release nc_node from lists and queue for free - * after rcu grace period - * @ref: kref pointer of the nc_node - */ -static void batadv_nc_node_release(struct kref *ref) -{ - struct batadv_nc_node *nc_node; - - nc_node = container_of(ref, struct batadv_nc_node, refcount); - - batadv_orig_node_put(nc_node->orig_node); - kfree_rcu(nc_node, rcu); -} - -/** - * batadv_nc_node_put() - decrement the nc_node refcounter and possibly - * release it - * @nc_node: nc_node to be free'd - */ -static void batadv_nc_node_put(struct batadv_nc_node *nc_node) -{ - if (!nc_node) - return; - - kref_put(&nc_node->refcount, batadv_nc_node_release); -} - -/** - * batadv_nc_path_release() - release nc_path from lists and queue for free - * after rcu grace period - * @ref: kref pointer of the nc_path - */ -static void batadv_nc_path_release(struct kref *ref) -{ - struct batadv_nc_path *nc_path; - - nc_path = container_of(ref, struct batadv_nc_path, refcount); - - kfree_rcu(nc_path, rcu); -} - -/** - * batadv_nc_path_put() - decrement the nc_path refcounter and possibly - * release it - * @nc_path: nc_path to be free'd - */ -static void batadv_nc_path_put(struct batadv_nc_path *nc_path) -{ - if (!nc_path) - return; - - kref_put(&nc_path->refcount, batadv_nc_path_release); -} - -/** - * batadv_nc_packet_free() - frees nc packet - * @nc_packet: the nc packet to free - * @dropped: whether the packet is freed because is dropped - */ -static void batadv_nc_packet_free(struct batadv_nc_packet *nc_packet, - bool dropped) -{ - if (dropped) - kfree_skb(nc_packet->skb); - else - consume_skb(nc_packet->skb); - - batadv_nc_path_put(nc_packet->nc_path); - kfree(nc_packet); -} - -/** - * batadv_nc_to_purge_nc_node() - checks whether an nc node has to be purged - * @bat_priv: the bat priv with all the mesh interface information - * @nc_node: the nc node to check - * - * Return: true if the entry has to be purged now, false otherwise - */ -static bool batadv_nc_to_purge_nc_node(struct batadv_priv *bat_priv, - struct batadv_nc_node *nc_node) -{ - if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) - return true; - - return batadv_has_timed_out(nc_node->last_seen, BATADV_NC_NODE_TIMEOUT); -} - -/** - * batadv_nc_to_purge_nc_path_coding() - checks whether an nc path has timed out - * @bat_priv: the bat priv with all the mesh interface information - * @nc_path: the nc path to check - * - * Return: true if the entry has to be purged now, false otherwise - */ -static bool batadv_nc_to_purge_nc_path_coding(struct batadv_priv *bat_priv, - struct batadv_nc_path *nc_path) -{ - if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) - return true; - - /* purge the path when no packets has been added for 10 times the - * max_fwd_delay time - */ - return batadv_has_timed_out(nc_path->last_valid, - bat_priv->nc.max_fwd_delay * 10); -} - -/** - * batadv_nc_to_purge_nc_path_decoding() - checks whether an nc path has timed - * out - * @bat_priv: the bat priv with all the mesh interface information - * @nc_path: the nc path to check - * - * Return: true if the entry has to be purged now, false otherwise - */ -static bool batadv_nc_to_purge_nc_path_decoding(struct batadv_priv *bat_priv, - struct batadv_nc_path *nc_path) -{ - if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) - return true; - - /* purge the path when no packets has been added for 10 times the - * max_buffer time - */ - return batadv_has_timed_out(nc_path->last_valid, - bat_priv->nc.max_buffer_time * 10); -} - -/** - * batadv_nc_purge_orig_nc_nodes() - go through list of nc nodes and purge stale - * entries - * @bat_priv: the bat priv with all the mesh interface information - * @list: list of nc nodes - * @lock: nc node list lock - * @to_purge: function in charge to decide whether an entry has to be purged or - * not. This function takes the nc node as argument and has to return - * a boolean value: true if the entry has to be deleted, false - * otherwise - */ -static void -batadv_nc_purge_orig_nc_nodes(struct batadv_priv *bat_priv, - struct list_head *list, - spinlock_t *lock, - bool (*to_purge)(struct batadv_priv *, - struct batadv_nc_node *)) -{ - struct batadv_nc_node *nc_node, *nc_node_tmp; - - /* For each nc_node in list */ - spin_lock_bh(lock); - list_for_each_entry_safe(nc_node, nc_node_tmp, list, list) { - /* if an helper function has been passed as parameter, - * ask it if the entry has to be purged or not - */ - if (to_purge && !to_purge(bat_priv, nc_node)) - continue; - - batadv_dbg(BATADV_DBG_NC, bat_priv, - "Removing nc_node %pM -> %pM\n", - nc_node->addr, nc_node->orig_node->orig); - list_del_rcu(&nc_node->list); - batadv_nc_node_put(nc_node); - } - spin_unlock_bh(lock); -} - -/** - * batadv_nc_purge_orig() - purges all nc node data attached of the given - * originator - * @bat_priv: the bat priv with all the mesh interface information - * @orig_node: orig_node with the nc node entries to be purged - * @to_purge: function in charge to decide whether an entry has to be purged or - * not. This function takes the nc node as argument and has to return - * a boolean value: true is the entry has to be deleted, false - * otherwise - */ -void batadv_nc_purge_orig(struct batadv_priv *bat_priv, - struct batadv_orig_node *orig_node, - bool (*to_purge)(struct batadv_priv *, - struct batadv_nc_node *)) -{ - /* Check ingoing nc_node's of this orig_node */ - batadv_nc_purge_orig_nc_nodes(bat_priv, &orig_node->in_coding_list, - &orig_node->in_coding_list_lock, - to_purge); - - /* Check outgoing nc_node's of this orig_node */ - batadv_nc_purge_orig_nc_nodes(bat_priv, &orig_node->out_coding_list, - &orig_node->out_coding_list_lock, - to_purge); -} - -/** - * batadv_nc_purge_orig_hash() - traverse entire originator hash to check if - * they have timed out nc nodes - * @bat_priv: the bat priv with all the mesh interface information - */ -static void batadv_nc_purge_orig_hash(struct batadv_priv *bat_priv) -{ - struct batadv_hashtable *hash = bat_priv->orig_hash; - struct hlist_head *head; - struct batadv_orig_node *orig_node; - u32 i; - - if (!hash) - return; - - /* For each orig_node */ - for (i = 0; i < hash->size; i++) { - head = &hash->table[i]; - - rcu_read_lock(); - hlist_for_each_entry_rcu(orig_node, head, hash_entry) - batadv_nc_purge_orig(bat_priv, orig_node, - batadv_nc_to_purge_nc_node); - rcu_read_unlock(); - } -} - -/** - * batadv_nc_purge_paths() - traverse all nc paths part of the hash and remove - * unused ones - * @bat_priv: the bat priv with all the mesh interface information - * @hash: hash table containing the nc paths to check - * @to_purge: function in charge to decide whether an entry has to be purged or - * not. This function takes the nc node as argument and has to return - * a boolean value: true is the entry has to be deleted, false - * otherwise - */ -static void batadv_nc_purge_paths(struct batadv_priv *bat_priv, - struct batadv_hashtable *hash, - bool (*to_purge)(struct batadv_priv *, - struct batadv_nc_path *)) -{ - struct hlist_head *head; - struct hlist_node *node_tmp; - struct batadv_nc_path *nc_path; - spinlock_t *lock; /* Protects lists in hash */ - u32 i; - - for (i = 0; i < hash->size; i++) { - head = &hash->table[i]; - lock = &hash->list_locks[i]; - - /* For each nc_path in this bin */ - spin_lock_bh(lock); - hlist_for_each_entry_safe(nc_path, node_tmp, head, hash_entry) { - /* if an helper function has been passed as parameter, - * ask it if the entry has to be purged or not - */ - if (to_purge && !to_purge(bat_priv, nc_path)) - continue; - - /* purging an non-empty nc_path should never happen, but - * is observed under high CPU load. Delay the purging - * until next iteration to allow the packet_list to be - * emptied first. - */ - if (!unlikely(list_empty(&nc_path->packet_list))) { - net_ratelimited_function(printk, - KERN_WARNING - "Skipping free of non-empty nc_path (%pM -> %pM)!\n", - nc_path->prev_hop, - nc_path->next_hop); - continue; - } - - /* nc_path is unused, so remove it */ - batadv_dbg(BATADV_DBG_NC, bat_priv, - "Remove nc_path %pM -> %pM\n", - nc_path->prev_hop, nc_path->next_hop); - hlist_del_rcu(&nc_path->hash_entry); - batadv_nc_path_put(nc_path); - } - spin_unlock_bh(lock); - } -} - -/** - * batadv_nc_hash_key_gen() - computes the nc_path hash key - * @key: buffer to hold the final hash key - * @src: source ethernet mac address going into the hash key - * @dst: destination ethernet mac address going into the hash key - */ -static void batadv_nc_hash_key_gen(struct batadv_nc_path *key, const char *src, - const char *dst) -{ - memcpy(key->prev_hop, src, sizeof(key->prev_hop)); - memcpy(key->next_hop, dst, sizeof(key->next_hop)); -} - -/** - * batadv_nc_hash_choose() - compute the hash value for an nc path - * @data: data to hash - * @size: size of the hash table - * - * Return: the selected index in the hash table for the given data. - */ -static u32 batadv_nc_hash_choose(const void *data, u32 size) -{ - const struct batadv_nc_path *nc_path = data; - u32 hash = 0; - - hash = jhash(&nc_path->prev_hop, sizeof(nc_path->prev_hop), hash); - hash = jhash(&nc_path->next_hop, sizeof(nc_path->next_hop), hash); - - return hash % size; -} - -/** - * batadv_nc_hash_compare() - comparing function used in the network coding hash - * tables - * @node: node in the local table - * @data2: second object to compare the node to - * - * Return: true if the two entry are the same, false otherwise - */ -static bool batadv_nc_hash_compare(const struct hlist_node *node, - const void *data2) -{ - const struct batadv_nc_path *nc_path1, *nc_path2; - - nc_path1 = container_of(node, struct batadv_nc_path, hash_entry); - nc_path2 = data2; - - /* Return 1 if the two keys are identical */ - if (!batadv_compare_eth(nc_path1->prev_hop, nc_path2->prev_hop)) - return false; - - if (!batadv_compare_eth(nc_path1->next_hop, nc_path2->next_hop)) - return false; - - return true; -} - -/** - * batadv_nc_hash_find() - search for an existing nc path and return it - * @hash: hash table containing the nc path - * @data: search key - * - * Return: the nc_path if found, NULL otherwise. - */ -static struct batadv_nc_path * -batadv_nc_hash_find(struct batadv_hashtable *hash, - void *data) -{ - struct hlist_head *head; - struct batadv_nc_path *nc_path, *nc_path_tmp = NULL; - int index; - - if (!hash) - return NULL; - - index = batadv_nc_hash_choose(data, hash->size); - head = &hash->table[index]; - - rcu_read_lock(); - hlist_for_each_entry_rcu(nc_path, head, hash_entry) { - if (!batadv_nc_hash_compare(&nc_path->hash_entry, data)) - continue; - - if (!kref_get_unless_zero(&nc_path->refcount)) - continue; - - nc_path_tmp = nc_path; - break; - } - rcu_read_unlock(); - - return nc_path_tmp; -} - -/** - * batadv_nc_send_packet() - send non-coded packet and free nc_packet struct - * @nc_packet: the nc packet to send - */ -static void batadv_nc_send_packet(struct batadv_nc_packet *nc_packet) -{ - batadv_send_unicast_skb(nc_packet->skb, nc_packet->neigh_node); - nc_packet->skb = NULL; - batadv_nc_packet_free(nc_packet, false); -} - -/** - * batadv_nc_sniffed_purge() - Checks timestamp of given sniffed nc_packet. - * @bat_priv: the bat priv with all the mesh interface information - * @nc_path: the nc path the packet belongs to - * @nc_packet: the nc packet to be checked - * - * Checks whether the given sniffed (overheard) nc_packet has hit its buffering - * timeout. If so, the packet is no longer kept and the entry deleted from the - * queue. Has to be called with the appropriate locks. - * - * Return: false as soon as the entry in the fifo queue has not been timed out - * yet and true otherwise. - */ -static bool batadv_nc_sniffed_purge(struct batadv_priv *bat_priv, - struct batadv_nc_path *nc_path, - struct batadv_nc_packet *nc_packet) -{ - unsigned long timeout = bat_priv->nc.max_buffer_time; - bool res = false; - - lockdep_assert_held(&nc_path->packet_list_lock); - - /* Packets are added to tail, so the remaining packets did not time - * out and we can stop processing the current queue - */ - if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_ACTIVE && - !batadv_has_timed_out(nc_packet->timestamp, timeout)) - goto out; - - /* purge nc packet */ - list_del(&nc_packet->list); - batadv_nc_packet_free(nc_packet, true); - - res = true; - -out: - return res; -} - -/** - * batadv_nc_fwd_flush() - Checks the timestamp of the given nc packet. - * @bat_priv: the bat priv with all the mesh interface information - * @nc_path: the nc path the packet belongs to - * @nc_packet: the nc packet to be checked - * - * Checks whether the given nc packet has hit its forward timeout. If so, the - * packet is no longer delayed, immediately sent and the entry deleted from the - * queue. Has to be called with the appropriate locks. - * - * Return: false as soon as the entry in the fifo queue has not been timed out - * yet and true otherwise. - */ -static bool batadv_nc_fwd_flush(struct batadv_priv *bat_priv, - struct batadv_nc_path *nc_path, - struct batadv_nc_packet *nc_packet) -{ - unsigned long timeout = bat_priv->nc.max_fwd_delay; - - lockdep_assert_held(&nc_path->packet_list_lock); - - /* Packets are added to tail, so the remaining packets did not time - * out and we can stop processing the current queue - */ - if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_ACTIVE && - !batadv_has_timed_out(nc_packet->timestamp, timeout)) - return false; - - /* Send packet */ - batadv_inc_counter(bat_priv, BATADV_CNT_FORWARD); - batadv_add_counter(bat_priv, BATADV_CNT_FORWARD_BYTES, - nc_packet->skb->len + ETH_HLEN); - list_del(&nc_packet->list); - batadv_nc_send_packet(nc_packet); - - return true; -} - -/** - * batadv_nc_process_nc_paths() - traverse given nc packet pool and free timed - * out nc packets - * @bat_priv: the bat priv with all the mesh interface information - * @hash: to be processed hash table - * @process_fn: Function called to process given nc packet. Should return true - * to encourage this function to proceed with the next packet. - * Otherwise the rest of the current queue is skipped. - */ -static void -batadv_nc_process_nc_paths(struct batadv_priv *bat_priv, - struct batadv_hashtable *hash, - bool (*process_fn)(struct batadv_priv *, - struct batadv_nc_path *, - struct batadv_nc_packet *)) -{ - struct hlist_head *head; - struct batadv_nc_packet *nc_packet, *nc_packet_tmp; - struct batadv_nc_path *nc_path; - bool ret; - int i; - - if (!hash) - return; - - /* Loop hash table bins */ - for (i = 0; i < hash->size; i++) { - head = &hash->table[i]; - - /* Loop coding paths */ - rcu_read_lock(); - hlist_for_each_entry_rcu(nc_path, head, hash_entry) { - /* Loop packets */ - spin_lock_bh(&nc_path->packet_list_lock); - list_for_each_entry_safe(nc_packet, nc_packet_tmp, - &nc_path->packet_list, list) { - ret = process_fn(bat_priv, nc_path, nc_packet); - if (!ret) - break; - } - spin_unlock_bh(&nc_path->packet_list_lock); - } - rcu_read_unlock(); - } -} - -/** - * batadv_nc_worker() - periodic task for housekeeping related to network - * coding - * @work: kernel work struct - */ -static void batadv_nc_worker(struct work_struct *work) -{ - struct delayed_work *delayed_work; - struct batadv_priv_nc *priv_nc; - struct batadv_priv *bat_priv; - unsigned long timeout; - - delayed_work = to_delayed_work(work); - priv_nc = container_of(delayed_work, struct batadv_priv_nc, work); - bat_priv = container_of(priv_nc, struct batadv_priv, nc); - - batadv_nc_purge_orig_hash(bat_priv); - batadv_nc_purge_paths(bat_priv, bat_priv->nc.coding_hash, - batadv_nc_to_purge_nc_path_coding); - batadv_nc_purge_paths(bat_priv, bat_priv->nc.decoding_hash, - batadv_nc_to_purge_nc_path_decoding); - - timeout = bat_priv->nc.max_fwd_delay; - - if (batadv_has_timed_out(bat_priv->nc.timestamp_fwd_flush, timeout)) { - batadv_nc_process_nc_paths(bat_priv, bat_priv->nc.coding_hash, - batadv_nc_fwd_flush); - bat_priv->nc.timestamp_fwd_flush = jiffies; - } - - if (batadv_has_timed_out(bat_priv->nc.timestamp_sniffed_purge, - bat_priv->nc.max_buffer_time)) { - batadv_nc_process_nc_paths(bat_priv, bat_priv->nc.decoding_hash, - batadv_nc_sniffed_purge); - bat_priv->nc.timestamp_sniffed_purge = jiffies; - } - - /* Schedule a new check */ - batadv_nc_start_timer(bat_priv); -} - -/** - * batadv_can_nc_with_orig() - checks whether the given orig node is suitable - * for coding or not - * @bat_priv: the bat priv with all the mesh interface information - * @orig_node: neighboring orig node which may be used as nc candidate - * @ogm_packet: incoming ogm packet also used for the checks - * - * Return: true if: - * 1) The OGM must have the most recent sequence number. - * 2) The TTL must be decremented by one and only one. - * 3) The OGM must be received from the first hop from orig_node. - * 4) The TQ value of the OGM must be above bat_priv->nc.min_tq. - */ -static bool batadv_can_nc_with_orig(struct batadv_priv *bat_priv, - struct batadv_orig_node *orig_node, - struct batadv_ogm_packet *ogm_packet) -{ - struct batadv_orig_ifinfo *orig_ifinfo; - u32 last_real_seqno; - u8 last_ttl; - - orig_ifinfo = batadv_orig_ifinfo_get(orig_node, BATADV_IF_DEFAULT); - if (!orig_ifinfo) - return false; - - last_ttl = orig_ifinfo->last_ttl; - last_real_seqno = orig_ifinfo->last_real_seqno; - batadv_orig_ifinfo_put(orig_ifinfo); - - if (last_real_seqno != ntohl(ogm_packet->seqno)) - return false; - if (last_ttl != ogm_packet->ttl + 1) - return false; - if (!batadv_compare_eth(ogm_packet->orig, ogm_packet->prev_sender)) - return false; - if (ogm_packet->tq < bat_priv->nc.min_tq) - return false; - - return true; -} - -/** - * batadv_nc_find_nc_node() - search for an existing nc node and return it - * @orig_node: orig node originating the ogm packet - * @orig_neigh_node: neighboring orig node from which we received the ogm packet - * (can be equal to orig_node) - * @in_coding: traverse incoming or outgoing network coding list - * - * Return: the nc_node if found, NULL otherwise. - */ -static struct batadv_nc_node * -batadv_nc_find_nc_node(struct batadv_orig_node *orig_node, - struct batadv_orig_node *orig_neigh_node, - bool in_coding) -{ - struct batadv_nc_node *nc_node, *nc_node_out = NULL; - struct list_head *list; - - if (in_coding) - list = &orig_neigh_node->in_coding_list; - else - list = &orig_neigh_node->out_coding_list; - - /* Traverse list of nc_nodes to orig_node */ - rcu_read_lock(); - list_for_each_entry_rcu(nc_node, list, list) { - if (!batadv_compare_eth(nc_node->addr, orig_node->orig)) - continue; - - if (!kref_get_unless_zero(&nc_node->refcount)) - continue; - - /* Found a match */ - nc_node_out = nc_node; - break; - } - rcu_read_unlock(); - - return nc_node_out; -} - -/** - * batadv_nc_get_nc_node() - retrieves an nc node or creates the entry if it was - * not found - * @bat_priv: the bat priv with all the mesh interface information - * @orig_node: orig node originating the ogm packet - * @orig_neigh_node: neighboring orig node from which we received the ogm packet - * (can be equal to orig_node) - * @in_coding: traverse incoming or outgoing network coding list - * - * Return: the nc_node if found or created, NULL in case of an error. - */ -static struct batadv_nc_node * -batadv_nc_get_nc_node(struct batadv_priv *bat_priv, - struct batadv_orig_node *orig_node, - struct batadv_orig_node *orig_neigh_node, - bool in_coding) -{ - struct batadv_nc_node *nc_node; - spinlock_t *lock; /* Used to lock list selected by "int in_coding" */ - struct list_head *list; - - /* Select ingoing or outgoing coding node */ - if (in_coding) { - lock = &orig_neigh_node->in_coding_list_lock; - list = &orig_neigh_node->in_coding_list; - } else { - lock = &orig_neigh_node->out_coding_list_lock; - list = &orig_neigh_node->out_coding_list; - } - - spin_lock_bh(lock); - - /* Check if nc_node is already added */ - nc_node = batadv_nc_find_nc_node(orig_node, orig_neigh_node, in_coding); - - /* Node found */ - if (nc_node) - goto unlock; - - nc_node = kzalloc(sizeof(*nc_node), GFP_ATOMIC); - if (!nc_node) - goto unlock; - - /* Initialize nc_node */ - INIT_LIST_HEAD(&nc_node->list); - kref_init(&nc_node->refcount); - ether_addr_copy(nc_node->addr, orig_node->orig); - kref_get(&orig_neigh_node->refcount); - nc_node->orig_node = orig_neigh_node; - - batadv_dbg(BATADV_DBG_NC, bat_priv, "Adding nc_node %pM -> %pM\n", - nc_node->addr, nc_node->orig_node->orig); - - /* Add nc_node to orig_node */ - kref_get(&nc_node->refcount); - list_add_tail_rcu(&nc_node->list, list); - -unlock: - spin_unlock_bh(lock); - - return nc_node; -} - -/** - * batadv_nc_update_nc_node() - updates stored incoming and outgoing nc node - * structs (best called on incoming OGMs) - * @bat_priv: the bat priv with all the mesh interface information - * @orig_node: orig node originating the ogm packet - * @orig_neigh_node: neighboring orig node from which we received the ogm packet - * (can be equal to orig_node) - * @ogm_packet: incoming ogm packet - * @is_single_hop_neigh: orig_node is a single hop neighbor - */ -void batadv_nc_update_nc_node(struct batadv_priv *bat_priv, - struct batadv_orig_node *orig_node, - struct batadv_orig_node *orig_neigh_node, - struct batadv_ogm_packet *ogm_packet, - int is_single_hop_neigh) -{ - struct batadv_nc_node *in_nc_node = NULL; - struct batadv_nc_node *out_nc_node = NULL; - - /* Check if network coding is enabled */ - if (!atomic_read(&bat_priv->network_coding)) - goto out; - - /* check if orig node is network coding enabled */ - if (!test_bit(BATADV_ORIG_CAPA_HAS_NC, &orig_node->capabilities)) - goto out; - - /* accept ogms from 'good' neighbors and single hop neighbors */ - if (!batadv_can_nc_with_orig(bat_priv, orig_node, ogm_packet) && - !is_single_hop_neigh) - goto out; - - /* Add orig_node as in_nc_node on hop */ - in_nc_node = batadv_nc_get_nc_node(bat_priv, orig_node, - orig_neigh_node, true); - if (!in_nc_node) - goto out; - - in_nc_node->last_seen = jiffies; - - /* Add hop as out_nc_node on orig_node */ - out_nc_node = batadv_nc_get_nc_node(bat_priv, orig_neigh_node, - orig_node, false); - if (!out_nc_node) - goto out; - - out_nc_node->last_seen = jiffies; - -out: - batadv_nc_node_put(in_nc_node); - batadv_nc_node_put(out_nc_node); -} - -/** - * batadv_nc_get_path() - get existing nc_path or allocate a new one - * @bat_priv: the bat priv with all the mesh interface information - * @hash: hash table containing the nc path - * @src: ethernet source address - first half of the nc path search key - * @dst: ethernet destination address - second half of the nc path search key - * - * Return: pointer to nc_path if the path was found or created, returns NULL - * on error. - */ -static struct batadv_nc_path *batadv_nc_get_path(struct batadv_priv *bat_priv, - struct batadv_hashtable *hash, - u8 *src, - u8 *dst) -{ - int hash_added; - struct batadv_nc_path *nc_path, nc_path_key; - - batadv_nc_hash_key_gen(&nc_path_key, src, dst); - - /* Search for existing nc_path */ - nc_path = batadv_nc_hash_find(hash, (void *)&nc_path_key); - - if (nc_path) { - /* Set timestamp to delay removal of nc_path */ - nc_path->last_valid = jiffies; - return nc_path; - } - - /* No existing nc_path was found; create a new */ - nc_path = kzalloc(sizeof(*nc_path), GFP_ATOMIC); - - if (!nc_path) - return NULL; - - /* Initialize nc_path */ - INIT_LIST_HEAD(&nc_path->packet_list); - spin_lock_init(&nc_path->packet_list_lock); - kref_init(&nc_path->refcount); - nc_path->last_valid = jiffies; - ether_addr_copy(nc_path->next_hop, dst); - ether_addr_copy(nc_path->prev_hop, src); - - batadv_dbg(BATADV_DBG_NC, bat_priv, "Adding nc_path %pM -> %pM\n", - nc_path->prev_hop, - nc_path->next_hop); - - /* Add nc_path to hash table */ - kref_get(&nc_path->refcount); - hash_added = batadv_hash_add(hash, batadv_nc_hash_compare, - batadv_nc_hash_choose, &nc_path_key, - &nc_path->hash_entry); - - if (hash_added < 0) { - kfree(nc_path); - return NULL; - } - - return nc_path; -} - -/** - * batadv_nc_random_weight_tq() - scale the receivers TQ-value to avoid unfair - * selection of a receiver with slightly lower TQ than the other - * @tq: to be weighted tq value - * - * Return: scaled tq value - */ -static u8 batadv_nc_random_weight_tq(u8 tq) -{ - /* randomize the estimated packet loss (max TQ - estimated TQ) */ - u8 rand_tq = get_random_u32_below(BATADV_TQ_MAX_VALUE + 1 - tq); - - /* convert to (randomized) estimated tq again */ - return BATADV_TQ_MAX_VALUE - rand_tq; -} - -/** - * batadv_nc_memxor() - XOR destination with source - * @dst: byte array to XOR into - * @src: byte array to XOR from - * @len: length of destination array - */ -static void batadv_nc_memxor(char *dst, const char *src, unsigned int len) -{ - unsigned int i; - - for (i = 0; i < len; ++i) - dst[i] ^= src[i]; -} - -/** - * batadv_nc_code_packets() - code a received unicast_packet with an nc packet - * into a coded_packet and send it - * @bat_priv: the bat priv with all the mesh interface information - * @skb: data skb to forward - * @ethhdr: pointer to the ethernet header inside the skb - * @nc_packet: structure containing the packet to the skb can be coded with - * @neigh_node: next hop to forward packet to - * - * Return: true if both packets are consumed, false otherwise. - */ -static bool batadv_nc_code_packets(struct batadv_priv *bat_priv, - struct sk_buff *skb, - struct ethhdr *ethhdr, - struct batadv_nc_packet *nc_packet, - struct batadv_neigh_node *neigh_node) -{ - u8 tq_weighted_neigh, tq_weighted_coding, tq_tmp; - struct sk_buff *skb_dest, *skb_src; - struct batadv_unicast_packet *packet1; - struct batadv_unicast_packet *packet2; - struct batadv_coded_packet *coded_packet; - struct batadv_neigh_node *neigh_tmp, *router_neigh, *first_dest; - struct batadv_neigh_node *router_coding = NULL, *second_dest; - struct batadv_neigh_ifinfo *router_neigh_ifinfo = NULL; - struct batadv_neigh_ifinfo *router_coding_ifinfo = NULL; - u8 *first_source, *second_source; - __be32 packet_id1, packet_id2; - size_t count; - bool res = false; - int coding_len; - int unicast_size = sizeof(*packet1); - int coded_size = sizeof(*coded_packet); - int header_add = coded_size - unicast_size; - - /* TODO: do we need to consider the outgoing interface for - * coded packets? - */ - router_neigh = batadv_orig_router_get(neigh_node->orig_node, - BATADV_IF_DEFAULT); - if (!router_neigh) - goto out; - - router_neigh_ifinfo = batadv_neigh_ifinfo_get(router_neigh, - BATADV_IF_DEFAULT); - if (!router_neigh_ifinfo) - goto out; - - neigh_tmp = nc_packet->neigh_node; - router_coding = batadv_orig_router_get(neigh_tmp->orig_node, - BATADV_IF_DEFAULT); - if (!router_coding) - goto out; - - router_coding_ifinfo = batadv_neigh_ifinfo_get(router_coding, - BATADV_IF_DEFAULT); - if (!router_coding_ifinfo) - goto out; - - tq_tmp = router_neigh_ifinfo->bat_iv.tq_avg; - tq_weighted_neigh = batadv_nc_random_weight_tq(tq_tmp); - tq_tmp = router_coding_ifinfo->bat_iv.tq_avg; - tq_weighted_coding = batadv_nc_random_weight_tq(tq_tmp); - - /* Select one destination for the MAC-header dst-field based on - * weighted TQ-values. - */ - if (tq_weighted_neigh >= tq_weighted_coding) { - /* Destination from nc_packet is selected for MAC-header */ - first_dest = nc_packet->neigh_node; - first_source = nc_packet->nc_path->prev_hop; - second_dest = neigh_node; - second_source = ethhdr->h_source; - packet1 = (struct batadv_unicast_packet *)nc_packet->skb->data; - packet2 = (struct batadv_unicast_packet *)skb->data; - packet_id1 = nc_packet->packet_id; - packet_id2 = batadv_skb_crc32(skb, - skb->data + sizeof(*packet2)); - } else { - /* Destination for skb is selected for MAC-header */ - first_dest = neigh_node; - first_source = ethhdr->h_source; - second_dest = nc_packet->neigh_node; - second_source = nc_packet->nc_path->prev_hop; - packet1 = (struct batadv_unicast_packet *)skb->data; - packet2 = (struct batadv_unicast_packet *)nc_packet->skb->data; - packet_id1 = batadv_skb_crc32(skb, - skb->data + sizeof(*packet1)); - packet_id2 = nc_packet->packet_id; - } - - /* Instead of zero padding the smallest data buffer, we - * code into the largest. - */ - if (skb->len <= nc_packet->skb->len) { - skb_dest = nc_packet->skb; - skb_src = skb; - } else { - skb_dest = skb; - skb_src = nc_packet->skb; - } - - /* coding_len is used when decoding the packet shorter packet */ - coding_len = skb_src->len - unicast_size; - - if (skb_linearize(skb_dest) < 0 || skb_linearize(skb_src) < 0) - goto out; - - skb_push(skb_dest, header_add); - - coded_packet = (struct batadv_coded_packet *)skb_dest->data; - skb_reset_mac_header(skb_dest); - - coded_packet->packet_type = BATADV_CODED; - coded_packet->version = BATADV_COMPAT_VERSION; - coded_packet->ttl = packet1->ttl; - - /* Info about first unicast packet */ - ether_addr_copy(coded_packet->first_source, first_source); - ether_addr_copy(coded_packet->first_orig_dest, packet1->dest); - coded_packet->first_crc = packet_id1; - coded_packet->first_ttvn = packet1->ttvn; - - /* Info about second unicast packet */ - ether_addr_copy(coded_packet->second_dest, second_dest->addr); - ether_addr_copy(coded_packet->second_source, second_source); - ether_addr_copy(coded_packet->second_orig_dest, packet2->dest); - coded_packet->second_crc = packet_id2; - coded_packet->second_ttl = packet2->ttl; - coded_packet->second_ttvn = packet2->ttvn; - coded_packet->coded_len = htons(coding_len); - - /* This is where the magic happens: Code skb_src into skb_dest */ - batadv_nc_memxor(skb_dest->data + coded_size, - skb_src->data + unicast_size, coding_len); - - /* Update counters accordingly */ - if (BATADV_SKB_CB(skb_src)->decoded && - BATADV_SKB_CB(skb_dest)->decoded) { - /* Both packets are recoded */ - count = skb_src->len + ETH_HLEN; - count += skb_dest->len + ETH_HLEN; - batadv_add_counter(bat_priv, BATADV_CNT_NC_RECODE, 2); - batadv_add_counter(bat_priv, BATADV_CNT_NC_RECODE_BYTES, count); - } else if (!BATADV_SKB_CB(skb_src)->decoded && - !BATADV_SKB_CB(skb_dest)->decoded) { - /* Both packets are newly coded */ - count = skb_src->len + ETH_HLEN; - count += skb_dest->len + ETH_HLEN; - batadv_add_counter(bat_priv, BATADV_CNT_NC_CODE, 2); - batadv_add_counter(bat_priv, BATADV_CNT_NC_CODE_BYTES, count); - } else if (BATADV_SKB_CB(skb_src)->decoded && - !BATADV_SKB_CB(skb_dest)->decoded) { - /* skb_src recoded and skb_dest is newly coded */ - batadv_inc_counter(bat_priv, BATADV_CNT_NC_RECODE); - batadv_add_counter(bat_priv, BATADV_CNT_NC_RECODE_BYTES, - skb_src->len + ETH_HLEN); - batadv_inc_counter(bat_priv, BATADV_CNT_NC_CODE); - batadv_add_counter(bat_priv, BATADV_CNT_NC_CODE_BYTES, - skb_dest->len + ETH_HLEN); - } else if (!BATADV_SKB_CB(skb_src)->decoded && - BATADV_SKB_CB(skb_dest)->decoded) { - /* skb_src is newly coded and skb_dest is recoded */ - batadv_inc_counter(bat_priv, BATADV_CNT_NC_CODE); - batadv_add_counter(bat_priv, BATADV_CNT_NC_CODE_BYTES, - skb_src->len + ETH_HLEN); - batadv_inc_counter(bat_priv, BATADV_CNT_NC_RECODE); - batadv_add_counter(bat_priv, BATADV_CNT_NC_RECODE_BYTES, - skb_dest->len + ETH_HLEN); - } - - /* skb_src is now coded into skb_dest, so free it */ - consume_skb(skb_src); - - /* avoid duplicate free of skb from nc_packet */ - nc_packet->skb = NULL; - batadv_nc_packet_free(nc_packet, false); - - /* Send the coded packet and return true */ - batadv_send_unicast_skb(skb_dest, first_dest); - res = true; -out: - batadv_neigh_node_put(router_neigh); - batadv_neigh_node_put(router_coding); - batadv_neigh_ifinfo_put(router_neigh_ifinfo); - batadv_neigh_ifinfo_put(router_coding_ifinfo); - return res; -} - -/** - * batadv_nc_skb_coding_possible() - true if a decoded skb is available at dst. - * @skb: data skb to forward - * @dst: destination mac address of the other skb to code with - * @src: source mac address of skb - * - * Whenever we network code a packet we have to check whether we received it in - * a network coded form. If so, we may not be able to use it for coding because - * some neighbors may also have received (overheard) the packet in the network - * coded form without being able to decode it. It is hard to know which of the - * neighboring nodes was able to decode the packet, therefore we can only - * re-code the packet if the source of the previous encoded packet is involved. - * Since the source encoded the packet we can be certain it has all necessary - * decode information. - * - * Return: true if coding of a decoded packet is allowed. - */ -static bool batadv_nc_skb_coding_possible(struct sk_buff *skb, u8 *dst, u8 *src) -{ - if (BATADV_SKB_CB(skb)->decoded && !batadv_compare_eth(dst, src)) - return false; - return true; -} - -/** - * batadv_nc_path_search() - Find the coding path matching in_nc_node and - * out_nc_node to retrieve a buffered packet that can be used for coding. - * @bat_priv: the bat priv with all the mesh interface information - * @in_nc_node: pointer to skb next hop's neighbor nc node - * @out_nc_node: pointer to skb source's neighbor nc node - * @skb: data skb to forward - * @eth_dst: next hop mac address of skb - * - * Return: true if coding of a decoded skb is allowed. - */ -static struct batadv_nc_packet * -batadv_nc_path_search(struct batadv_priv *bat_priv, - struct batadv_nc_node *in_nc_node, - struct batadv_nc_node *out_nc_node, - struct sk_buff *skb, - u8 *eth_dst) -{ - struct batadv_nc_path *nc_path, nc_path_key; - struct batadv_nc_packet *nc_packet_out = NULL; - struct batadv_nc_packet *nc_packet, *nc_packet_tmp; - struct batadv_hashtable *hash = bat_priv->nc.coding_hash; - int idx; - - if (!hash) - return NULL; - - /* Create almost path key */ - batadv_nc_hash_key_gen(&nc_path_key, in_nc_node->addr, - out_nc_node->addr); - idx = batadv_nc_hash_choose(&nc_path_key, hash->size); - - /* Check for coding opportunities in this nc_path */ - rcu_read_lock(); - hlist_for_each_entry_rcu(nc_path, &hash->table[idx], hash_entry) { - if (!batadv_compare_eth(nc_path->prev_hop, in_nc_node->addr)) - continue; - - if (!batadv_compare_eth(nc_path->next_hop, out_nc_node->addr)) - continue; - - spin_lock_bh(&nc_path->packet_list_lock); - if (list_empty(&nc_path->packet_list)) { - spin_unlock_bh(&nc_path->packet_list_lock); - continue; - } - - list_for_each_entry_safe(nc_packet, nc_packet_tmp, - &nc_path->packet_list, list) { - if (!batadv_nc_skb_coding_possible(nc_packet->skb, - eth_dst, - in_nc_node->addr)) - continue; - - /* Coding opportunity is found! */ - list_del(&nc_packet->list); - nc_packet_out = nc_packet; - break; - } - - spin_unlock_bh(&nc_path->packet_list_lock); - break; - } - rcu_read_unlock(); - - return nc_packet_out; -} - -/** - * batadv_nc_skb_src_search() - Loops through the list of neighboring nodes of - * the skb's sender (may be equal to the originator). - * @bat_priv: the bat priv with all the mesh interface information - * @skb: data skb to forward - * @eth_dst: next hop mac address of skb - * @eth_src: source mac address of skb - * @in_nc_node: pointer to skb next hop's neighbor nc node - * - * Return: an nc packet if a suitable coding packet was found, NULL otherwise. - */ -static struct batadv_nc_packet * -batadv_nc_skb_src_search(struct batadv_priv *bat_priv, - struct sk_buff *skb, - u8 *eth_dst, - u8 *eth_src, - struct batadv_nc_node *in_nc_node) -{ - struct batadv_orig_node *orig_node; - struct batadv_nc_node *out_nc_node; - struct batadv_nc_packet *nc_packet = NULL; - - orig_node = batadv_orig_hash_find(bat_priv, eth_src); - if (!orig_node) - return NULL; - - rcu_read_lock(); - list_for_each_entry_rcu(out_nc_node, - &orig_node->out_coding_list, list) { - /* Check if the skb is decoded and if recoding is possible */ - if (!batadv_nc_skb_coding_possible(skb, - out_nc_node->addr, eth_src)) - continue; - - /* Search for an opportunity in this nc_path */ - nc_packet = batadv_nc_path_search(bat_priv, in_nc_node, - out_nc_node, skb, eth_dst); - if (nc_packet) - break; - } - rcu_read_unlock(); - - batadv_orig_node_put(orig_node); - return nc_packet; -} - -/** - * batadv_nc_skb_store_before_coding() - set the ethernet src and dst of the - * unicast skb before it is stored for use in later decoding - * @bat_priv: the bat priv with all the mesh interface information - * @skb: data skb to store - * @eth_dst_new: new destination mac address of skb - */ -static void batadv_nc_skb_store_before_coding(struct batadv_priv *bat_priv, - struct sk_buff *skb, - u8 *eth_dst_new) -{ - struct ethhdr *ethhdr; - - /* Copy skb header to change the mac header */ - skb = pskb_copy_for_clone(skb, GFP_ATOMIC); - if (!skb) - return; - - /* Set the mac header as if we actually sent the packet uncoded */ - ethhdr = eth_hdr(skb); - ether_addr_copy(ethhdr->h_source, ethhdr->h_dest); - ether_addr_copy(ethhdr->h_dest, eth_dst_new); - - /* Set data pointer to MAC header to mimic packets from our tx path */ - skb_push(skb, ETH_HLEN); - - /* Add the packet to the decoding packet pool */ - batadv_nc_skb_store_for_decoding(bat_priv, skb); - - /* batadv_nc_skb_store_for_decoding() clones the skb, so we must free - * our ref - */ - consume_skb(skb); -} - -/** - * batadv_nc_skb_dst_search() - Loops through list of neighboring nodes to dst. - * @skb: data skb to forward - * @neigh_node: next hop to forward packet to - * @ethhdr: pointer to the ethernet header inside the skb - * - * Loops through the list of neighboring nodes the next hop has a good - * connection to (receives OGMs with a sufficient quality). We need to find a - * neighbor of our next hop that potentially sent a packet which our next hop - * also received (overheard) and has stored for later decoding. - * - * Return: true if the skb was consumed (encoded packet sent) or false otherwise - */ -static bool batadv_nc_skb_dst_search(struct sk_buff *skb, - struct batadv_neigh_node *neigh_node, - struct ethhdr *ethhdr) -{ - struct net_device *netdev = neigh_node->if_incoming->mesh_iface; - struct batadv_priv *bat_priv = netdev_priv(netdev); - struct batadv_orig_node *orig_node = neigh_node->orig_node; - struct batadv_nc_node *nc_node; - struct batadv_nc_packet *nc_packet = NULL; - - rcu_read_lock(); - list_for_each_entry_rcu(nc_node, &orig_node->in_coding_list, list) { - /* Search for coding opportunity with this in_nc_node */ - nc_packet = batadv_nc_skb_src_search(bat_priv, skb, - neigh_node->addr, - ethhdr->h_source, nc_node); - - /* Opportunity was found, so stop searching */ - if (nc_packet) - break; - } - rcu_read_unlock(); - - if (!nc_packet) - return false; - - /* Save packets for later decoding */ - batadv_nc_skb_store_before_coding(bat_priv, skb, - neigh_node->addr); - batadv_nc_skb_store_before_coding(bat_priv, nc_packet->skb, - nc_packet->neigh_node->addr); - - /* Code and send packets */ - if (batadv_nc_code_packets(bat_priv, skb, ethhdr, nc_packet, - neigh_node)) - return true; - - /* out of mem ? Coding failed - we have to free the buffered packet - * to avoid memleaks. The skb passed as argument will be dealt with - * by the calling function. - */ - batadv_nc_send_packet(nc_packet); - return false; -} - -/** - * batadv_nc_skb_add_to_path() - buffer skb for later encoding / decoding - * @skb: skb to add to path - * @nc_path: path to add skb to - * @neigh_node: next hop to forward packet to - * @packet_id: checksum to identify packet - * - * Return: true if the packet was buffered or false in case of an error. - */ -static bool batadv_nc_skb_add_to_path(struct sk_buff *skb, - struct batadv_nc_path *nc_path, - struct batadv_neigh_node *neigh_node, - __be32 packet_id) -{ - struct batadv_nc_packet *nc_packet; - - nc_packet = kzalloc(sizeof(*nc_packet), GFP_ATOMIC); - if (!nc_packet) - return false; - - /* Initialize nc_packet */ - nc_packet->timestamp = jiffies; - nc_packet->packet_id = packet_id; - nc_packet->skb = skb; - nc_packet->neigh_node = neigh_node; - nc_packet->nc_path = nc_path; - - /* Add coding packet to list */ - spin_lock_bh(&nc_path->packet_list_lock); - list_add_tail(&nc_packet->list, &nc_path->packet_list); - spin_unlock_bh(&nc_path->packet_list_lock); - - return true; -} - -/** - * batadv_nc_skb_forward() - try to code a packet or add it to the coding packet - * buffer - * @skb: data skb to forward - * @neigh_node: next hop to forward packet to - * - * Return: true if the skb was consumed (encoded packet sent) or false otherwise - */ -bool batadv_nc_skb_forward(struct sk_buff *skb, - struct batadv_neigh_node *neigh_node) -{ - const struct net_device *netdev = neigh_node->if_incoming->mesh_iface; - struct batadv_priv *bat_priv = netdev_priv(netdev); - struct batadv_unicast_packet *packet; - struct batadv_nc_path *nc_path; - struct ethhdr *ethhdr = eth_hdr(skb); - __be32 packet_id; - u8 *payload; - - /* Check if network coding is enabled */ - if (!atomic_read(&bat_priv->network_coding)) - goto out; - - /* We only handle unicast packets */ - payload = skb_network_header(skb); - packet = (struct batadv_unicast_packet *)payload; - if (packet->packet_type != BATADV_UNICAST) - goto out; - - /* Try to find a coding opportunity and send the skb if one is found */ - if (batadv_nc_skb_dst_search(skb, neigh_node, ethhdr)) - return true; - - /* Find or create a nc_path for this src-dst pair */ - nc_path = batadv_nc_get_path(bat_priv, - bat_priv->nc.coding_hash, - ethhdr->h_source, - neigh_node->addr); - - if (!nc_path) - goto out; - - /* Add skb to nc_path */ - packet_id = batadv_skb_crc32(skb, payload + sizeof(*packet)); - if (!batadv_nc_skb_add_to_path(skb, nc_path, neigh_node, packet_id)) - goto free_nc_path; - - /* Packet is consumed */ - return true; - -free_nc_path: - batadv_nc_path_put(nc_path); -out: - /* Packet is not consumed */ - return false; -} - -/** - * batadv_nc_skb_store_for_decoding() - save a clone of the skb which can be - * used when decoding coded packets - * @bat_priv: the bat priv with all the mesh interface information - * @skb: data skb to store - */ -void batadv_nc_skb_store_for_decoding(struct batadv_priv *bat_priv, - struct sk_buff *skb) -{ - struct batadv_unicast_packet *packet; - struct batadv_nc_path *nc_path; - struct ethhdr *ethhdr = eth_hdr(skb); - __be32 packet_id; - u8 *payload; - - /* Check if network coding is enabled */ - if (!atomic_read(&bat_priv->network_coding)) - goto out; - - /* Check for supported packet type */ - payload = skb_network_header(skb); - packet = (struct batadv_unicast_packet *)payload; - if (packet->packet_type != BATADV_UNICAST) - goto out; - - /* Find existing nc_path or create a new */ - nc_path = batadv_nc_get_path(bat_priv, - bat_priv->nc.decoding_hash, - ethhdr->h_source, - ethhdr->h_dest); - - if (!nc_path) - goto out; - - /* Clone skb and adjust skb->data to point at batman header */ - skb = skb_clone(skb, GFP_ATOMIC); - if (unlikely(!skb)) - goto free_nc_path; - - if (unlikely(!pskb_may_pull(skb, ETH_HLEN))) - goto free_skb; - - if (unlikely(!skb_pull_rcsum(skb, ETH_HLEN))) - goto free_skb; - - /* Add skb to nc_path */ - packet_id = batadv_skb_crc32(skb, payload + sizeof(*packet)); - if (!batadv_nc_skb_add_to_path(skb, nc_path, NULL, packet_id)) - goto free_skb; - - batadv_inc_counter(bat_priv, BATADV_CNT_NC_BUFFER); - return; - -free_skb: - kfree_skb(skb); -free_nc_path: - batadv_nc_path_put(nc_path); -out: - return; -} - -/** - * batadv_nc_skb_store_sniffed_unicast() - check if a received unicast packet - * should be saved in the decoding buffer and, if so, store it there - * @bat_priv: the bat priv with all the mesh interface information - * @skb: unicast skb to store - */ -void batadv_nc_skb_store_sniffed_unicast(struct batadv_priv *bat_priv, - struct sk_buff *skb) -{ - struct ethhdr *ethhdr = eth_hdr(skb); - - if (batadv_is_my_mac(bat_priv, ethhdr->h_dest)) - return; - - /* Set data pointer to MAC header to mimic packets from our tx path */ - skb_push(skb, ETH_HLEN); - - batadv_nc_skb_store_for_decoding(bat_priv, skb); -} - -/** - * batadv_nc_skb_decode_packet() - decode given skb using the decode data stored - * in nc_packet - * @bat_priv: the bat priv with all the mesh interface information - * @skb: unicast skb to decode - * @nc_packet: decode data needed to decode the skb - * - * Return: pointer to decoded unicast packet if the packet was decoded or NULL - * in case of an error. - */ -static struct batadv_unicast_packet * -batadv_nc_skb_decode_packet(struct batadv_priv *bat_priv, struct sk_buff *skb, - struct batadv_nc_packet *nc_packet) -{ - const int h_size = sizeof(struct batadv_unicast_packet); - const int h_diff = sizeof(struct batadv_coded_packet) - h_size; - struct batadv_unicast_packet *unicast_packet; - struct batadv_coded_packet coded_packet_tmp; - struct ethhdr *ethhdr, ethhdr_tmp; - u8 *orig_dest, ttl, ttvn; - unsigned int coding_len; - int err; - - /* Save headers temporarily */ - memcpy(&coded_packet_tmp, skb->data, sizeof(coded_packet_tmp)); - memcpy(ðhdr_tmp, skb_mac_header(skb), sizeof(ethhdr_tmp)); - - if (skb_cow(skb, 0) < 0) - return NULL; - - if (unlikely(!skb_pull_rcsum(skb, h_diff))) - return NULL; - - /* Data points to batman header, so set mac header 14 bytes before - * and network to data - */ - skb_set_mac_header(skb, -ETH_HLEN); - skb_reset_network_header(skb); - - /* Reconstruct original mac header */ - ethhdr = eth_hdr(skb); - *ethhdr = ethhdr_tmp; - - /* Select the correct unicast header information based on the location - * of our mac address in the coded_packet header - */ - if (batadv_is_my_mac(bat_priv, coded_packet_tmp.second_dest)) { - /* If we are the second destination the packet was overheard, - * so the Ethernet address must be copied to h_dest and - * pkt_type changed from PACKET_OTHERHOST to PACKET_HOST - */ - ether_addr_copy(ethhdr->h_dest, coded_packet_tmp.second_dest); - skb->pkt_type = PACKET_HOST; - - orig_dest = coded_packet_tmp.second_orig_dest; - ttl = coded_packet_tmp.second_ttl; - ttvn = coded_packet_tmp.second_ttvn; - } else { - orig_dest = coded_packet_tmp.first_orig_dest; - ttl = coded_packet_tmp.ttl; - ttvn = coded_packet_tmp.first_ttvn; - } - - coding_len = ntohs(coded_packet_tmp.coded_len); - - if (coding_len > skb->len) - return NULL; - - /* Here the magic is reversed: - * extract the missing packet from the received coded packet - */ - batadv_nc_memxor(skb->data + h_size, - nc_packet->skb->data + h_size, - coding_len); - - /* Resize decoded skb if decoded with larger packet */ - if (nc_packet->skb->len > coding_len + h_size) { - err = pskb_trim_rcsum(skb, coding_len + h_size); - if (err) - return NULL; - } - - /* Create decoded unicast packet */ - unicast_packet = (struct batadv_unicast_packet *)skb->data; - unicast_packet->packet_type = BATADV_UNICAST; - unicast_packet->version = BATADV_COMPAT_VERSION; - unicast_packet->ttl = ttl; - ether_addr_copy(unicast_packet->dest, orig_dest); - unicast_packet->ttvn = ttvn; - - batadv_nc_packet_free(nc_packet, false); - return unicast_packet; -} - -/** - * batadv_nc_find_decoding_packet() - search through buffered decoding data to - * find the data needed to decode the coded packet - * @bat_priv: the bat priv with all the mesh interface information - * @ethhdr: pointer to the ethernet header inside the coded packet - * @coded: coded packet we try to find decode data for - * - * Return: pointer to nc packet if the needed data was found or NULL otherwise. - */ -static struct batadv_nc_packet * -batadv_nc_find_decoding_packet(struct batadv_priv *bat_priv, - struct ethhdr *ethhdr, - struct batadv_coded_packet *coded) -{ - struct batadv_hashtable *hash = bat_priv->nc.decoding_hash; - struct batadv_nc_packet *tmp_nc_packet, *nc_packet = NULL; - struct batadv_nc_path *nc_path, nc_path_key; - u8 *dest, *source; - __be32 packet_id; - int index; - - if (!hash) - return NULL; - - /* Select the correct packet id based on the location of our mac-addr */ - dest = ethhdr->h_source; - if (!batadv_is_my_mac(bat_priv, coded->second_dest)) { - source = coded->second_source; - packet_id = coded->second_crc; - } else { - source = coded->first_source; - packet_id = coded->first_crc; - } - - batadv_nc_hash_key_gen(&nc_path_key, source, dest); - index = batadv_nc_hash_choose(&nc_path_key, hash->size); - - /* Search for matching coding path */ - rcu_read_lock(); - hlist_for_each_entry_rcu(nc_path, &hash->table[index], hash_entry) { - /* Find matching nc_packet */ - spin_lock_bh(&nc_path->packet_list_lock); - list_for_each_entry(tmp_nc_packet, - &nc_path->packet_list, list) { - if (packet_id == tmp_nc_packet->packet_id) { - list_del(&tmp_nc_packet->list); - - nc_packet = tmp_nc_packet; - break; - } - } - spin_unlock_bh(&nc_path->packet_list_lock); - - if (nc_packet) - break; - } - rcu_read_unlock(); - - if (!nc_packet) - batadv_dbg(BATADV_DBG_NC, bat_priv, - "No decoding packet found for %u\n", packet_id); - - return nc_packet; -} - -/** - * batadv_nc_recv_coded_packet() - try to decode coded packet and enqueue the - * resulting unicast packet - * @skb: incoming coded packet - * @recv_if: pointer to interface this packet was received on - * - * Return: NET_RX_SUCCESS if the packet has been consumed or NET_RX_DROP - * otherwise. - */ -static int batadv_nc_recv_coded_packet(struct sk_buff *skb, - struct batadv_hard_iface *recv_if) -{ - struct batadv_priv *bat_priv = netdev_priv(recv_if->mesh_iface); - struct batadv_unicast_packet *unicast_packet; - struct batadv_coded_packet *coded_packet; - struct batadv_nc_packet *nc_packet; - struct ethhdr *ethhdr; - int hdr_size = sizeof(*coded_packet); - - /* Check if network coding is enabled */ - if (!atomic_read(&bat_priv->network_coding)) - goto free_skb; - - /* Make sure we can access (and remove) header */ - if (unlikely(!pskb_may_pull(skb, hdr_size))) - goto free_skb; - - coded_packet = (struct batadv_coded_packet *)skb->data; - ethhdr = eth_hdr(skb); - - /* Verify frame is destined for us */ - if (!batadv_is_my_mac(bat_priv, ethhdr->h_dest) && - !batadv_is_my_mac(bat_priv, coded_packet->second_dest)) - goto free_skb; - - /* Update stat counter */ - if (batadv_is_my_mac(bat_priv, coded_packet->second_dest)) - batadv_inc_counter(bat_priv, BATADV_CNT_NC_SNIFFED); - - nc_packet = batadv_nc_find_decoding_packet(bat_priv, ethhdr, - coded_packet); - if (!nc_packet) { - batadv_inc_counter(bat_priv, BATADV_CNT_NC_DECODE_FAILED); - goto free_skb; - } - - /* Make skb's linear, because decoding accesses the entire buffer */ - if (skb_linearize(skb) < 0) - goto free_nc_packet; - - if (skb_linearize(nc_packet->skb) < 0) - goto free_nc_packet; - - /* Decode the packet */ - unicast_packet = batadv_nc_skb_decode_packet(bat_priv, skb, nc_packet); - if (!unicast_packet) { - batadv_inc_counter(bat_priv, BATADV_CNT_NC_DECODE_FAILED); - goto free_nc_packet; - } - - /* Mark packet as decoded to do correct recoding when forwarding */ - BATADV_SKB_CB(skb)->decoded = true; - batadv_inc_counter(bat_priv, BATADV_CNT_NC_DECODE); - batadv_add_counter(bat_priv, BATADV_CNT_NC_DECODE_BYTES, - skb->len + ETH_HLEN); - return batadv_recv_unicast_packet(skb, recv_if); - -free_nc_packet: - batadv_nc_packet_free(nc_packet, true); -free_skb: - kfree_skb(skb); - - return NET_RX_DROP; -} - -/** - * batadv_nc_mesh_free() - clean up network coding memory - * @bat_priv: the bat priv with all the mesh interface information - */ -void batadv_nc_mesh_free(struct batadv_priv *bat_priv) -{ - batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_NC, 1); - batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_NC, 1); - cancel_delayed_work_sync(&bat_priv->nc.work); - - batadv_nc_purge_paths(bat_priv, bat_priv->nc.coding_hash, NULL); - batadv_hash_destroy(bat_priv->nc.coding_hash); - batadv_nc_purge_paths(bat_priv, bat_priv->nc.decoding_hash, NULL); - batadv_hash_destroy(bat_priv->nc.decoding_hash); -} diff --git a/net/batman-adv/network-coding.h b/net/batman-adv/network-coding.h deleted file mode 100644 index 368cc3130e4c..000000000000 --- a/net/batman-adv/network-coding.h +++ /dev/null @@ -1,106 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) B.A.T.M.A.N. contributors: - * - * Martin Hundebøll, Jeppe Ledet-Pedersen - */ - -#ifndef _NET_BATMAN_ADV_NETWORK_CODING_H_ -#define _NET_BATMAN_ADV_NETWORK_CODING_H_ - -#include "main.h" - -#include <linux/netdevice.h> -#include <linux/skbuff.h> -#include <linux/types.h> -#include <uapi/linux/batadv_packet.h> - -#ifdef CONFIG_BATMAN_ADV_NC - -void batadv_nc_status_update(struct net_device *net_dev); -int batadv_nc_init(void); -int batadv_nc_mesh_init(struct batadv_priv *bat_priv); -void batadv_nc_mesh_free(struct batadv_priv *bat_priv); -void batadv_nc_update_nc_node(struct batadv_priv *bat_priv, - struct batadv_orig_node *orig_node, - struct batadv_orig_node *orig_neigh_node, - struct batadv_ogm_packet *ogm_packet, - int is_single_hop_neigh); -void batadv_nc_purge_orig(struct batadv_priv *bat_priv, - struct batadv_orig_node *orig_node, - bool (*to_purge)(struct batadv_priv *, - struct batadv_nc_node *)); -void batadv_nc_init_bat_priv(struct batadv_priv *bat_priv); -void batadv_nc_init_orig(struct batadv_orig_node *orig_node); -bool batadv_nc_skb_forward(struct sk_buff *skb, - struct batadv_neigh_node *neigh_node); -void batadv_nc_skb_store_for_decoding(struct batadv_priv *bat_priv, - struct sk_buff *skb); -void batadv_nc_skb_store_sniffed_unicast(struct batadv_priv *bat_priv, - struct sk_buff *skb); - -#else /* ifdef CONFIG_BATMAN_ADV_NC */ - -static inline void batadv_nc_status_update(struct net_device *net_dev) -{ -} - -static inline int batadv_nc_init(void) -{ - return 0; -} - -static inline int batadv_nc_mesh_init(struct batadv_priv *bat_priv) -{ - return 0; -} - -static inline void batadv_nc_mesh_free(struct batadv_priv *bat_priv) -{ -} - -static inline void -batadv_nc_update_nc_node(struct batadv_priv *bat_priv, - struct batadv_orig_node *orig_node, - struct batadv_orig_node *orig_neigh_node, - struct batadv_ogm_packet *ogm_packet, - int is_single_hop_neigh) -{ -} - -static inline void -batadv_nc_purge_orig(struct batadv_priv *bat_priv, - struct batadv_orig_node *orig_node, - bool (*to_purge)(struct batadv_priv *, - struct batadv_nc_node *)) -{ -} - -static inline void batadv_nc_init_bat_priv(struct batadv_priv *bat_priv) -{ -} - -static inline void batadv_nc_init_orig(struct batadv_orig_node *orig_node) -{ -} - -static inline bool batadv_nc_skb_forward(struct sk_buff *skb, - struct batadv_neigh_node *neigh_node) -{ - return false; -} - -static inline void -batadv_nc_skb_store_for_decoding(struct batadv_priv *bat_priv, - struct sk_buff *skb) -{ -} - -static inline void -batadv_nc_skb_store_sniffed_unicast(struct batadv_priv *bat_priv, - struct sk_buff *skb) -{ -} - -#endif /* ifdef CONFIG_BATMAN_ADV_NC */ - -#endif /* _NET_BATMAN_ADV_NETWORK_CODING_H_ */ diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index a464ff96b929..a662408ad867 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -37,7 +37,6 @@ #include "log.h" #include "multicast.h" #include "netlink.h" -#include "network-coding.h" #include "routing.h" #include "translation-table.h" @@ -764,11 +763,16 @@ int batadv_hardif_neigh_dump(struct sk_buff *msg, struct netlink_callback *cb) bat_priv = netdev_priv(mesh_iface); primary_if = batadv_primary_if_get_selected(bat_priv); - if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) { + if (!primary_if) { ret = -ENOENT; goto out_put_mesh_iface; } + if (primary_if->if_status != BATADV_IF_ACTIVE) { + ret = -ENOENT; + goto out_put_primary_if; + } + hard_iface = batadv_netlink_get_hardif(bat_priv, cb); if (IS_ERR(hard_iface) && PTR_ERR(hard_iface) != -ENONET) { ret = PTR_ERR(hard_iface); @@ -883,9 +887,6 @@ void batadv_orig_node_release(struct kref *ref) } spin_unlock_bh(&orig_node->vlan_list_lock); - /* Free nc_nodes */ - batadv_nc_purge_orig(orig_node->bat_priv, orig_node, NULL); - call_rcu(&orig_node->rcu, batadv_orig_node_free_rcu); } @@ -959,8 +960,6 @@ struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv, spin_lock_init(&orig_node->tt_lock); spin_lock_init(&orig_node->vlan_list_lock); - batadv_nc_init_orig(orig_node); - /* extra reference for return */ kref_init(&orig_node->refcount); @@ -1333,11 +1332,16 @@ int batadv_orig_dump(struct sk_buff *msg, struct netlink_callback *cb) bat_priv = netdev_priv(mesh_iface); primary_if = batadv_primary_if_get_selected(bat_priv); - if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) { + if (!primary_if) { ret = -ENOENT; goto out_put_mesh_iface; } + if (primary_if->if_status != BATADV_IF_ACTIVE) { + ret = -ENOENT; + goto out_put_primary_if; + } + hard_iface = batadv_netlink_get_hardif(bat_priv, cb); if (IS_ERR(hard_iface) && PTR_ERR(hard_iface) != -ENONET) { ret = PTR_ERR(hard_iface); diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index 35d8c5783999..12c16f81cc51 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -31,7 +31,6 @@ #include "hard-interface.h" #include "log.h" #include "mesh-interface.h" -#include "network-coding.h" #include "originator.h" #include "send.h" #include "tp_meter.h" @@ -956,15 +955,9 @@ int batadv_recv_unicast_packet(struct sk_buff *skb, /* function returns -EREMOTE for promiscuous packets */ check = batadv_check_unicast_packet(bat_priv, skb, hdr_size); - - /* Even though the packet is not for us, we might save it to use for - * decoding a later received coded packet - */ - if (check == -EREMOTE) - batadv_nc_skb_store_sniffed_unicast(bat_priv, skb); - if (check < 0) goto free_skb; + if (!batadv_check_unicast_ttvn(bat_priv, skb, hdr_size)) goto free_skb; diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 95849ba004e7..20d85c681064 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -34,7 +34,6 @@ #include "hard-interface.h" #include "log.h" #include "mesh-interface.h" -#include "network-coding.h" #include "originator.h" #include "routing.h" #include "translation-table.h" @@ -63,12 +62,9 @@ int batadv_send_skb_packet(struct sk_buff *skb, struct batadv_hard_iface *hard_iface, const u8 *dst_addr) { - struct batadv_priv *bat_priv; struct ethhdr *ethhdr; int ret; - bat_priv = netdev_priv(hard_iface->mesh_iface); - if (hard_iface->if_status != BATADV_IF_ACTIVE) goto send_skb_err; @@ -97,9 +93,6 @@ int batadv_send_skb_packet(struct sk_buff *skb, skb->dev = hard_iface->net_dev; - /* Save a clone of the skb to use when decoding coded packets */ - batadv_nc_skb_store_for_decoding(bat_priv, skb); - /* dev_queue_xmit() returns a negative result on error. However on * congestion and traffic shaping, it drops and returns NET_XMIT_DROP * (which is > 0). This will not be treated as an error. @@ -202,14 +195,7 @@ int batadv_send_skb_to_orig(struct sk_buff *skb, goto put_neigh_node; } - /* try to network code the packet, if it is received on an interface - * (i.e. being forwarded). If the packet originates from this node or if - * network coding fails, then send the packet as usual. - */ - if (recv_if && batadv_nc_skb_forward(skb, neigh_node)) - ret = -EINPROGRESS; - else - ret = batadv_send_unicast_skb(skb, neigh_node); + ret = batadv_send_unicast_skb(skb, neigh_node); /* skb was consumed */ skb = NULL; diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 8d0e04e770cb..6e95e883c2bf 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -212,7 +212,7 @@ batadv_tt_global_hash_find(struct batadv_priv *bat_priv, const u8 *addr, /** * batadv_tt_local_entry_release() - release tt_local_entry from lists and queue * for free after rcu grace period - * @ref: kref pointer of the nc_node + * @ref: kref pointer of the batadv_tt_local_entry */ static void batadv_tt_local_entry_release(struct kref *ref) { @@ -244,7 +244,7 @@ batadv_tt_local_entry_put(struct batadv_tt_local_entry *tt_local_entry) /** * batadv_tt_global_entry_release() - release tt_global_entry from lists and * queue for free after rcu grace period - * @ref: kref pointer of the nc_node + * @ref: kref pointer of the batadv_tt_global_entry */ void batadv_tt_global_entry_release(struct kref *ref) { diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 0ca0fc072fc9..8fc5fe0e9b05 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -505,20 +505,6 @@ struct batadv_orig_node { /** @rcu: struct used for freeing in an RCU-safe manner */ struct rcu_head rcu; -#ifdef CONFIG_BATMAN_ADV_NC - /** @in_coding_list: list of nodes this orig can hear */ - struct list_head in_coding_list; - - /** @out_coding_list: list of nodes that can hear this orig */ - struct list_head out_coding_list; - - /** @in_coding_list_lock: protects in_coding_list */ - spinlock_t in_coding_list_lock; - - /** @out_coding_list_lock: protects out_coding_list */ - spinlock_t out_coding_list_lock; -#endif - /** @fragments: array with heads for fragment chains */ struct batadv_frag_table_entry fragments[BATADV_FRAG_BUFFER_COUNT]; @@ -545,9 +531,6 @@ enum batadv_orig_capabilities { */ BATADV_ORIG_CAPA_HAS_DAT, - /** @BATADV_ORIG_CAPA_HAS_NC: orig node has network coding enabled */ - BATADV_ORIG_CAPA_HAS_NC, - /** @BATADV_ORIG_CAPA_HAS_TT: orig node has tt capability */ BATADV_ORIG_CAPA_HAS_TT, @@ -751,7 +734,7 @@ struct batadv_bcast_duplist_entry { u8 orig[ETH_ALEN]; /** @crc: crc32 checksum of broadcast payload */ - __be32 crc; + u32 crc; /** @entrytime: time when the broadcast packet was received */ unsigned long entrytime; @@ -953,60 +936,6 @@ enum batadv_counters { BATADV_CNT_DAT_CACHED_REPLY_TX, #endif -#ifdef CONFIG_BATMAN_ADV_NC - /** - * @BATADV_CNT_NC_CODE: transmitted nc-combined traffic packet counter - */ - BATADV_CNT_NC_CODE, - - /** - * @BATADV_CNT_NC_CODE_BYTES: transmitted nc-combined traffic bytes - * counter - */ - BATADV_CNT_NC_CODE_BYTES, - - /** - * @BATADV_CNT_NC_RECODE: transmitted nc-recombined traffic packet - * counter - */ - BATADV_CNT_NC_RECODE, - - /** - * @BATADV_CNT_NC_RECODE_BYTES: transmitted nc-recombined traffic bytes - * counter - */ - BATADV_CNT_NC_RECODE_BYTES, - - /** - * @BATADV_CNT_NC_BUFFER: counter for packets buffered for later nc - * decoding - */ - BATADV_CNT_NC_BUFFER, - - /** - * @BATADV_CNT_NC_DECODE: received and nc-decoded traffic packet counter - */ - BATADV_CNT_NC_DECODE, - - /** - * @BATADV_CNT_NC_DECODE_BYTES: received and nc-decoded traffic bytes - * counter - */ - BATADV_CNT_NC_DECODE_BYTES, - - /** - * @BATADV_CNT_NC_DECODE_FAILED: received and decode-failed traffic - * packet counter - */ - BATADV_CNT_NC_DECODE_FAILED, - - /** - * @BATADV_CNT_NC_SNIFFED: counter for nc-decoded packets received in - * promisc mode. - */ - BATADV_CNT_NC_SNIFFED, -#endif - /** @BATADV_CNT_NUM: number of traffic counters */ BATADV_CNT_NUM, }; @@ -1340,56 +1269,6 @@ struct batadv_priv_mcast { #endif /** - * struct batadv_priv_nc - per mesh interface network coding private data - */ -struct batadv_priv_nc { - /** @work: work queue callback item for cleanup */ - struct delayed_work work; - - /** - * @min_tq: only consider neighbors for encoding if neigh_tq > min_tq - */ - u8 min_tq; - - /** - * @max_fwd_delay: maximum packet forward delay to allow coding of - * packets - */ - u32 max_fwd_delay; - - /** - * @max_buffer_time: buffer time for sniffed packets used to decoding - */ - u32 max_buffer_time; - - /** - * @timestamp_fwd_flush: timestamp of last forward packet queue flush - */ - unsigned long timestamp_fwd_flush; - - /** - * @timestamp_sniffed_purge: timestamp of last sniffed packet queue - * purge - */ - unsigned long timestamp_sniffed_purge; - - /** - * @coding_hash: Hash table used to buffer skbs while waiting for - * another incoming skb to code it with. Skbs are added to the buffer - * just before being forwarded in routing.c - */ - struct batadv_hashtable *coding_hash; - - /** - * @decoding_hash: Hash table used to buffer skbs that might be needed - * to decode a received coded skb. The buffer is used for 1) skbs - * arriving on the mesh-interface; 2) skbs overheard on the - * hard-interface; and 3) skbs forwarded by batman-adv. - */ - struct batadv_hashtable *decoding_hash; -}; - -/** * struct batadv_tp_unacked - unacked packet meta-information * * This struct is supposed to represent a buffer unacked packet. However, since @@ -1775,16 +1654,6 @@ struct batadv_priv { struct batadv_priv_mcast mcast; #endif -#ifdef CONFIG_BATMAN_ADV_NC - /** - * @network_coding: bool indicating whether network coding is enabled - */ - atomic_t network_coding; - - /** @nc: network coding data */ - struct batadv_priv_nc nc; -#endif /* CONFIG_BATMAN_ADV_NC */ - #ifdef CONFIG_BATMAN_ADV_BATMAN_V /** @bat_v: B.A.T.M.A.N. V per mesh-interface private data */ struct batadv_priv_bat_v bat_v; @@ -2017,95 +1886,10 @@ struct batadv_tt_roam_node { }; /** - * struct batadv_nc_node - network coding node - */ -struct batadv_nc_node { - /** @list: next and prev pointer for the list handling */ - struct list_head list; - - /** @addr: the node's mac address */ - u8 addr[ETH_ALEN]; - - /** @refcount: number of contexts the object is used by */ - struct kref refcount; - - /** @rcu: struct used for freeing in an RCU-safe manner */ - struct rcu_head rcu; - - /** @orig_node: pointer to corresponding orig node struct */ - struct batadv_orig_node *orig_node; - - /** @last_seen: timestamp of last ogm received from this node */ - unsigned long last_seen; -}; - -/** - * struct batadv_nc_path - network coding path - */ -struct batadv_nc_path { - /** @hash_entry: next and prev pointer for the list handling */ - struct hlist_node hash_entry; - - /** @rcu: struct used for freeing in an RCU-safe manner */ - struct rcu_head rcu; - - /** @refcount: number of contexts the object is used by */ - struct kref refcount; - - /** @packet_list: list of buffered packets for this path */ - struct list_head packet_list; - - /** @packet_list_lock: access lock for packet list */ - spinlock_t packet_list_lock; - - /** @next_hop: next hop (destination) of path */ - u8 next_hop[ETH_ALEN]; - - /** @prev_hop: previous hop (source) of path */ - u8 prev_hop[ETH_ALEN]; - - /** @last_valid: timestamp for last validation of path */ - unsigned long last_valid; -}; - -/** - * struct batadv_nc_packet - network coding packet used when coding and - * decoding packets - */ -struct batadv_nc_packet { - /** @list: next and prev pointer for the list handling */ - struct list_head list; - - /** @packet_id: crc32 checksum of skb data */ - __be32 packet_id; - - /** - * @timestamp: field containing the info when the packet was added to - * path - */ - unsigned long timestamp; - - /** @neigh_node: pointer to original next hop neighbor of skb */ - struct batadv_neigh_node *neigh_node; - - /** @skb: skb which can be encoded or used for decoding */ - struct sk_buff *skb; - - /** @nc_path: pointer to path this nc packet is attached to */ - struct batadv_nc_path *nc_path; -}; - -/** * struct batadv_skb_cb - control buffer structure used to store private data * relevant to batman-adv in the skb->cb buffer in skbs. */ struct batadv_skb_cb { - /** - * @decoded: Marks a skb as decoded, which is checked when searching for - * coding opportunities in network-coding.c - */ - unsigned char decoded:1; - /** @num_bcasts: Counter for broadcast packet retransmissions */ unsigned char num_bcasts; }; diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index f0c862091bff..2c21ae8abadc 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -53,6 +53,11 @@ static bool enable_6lowpan; static struct l2cap_chan *listen_chan; static DEFINE_MUTEX(set_lock); +enum { + LOWPAN_PEER_CLOSING, + LOWPAN_PEER_MAXBITS +}; + struct lowpan_peer { struct list_head list; struct rcu_head rcu; @@ -61,6 +66,8 @@ struct lowpan_peer { /* peer addresses in various formats */ unsigned char lladdr[ETH_ALEN]; struct in6_addr peer_addr; + + DECLARE_BITMAP(flags, LOWPAN_PEER_MAXBITS); }; struct lowpan_btle_dev { @@ -289,6 +296,7 @@ static int recv_pkt(struct sk_buff *skb, struct net_device *dev, local_skb->pkt_type = PACKET_HOST; local_skb->dev = dev; + skb_reset_mac_header(local_skb); skb_set_transport_header(local_skb, sizeof(struct ipv6hdr)); if (give_skb_to_upper(local_skb, dev) != NET_RX_SUCCESS) { @@ -919,7 +927,9 @@ static int bt_6lowpan_disconnect(struct l2cap_conn *conn, u8 dst_type) BT_DBG("peer %p chan %p", peer, peer->chan); + l2cap_chan_lock(peer->chan); l2cap_chan_close(peer->chan, ENOENT); + l2cap_chan_unlock(peer->chan); return 0; } @@ -956,10 +966,11 @@ static struct l2cap_chan *bt_6lowpan_listen(void) } static int get_l2cap_conn(char *buf, bdaddr_t *addr, u8 *addr_type, - struct l2cap_conn **conn) + struct l2cap_conn **conn, bool disconnect) { struct hci_conn *hcon; struct hci_dev *hdev; + int le_addr_type; int n; n = sscanf(buf, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx %hhu", @@ -970,13 +981,32 @@ static int get_l2cap_conn(char *buf, bdaddr_t *addr, u8 *addr_type, if (n < 7) return -EINVAL; + if (disconnect) { + /* The "disconnect" debugfs command has used different address + * type constants than "connect" since 2015. Let's retain that + * for now even though it's obviously buggy... + */ + *addr_type += 1; + } + + switch (*addr_type) { + case BDADDR_LE_PUBLIC: + le_addr_type = ADDR_LE_DEV_PUBLIC; + break; + case BDADDR_LE_RANDOM: + le_addr_type = ADDR_LE_DEV_RANDOM; + break; + default: + return -EINVAL; + } + /* The LE_PUBLIC address type is ignored because of BDADDR_ANY */ hdev = hci_get_route(addr, BDADDR_ANY, BDADDR_LE_PUBLIC); if (!hdev) return -ENOENT; hci_dev_lock(hdev); - hcon = hci_conn_hash_lookup_le(hdev, addr, *addr_type); + hcon = hci_conn_hash_lookup_le(hdev, addr, le_addr_type); hci_dev_unlock(hdev); hci_dev_put(hdev); @@ -993,41 +1023,52 @@ static int get_l2cap_conn(char *buf, bdaddr_t *addr, u8 *addr_type, static void disconnect_all_peers(void) { struct lowpan_btle_dev *entry; - struct lowpan_peer *peer, *tmp_peer, *new_peer; - struct list_head peers; - - INIT_LIST_HEAD(&peers); + struct lowpan_peer *peer; + int nchans; - /* We make a separate list of peers as the close_cb() will - * modify the device peers list so it is better not to mess - * with the same list at the same time. + /* l2cap_chan_close() cannot be called from RCU, and lock ordering + * chan->lock > devices_lock prevents taking write side lock, so copy + * then close. */ rcu_read_lock(); + list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) + list_for_each_entry_rcu(peer, &entry->peers, list) + clear_bit(LOWPAN_PEER_CLOSING, peer->flags); + rcu_read_unlock(); - list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) { - list_for_each_entry_rcu(peer, &entry->peers, list) { - new_peer = kmalloc(sizeof(*new_peer), GFP_ATOMIC); - if (!new_peer) - break; + do { + struct l2cap_chan *chans[32]; + int i; - new_peer->chan = peer->chan; - INIT_LIST_HEAD(&new_peer->list); + nchans = 0; - list_add(&new_peer->list, &peers); - } - } + spin_lock(&devices_lock); - rcu_read_unlock(); + list_for_each_entry_rcu(entry, &bt_6lowpan_devices, list) { + list_for_each_entry_rcu(peer, &entry->peers, list) { + if (test_and_set_bit(LOWPAN_PEER_CLOSING, + peer->flags)) + continue; - spin_lock(&devices_lock); - list_for_each_entry_safe(peer, tmp_peer, &peers, list) { - l2cap_chan_close(peer->chan, ENOENT); + l2cap_chan_hold(peer->chan); + chans[nchans++] = peer->chan; - list_del_rcu(&peer->list); - kfree_rcu(peer, rcu); - } - spin_unlock(&devices_lock); + if (nchans >= ARRAY_SIZE(chans)) + goto done; + } + } + +done: + spin_unlock(&devices_lock); + + for (i = 0; i < nchans; ++i) { + l2cap_chan_lock(chans[i]); + l2cap_chan_close(chans[i], ENOENT); + l2cap_chan_unlock(chans[i]); + l2cap_chan_put(chans[i]); + } + } while (nchans); } struct set_enable { @@ -1050,7 +1091,9 @@ static void do_enable_set(struct work_struct *work) mutex_lock(&set_lock); if (listen_chan) { + l2cap_chan_lock(listen_chan); l2cap_chan_close(listen_chan, 0); + l2cap_chan_unlock(listen_chan); l2cap_chan_put(listen_chan); } @@ -1103,13 +1146,15 @@ static ssize_t lowpan_control_write(struct file *fp, buf[buf_size] = '\0'; if (memcmp(buf, "connect ", 8) == 0) { - ret = get_l2cap_conn(&buf[8], &addr, &addr_type, &conn); + ret = get_l2cap_conn(&buf[8], &addr, &addr_type, &conn, false); if (ret == -EINVAL) return ret; mutex_lock(&set_lock); if (listen_chan) { + l2cap_chan_lock(listen_chan); l2cap_chan_close(listen_chan, 0); + l2cap_chan_unlock(listen_chan); l2cap_chan_put(listen_chan); listen_chan = NULL; } @@ -1140,7 +1185,7 @@ static ssize_t lowpan_control_write(struct file *fp, } if (memcmp(buf, "disconnect ", 11) == 0) { - ret = get_l2cap_conn(&buf[11], &addr, &addr_type, &conn); + ret = get_l2cap_conn(&buf[11], &addr, &addr_type, &conn, true); if (ret < 0) return ret; @@ -1271,7 +1316,9 @@ static void __exit bt_6lowpan_exit(void) debugfs_remove(lowpan_control_debugfs); if (listen_chan) { + l2cap_chan_lock(listen_chan); l2cap_chan_close(listen_chan, 0); + l2cap_chan_unlock(listen_chan); l2cap_chan_put(listen_chan); } diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 7d1e79f69cd1..c3f7828bf9d5 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -149,8 +149,6 @@ static void hci_conn_cleanup(struct hci_conn *conn) hci_chan_list_flush(conn); - hci_conn_hash_del(hdev, conn); - if (HCI_CONN_HANDLE_UNSET(conn->handle)) ida_free(&hdev->unset_handle_ida, conn->handle); @@ -339,7 +337,8 @@ static int hci_enhanced_setup_sync(struct hci_dev *hdev, void *data) case BT_CODEC_TRANSPARENT: if (!find_next_esco_param(conn, esco_param_msbc, ARRAY_SIZE(esco_param_msbc))) - return false; + return -EINVAL; + param = &esco_param_msbc[conn->attempt - 1]; cp.tx_coding_format.id = 0x03; cp.rx_coding_format.id = 0x03; @@ -770,21 +769,23 @@ static void find_bis(struct hci_conn *conn, void *data) d->count++; } -static int hci_le_big_terminate(struct hci_dev *hdev, u8 big, struct hci_conn *conn) +static int hci_le_big_terminate(struct hci_dev *hdev, struct hci_conn *conn) { struct iso_list_data *d; int ret; - bt_dev_dbg(hdev, "big 0x%2.2x sync_handle 0x%4.4x", big, conn->sync_handle); + bt_dev_dbg(hdev, "hcon %p big 0x%2.2x sync_handle 0x%4.4x", conn, + conn->iso_qos.bcast.big, conn->sync_handle); d = kzalloc(sizeof(*d), GFP_KERNEL); if (!d) return -ENOMEM; - d->big = big; + d->big = conn->iso_qos.bcast.big; d->sync_handle = conn->sync_handle; - if (test_and_clear_bit(HCI_CONN_PA_SYNC, &conn->flags)) { + if (conn->type == PA_LINK && + test_and_clear_bit(HCI_CONN_PA_SYNC, &conn->flags)) { hci_conn_hash_list_flag(hdev, find_bis, PA_LINK, HCI_CONN_PA_SYNC, d); @@ -802,6 +803,9 @@ static int hci_le_big_terminate(struct hci_dev *hdev, u8 big, struct hci_conn *c d->big_sync_term = true; } + if (!d->pa_sync_term && !d->big_sync_term) + return 0; + ret = hci_cmd_sync_queue(hdev, big_terminate_sync, d, terminate_big_destroy); if (ret) @@ -830,14 +834,30 @@ static void bis_cleanup(struct hci_conn *conn) /* Check if ISO connection is a BIS and terminate advertising * set and BIG if there are no other connections using it. */ - bis = hci_conn_hash_lookup_big(hdev, conn->iso_qos.bcast.big); + bis = hci_conn_hash_lookup_big_state(hdev, + conn->iso_qos.bcast.big, + BT_CONNECTED, + HCI_ROLE_MASTER); + if (bis) + return; + + bis = hci_conn_hash_lookup_big_state(hdev, + conn->iso_qos.bcast.big, + BT_CONNECT, + HCI_ROLE_MASTER); + if (bis) + return; + + bis = hci_conn_hash_lookup_big_state(hdev, + conn->iso_qos.bcast.big, + BT_OPEN, + HCI_ROLE_MASTER); if (bis) return; hci_le_terminate_big(hdev, conn); } else { - hci_le_big_terminate(hdev, conn->iso_qos.bcast.big, - conn); + hci_le_big_terminate(hdev, conn); } } @@ -902,10 +922,12 @@ static int hci_conn_hash_alloc_unset(struct hci_dev *hdev) U16_MAX, GFP_ATOMIC); } -static struct hci_conn *__hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst, +static struct hci_conn *__hci_conn_add(struct hci_dev *hdev, int type, + bdaddr_t *dst, u8 dst_type, u8 role, u16 handle) { struct hci_conn *conn; + struct smp_irk *irk = NULL; switch (type) { case ACL_LINK: @@ -915,15 +937,16 @@ static struct hci_conn *__hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t case CIS_LINK: case BIS_LINK: case PA_LINK: - if (hdev->iso_mtu) - /* Dedicated ISO Buffer exists */ - break; - fallthrough; + if (!hdev->iso_mtu) + return ERR_PTR(-ECONNREFUSED); + irk = hci_get_irk(hdev, dst, dst_type); + break; case LE_LINK: if (hdev->le_mtu && hdev->le_mtu < HCI_MIN_LE_MTU) return ERR_PTR(-ECONNREFUSED); if (!hdev->le_mtu && hdev->acl_mtu < HCI_MIN_LE_MTU) return ERR_PTR(-ECONNREFUSED); + irk = hci_get_irk(hdev, dst, dst_type); break; case SCO_LINK: case ESCO_LINK: @@ -941,7 +964,15 @@ static struct hci_conn *__hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t if (!conn) return ERR_PTR(-ENOMEM); - bacpy(&conn->dst, dst); + /* If and IRK exists use its identity address */ + if (!irk) { + bacpy(&conn->dst, dst); + conn->dst_type = dst_type; + } else { + bacpy(&conn->dst, &irk->bdaddr); + conn->dst_type = irk->addr_type; + } + bacpy(&conn->src, &hdev->bdaddr); conn->handle = handle; conn->hdev = hdev; @@ -979,19 +1010,20 @@ static struct hci_conn *__hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t conn->mtu = hdev->le_mtu ? hdev->le_mtu : hdev->acl_mtu; break; case CIS_LINK: - case BIS_LINK: - case PA_LINK: /* conn->src should reflect the local identity address */ hci_copy_identity_address(hdev, &conn->src, &conn->src_type); - /* set proper cleanup function */ - if (!bacmp(dst, BDADDR_ANY)) - conn->cleanup = bis_cleanup; - else if (conn->role == HCI_ROLE_MASTER) + if (conn->role == HCI_ROLE_MASTER) conn->cleanup = cis_cleanup; - conn->mtu = hdev->iso_mtu ? hdev->iso_mtu : - hdev->le_mtu ? hdev->le_mtu : hdev->acl_mtu; + conn->mtu = hdev->iso_mtu; + break; + case PA_LINK: + case BIS_LINK: + /* conn->src should reflect the local identity address */ + hci_copy_identity_address(hdev, &conn->src, &conn->src_type); + conn->cleanup = bis_cleanup; + conn->mtu = hdev->iso_mtu; break; case SCO_LINK: if (lmp_esco_capable(hdev)) @@ -1039,7 +1071,7 @@ static struct hci_conn *__hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t } struct hci_conn *hci_conn_add_unset(struct hci_dev *hdev, int type, - bdaddr_t *dst, u8 role) + bdaddr_t *dst, u8 dst_type, u8 role) { int handle; @@ -1049,16 +1081,16 @@ struct hci_conn *hci_conn_add_unset(struct hci_dev *hdev, int type, if (unlikely(handle < 0)) return ERR_PTR(-ECONNREFUSED); - return __hci_conn_add(hdev, type, dst, role, handle); + return __hci_conn_add(hdev, type, dst, dst_type, role, handle); } struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst, - u8 role, u16 handle) + u8 dst_type, u8 role, u16 handle) { if (handle > HCI_CONN_HANDLE_MAX) return ERR_PTR(-EINVAL); - return __hci_conn_add(hdev, type, dst, role, handle); + return __hci_conn_add(hdev, type, dst, dst_type, role, handle); } static void hci_conn_cleanup_child(struct hci_conn *conn, u8 reason) @@ -1141,28 +1173,54 @@ void hci_conn_del(struct hci_conn *conn) disable_delayed_work_sync(&conn->auto_accept_work); disable_delayed_work_sync(&conn->idle_work); - if (conn->type == ACL_LINK) { - /* Unacked frames */ - hdev->acl_cnt += conn->sent; - } else if (conn->type == LE_LINK) { - cancel_delayed_work(&conn->le_conn_timeout); + /* Remove the connection from the list so unacked logic can detect when + * a certain pool is not being utilized. + */ + hci_conn_hash_del(hdev, conn); - if (hdev->le_pkts) - hdev->le_cnt += conn->sent; + /* Handle unacked frames: + * + * - In case there are no connection, or if restoring the buffers + * considered in transist would overflow, restore all buffers to the + * pool. + * - Otherwise restore just the buffers considered in transit for the + * hci_conn + */ + switch (conn->type) { + case ACL_LINK: + if (!hci_conn_num(hdev, ACL_LINK) || + hdev->acl_cnt + conn->sent > hdev->acl_pkts) + hdev->acl_cnt = hdev->acl_pkts; else hdev->acl_cnt += conn->sent; - } else { - /* Unacked ISO frames */ - if (conn->type == CIS_LINK || - conn->type == BIS_LINK || - conn->type == PA_LINK) { - if (hdev->iso_pkts) - hdev->iso_cnt += conn->sent; - else if (hdev->le_pkts) + break; + case LE_LINK: + cancel_delayed_work(&conn->le_conn_timeout); + + if (hdev->le_pkts) { + if (!hci_conn_num(hdev, LE_LINK) || + hdev->le_cnt + conn->sent > hdev->le_pkts) + hdev->le_cnt = hdev->le_pkts; + else hdev->le_cnt += conn->sent; + } else { + if ((!hci_conn_num(hdev, LE_LINK) && + !hci_conn_num(hdev, ACL_LINK)) || + hdev->acl_cnt + conn->sent > hdev->acl_pkts) + hdev->acl_cnt = hdev->acl_pkts; else hdev->acl_cnt += conn->sent; } + break; + case CIS_LINK: + case BIS_LINK: + case PA_LINK: + if (!hci_iso_count(hdev) || + hdev->iso_cnt + conn->sent > hdev->iso_pkts) + hdev->iso_cnt = hdev->iso_pkts; + else + hdev->iso_cnt += conn->sent; + break; } skb_queue_purge(&conn->data_q); @@ -1364,14 +1422,13 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst, if (conn) { bacpy(&conn->dst, dst); } else { - conn = hci_conn_add_unset(hdev, LE_LINK, dst, role); + conn = hci_conn_add_unset(hdev, LE_LINK, dst, dst_type, role); if (IS_ERR(conn)) return conn; hci_conn_hold(conn); conn->pending_sec_level = sec_level; } - conn->dst_type = dst_type; conn->sec_level = BT_SECURITY_LOW; conn->conn_timeout = conn_timeout; conn->le_adv_phy = phy; @@ -1505,7 +1562,7 @@ static int qos_set_bis(struct hci_dev *hdev, struct bt_iso_qos *qos) /* This function requires the caller holds hdev->lock */ static struct hci_conn *hci_add_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 sid, struct bt_iso_qos *qos, - __u8 base_len, __u8 *base) + __u8 base_len, __u8 *base, u16 timeout) { struct hci_conn *conn; int err; @@ -1541,12 +1598,13 @@ static struct hci_conn *hci_add_bis(struct hci_dev *hdev, bdaddr_t *dst, memcmp(conn->le_per_adv_data, base, base_len))) return ERR_PTR(-EADDRINUSE); - conn = hci_conn_add_unset(hdev, BIS_LINK, dst, HCI_ROLE_MASTER); + conn = hci_conn_add_unset(hdev, BIS_LINK, dst, 0, HCI_ROLE_MASTER); if (IS_ERR(conn)) return conn; conn->state = BT_CONNECT; conn->sid = sid; + conn->conn_timeout = timeout; hci_conn_hold(conn); return conn; @@ -1586,7 +1644,8 @@ struct hci_conn *hci_connect_le_scan(struct hci_dev *hdev, bdaddr_t *dst, BT_DBG("requesting refresh of dst_addr"); - conn = hci_conn_add_unset(hdev, LE_LINK, dst, HCI_ROLE_MASTER); + conn = hci_conn_add_unset(hdev, LE_LINK, dst, dst_type, + HCI_ROLE_MASTER); if (IS_ERR(conn)) return conn; @@ -1597,7 +1656,6 @@ struct hci_conn *hci_connect_le_scan(struct hci_dev *hdev, bdaddr_t *dst, conn->state = BT_CONNECT; set_bit(HCI_CONN_SCANNING, &conn->flags); - conn->dst_type = dst_type; conn->sec_level = BT_SECURITY_LOW; conn->pending_sec_level = sec_level; conn->conn_timeout = conn_timeout; @@ -1634,7 +1692,8 @@ struct hci_conn *hci_connect_acl(struct hci_dev *hdev, bdaddr_t *dst, acl = hci_conn_hash_lookup_ba(hdev, ACL_LINK, dst); if (!acl) { - acl = hci_conn_add_unset(hdev, ACL_LINK, dst, HCI_ROLE_MASTER); + acl = hci_conn_add_unset(hdev, ACL_LINK, dst, 0, + HCI_ROLE_MASTER); if (IS_ERR(acl)) return acl; } @@ -1703,7 +1762,7 @@ struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst, sco = hci_conn_hash_lookup_ba(hdev, type, dst); if (!sco) { - sco = hci_conn_add_unset(hdev, type, dst, HCI_ROLE_MASTER); + sco = hci_conn_add_unset(hdev, type, dst, 0, HCI_ROLE_MASTER); if (IS_ERR(sco)) { hci_conn_drop(acl); return sco; @@ -1887,14 +1946,15 @@ done: } struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst, - __u8 dst_type, struct bt_iso_qos *qos) + __u8 dst_type, struct bt_iso_qos *qos, + u16 timeout) { struct hci_conn *cis; cis = hci_conn_hash_lookup_cis(hdev, dst, dst_type, qos->ucast.cig, qos->ucast.cis); if (!cis) { - cis = hci_conn_add_unset(hdev, CIS_LINK, dst, + cis = hci_conn_add_unset(hdev, CIS_LINK, dst, dst_type, HCI_ROLE_MASTER); if (IS_ERR(cis)) return cis; @@ -1902,6 +1962,7 @@ struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst, cis->dst_type = dst_type; cis->iso_qos.ucast.cig = BT_ISO_QOS_CIG_UNSET; cis->iso_qos.ucast.cis = BT_ISO_QOS_CIS_UNSET; + cis->conn_timeout = timeout; } if (cis->state == BT_CONNECTED) @@ -2084,12 +2145,11 @@ struct hci_conn *hci_pa_create_sync(struct hci_dev *hdev, bdaddr_t *dst, bt_dev_dbg(hdev, "dst %pMR type %d sid %d", dst, dst_type, sid); - conn = hci_conn_add_unset(hdev, PA_LINK, dst, HCI_ROLE_SLAVE); + conn = hci_conn_add_unset(hdev, PA_LINK, dst, dst_type, HCI_ROLE_SLAVE); if (IS_ERR(conn)) return conn; conn->iso_qos = *qos; - conn->dst_type = dst_type; conn->sid = sid; conn->state = BT_LISTEN; conn->conn_timeout = msecs_to_jiffies(qos->bcast.sync_timeout * 10); @@ -2141,7 +2201,7 @@ static void create_big_complete(struct hci_dev *hdev, void *data, int err) struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 sid, struct bt_iso_qos *qos, - __u8 base_len, __u8 *base) + __u8 base_len, __u8 *base, u16 timeout) { struct hci_conn *conn; struct hci_conn *parent; @@ -2162,7 +2222,7 @@ struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 sid, base, base_len); /* We need hci_conn object using the BDADDR_ANY as dst */ - conn = hci_add_bis(hdev, dst, sid, qos, base_len, eir); + conn = hci_add_bis(hdev, dst, sid, qos, base_len, eir, timeout); if (IS_ERR(conn)) return conn; @@ -2196,6 +2256,18 @@ struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 sid, return conn; } +int hci_past_bis(struct hci_conn *conn, bdaddr_t *dst, __u8 dst_type) +{ + struct hci_conn *le; + + /* Lookup existing LE connection to rebind to */ + le = hci_conn_hash_lookup_le(conn->hdev, dst, dst_type); + if (!le) + return -EINVAL; + + return hci_past_sync(conn, le); +} + static void bis_mark_per_adv(struct hci_conn *conn, void *data) { struct iso_list_data *d = data; @@ -2215,13 +2287,13 @@ static void bis_mark_per_adv(struct hci_conn *conn, void *data) struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type, __u8 sid, struct bt_iso_qos *qos, - __u8 base_len, __u8 *base) + __u8 base_len, __u8 *base, u16 timeout) { struct hci_conn *conn; int err; struct iso_list_data data; - conn = hci_bind_bis(hdev, dst, sid, qos, base_len, base); + conn = hci_bind_bis(hdev, dst, sid, qos, base_len, base, timeout); if (IS_ERR(conn)) return conn; @@ -2249,7 +2321,7 @@ struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst, * the start periodic advertising and create BIG commands have * been queued */ - hci_conn_hash_list_state(hdev, bis_mark_per_adv, PA_LINK, + hci_conn_hash_list_state(hdev, bis_mark_per_adv, BIS_LINK, BT_BOUND, &data); /* Queue start periodic advertising and create BIG */ @@ -2264,7 +2336,8 @@ struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst, } struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst, - __u8 dst_type, struct bt_iso_qos *qos) + __u8 dst_type, struct bt_iso_qos *qos, + u16 timeout) { struct hci_conn *le; struct hci_conn *cis; @@ -2288,7 +2361,7 @@ struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst, hci_iso_qos_setup(hdev, le, &qos->ucast.in, le->le_rx_phy ? le->le_rx_phy : hdev->le_rx_def_phys); - cis = hci_bind_cis(hdev, dst, dst_type, qos); + cis = hci_bind_cis(hdev, dst, dst_type, qos, timeout); if (IS_ERR(cis)) { hci_conn_drop(le); return cis; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 55e0722fd066..8ccec73dce45 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -3267,6 +3267,8 @@ static void hci_queue_acl(struct hci_chan *chan, struct sk_buff_head *queue, spin_unlock_bh(&queue->lock); } + + bt_dev_dbg(hdev, "chan %p queued %d", chan, skb_queue_len(queue)); } void hci_send_acl(struct hci_chan *chan, struct sk_buff *skb, __u16 flags) @@ -3298,6 +3300,10 @@ void hci_send_sco(struct hci_conn *conn, struct sk_buff *skb) hci_skb_pkt_type(skb) = HCI_SCODATA_PKT; skb_queue_tail(&conn->data_q, skb); + + bt_dev_dbg(hdev, "hcon %p queued %d", conn, + skb_queue_len(&conn->data_q)); + queue_work(hdev->workqueue, &hdev->tx_work); } @@ -3357,6 +3363,8 @@ static void hci_queue_iso(struct hci_conn *conn, struct sk_buff_head *queue, __skb_queue_tail(queue, skb); } while (list); } + + bt_dev_dbg(hdev, "hcon %p queued %d", conn, skb_queue_len(queue)); } void hci_send_iso(struct hci_conn *conn, struct sk_buff *skb) @@ -3399,8 +3407,7 @@ static inline void hci_quote_sent(struct hci_conn *conn, int num, int *quote) case CIS_LINK: case BIS_LINK: case PA_LINK: - cnt = hdev->iso_mtu ? hdev->iso_cnt : - hdev->le_mtu ? hdev->le_cnt : hdev->acl_cnt; + cnt = hdev->iso_cnt; break; default: cnt = 0; @@ -3428,6 +3435,10 @@ static struct hci_conn *hci_low_sent(struct hci_dev *hdev, __u8 type, skb_queue_empty(&c->data_q)) continue; + bt_dev_dbg(hdev, "hcon %p state %s queued %d", c, + state_to_string(c->state), + skb_queue_len(&c->data_q)); + if (c->state != BT_CONNECTED && c->state != BT_CONFIG) continue; @@ -3586,24 +3597,37 @@ static void hci_prio_recalculate(struct hci_dev *hdev, __u8 type) static void __check_timeout(struct hci_dev *hdev, unsigned int cnt, u8 type) { - unsigned long last_tx; + unsigned long timeout; if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) return; switch (type) { + case ACL_LINK: + /* tx timeout must be longer than maximum link supervision + * timeout (40.9 seconds) + */ + timeout = hdev->acl_last_tx + HCI_ACL_TX_TIMEOUT; + break; case LE_LINK: - last_tx = hdev->le_last_tx; + /* tx timeout must be longer than maximum link supervision + * timeout (40.9 seconds) + */ + timeout = hdev->le_last_tx + HCI_ACL_TX_TIMEOUT; break; - default: - last_tx = hdev->acl_last_tx; + case CIS_LINK: + case BIS_LINK: + case PA_LINK: + /* tx timeout must be longer than the maximum transport latency + * (8.388607 seconds) + */ + timeout = hdev->iso_last_tx + HCI_ISO_TX_TIMEOUT; break; + default: + return; } - /* tx timeout must be longer than maximum link supervision timeout - * (40.9 seconds) - */ - if (!cnt && time_after(jiffies, last_tx + HCI_ACL_TX_TIMEOUT)) + if (!cnt && time_after(jiffies, timeout)) hci_link_tx_to(hdev, type); } @@ -3759,12 +3783,16 @@ static void hci_sched_iso(struct hci_dev *hdev, __u8 type) if (!hci_conn_num(hdev, type)) return; - cnt = hdev->iso_pkts ? &hdev->iso_cnt : - hdev->le_pkts ? &hdev->le_cnt : &hdev->acl_cnt; + cnt = &hdev->iso_cnt; + + __check_timeout(hdev, *cnt, type); + while (*cnt && (conn = hci_low_sent(hdev, type, "e))) { while (quote-- && (skb = skb_dequeue(&conn->data_q))) { BT_DBG("skb %p len %d", skb, skb->len); + hci_send_conn_frame(hdev, conn, skb); + hdev->iso_last_tx = jiffies; conn->sent++; if (conn->sent == ~0) @@ -3804,13 +3832,14 @@ static void hci_tx_work(struct work_struct *work) static void hci_acldata_packet(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_acl_hdr *hdr; - struct hci_conn *conn; __u16 handle, flags; + int err; hdr = skb_pull_data(skb, sizeof(*hdr)); if (!hdr) { bt_dev_err(hdev, "ACL packet too small"); - goto drop; + kfree_skb(skb); + return; } handle = __le16_to_cpu(hdr->handle); @@ -3822,36 +3851,27 @@ static void hci_acldata_packet(struct hci_dev *hdev, struct sk_buff *skb) hdev->stat.acl_rx++; - hci_dev_lock(hdev); - conn = hci_conn_hash_lookup_handle(hdev, handle); - hci_dev_unlock(hdev); - - if (conn) { - hci_conn_enter_active_mode(conn, BT_POWER_FORCE_ACTIVE_OFF); - - /* Send to upper protocol */ - l2cap_recv_acldata(conn, skb, flags); - return; - } else { + err = l2cap_recv_acldata(hdev, handle, skb, flags); + if (err == -ENOENT) bt_dev_err(hdev, "ACL packet for unknown connection handle %d", handle); - } - -drop: - kfree_skb(skb); + else if (err) + bt_dev_dbg(hdev, "ACL packet recv for handle %d failed: %d", + handle, err); } /* SCO data packet */ static void hci_scodata_packet(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_sco_hdr *hdr; - struct hci_conn *conn; __u16 handle, flags; + int err; hdr = skb_pull_data(skb, sizeof(*hdr)); if (!hdr) { bt_dev_err(hdev, "SCO packet too small"); - goto drop; + kfree_skb(skb); + return; } handle = __le16_to_cpu(hdr->handle); @@ -3863,34 +3883,28 @@ static void hci_scodata_packet(struct hci_dev *hdev, struct sk_buff *skb) hdev->stat.sco_rx++; - hci_dev_lock(hdev); - conn = hci_conn_hash_lookup_handle(hdev, handle); - hci_dev_unlock(hdev); + hci_skb_pkt_status(skb) = flags & 0x03; - if (conn) { - /* Send to upper protocol */ - hci_skb_pkt_status(skb) = flags & 0x03; - sco_recv_scodata(conn, skb); - return; - } else { + err = sco_recv_scodata(hdev, handle, skb); + if (err == -ENOENT) bt_dev_err_ratelimited(hdev, "SCO packet for unknown connection handle %d", handle); - } - -drop: - kfree_skb(skb); + else if (err) + bt_dev_dbg(hdev, "SCO packet recv for handle %d failed: %d", + handle, err); } static void hci_isodata_packet(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_iso_hdr *hdr; - struct hci_conn *conn; __u16 handle, flags; + int err; hdr = skb_pull_data(skb, sizeof(*hdr)); if (!hdr) { bt_dev_err(hdev, "ISO packet too small"); - goto drop; + kfree_skb(skb); + return; } handle = __le16_to_cpu(hdr->handle); @@ -3900,22 +3914,13 @@ static void hci_isodata_packet(struct hci_dev *hdev, struct sk_buff *skb) bt_dev_dbg(hdev, "len %d handle 0x%4.4x flags 0x%4.4x", skb->len, handle, flags); - hci_dev_lock(hdev); - conn = hci_conn_hash_lookup_handle(hdev, handle); - hci_dev_unlock(hdev); - - if (!conn) { + err = iso_recv(hdev, handle, skb, flags); + if (err == -ENOENT) bt_dev_err(hdev, "ISO packet for unknown connection handle %d", handle); - goto drop; - } - - /* Send to upper protocol */ - iso_recv(conn, skb, flags); - return; - -drop: - kfree_skb(skb); + else if (err) + bt_dev_dbg(hdev, "ISO packet recv for handle %d failed: %d", + handle, err); } static bool hci_req_is_complete(struct hci_dev *hdev) @@ -4093,7 +4098,7 @@ static void hci_rx_work(struct work_struct *work) } } -static void hci_send_cmd_sync(struct hci_dev *hdev, struct sk_buff *skb) +static int hci_send_cmd_sync(struct hci_dev *hdev, struct sk_buff *skb) { int err; @@ -4105,16 +4110,19 @@ static void hci_send_cmd_sync(struct hci_dev *hdev, struct sk_buff *skb) if (!hdev->sent_cmd) { skb_queue_head(&hdev->cmd_q, skb); queue_work(hdev->workqueue, &hdev->cmd_work); - return; + return -EINVAL; } if (hci_skb_opcode(skb) != HCI_OP_NOP) { err = hci_send_frame(hdev, skb); if (err < 0) { hci_cmd_sync_cancel_sync(hdev, -err); - return; + return err; } atomic_dec(&hdev->cmd_cnt); + } else { + err = -ENODATA; + kfree_skb(skb); } if (hdev->req_status == HCI_REQ_PEND && @@ -4122,12 +4130,15 @@ static void hci_send_cmd_sync(struct hci_dev *hdev, struct sk_buff *skb) kfree_skb(hdev->req_skb); hdev->req_skb = skb_clone(hdev->sent_cmd, GFP_KERNEL); } + + return err; } static void hci_cmd_work(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, cmd_work); struct sk_buff *skb; + int err; BT_DBG("%s cmd_cnt %d cmd queued %d", hdev->name, atomic_read(&hdev->cmd_cnt), skb_queue_len(&hdev->cmd_q)); @@ -4138,7 +4149,9 @@ static void hci_cmd_work(struct work_struct *work) if (!skb) return; - hci_send_cmd_sync(hdev, skb); + err = hci_send_cmd_sync(hdev, skb); + if (err) + return; rcu_read_lock(); if (test_bit(HCI_RESET, &hdev->flags) || diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 8aa5039b975a..a9868f17ef40 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1607,8 +1607,10 @@ static u8 hci_cc_le_set_ext_adv_enable(struct hci_dev *hdev, void *data, hci_dev_set_flag(hdev, HCI_LE_ADV); - if (adv && !adv->periodic) + if (adv) adv->enabled = true; + else if (!set->handle) + hci_dev_set_flag(hdev, HCI_LE_ADV_0); conn = hci_lookup_le_connect(hdev); if (conn) @@ -1619,6 +1621,8 @@ static u8 hci_cc_le_set_ext_adv_enable(struct hci_dev *hdev, void *data, if (cp->num_of_sets) { if (adv) adv->enabled = false; + else if (!set->handle) + hci_dev_clear_flag(hdev, HCI_LE_ADV_0); /* If just one instance was disabled check if there are * any other instance enabled before clearing HCI_LE_ADV @@ -2263,7 +2267,7 @@ static void hci_cs_create_conn(struct hci_dev *hdev, __u8 status) } else { if (!conn) { conn = hci_conn_add_unset(hdev, ACL_LINK, &cp->bdaddr, - HCI_ROLE_MASTER); + 0, HCI_ROLE_MASTER); if (IS_ERR(conn)) bt_dev_err(hdev, "connection err: %ld", PTR_ERR(conn)); } @@ -2703,7 +2707,7 @@ static void hci_cs_disconnect(struct hci_dev *hdev, u8 status) if (!conn) goto unlock; - if (status) { + if (status && status != HCI_ERROR_UNKNOWN_CONN_ID) { mgmt_disconnect_failed(hdev, &conn->dst, conn->type, conn->dst_type, status); @@ -2718,6 +2722,12 @@ static void hci_cs_disconnect(struct hci_dev *hdev, u8 status) goto done; } + /* During suspend, mark connection as closed immediately + * since we might not receive HCI_EV_DISCONN_COMPLETE + */ + if (hdev->suspended) + conn->state = BT_CLOSED; + mgmt_conn = test_and_clear_bit(HCI_CONN_MGMT_CONNECTED, &conn->flags); if (conn->type == ACL_LINK) { @@ -2876,12 +2886,8 @@ static void hci_cs_le_read_remote_features(struct hci_dev *hdev, u8 status) hci_dev_lock(hdev); conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle)); - if (conn) { - if (conn->state == BT_CONFIG) { - hci_connect_cfm(conn, status); - hci_conn_drop(conn); - } - } + if (conn && conn->state == BT_CONFIG) + hci_connect_cfm(conn, status); hci_dev_unlock(hdev); } @@ -3081,8 +3087,18 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, void *data, hci_dev_lock(hdev); + /* Check for existing connection: + * + * 1. If it doesn't exist then it must be receiver/slave role. + * 2. If it does exist confirm that it is connecting/BT_CONNECT in case + * of initiator/master role since there could be a collision where + * either side is attempting to connect or something like a fuzzing + * testing is trying to play tricks to destroy the hcon object before + * it even attempts to connect (e.g. hcon->state == BT_OPEN). + */ conn = hci_conn_hash_lookup_ba(hdev, ev->link_type, &ev->bdaddr); - if (!conn) { + if (!conn || + (conn->role == HCI_ROLE_MASTER && conn->state != BT_CONNECT)) { /* In case of error status and there is no connection pending * just unlock as there is nothing to cleanup. */ @@ -3103,7 +3119,8 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, void *data, &ev->bdaddr, BDADDR_BREDR)) { conn = hci_conn_add_unset(hdev, ev->link_type, - &ev->bdaddr, HCI_ROLE_SLAVE); + &ev->bdaddr, 0, + HCI_ROLE_SLAVE); if (IS_ERR(conn)) { bt_dev_err(hdev, "connection err: %ld", PTR_ERR(conn)); goto unlock; @@ -3279,7 +3296,7 @@ static void hci_conn_request_evt(struct hci_dev *hdev, void *data, conn = hci_conn_hash_lookup_ba(hdev, ev->link_type, &ev->bdaddr); if (!conn) { - conn = hci_conn_add_unset(hdev, ev->link_type, &ev->bdaddr, + conn = hci_conn_add_unset(hdev, ev->link_type, &ev->bdaddr, 0, HCI_ROLE_SLAVE); if (IS_ERR(conn)) { bt_dev_err(hdev, "connection err: %ld", PTR_ERR(conn)); @@ -3894,11 +3911,49 @@ unlock: return rp->status; } +static u8 hci_cc_le_read_all_local_features(struct hci_dev *hdev, void *data, + struct sk_buff *skb) +{ + struct hci_rp_le_read_all_local_features *rp = data; + + bt_dev_dbg(hdev, "status 0x%2.2x", rp->status); + + if (rp->status) + return rp->status; + + memcpy(hdev->le_features, rp->features, 248); + + return rp->status; +} + static void hci_cs_le_create_big(struct hci_dev *hdev, u8 status) { bt_dev_dbg(hdev, "status 0x%2.2x", status); } +static void hci_cs_le_read_all_remote_features(struct hci_dev *hdev, u8 status) +{ + struct hci_cp_le_read_remote_features *cp; + struct hci_conn *conn; + + bt_dev_dbg(hdev, "status 0x%2.2x", status); + + if (!status) + return; + + cp = hci_sent_cmd_data(hdev, HCI_OP_LE_READ_ALL_REMOTE_FEATURES); + if (!cp) + return; + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle)); + if (conn && conn->state == BT_CONFIG) + hci_connect_cfm(conn, status); + + hci_dev_unlock(hdev); +} + static u8 hci_cc_set_per_adv_param(struct hci_dev *hdev, void *data, struct sk_buff *skb) { @@ -3943,8 +3998,11 @@ static u8 hci_cc_le_set_per_adv_enable(struct hci_dev *hdev, void *data, hci_dev_set_flag(hdev, HCI_LE_PER_ADV); if (adv) - adv->enabled = true; + adv->periodic_enabled = true; } else { + if (adv) + adv->periodic_enabled = false; + /* If just one instance was disabled check if there are * any other instance enabled before clearing HCI_LE_PER_ADV. * The current periodic adv instance will be marked as @@ -4147,6 +4205,9 @@ static const struct hci_cc { sizeof(struct hci_rp_le_set_cig_params), HCI_MAX_EVENT_SIZE), HCI_CC(HCI_OP_LE_SETUP_ISO_PATH, hci_cc_le_setup_iso_path, sizeof(struct hci_rp_le_setup_iso_path)), + HCI_CC(HCI_OP_LE_READ_ALL_LOCAL_FEATURES, + hci_cc_le_read_all_local_features, + sizeof(struct hci_rp_le_read_all_local_features)), }; static u8 hci_cc_func(struct hci_dev *hdev, const struct hci_cc *cc, @@ -4195,6 +4256,13 @@ static void hci_cmd_complete_evt(struct hci_dev *hdev, void *data, } if (i == ARRAY_SIZE(hci_cc_table)) { + if (!skb->len) { + bt_dev_err(hdev, "Unexpected cc 0x%4.4x with no status", + *opcode); + *status = HCI_ERROR_UNSPECIFIED; + return; + } + /* Unknown opcode, assume byte 0 contains the status, so * that e.g. __hci_cmd_sync() properly returns errors * for vendor specific commands send by HCI drivers. @@ -4294,6 +4362,8 @@ static const struct hci_cs { HCI_CS(HCI_OP_LE_EXT_CREATE_CONN, hci_cs_le_ext_create_conn), HCI_CS(HCI_OP_LE_CREATE_CIS, hci_cs_le_create_cis), HCI_CS(HCI_OP_LE_CREATE_BIG, hci_cs_le_create_big), + HCI_CS(HCI_OP_LE_READ_ALL_REMOTE_FEATURES, + hci_cs_le_read_all_remote_features), }; static void hci_cmd_status_evt(struct hci_dev *hdev, void *data, @@ -4385,6 +4455,8 @@ static void hci_num_comp_pkts_evt(struct hci_dev *hdev, void *data, bt_dev_dbg(hdev, "num %d", ev->num); + hci_dev_lock(hdev); + for (i = 0; i < ev->num; i++) { struct hci_comp_pkts_info *info = &ev->handles[i]; struct hci_conn *conn; @@ -4398,7 +4470,17 @@ static void hci_num_comp_pkts_evt(struct hci_dev *hdev, void *data, if (!conn) continue; - conn->sent -= count; + /* Check if there is really enough packets outstanding before + * attempting to decrease the sent counter otherwise it could + * underflow.. + */ + if (conn->sent >= count) { + conn->sent -= count; + } else { + bt_dev_warn(hdev, "hcon %p sent %u < count %u", + conn, conn->sent, count); + conn->sent = 0; + } for (i = 0; i < count; ++i) hci_conn_tx_dequeue(conn); @@ -4433,19 +4515,9 @@ static void hci_num_comp_pkts_evt(struct hci_dev *hdev, void *data, case CIS_LINK: case BIS_LINK: case PA_LINK: - if (hdev->iso_pkts) { - hdev->iso_cnt += count; - if (hdev->iso_cnt > hdev->iso_pkts) - hdev->iso_cnt = hdev->iso_pkts; - } else if (hdev->le_pkts) { - hdev->le_cnt += count; - if (hdev->le_cnt > hdev->le_pkts) - hdev->le_cnt = hdev->le_pkts; - } else { - hdev->acl_cnt += count; - if (hdev->acl_cnt > hdev->acl_pkts) - hdev->acl_cnt = hdev->acl_pkts; - } + hdev->iso_cnt += count; + if (hdev->iso_cnt > hdev->iso_pkts) + hdev->iso_cnt = hdev->iso_pkts; break; default: @@ -4456,6 +4528,8 @@ static void hci_num_comp_pkts_evt(struct hci_dev *hdev, void *data, } queue_work(hdev->workqueue, &hdev->tx_work); + + hci_dev_unlock(hdev); } static void hci_mode_change_evt(struct hci_dev *hdev, void *data, @@ -5610,6 +5684,7 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, struct hci_conn *conn; struct smp_irk *irk; u8 addr_type; + int err; hci_dev_lock(hdev); @@ -5618,22 +5693,31 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, */ hci_dev_clear_flag(hdev, HCI_LE_ADV); - conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, bdaddr); - if (!conn) { + /* Check for existing connection: + * + * 1. If it doesn't exist then use the role to create a new object. + * 2. If it does exist confirm that it is connecting/BT_CONNECT in case + * of initiator/master role since there could be a collision where + * either side is attempting to connect or something like a fuzzing + * testing is trying to play tricks to destroy the hcon object before + * it even attempts to connect (e.g. hcon->state == BT_OPEN). + */ + conn = hci_conn_hash_lookup_role(hdev, LE_LINK, role, bdaddr); + if (!conn || + (conn->role == HCI_ROLE_MASTER && conn->state != BT_CONNECT)) { /* In case of error status and there is no connection pending * just unlock as there is nothing to cleanup. */ if (status) goto unlock; - conn = hci_conn_add_unset(hdev, LE_LINK, bdaddr, role); + conn = hci_conn_add_unset(hdev, LE_LINK, bdaddr, bdaddr_type, + role); if (IS_ERR(conn)) { bt_dev_err(hdev, "connection err: %ld", PTR_ERR(conn)); goto unlock; } - conn->dst_type = bdaddr_type; - /* If we didn't have a hci_conn object previously * but we're in central role this must be something * initiated using an accept list. Since accept list based @@ -5731,26 +5815,8 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, hci_debugfs_create_conn(conn); hci_conn_add_sysfs(conn); - /* The remote features procedure is defined for central - * role only. So only in case of an initiated connection - * request the remote features. - * - * If the local controller supports peripheral-initiated features - * exchange, then requesting the remote features in peripheral - * role is possible. Otherwise just transition into the - * connected state without requesting the remote features. - */ - if (conn->out || - (hdev->le_features[0] & HCI_LE_PERIPHERAL_FEATURES)) { - struct hci_cp_le_read_remote_features cp; - - cp.handle = __cpu_to_le16(conn->handle); - - hci_send_cmd(hdev, HCI_OP_LE_READ_REMOTE_FEATURES, - sizeof(cp), &cp); - - hci_conn_hold(conn); - } else { + err = hci_le_read_remote_features(conn); + if (err) { conn->state = BT_CONNECTED; hci_connect_cfm(conn, status); } @@ -5799,6 +5865,29 @@ static void hci_le_enh_conn_complete_evt(struct hci_dev *hdev, void *data, le16_to_cpu(ev->supervision_timeout)); } +static void hci_le_pa_sync_lost_evt(struct hci_dev *hdev, void *data, + struct sk_buff *skb) +{ + struct hci_ev_le_pa_sync_lost *ev = data; + u16 handle = le16_to_cpu(ev->handle); + struct hci_conn *conn; + + bt_dev_dbg(hdev, "sync handle 0x%4.4x", handle); + + hci_dev_lock(hdev); + + /* Delete the pa sync connection */ + conn = hci_conn_hash_lookup_pa_sync_handle(hdev, handle); + if (conn) { + clear_bit(HCI_CONN_BIG_SYNC, &conn->flags); + clear_bit(HCI_CONN_PA_SYNC, &conn->flags); + hci_disconn_cfm(conn, HCI_ERROR_REMOTE_USER_TERM); + hci_conn_del(conn); + } + + hci_dev_unlock(hdev); +} + static void hci_le_ext_adv_term_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb) { @@ -5869,6 +5958,71 @@ unlock: hci_dev_unlock(hdev); } +static int hci_le_pa_term_sync(struct hci_dev *hdev, __le16 handle) +{ + struct hci_cp_le_pa_term_sync cp; + + memset(&cp, 0, sizeof(cp)); + cp.handle = handle; + + return hci_send_cmd(hdev, HCI_OP_LE_PA_TERM_SYNC, sizeof(cp), &cp); +} + +static void hci_le_past_received_evt(struct hci_dev *hdev, void *data, + struct sk_buff *skb) +{ + struct hci_ev_le_past_received *ev = data; + int mask = hdev->link_mode; + __u8 flags = 0; + struct hci_conn *pa_sync, *conn; + + bt_dev_dbg(hdev, "status 0x%2.2x", ev->status); + + hci_dev_lock(hdev); + + hci_dev_clear_flag(hdev, HCI_PA_SYNC); + + conn = hci_conn_hash_lookup_create_pa_sync(hdev); + if (!conn) { + bt_dev_err(hdev, + "Unable to find connection for dst %pMR sid 0x%2.2x", + &ev->bdaddr, ev->sid); + goto unlock; + } + + conn->sync_handle = le16_to_cpu(ev->sync_handle); + conn->sid = HCI_SID_INVALID; + + mask |= hci_proto_connect_ind(hdev, &ev->bdaddr, PA_LINK, + &flags); + if (!(mask & HCI_LM_ACCEPT)) { + hci_le_pa_term_sync(hdev, ev->sync_handle); + goto unlock; + } + + if (!(flags & HCI_PROTO_DEFER)) + goto unlock; + + /* Add connection to indicate PA sync event */ + pa_sync = hci_conn_add_unset(hdev, PA_LINK, BDADDR_ANY, 0, + HCI_ROLE_SLAVE); + + if (IS_ERR(pa_sync)) + goto unlock; + + pa_sync->sync_handle = le16_to_cpu(ev->sync_handle); + + if (ev->status) { + set_bit(HCI_CONN_PA_SYNC_FAILED, &pa_sync->flags); + + /* Notify iso layer */ + hci_connect_cfm(pa_sync, ev->status); + } + +unlock: + hci_dev_unlock(hdev); +} + static void hci_le_conn_update_complete_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb) { @@ -6345,16 +6499,6 @@ static void hci_le_ext_adv_report_evt(struct hci_dev *hdev, void *data, hci_dev_unlock(hdev); } -static int hci_le_pa_term_sync(struct hci_dev *hdev, __le16 handle) -{ - struct hci_cp_le_pa_term_sync cp; - - memset(&cp, 0, sizeof(cp)); - cp.handle = handle; - - return hci_send_cmd(hdev, HCI_OP_LE_PA_TERM_SYNC, sizeof(cp), &cp); -} - static void hci_le_pa_sync_established_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb) { @@ -6393,7 +6537,7 @@ static void hci_le_pa_sync_established_evt(struct hci_dev *hdev, void *data, goto unlock; /* Add connection to indicate PA sync event */ - pa_sync = hci_conn_add_unset(hdev, PA_LINK, BDADDR_ANY, + pa_sync = hci_conn_add_unset(hdev, PA_LINK, BDADDR_ANY, 0, HCI_ROLE_SLAVE); if (IS_ERR(pa_sync)) @@ -6486,7 +6630,6 @@ static void hci_le_remote_feat_complete_evt(struct hci_dev *hdev, void *data, conn->state = BT_CONNECTED; hci_connect_cfm(conn, status); - hci_conn_drop(conn); } } @@ -6745,8 +6888,8 @@ static void hci_le_cis_established_evt(struct hci_dev *hdev, void *data, qos->ucast.out.latency = DIV_ROUND_CLOSEST(get_unaligned_le24(ev->p_latency), 1000); - qos->ucast.in.sdu = le16_to_cpu(ev->c_mtu); - qos->ucast.out.sdu = le16_to_cpu(ev->p_mtu); + qos->ucast.in.sdu = ev->c_bn ? le16_to_cpu(ev->c_mtu) : 0; + qos->ucast.out.sdu = ev->p_bn ? le16_to_cpu(ev->p_mtu) : 0; qos->ucast.in.phy = ev->c_phy; qos->ucast.out.phy = ev->p_phy; break; @@ -6760,8 +6903,8 @@ static void hci_le_cis_established_evt(struct hci_dev *hdev, void *data, qos->ucast.in.latency = DIV_ROUND_CLOSEST(get_unaligned_le24(ev->p_latency), 1000); - qos->ucast.out.sdu = le16_to_cpu(ev->c_mtu); - qos->ucast.in.sdu = le16_to_cpu(ev->p_mtu); + qos->ucast.out.sdu = ev->c_bn ? le16_to_cpu(ev->c_mtu) : 0; + qos->ucast.in.sdu = ev->p_bn ? le16_to_cpu(ev->p_mtu) : 0; qos->ucast.out.phy = ev->c_phy; qos->ucast.in.phy = ev->p_phy; break; @@ -6834,7 +6977,7 @@ static void hci_le_cis_req_evt(struct hci_dev *hdev, void *data, cis = hci_conn_hash_lookup_handle(hdev, cis_handle); if (!cis) { - cis = hci_conn_add(hdev, CIS_LINK, &acl->dst, + cis = hci_conn_add(hdev, CIS_LINK, &acl->dst, acl->dst_type, HCI_ROLE_SLAVE, cis_handle); if (IS_ERR(cis)) { hci_le_reject_cis(hdev, ev->cis_handle); @@ -6951,7 +7094,7 @@ static void hci_le_big_sync_established_evt(struct hci_dev *hdev, void *data, bt_dev_dbg(hdev, "ignore too large handle %u", handle); continue; } - bis = hci_conn_add(hdev, BIS_LINK, BDADDR_ANY, + bis = hci_conn_add(hdev, BIS_LINK, BDADDR_ANY, 0, HCI_ROLE_SLAVE, handle); if (IS_ERR(bis)) continue; @@ -7002,25 +7145,25 @@ static void hci_le_big_sync_lost_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb) { struct hci_evt_le_big_sync_lost *ev = data; - struct hci_conn *bis, *conn; + struct hci_conn *bis; + bool mgmt_conn = false; bt_dev_dbg(hdev, "big handle 0x%2.2x", ev->handle); hci_dev_lock(hdev); - /* Delete the pa sync connection */ - bis = hci_conn_hash_lookup_pa_sync_big_handle(hdev, ev->handle); - if (bis) { - conn = hci_conn_hash_lookup_pa_sync_handle(hdev, - bis->sync_handle); - if (conn) - hci_conn_del(conn); - } - /* Delete each bis connection */ while ((bis = hci_conn_hash_lookup_big_state(hdev, ev->handle, BT_CONNECTED, HCI_ROLE_SLAVE))) { + if (!mgmt_conn) { + mgmt_conn = test_and_clear_bit(HCI_CONN_MGMT_CONNECTED, + &bis->flags); + mgmt_device_disconnected(hdev, &bis->dst, bis->type, + bis->dst_type, ev->reason, + mgmt_conn); + } + clear_bit(HCI_CONN_BIG_SYNC, &bis->flags); hci_disconn_cfm(bis, ev->reason); hci_conn_del(bis); @@ -7064,6 +7207,50 @@ unlock: hci_dev_unlock(hdev); } +static void hci_le_read_all_remote_features_evt(struct hci_dev *hdev, + void *data, struct sk_buff *skb) +{ + struct hci_evt_le_read_all_remote_features_complete *ev = data; + struct hci_conn *conn; + + bt_dev_dbg(hdev, "status 0x%2.2x", ev->status); + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle)); + if (!conn) + goto unlock; + + if (!ev->status) + memcpy(conn->le_features, ev->features, 248); + + if (conn->state == BT_CONFIG) { + __u8 status; + + /* If the local controller supports peripheral-initiated + * features exchange, but the remote controller does + * not, then it is possible that the error code 0x1a + * for unsupported remote feature gets returned. + * + * In this specific case, allow the connection to + * transition into connected state and mark it as + * successful. + */ + if (!conn->out && + ev->status == HCI_ERROR_UNSUPPORTED_REMOTE_FEATURE && + (hdev->le_features[0] & HCI_LE_PERIPHERAL_FEATURES)) + status = 0x00; + else + status = ev->status; + + conn->state = BT_CONNECTED; + hci_connect_cfm(conn, status); + } + +unlock: + hci_dev_unlock(hdev); +} + #define HCI_LE_EV_VL(_op, _func, _min_len, _max_len) \ [_op] = { \ .func = _func, \ @@ -7133,9 +7320,16 @@ static const struct hci_le_ev { hci_le_per_adv_report_evt, sizeof(struct hci_ev_le_per_adv_report), HCI_MAX_EVENT_SIZE), + /* [0x10 = HCI_EV_LE_PA_SYNC_LOST] */ + HCI_LE_EV(HCI_EV_LE_PA_SYNC_LOST, hci_le_pa_sync_lost_evt, + sizeof(struct hci_ev_le_pa_sync_lost)), /* [0x12 = HCI_EV_LE_EXT_ADV_SET_TERM] */ HCI_LE_EV(HCI_EV_LE_EXT_ADV_SET_TERM, hci_le_ext_adv_term_evt, sizeof(struct hci_evt_le_ext_adv_set_term)), + /* [0x18 = HCI_EVT_LE_PAST_RECEIVED] */ + HCI_LE_EV(HCI_EV_LE_PAST_RECEIVED, + hci_le_past_received_evt, + sizeof(struct hci_ev_le_past_received)), /* [0x19 = HCI_EVT_LE_CIS_ESTABLISHED] */ HCI_LE_EV(HCI_EVT_LE_CIS_ESTABLISHED, hci_le_cis_established_evt, sizeof(struct hci_evt_le_cis_established)), @@ -7162,6 +7356,12 @@ static const struct hci_le_ev { hci_le_big_info_adv_report_evt, sizeof(struct hci_evt_le_big_info_adv_report), HCI_MAX_EVENT_SIZE), + /* [0x2b = HCI_EVT_LE_ALL_REMOTE_FEATURES_COMPLETE] */ + HCI_LE_EV_VL(HCI_EVT_LE_ALL_REMOTE_FEATURES_COMPLETE, + hci_le_read_all_remote_features_evt, + sizeof(struct + hci_evt_le_read_all_remote_features_complete), + HCI_MAX_EVENT_SIZE), }; static void hci_le_meta_evt(struct hci_dev *hdev, void *data, diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index fc866759910d..4e7bf63af9c5 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -1185,7 +1185,7 @@ static int hci_sock_compat_ioctl(struct socket *sock, unsigned int cmd, } #endif -static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, +static int hci_sock_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len) { struct sockaddr_hci haddr; @@ -1311,7 +1311,9 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, goto done; } + hci_dev_lock(hdev); mgmt_index_removed(hdev); + hci_dev_unlock(hdev); err = hci_dev_open(hdev->id); if (err) { diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 2b4f21fbf9c1..a9f5b1a68356 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -863,11 +863,17 @@ bool hci_cmd_sync_dequeue_once(struct hci_dev *hdev, { struct hci_cmd_sync_work_entry *entry; - entry = hci_cmd_sync_lookup_entry(hdev, func, data, destroy); - if (!entry) + mutex_lock(&hdev->cmd_sync_work_lock); + + entry = _hci_cmd_sync_lookup_entry(hdev, func, data, destroy); + if (!entry) { + mutex_unlock(&hdev->cmd_sync_work_lock); return false; + } - hci_cmd_sync_cancel_entry(hdev, entry); + _hci_cmd_sync_cancel_entry(hdev, entry, -ECANCELED); + + mutex_unlock(&hdev->cmd_sync_work_lock); return true; } @@ -1325,7 +1331,7 @@ int hci_setup_ext_adv_instance_sync(struct hci_dev *hdev, u8 instance) { struct hci_cp_le_set_ext_adv_params cp; struct hci_rp_le_set_ext_adv_params rp; - bool connectable; + bool connectable, require_privacy; u32 flags; bdaddr_t random_addr; u8 own_addr_type; @@ -1363,10 +1369,12 @@ int hci_setup_ext_adv_instance_sync(struct hci_dev *hdev, u8 instance) return -EPERM; /* Set require_privacy to true only when non-connectable - * advertising is used. In that case it is fine to use a - * non-resolvable private address. + * advertising is used and it is not periodic. + * In that case it is fine to use a non-resolvable private address. */ - err = hci_get_random_address(hdev, !connectable, + require_privacy = !connectable && !(adv && adv->periodic); + + err = hci_get_random_address(hdev, require_privacy, adv_use_rpa(hdev, flags), adv, &own_addr_type, &random_addr); if (err < 0) @@ -1599,7 +1607,7 @@ int hci_disable_per_advertising_sync(struct hci_dev *hdev, u8 instance) /* If periodic advertising already disabled there is nothing to do. */ adv = hci_find_adv_instance(hdev, instance); - if (!adv || !adv->periodic || !adv->enabled) + if (!adv || !adv->periodic_enabled) return 0; memset(&cp, 0, sizeof(cp)); @@ -1664,7 +1672,7 @@ static int hci_enable_per_advertising_sync(struct hci_dev *hdev, u8 instance) /* If periodic advertising already enabled there is nothing to do. */ adv = hci_find_adv_instance(hdev, instance); - if (adv && adv->periodic && adv->enabled) + if (adv && adv->periodic_enabled) return 0; memset(&cp, 0, sizeof(cp)); @@ -2594,6 +2602,12 @@ static int hci_resume_advertising_sync(struct hci_dev *hdev) hci_remove_ext_adv_instance_sync(hdev, adv->instance, NULL); } + + /* If current advertising instance is set to instance 0x00 + * then we need to re-enable it. + */ + if (hci_dev_test_and_clear_flag(hdev, HCI_LE_ADV_0)) + err = hci_enable_ext_advertising_sync(hdev, 0x00); } else { /* Schedule for most recent instance to be restarted and begin * the software rotation loop @@ -3344,7 +3358,7 @@ static int hci_powered_update_adv_sync(struct hci_dev *hdev) * advertising data. This also applies to the case * where BR/EDR was toggled during the AUTO_OFF phase. */ - if (hci_dev_test_flag(hdev, HCI_ADVERTISING) || + if (hci_dev_test_flag(hdev, HCI_ADVERTISING) && list_empty(&hdev->adv_instances)) { if (ext_adv_capable(hdev)) { err = hci_setup_ext_adv_instance_sync(hdev, 0x00); @@ -3481,13 +3495,13 @@ int hci_update_scan_sync(struct hci_dev *hdev) return hci_write_scan_enable_sync(hdev, scan); } -int hci_update_name_sync(struct hci_dev *hdev) +int hci_update_name_sync(struct hci_dev *hdev, const u8 *name) { struct hci_cp_write_local_name cp; memset(&cp, 0, sizeof(cp)); - memcpy(cp.name, hdev->dev_name, sizeof(cp.name)); + memcpy(cp.name, name, sizeof(cp.name)); return __hci_cmd_sync_status(hdev, HCI_OP_WRITE_LOCAL_NAME, sizeof(cp), &cp, @@ -3540,7 +3554,7 @@ int hci_powered_update_sync(struct hci_dev *hdev) hci_write_fast_connectable_sync(hdev, false); hci_update_scan_sync(hdev); hci_update_class_sync(hdev); - hci_update_name_sync(hdev); + hci_update_name_sync(hdev, hdev->dev_name); hci_update_eir_sync(hdev); } @@ -3997,8 +4011,19 @@ static int hci_le_read_buffer_size_sync(struct hci_dev *hdev) /* Read LE Local Supported Features */ static int hci_le_read_local_features_sync(struct hci_dev *hdev) { - return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_LOCAL_FEATURES, - 0, NULL, HCI_CMD_TIMEOUT); + int err; + + err = __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_LOCAL_FEATURES, + 0, NULL, HCI_CMD_TIMEOUT); + if (err) + return err; + + if (ll_ext_feature_capable(hdev) && hdev->commands[47] & BIT(2)) + return __hci_cmd_sync_status(hdev, + HCI_OP_LE_READ_ALL_LOCAL_FEATURES, + 0, NULL, HCI_CMD_TIMEOUT); + + return err; } /* Read LE Supported States */ @@ -4310,6 +4335,10 @@ static int hci_le_set_event_mask_sync(struct hci_dev *hdev) if (ll_privacy_capable(hdev)) hdev->conn_flags |= HCI_CONN_FLAG_ADDRESS_RESOLUTION; + /* Mark PAST if supported */ + if (past_capable(hdev)) + hdev->conn_flags |= HCI_CONN_FLAG_PAST; + /* If the controller supports Extended Scanner Filter * Policies, enable the corresponding event. */ @@ -4379,6 +4408,9 @@ static int hci_le_set_event_mask_sync(struct hci_dev *hdev) if (ext_adv_capable(hdev)) events[2] |= 0x02; /* LE Advertising Set Terminated */ + if (past_receiver_capable(hdev)) + events[2] |= 0x80; /* LE PAST Received */ + if (cis_capable(hdev)) { events[3] |= 0x01; /* LE CIS Established */ if (cis_peripheral_capable(hdev)) @@ -4531,14 +4563,14 @@ static int hci_le_set_host_feature_sync(struct hci_dev *hdev) { struct hci_cp_le_set_host_feature cp; - if (!cis_capable(hdev)) + if (!iso_capable(hdev)) return 0; memset(&cp, 0, sizeof(cp)); /* Connected Isochronous Channels (Host Support) */ cp.bit_number = 32; - cp.bit_value = 1; + cp.bit_value = iso_enabled(hdev) ? 0x01 : 0x00; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_HOST_FEATURE, sizeof(cp), &cp, HCI_CMD_TIMEOUT); @@ -6985,16 +7017,14 @@ static void create_pa_complete(struct hci_dev *hdev, void *data, int err) hci_dev_lock(hdev); - hci_dev_clear_flag(hdev, HCI_PA_SYNC); - - if (!hci_conn_valid(hdev, conn)) + if (hci_conn_valid(hdev, conn)) clear_bit(HCI_CONN_CREATE_PA_SYNC, &conn->flags); if (!err) goto unlock; /* Add connection to indicate PA sync error */ - pa_sync = hci_conn_add_unset(hdev, PA_LINK, BDADDR_ANY, + pa_sync = hci_conn_add_unset(hdev, PA_LINK, BDADDR_ANY, 0, HCI_ROLE_SLAVE); if (IS_ERR(pa_sync)) @@ -7009,10 +7039,41 @@ unlock: hci_dev_unlock(hdev); } +static int hci_le_past_params_sync(struct hci_dev *hdev, struct hci_conn *conn, + struct hci_conn *acl, struct bt_iso_qos *qos) +{ + struct hci_cp_le_past_params cp; + int err; + + memset(&cp, 0, sizeof(cp)); + cp.handle = cpu_to_le16(acl->handle); + /* An HCI_LE_Periodic_Advertising_Sync_Transfer_Received event is sent + * to the Host. HCI_LE_Periodic_Advertising_Report events will be + * enabled with duplicate filtering enabled. + */ + cp.mode = 0x03; + cp.skip = cpu_to_le16(qos->bcast.skip); + cp.sync_timeout = cpu_to_le16(qos->bcast.sync_timeout); + cp.cte_type = qos->bcast.sync_cte_type; + + /* HCI_LE_PAST_PARAMS command returns a command complete event so it + * cannot wait for HCI_EV_LE_PAST_RECEIVED. + */ + err = __hci_cmd_sync_status(hdev, HCI_OP_LE_PAST_PARAMS, + sizeof(cp), &cp, HCI_CMD_TIMEOUT); + if (err) + return err; + + /* Wait for HCI_EV_LE_PAST_RECEIVED event */ + return __hci_cmd_sync_status_sk(hdev, HCI_OP_NOP, 0, NULL, + HCI_EV_LE_PAST_RECEIVED, + conn->conn_timeout, NULL); +} + static int hci_le_pa_create_sync(struct hci_dev *hdev, void *data) { struct hci_cp_le_pa_create_sync cp; - struct hci_conn *conn = data; + struct hci_conn *conn = data, *le; struct bt_iso_qos *qos = &conn->iso_qos; int err; @@ -7044,13 +7105,34 @@ static int hci_le_pa_create_sync(struct hci_dev *hdev, void *data) hci_update_passive_scan_sync(hdev); + /* Check if PAST is possible: + * + * 1. Check if an ACL connection with the destination address exists + * 2. Check if that HCI_CONN_FLAG_PAST has been set which indicates that + * user really intended to use PAST. + */ + le = hci_conn_hash_lookup_le(hdev, &conn->dst, conn->dst_type); + if (le) { + struct hci_conn_params *params; + + params = hci_conn_params_lookup(hdev, &le->dst, le->dst_type); + if (params && params->flags & HCI_CONN_FLAG_PAST) { + err = hci_le_past_params_sync(hdev, conn, le, qos); + if (!err) + goto done; + } + } + /* SID has not been set listen for HCI_EV_LE_EXT_ADV_REPORT to update * it. */ - if (conn->sid == HCI_SID_INVALID) - __hci_cmd_sync_status_sk(hdev, HCI_OP_NOP, 0, NULL, - HCI_EV_LE_EXT_ADV_REPORT, - conn->conn_timeout, NULL); + if (conn->sid == HCI_SID_INVALID) { + err = __hci_cmd_sync_status_sk(hdev, HCI_OP_NOP, 0, NULL, + HCI_EV_LE_EXT_ADV_REPORT, + conn->conn_timeout, NULL); + if (err == -ETIMEDOUT) + goto done; + } memset(&cp, 0, sizeof(cp)); cp.options = qos->bcast.options; @@ -7080,6 +7162,12 @@ static int hci_le_pa_create_sync(struct hci_dev *hdev, void *data) __hci_cmd_sync_status(hdev, HCI_OP_LE_PA_CREATE_SYNC_CANCEL, 0, NULL, HCI_CMD_TIMEOUT); +done: + hci_dev_clear_flag(hdev, HCI_PA_SYNC); + + /* Update passive scan since HCI_PA_SYNC flag has been cleared */ + hci_update_passive_scan_sync(hdev); + return err; } @@ -7151,3 +7239,182 @@ int hci_connect_big_sync(struct hci_dev *hdev, struct hci_conn *conn) return hci_cmd_sync_queue_once(hdev, hci_le_big_create_sync, conn, create_big_complete); } + +struct past_data { + struct hci_conn *conn; + struct hci_conn *le; +}; + +static void past_complete(struct hci_dev *hdev, void *data, int err) +{ + struct past_data *past = data; + + bt_dev_dbg(hdev, "err %d", err); + + kfree(past); +} + +static int hci_le_past_set_info_sync(struct hci_dev *hdev, void *data) +{ + struct past_data *past = data; + struct hci_cp_le_past_set_info cp; + + hci_dev_lock(hdev); + + if (!hci_conn_valid(hdev, past->conn) || + !hci_conn_valid(hdev, past->le)) { + hci_dev_unlock(hdev); + return -ECANCELED; + } + + memset(&cp, 0, sizeof(cp)); + cp.handle = cpu_to_le16(past->le->handle); + cp.adv_handle = past->conn->iso_qos.bcast.bis; + + hci_dev_unlock(hdev); + + return __hci_cmd_sync_status(hdev, HCI_OP_LE_PAST_SET_INFO, + sizeof(cp), &cp, HCI_CMD_TIMEOUT); +} + +static int hci_le_past_sync(struct hci_dev *hdev, void *data) +{ + struct past_data *past = data; + struct hci_cp_le_past cp; + + hci_dev_lock(hdev); + + if (!hci_conn_valid(hdev, past->conn) || + !hci_conn_valid(hdev, past->le)) { + hci_dev_unlock(hdev); + return -ECANCELED; + } + + memset(&cp, 0, sizeof(cp)); + cp.handle = cpu_to_le16(past->le->handle); + cp.sync_handle = cpu_to_le16(past->conn->sync_handle); + + hci_dev_unlock(hdev); + + return __hci_cmd_sync_status(hdev, HCI_OP_LE_PAST, + sizeof(cp), &cp, HCI_CMD_TIMEOUT); +} + +int hci_past_sync(struct hci_conn *conn, struct hci_conn *le) +{ + struct past_data *data; + int err; + + if (conn->type != BIS_LINK && conn->type != PA_LINK) + return -EINVAL; + + if (!past_sender_capable(conn->hdev)) + return -EOPNOTSUPP; + + data = kmalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + data->conn = conn; + data->le = le; + + if (conn->role == HCI_ROLE_MASTER) + err = hci_cmd_sync_queue_once(conn->hdev, + hci_le_past_set_info_sync, data, + past_complete); + else + err = hci_cmd_sync_queue_once(conn->hdev, hci_le_past_sync, + data, past_complete); + + if (err) + kfree(data); + + return err; +} + +static void le_read_features_complete(struct hci_dev *hdev, void *data, int err) +{ + struct hci_conn *conn = data; + + bt_dev_dbg(hdev, "err %d", err); + + if (err == -ECANCELED) + return; + + hci_conn_drop(conn); +} + +static int hci_le_read_all_remote_features_sync(struct hci_dev *hdev, + void *data) +{ + struct hci_conn *conn = data; + struct hci_cp_le_read_all_remote_features cp; + + memset(&cp, 0, sizeof(cp)); + cp.handle = cpu_to_le16(conn->handle); + cp.pages = 10; /* Attempt to read all pages */ + + /* Wait for HCI_EVT_LE_ALL_REMOTE_FEATURES_COMPLETE event otherwise + * hci_conn_drop may run prematurely causing a disconnection. + */ + return __hci_cmd_sync_status_sk(hdev, + HCI_OP_LE_READ_ALL_REMOTE_FEATURES, + sizeof(cp), &cp, + HCI_EVT_LE_ALL_REMOTE_FEATURES_COMPLETE, + HCI_CMD_TIMEOUT, NULL); + + return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_ALL_REMOTE_FEATURES, + sizeof(cp), &cp, HCI_CMD_TIMEOUT); +} + +static int hci_le_read_remote_features_sync(struct hci_dev *hdev, void *data) +{ + struct hci_conn *conn = data; + struct hci_cp_le_read_remote_features cp; + + if (!hci_conn_valid(hdev, conn)) + return -ECANCELED; + + /* Check if LL Extended Feature Set is supported and + * HCI_OP_LE_READ_ALL_REMOTE_FEATURES is supported then use that to read + * all features. + */ + if (ll_ext_feature_capable(hdev) && hdev->commands[47] & BIT(3)) + return hci_le_read_all_remote_features_sync(hdev, data); + + memset(&cp, 0, sizeof(cp)); + cp.handle = cpu_to_le16(conn->handle); + + /* Wait for HCI_EV_LE_REMOTE_FEAT_COMPLETE event otherwise + * hci_conn_drop may run prematurely causing a disconnection. + */ + return __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_READ_REMOTE_FEATURES, + sizeof(cp), &cp, + HCI_EV_LE_REMOTE_FEAT_COMPLETE, + HCI_CMD_TIMEOUT, NULL); +} + +int hci_le_read_remote_features(struct hci_conn *conn) +{ + struct hci_dev *hdev = conn->hdev; + int err; + + /* The remote features procedure is defined for central + * role only. So only in case of an initiated connection + * request the remote features. + * + * If the local controller supports peripheral-initiated features + * exchange, then requesting the remote features in peripheral + * role is possible. Otherwise just transition into the + * connected state without requesting the remote features. + */ + if (conn->out || (hdev->le_features[0] & HCI_LE_PERIPHERAL_FEATURES)) + err = hci_cmd_sync_queue_once(hdev, + hci_le_read_remote_features_sync, + hci_conn_hold(conn), + le_read_features_complete); + else + err = -EOPNOTSUPP; + + return err; +} diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 7bd3aa0a6db9..e36d24a9098b 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -80,19 +80,20 @@ static struct bt_iso_qos default_qos; static bool check_ucast_qos(struct bt_iso_qos *qos); static bool check_bcast_qos(struct bt_iso_qos *qos); static bool iso_match_sid(struct sock *sk, void *data); +static bool iso_match_sid_past(struct sock *sk, void *data); static bool iso_match_sync_handle(struct sock *sk, void *data); static bool iso_match_sync_handle_pa_report(struct sock *sk, void *data); static void iso_sock_disconn(struct sock *sk); typedef bool (*iso_sock_match_t)(struct sock *sk, void *data); -static struct sock *iso_get_sock(bdaddr_t *src, bdaddr_t *dst, - enum bt_sock_state state, +static struct sock *iso_get_sock(struct hci_dev *hdev, bdaddr_t *src, + bdaddr_t *dst, enum bt_sock_state state, iso_sock_match_t match, void *data); /* ---- ISO timers ---- */ -#define ISO_CONN_TIMEOUT (HZ * 40) -#define ISO_DISCONN_TIMEOUT (HZ * 2) +#define ISO_CONN_TIMEOUT secs_to_jiffies(20) +#define ISO_DISCONN_TIMEOUT secs_to_jiffies(2) static void iso_conn_free(struct kref *ref) { @@ -111,6 +112,8 @@ static void iso_conn_free(struct kref *ref) /* Ensure no more work items will run since hci_conn has been dropped */ disable_delayed_work_sync(&conn->timeout_work); + kfree_skb(conn->rx_skb); + kfree(conn); } @@ -367,7 +370,8 @@ static int iso_connect_bis(struct sock *sk) if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { hcon = hci_bind_bis(hdev, &iso_pi(sk)->dst, iso_pi(sk)->bc_sid, &iso_pi(sk)->qos, iso_pi(sk)->base_len, - iso_pi(sk)->base); + iso_pi(sk)->base, + READ_ONCE(sk->sk_sndtimeo)); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); goto unlock; @@ -376,7 +380,8 @@ static int iso_connect_bis(struct sock *sk) hcon = hci_connect_bis(hdev, &iso_pi(sk)->dst, le_addr_type(iso_pi(sk)->dst_type), iso_pi(sk)->bc_sid, &iso_pi(sk)->qos, - iso_pi(sk)->base_len, iso_pi(sk)->base); + iso_pi(sk)->base_len, iso_pi(sk)->base, + READ_ONCE(sk->sk_sndtimeo)); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); goto unlock; @@ -458,11 +463,19 @@ static int iso_connect_cis(struct sock *sk) goto unlock; } + /* Check if there are available buffers for output/TX. */ + if (iso_pi(sk)->qos.ucast.out.sdu && !hci_iso_count(hdev) && + (hdev->iso_pkts && !hdev->iso_cnt)) { + err = -ENOBUFS; + goto unlock; + } + /* Just bind if DEFER_SETUP has been set */ if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { hcon = hci_bind_cis(hdev, &iso_pi(sk)->dst, le_addr_type(iso_pi(sk)->dst_type), - &iso_pi(sk)->qos); + &iso_pi(sk)->qos, + READ_ONCE(sk->sk_sndtimeo)); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); goto unlock; @@ -470,7 +483,8 @@ static int iso_connect_cis(struct sock *sk) } else { hcon = hci_connect_cis(hdev, &iso_pi(sk)->dst, le_addr_type(iso_pi(sk)->dst_type), - &iso_pi(sk)->qos); + &iso_pi(sk)->qos, + READ_ONCE(sk->sk_sndtimeo)); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); goto unlock; @@ -624,8 +638,8 @@ static struct sock *__iso_get_sock_listen_by_sid(bdaddr_t *ba, bdaddr_t *bc, * match func data - pass -1 to ignore * Returns closest match. */ -static struct sock *iso_get_sock(bdaddr_t *src, bdaddr_t *dst, - enum bt_sock_state state, +static struct sock *iso_get_sock(struct hci_dev *hdev, bdaddr_t *src, + bdaddr_t *dst, enum bt_sock_state state, iso_sock_match_t match, void *data) { struct sock *sk = NULL, *sk1 = NULL; @@ -637,8 +651,25 @@ static struct sock *iso_get_sock(bdaddr_t *src, bdaddr_t *dst, continue; /* Match Broadcast destination */ - if (bacmp(dst, BDADDR_ANY) && bacmp(&iso_pi(sk)->dst, dst)) - continue; + if (bacmp(dst, BDADDR_ANY) && bacmp(&iso_pi(sk)->dst, dst)) { + struct smp_irk *irk1, *irk2; + + /* Check if destination is an RPA that we can resolve */ + irk1 = hci_find_irk_by_rpa(hdev, dst); + if (!irk1) + continue; + + /* Match with identity address */ + if (bacmp(&iso_pi(sk)->dst, &irk1->bdaddr)) { + /* Check if socket destination address is also + * an RPA and if the IRK matches. + */ + irk2 = hci_find_irk_by_rpa(hdev, + &iso_pi(sk)->dst); + if (!irk2 || irk1 != irk2) + continue; + } + } /* Use Match function if provided */ if (match && !match(sk, data)) @@ -750,6 +781,13 @@ static void iso_sock_kill(struct sock *sk) BT_DBG("sk %p state %d", sk, sk->sk_state); + /* Sock is dead, so set conn->sk to NULL to avoid possible UAF */ + if (iso_pi(sk)->conn) { + iso_conn_lock(iso_pi(sk)->conn); + iso_pi(sk)->conn->sk = NULL; + iso_conn_unlock(iso_pi(sk)->conn); + } + /* Kill poor orphan */ bt_sock_unlink(&iso_sk_list, sk); sock_set_flag(sk, SOCK_DEAD); @@ -924,7 +962,7 @@ static int iso_sock_create(struct net *net, struct socket *sock, int protocol, return 0; } -static int iso_sock_bind_bc(struct socket *sock, struct sockaddr *addr, +static int iso_sock_bind_bc(struct socket *sock, struct sockaddr_unsized *addr, int addr_len) { struct sockaddr_iso *sa = (struct sockaddr_iso *)addr; @@ -966,20 +1004,14 @@ static int iso_sock_bind_bc(struct socket *sock, struct sockaddr *addr, return 0; } -static int iso_sock_bind_pa_sk(struct sock *sk, struct sockaddr_iso *sa, +/* Must be called on the locked socket. */ +static int iso_sock_rebind_bis(struct sock *sk, struct sockaddr_iso *sa, int addr_len) { int err = 0; - if (sk->sk_type != SOCK_SEQPACKET) { - err = -EINVAL; - goto done; - } - - if (addr_len != sizeof(*sa) + sizeof(*sa->iso_bc)) { - err = -EINVAL; - goto done; - } + if (!test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags)) + return -EBADFD; if (sa->iso_bc->bc_num_bis > ISO_MAX_NUM_BIS) { err = -EINVAL; @@ -1002,7 +1034,78 @@ done: return err; } -static int iso_sock_bind(struct socket *sock, struct sockaddr *addr, +static struct hci_dev *iso_conn_get_hdev(struct iso_conn *conn) +{ + struct hci_dev *hdev = NULL; + + iso_conn_lock(conn); + if (conn->hcon) + hdev = hci_dev_hold(conn->hcon->hdev); + iso_conn_unlock(conn); + + return hdev; +} + +/* Must be called on the locked socket. */ +static int iso_sock_rebind_bc(struct sock *sk, struct sockaddr_iso *sa, + int addr_len) +{ + struct hci_dev *hdev; + struct hci_conn *bis; + int err; + + if (sk->sk_type != SOCK_SEQPACKET || !iso_pi(sk)->conn) + return -EINVAL; + + /* Check if it is really a Broadcast address being requested */ + if (addr_len != sizeof(*sa) + sizeof(*sa->iso_bc)) + return -EINVAL; + + /* Check if the address hasn't changed then perhaps only the number of + * bis has changed. + */ + if (!bacmp(&iso_pi(sk)->dst, &sa->iso_bc->bc_bdaddr) || + !bacmp(&sa->iso_bc->bc_bdaddr, BDADDR_ANY)) + return iso_sock_rebind_bis(sk, sa, addr_len); + + /* Check if the address type is of LE type */ + if (!bdaddr_type_is_le(sa->iso_bc->bc_bdaddr_type)) + return -EINVAL; + + hdev = iso_conn_get_hdev(iso_pi(sk)->conn); + if (!hdev) + return -EINVAL; + + bis = iso_pi(sk)->conn->hcon; + + /* Release the socket before lookups since that requires hci_dev_lock + * which shall not be acquired while holding sock_lock for proper + * ordering. + */ + release_sock(sk); + hci_dev_lock(bis->hdev); + lock_sock(sk); + + if (!iso_pi(sk)->conn || iso_pi(sk)->conn->hcon != bis) { + /* raced with iso_conn_del() or iso_disconn_sock() */ + err = -ENOTCONN; + goto unlock; + } + + BT_DBG("sk %p %pMR type %u", sk, &sa->iso_bc->bc_bdaddr, + sa->iso_bc->bc_bdaddr_type); + + err = hci_past_bis(bis, &sa->iso_bc->bc_bdaddr, + le_addr_type(sa->iso_bc->bc_bdaddr_type)); + +unlock: + hci_dev_unlock(hdev); + hci_dev_put(hdev); + + return err; +} + +static int iso_sock_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len) { struct sockaddr_iso *sa = (struct sockaddr_iso *)addr; @@ -1017,13 +1120,12 @@ static int iso_sock_bind(struct socket *sock, struct sockaddr *addr, lock_sock(sk); - /* Allow the user to bind a PA sync socket to a number - * of BISes to sync to. - */ - if ((sk->sk_state == BT_CONNECT2 || - sk->sk_state == BT_CONNECTED) && - test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags)) { - err = iso_sock_bind_pa_sk(sk, sa, addr_len); + if ((sk->sk_state == BT_CONNECT2 || sk->sk_state == BT_CONNECTED) && + addr_len > sizeof(*sa)) { + /* Allow the user to rebind to a different address using + * PAST procedures. + */ + err = iso_sock_rebind_bc(sk, sa, addr_len); goto done; } @@ -1060,7 +1162,7 @@ done: return err; } -static int iso_sock_connect(struct socket *sock, struct sockaddr *addr, +static int iso_sock_connect(struct socket *sock, struct sockaddr_unsized *addr, int alen, int flags) { struct sockaddr_iso *sa = (struct sockaddr_iso *)addr; @@ -1347,7 +1449,7 @@ static int iso_sock_getname(struct socket *sock, struct sockaddr *addr, bacpy(&sa->iso_bdaddr, &iso_pi(sk)->dst); sa->iso_bdaddr_type = iso_pi(sk)->dst_type; - if (hcon && hcon->type == BIS_LINK) { + if (hcon && (hcon->type == BIS_LINK || hcon->type == PA_LINK)) { sa->iso_bc->bc_sid = iso_pi(sk)->bc_sid; sa->iso_bc->bc_num_bis = iso_pi(sk)->bc_num_bis; memcpy(sa->iso_bc->bc_bis, iso_pi(sk)->bc_bis, @@ -1919,6 +2021,11 @@ static bool iso_match_pa_sync_flag(struct sock *sk, void *data) return test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags); } +static bool iso_match_dst(struct sock *sk, void *data) +{ + return !bacmp(&iso_pi(sk)->dst, (bdaddr_t *)data); +} + static void iso_conn_ready(struct iso_conn *conn) { struct sock *parent = NULL; @@ -1927,23 +2034,45 @@ static void iso_conn_ready(struct iso_conn *conn) struct hci_ev_le_pa_sync_established *ev2 = NULL; struct hci_ev_le_per_adv_report *ev3 = NULL; struct hci_conn *hcon; + struct hci_dev *hdev; BT_DBG("conn %p", conn); if (sk) { + /* Attempt to update source address in case of BIS Sender if + * the advertisement is using a random address. + */ + if (conn->hcon->type == BIS_LINK && + conn->hcon->role == HCI_ROLE_MASTER && + !bacmp(&conn->hcon->dst, BDADDR_ANY)) { + struct hci_conn *bis = conn->hcon; + struct adv_info *adv; + + adv = hci_find_adv_instance(bis->hdev, + bis->iso_qos.bcast.bis); + if (adv && bacmp(&adv->random_addr, BDADDR_ANY)) { + lock_sock(sk); + iso_pi(sk)->src_type = BDADDR_LE_RANDOM; + bacpy(&iso_pi(sk)->src, &adv->random_addr); + release_sock(sk); + } + } + iso_sock_ready(conn->sk); } else { hcon = conn->hcon; if (!hcon) return; + hdev = hcon->hdev; + if (test_bit(HCI_CONN_BIG_SYNC, &hcon->flags)) { /* A BIS slave hcon is notified to the ISO layer * after the Command Complete for the LE Setup * ISO Data Path command is received. Get the * parent socket that matches the hcon BIG handle. */ - parent = iso_get_sock(&hcon->src, &hcon->dst, + parent = iso_get_sock(hdev, &hcon->src, &hcon->dst, BT_LISTEN, iso_match_big_hcon, hcon); } else if (test_bit(HCI_CONN_BIG_SYNC_FAILED, &hcon->flags)) { @@ -1951,12 +2080,12 @@ static void iso_conn_ready(struct iso_conn *conn) HCI_EVT_LE_BIG_SYNC_ESTABLISHED); /* Get reference to PA sync parent socket, if it exists */ - parent = iso_get_sock(&hcon->src, &hcon->dst, + parent = iso_get_sock(hdev, &hcon->src, &hcon->dst, BT_LISTEN, iso_match_pa_sync_flag, NULL); if (!parent && ev) - parent = iso_get_sock(&hcon->src, + parent = iso_get_sock(hdev, &hcon->src, &hcon->dst, BT_LISTEN, iso_match_big, ev); @@ -1964,7 +2093,7 @@ static void iso_conn_ready(struct iso_conn *conn) ev2 = hci_recv_event_data(hcon->hdev, HCI_EV_LE_PA_SYNC_ESTABLISHED); if (ev2) - parent = iso_get_sock(&hcon->src, + parent = iso_get_sock(hdev, &hcon->src, &hcon->dst, BT_LISTEN, iso_match_sid, ev2); @@ -1972,7 +2101,7 @@ static void iso_conn_ready(struct iso_conn *conn) ev3 = hci_recv_event_data(hcon->hdev, HCI_EV_LE_PER_ADV_REPORT); if (ev3) - parent = iso_get_sock(&hcon->src, + parent = iso_get_sock(hdev, &hcon->src, &hcon->dst, BT_LISTEN, iso_match_sync_handle_pa_report, @@ -1980,8 +2109,8 @@ static void iso_conn_ready(struct iso_conn *conn) } if (!parent) - parent = iso_get_sock(&hcon->src, BDADDR_ANY, - BT_LISTEN, NULL, NULL); + parent = iso_get_sock(hdev, &hcon->src, BDADDR_ANY, + BT_LISTEN, iso_match_dst, BDADDR_ANY); if (!parent) return; @@ -2012,7 +2141,7 @@ static void iso_conn_ready(struct iso_conn *conn) */ if (!bacmp(&hcon->dst, BDADDR_ANY)) { bacpy(&hcon->dst, &iso_pi(parent)->dst); - hcon->dst_type = iso_pi(parent)->dst_type; + hcon->dst_type = le_addr_type(iso_pi(parent)->dst_type); } if (test_bit(HCI_CONN_PA_SYNC, &hcon->flags)) { @@ -2026,7 +2155,13 @@ static void iso_conn_ready(struct iso_conn *conn) } bacpy(&iso_pi(sk)->dst, &hcon->dst); - iso_pi(sk)->dst_type = hcon->dst_type; + + /* Convert from HCI to three-value type */ + if (hcon->dst_type == ADDR_LE_DEV_PUBLIC) + iso_pi(sk)->dst_type = BDADDR_LE_PUBLIC; + else + iso_pi(sk)->dst_type = BDADDR_LE_RANDOM; + iso_pi(sk)->sync_handle = iso_pi(parent)->sync_handle; memcpy(iso_pi(sk)->base, iso_pi(parent)->base, iso_pi(parent)->base_len); iso_pi(sk)->base_len = iso_pi(parent)->base_len; @@ -2064,6 +2199,16 @@ static bool iso_match_sid(struct sock *sk, void *data) return ev->sid == iso_pi(sk)->bc_sid; } +static bool iso_match_sid_past(struct sock *sk, void *data) +{ + struct hci_ev_le_past_received *ev = data; + + if (iso_pi(sk)->bc_sid == HCI_SID_INVALID) + return true; + + return ev->sid == iso_pi(sk)->bc_sid; +} + static bool iso_match_sync_handle(struct sock *sk, void *data) { struct hci_evt_le_big_info_adv_report *ev = data; @@ -2083,6 +2228,7 @@ static bool iso_match_sync_handle_pa_report(struct sock *sk, void *data) int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) { struct hci_ev_le_pa_sync_established *ev1; + struct hci_ev_le_past_received *ev1a; struct hci_evt_le_big_info_adv_report *ev2; struct hci_ev_le_per_adv_report *ev3; struct sock *sk; @@ -2096,6 +2242,7 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) * SID to listen to and once sync is established its handle needs to * be stored in iso_pi(sk)->sync_handle so it can be matched once * receiving the BIG Info. + * 1a. HCI_EV_LE_PAST_RECEIVED: alternative to 1. * 2. HCI_EVT_LE_BIG_INFO_ADV_REPORT: When connect_ind is triggered by a * a BIG Info it attempts to check if there any listening socket with * the same sync_handle and if it does then attempt to create a sync. @@ -2105,7 +2252,7 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) */ ev1 = hci_recv_event_data(hdev, HCI_EV_LE_PA_SYNC_ESTABLISHED); if (ev1) { - sk = iso_get_sock(&hdev->bdaddr, bdaddr, BT_LISTEN, + sk = iso_get_sock(hdev, &hdev->bdaddr, bdaddr, BT_LISTEN, iso_match_sid, ev1); if (sk && !ev1->status) { iso_pi(sk)->sync_handle = le16_to_cpu(ev1->handle); @@ -2115,10 +2262,22 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) goto done; } + ev1a = hci_recv_event_data(hdev, HCI_EV_LE_PAST_RECEIVED); + if (ev1a) { + sk = iso_get_sock(hdev, &hdev->bdaddr, bdaddr, BT_LISTEN, + iso_match_sid_past, ev1a); + if (sk && !ev1a->status) { + iso_pi(sk)->sync_handle = le16_to_cpu(ev1a->sync_handle); + iso_pi(sk)->bc_sid = ev1a->sid; + } + + goto done; + } + ev2 = hci_recv_event_data(hdev, HCI_EVT_LE_BIG_INFO_ADV_REPORT); if (ev2) { /* Check if BIGInfo report has already been handled */ - sk = iso_get_sock(&hdev->bdaddr, bdaddr, BT_CONNECTED, + sk = iso_get_sock(hdev, &hdev->bdaddr, bdaddr, BT_CONNECTED, iso_match_sync_handle, ev2); if (sk) { sock_put(sk); @@ -2127,10 +2286,10 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) } /* Try to get PA sync socket, if it exists */ - sk = iso_get_sock(&hdev->bdaddr, bdaddr, BT_CONNECT2, + sk = iso_get_sock(hdev, &hdev->bdaddr, bdaddr, BT_CONNECT2, iso_match_sync_handle, ev2); if (!sk) - sk = iso_get_sock(&hdev->bdaddr, bdaddr, + sk = iso_get_sock(hdev, &hdev->bdaddr, bdaddr, BT_LISTEN, iso_match_sync_handle, ev2); @@ -2169,7 +2328,7 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) u8 *base; struct hci_conn *hcon; - sk = iso_get_sock(&hdev->bdaddr, bdaddr, BT_LISTEN, + sk = iso_get_sock(hdev, &hdev->bdaddr, bdaddr, BT_LISTEN, iso_match_sync_handle_pa_report, ev3); if (!sk) goto done; @@ -2219,8 +2378,8 @@ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) hcon->le_per_adv_data_len = 0; } } else { - sk = iso_get_sock(&hdev->bdaddr, BDADDR_ANY, - BT_LISTEN, NULL, NULL); + sk = iso_get_sock(hdev, &hdev->bdaddr, BDADDR_ANY, + BT_LISTEN, iso_match_dst, BDADDR_ANY); } done: @@ -2288,14 +2447,31 @@ static void iso_disconn_cfm(struct hci_conn *hcon, __u8 reason) iso_conn_del(hcon, bt_to_errno(reason)); } -void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) +int iso_recv(struct hci_dev *hdev, u16 handle, struct sk_buff *skb, u16 flags) { - struct iso_conn *conn = hcon->iso_data; + struct hci_conn *hcon; + struct iso_conn *conn; struct skb_shared_hwtstamps *hwts; __u16 pb, ts, len, sn; - if (!conn) - goto drop; + hci_dev_lock(hdev); + + hcon = hci_conn_hash_lookup_handle(hdev, handle); + if (!hcon) { + hci_dev_unlock(hdev); + kfree_skb(skb); + return -ENOENT; + } + + conn = iso_conn_hold_unless_zero(hcon->iso_data); + hcon = NULL; + + hci_dev_unlock(hdev); + + if (!conn) { + kfree_skb(skb); + return -EINVAL; + } pb = hci_iso_flags_pb(flags); ts = hci_iso_flags_ts(flags); @@ -2351,7 +2527,7 @@ void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) hci_skb_pkt_status(skb) = flags & 0x03; hci_skb_pkt_seqnum(skb) = sn; iso_recv_frame(conn, skb); - return; + goto done; } if (pb == ISO_SINGLE) { @@ -2407,7 +2583,7 @@ void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len), skb->len); conn->rx_len -= skb->len; - return; + break; case ISO_END: skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len), @@ -2429,6 +2605,9 @@ void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) drop: kfree_skb(skb); +done: + iso_conn_put(conn); + return 0; } static struct hci_cb iso_cb = { @@ -2483,11 +2662,11 @@ static const struct net_proto_family iso_sock_family_ops = { .create = iso_sock_create, }; -static bool iso_inited; +static bool inited; -bool iso_enabled(void) +bool iso_inited(void) { - return iso_inited; + return inited; } int iso_init(void) @@ -2496,7 +2675,7 @@ int iso_init(void) BUILD_BUG_ON(sizeof(struct sockaddr_iso) > sizeof(struct sockaddr)); - if (iso_inited) + if (inited) return -EALREADY; err = proto_register(&iso_proto, 0); @@ -2524,7 +2703,7 @@ int iso_init(void) iso_debugfs = debugfs_create_file("iso", 0444, bt_debugfs, NULL, &iso_debugfs_fops); - iso_inited = true; + inited = true; return 0; @@ -2535,7 +2714,7 @@ error: int iso_exit(void) { - if (!iso_inited) + if (!inited) return -EALREADY; bt_procfs_cleanup(&init_net, "iso"); @@ -2549,7 +2728,7 @@ int iso_exit(void) proto_unregister(&iso_proto); - iso_inited = false; + inited = false; return 0; } diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 805c752ac0a9..07b493331fd7 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -282,7 +282,7 @@ static void __set_retrans_timer(struct l2cap_chan *chan) if (!delayed_work_pending(&chan->monitor_timer) && chan->retrans_timeout) { l2cap_set_timer(chan, &chan->retrans_timer, - secs_to_jiffies(chan->retrans_timeout)); + msecs_to_jiffies(chan->retrans_timeout)); } } @@ -291,7 +291,7 @@ static void __set_monitor_timer(struct l2cap_chan *chan) __clear_retrans_timer(chan); if (chan->monitor_timeout) { l2cap_set_timer(chan, &chan->monitor_timer, - secs_to_jiffies(chan->monitor_timeout)); + msecs_to_jiffies(chan->monitor_timeout)); } } @@ -497,6 +497,7 @@ void l2cap_chan_hold(struct l2cap_chan *c) kref_get(&c->kref); } +EXPORT_SYMBOL_GPL(l2cap_chan_hold); struct l2cap_chan *l2cap_chan_hold_unless_zero(struct l2cap_chan *c) { @@ -7509,13 +7510,24 @@ struct l2cap_conn *l2cap_conn_hold_unless_zero(struct l2cap_conn *c) return c; } -void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) +int l2cap_recv_acldata(struct hci_dev *hdev, u16 handle, + struct sk_buff *skb, u16 flags) { + struct hci_conn *hcon; struct l2cap_conn *conn; int len; - /* Lock hdev to access l2cap_data to avoid race with l2cap_conn_del */ - hci_dev_lock(hcon->hdev); + /* Lock hdev for hci_conn, and race on l2cap_data vs. l2cap_conn_del */ + hci_dev_lock(hdev); + + hcon = hci_conn_hash_lookup_handle(hdev, handle); + if (!hcon) { + hci_dev_unlock(hdev); + kfree_skb(skb); + return -ENOENT; + } + + hci_conn_enter_active_mode(hcon, BT_POWER_FORCE_ACTIVE_OFF); conn = hcon->l2cap_data; @@ -7523,12 +7535,13 @@ void l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) conn = l2cap_conn_add(hcon); conn = l2cap_conn_hold_unless_zero(conn); + hcon = NULL; - hci_dev_unlock(hcon->hdev); + hci_dev_unlock(hdev); if (!conn) { kfree_skb(skb); - return; + return -EINVAL; } BT_DBG("conn %p len %u flags 0x%x", conn, skb->len, flags); @@ -7642,6 +7655,7 @@ drop: unlock: mutex_unlock(&conn->lock); l2cap_conn_put(conn); + return 0; } static struct hci_cb l2cap_cb = { diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index f4257c4d3052..9ee189c815d4 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -80,7 +80,7 @@ static int l2cap_validate_le_psm(u16 psm) return 0; } -static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int alen) +static int l2cap_sock_bind(struct socket *sock, struct sockaddr_unsized *addr, int alen) { struct sock *sk = sock->sk; struct l2cap_chan *chan = l2cap_pi(sk)->chan; @@ -178,7 +178,7 @@ done: return err; } -static int l2cap_sock_connect(struct socket *sock, struct sockaddr *addr, +static int l2cap_sock_connect(struct socket *sock, struct sockaddr_unsized *addr, int alen, int flags) { struct sock *sk = sock->sk; @@ -1422,7 +1422,10 @@ static int l2cap_sock_release(struct socket *sock) if (!sk) return 0; + lock_sock_nested(sk, L2CAP_NESTING_PARENT); l2cap_sock_cleanup_listen(sk); + release_sock(sk); + bt_sock_unlink(&l2cap_sk_list, sk); err = l2cap_sock_shutdown(sock, SHUT_RDWR); diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 1ce682038b51..c11cdef42b6f 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -852,6 +852,12 @@ static u32 get_supported_settings(struct hci_dev *hdev) if (ll_privacy_capable(hdev)) settings |= MGMT_SETTING_LL_PRIVACY; + if (past_sender_capable(hdev)) + settings |= MGMT_SETTING_PAST_SENDER; + + if (past_receiver_capable(hdev)) + settings |= MGMT_SETTING_PAST_RECEIVER; + settings |= MGMT_SETTING_PHY_CONFIGURATION; return settings; @@ -922,21 +928,27 @@ static u32 get_current_settings(struct hci_dev *hdev) if (hci_dev_test_flag(hdev, HCI_WIDEBAND_SPEECH_ENABLED)) settings |= MGMT_SETTING_WIDEBAND_SPEECH; - if (cis_central_capable(hdev)) + if (cis_central_enabled(hdev)) settings |= MGMT_SETTING_CIS_CENTRAL; - if (cis_peripheral_capable(hdev)) + if (cis_peripheral_enabled(hdev)) settings |= MGMT_SETTING_CIS_PERIPHERAL; - if (bis_capable(hdev)) + if (bis_enabled(hdev)) settings |= MGMT_SETTING_ISO_BROADCASTER; - if (sync_recv_capable(hdev)) + if (sync_recv_enabled(hdev)) settings |= MGMT_SETTING_ISO_SYNC_RECEIVER; - if (ll_privacy_capable(hdev)) + if (ll_privacy_enabled(hdev)) settings |= MGMT_SETTING_LL_PRIVACY; + if (past_sender_enabled(hdev)) + settings |= MGMT_SETTING_PAST_SENDER; + + if (past_receiver_enabled(hdev)) + settings |= MGMT_SETTING_PAST_RECEIVER; + return settings; } @@ -1323,8 +1335,7 @@ static void mgmt_set_powered_complete(struct hci_dev *hdev, void *data, int err) struct mgmt_mode *cp; /* Make sure cmd still outstanding. */ - if (err == -ECANCELED || - cmd != pending_find(MGMT_OP_SET_POWERED, hdev)) + if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd)) return; cp = cmd->param; @@ -1351,23 +1362,29 @@ static void mgmt_set_powered_complete(struct hci_dev *hdev, void *data, int err) mgmt_status(err)); } - mgmt_pending_remove(cmd); + mgmt_pending_free(cmd); } static int set_powered_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; - struct mgmt_mode *cp; + struct mgmt_mode cp; + + mutex_lock(&hdev->mgmt_pending_lock); /* Make sure cmd still outstanding. */ - if (cmd != pending_find(MGMT_OP_SET_POWERED, hdev)) + if (!__mgmt_pending_listed(hdev, cmd)) { + mutex_unlock(&hdev->mgmt_pending_lock); return -ECANCELED; + } - cp = cmd->param; + memcpy(&cp, cmd->param, sizeof(cp)); + + mutex_unlock(&hdev->mgmt_pending_lock); BT_DBG("%s", hdev->name); - return hci_set_powered_sync(hdev, cp->val); + return hci_set_powered_sync(hdev, cp.val); } static int set_powered(struct sock *sk, struct hci_dev *hdev, void *data, @@ -1516,8 +1533,7 @@ static void mgmt_set_discoverable_complete(struct hci_dev *hdev, void *data, bt_dev_dbg(hdev, "err %d", err); /* Make sure cmd still outstanding. */ - if (err == -ECANCELED || - cmd != pending_find(MGMT_OP_SET_DISCOVERABLE, hdev)) + if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd)) return; hci_dev_lock(hdev); @@ -1539,12 +1555,15 @@ static void mgmt_set_discoverable_complete(struct hci_dev *hdev, void *data, new_settings(hdev, cmd->sk); done: - mgmt_pending_remove(cmd); + mgmt_pending_free(cmd); hci_dev_unlock(hdev); } static int set_discoverable_sync(struct hci_dev *hdev, void *data) { + if (!mgmt_pending_listed(hdev, data)) + return -ECANCELED; + BT_DBG("%s", hdev->name); return hci_update_discoverable_sync(hdev); @@ -1691,8 +1710,7 @@ static void mgmt_set_connectable_complete(struct hci_dev *hdev, void *data, bt_dev_dbg(hdev, "err %d", err); /* Make sure cmd still outstanding. */ - if (err == -ECANCELED || - cmd != pending_find(MGMT_OP_SET_CONNECTABLE, hdev)) + if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd)) return; hci_dev_lock(hdev); @@ -1707,7 +1725,7 @@ static void mgmt_set_connectable_complete(struct hci_dev *hdev, void *data, new_settings(hdev, cmd->sk); done: - mgmt_pending_remove(cmd); + mgmt_pending_free(cmd); hci_dev_unlock(hdev); } @@ -1743,6 +1761,9 @@ static int set_connectable_update_settings(struct hci_dev *hdev, static int set_connectable_sync(struct hci_dev *hdev, void *data) { + if (!mgmt_pending_listed(hdev, data)) + return -ECANCELED; + BT_DBG("%s", hdev->name); return hci_update_connectable_sync(hdev); @@ -1919,14 +1940,17 @@ static void set_ssp_complete(struct hci_dev *hdev, void *data, int err) { struct cmd_lookup match = { NULL, hdev }; struct mgmt_pending_cmd *cmd = data; - struct mgmt_mode *cp = cmd->param; - u8 enable = cp->val; + struct mgmt_mode *cp; + u8 enable; bool changed; /* Make sure cmd still outstanding. */ - if (err == -ECANCELED || cmd != pending_find(MGMT_OP_SET_SSP, hdev)) + if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd)) return; + cp = cmd->param; + enable = cp->val; + if (err) { u8 mgmt_err = mgmt_status(err); @@ -1935,8 +1959,7 @@ static void set_ssp_complete(struct hci_dev *hdev, void *data, int err) new_settings(hdev, NULL); } - mgmt_pending_foreach(MGMT_OP_SET_SSP, hdev, true, - cmd_status_rsp, &mgmt_err); + mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_err); return; } @@ -1946,7 +1969,7 @@ static void set_ssp_complete(struct hci_dev *hdev, void *data, int err) changed = hci_dev_test_and_clear_flag(hdev, HCI_SSP_ENABLED); } - mgmt_pending_foreach(MGMT_OP_SET_SSP, hdev, true, settings_rsp, &match); + settings_rsp(cmd, &match); if (changed) new_settings(hdev, match.sk); @@ -1960,14 +1983,25 @@ static void set_ssp_complete(struct hci_dev *hdev, void *data, int err) static int set_ssp_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; - struct mgmt_mode *cp = cmd->param; + struct mgmt_mode cp; bool changed = false; int err; - if (cp->val) + mutex_lock(&hdev->mgmt_pending_lock); + + if (!__mgmt_pending_listed(hdev, cmd)) { + mutex_unlock(&hdev->mgmt_pending_lock); + return -ECANCELED; + } + + memcpy(&cp, cmd->param, sizeof(cp)); + + mutex_unlock(&hdev->mgmt_pending_lock); + + if (cp.val) changed = !hci_dev_test_and_set_flag(hdev, HCI_SSP_ENABLED); - err = hci_write_ssp_mode_sync(hdev, cp->val); + err = hci_write_ssp_mode_sync(hdev, cp.val); if (!err && changed) hci_dev_clear_flag(hdev, HCI_SSP_ENABLED); @@ -2060,32 +2094,50 @@ static int set_hs(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) static void set_le_complete(struct hci_dev *hdev, void *data, int err) { + struct mgmt_pending_cmd *cmd = data; struct cmd_lookup match = { NULL, hdev }; u8 status = mgmt_status(err); bt_dev_dbg(hdev, "err %d", err); - if (status) { - mgmt_pending_foreach(MGMT_OP_SET_LE, hdev, true, cmd_status_rsp, - &status); + if (err == -ECANCELED || !mgmt_pending_valid(hdev, data)) return; + + if (status) { + mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, status); + goto done; } - mgmt_pending_foreach(MGMT_OP_SET_LE, hdev, true, settings_rsp, &match); + settings_rsp(cmd, &match); new_settings(hdev, match.sk); if (match.sk) sock_put(match.sk); + +done: + mgmt_pending_free(cmd); } static int set_le_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; - struct mgmt_mode *cp = cmd->param; - u8 val = !!cp->val; + struct mgmt_mode cp; + u8 val; int err; + mutex_lock(&hdev->mgmt_pending_lock); + + if (!__mgmt_pending_listed(hdev, cmd)) { + mutex_unlock(&hdev->mgmt_pending_lock); + return -ECANCELED; + } + + memcpy(&cp, cmd->param, sizeof(cp)); + val = !!cp.val; + + mutex_unlock(&hdev->mgmt_pending_lock); + if (!val) { hci_clear_adv_instance_sync(hdev, NULL, 0x00, true); @@ -2127,23 +2179,45 @@ static void set_mesh_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; u8 status = mgmt_status(err); - struct sock *sk = cmd->sk; + struct sock *sk; + + if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd)) + return; + + sk = cmd->sk; if (status) { + mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_SET_MESH_RECEIVER, + status); mgmt_pending_foreach(MGMT_OP_SET_MESH_RECEIVER, hdev, true, cmd_status_rsp, &status); - return; + goto done; } - mgmt_pending_remove(cmd); mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_MESH_RECEIVER, 0, NULL, 0); + +done: + mgmt_pending_free(cmd); } static int set_mesh_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; - struct mgmt_cp_set_mesh *cp = cmd->param; - size_t len = cmd->param_len; + DEFINE_FLEX(struct mgmt_cp_set_mesh, cp, ad_types, num_ad_types, + sizeof(hdev->mesh_ad_types)); + size_t len; + + mutex_lock(&hdev->mgmt_pending_lock); + + if (!__mgmt_pending_listed(hdev, cmd)) { + mutex_unlock(&hdev->mgmt_pending_lock); + return -ECANCELED; + } + + len = cmd->param_len; + memcpy(cp, cmd->param, min(__struct_size(cp), len)); + + mutex_unlock(&hdev->mgmt_pending_lock); memset(hdev->mesh_ad_types, 0, sizeof(hdev->mesh_ad_types)); @@ -2155,7 +2229,7 @@ static int set_mesh_sync(struct hci_dev *hdev, void *data) hdev->le_scan_interval = __le16_to_cpu(cp->period); hdev->le_scan_window = __le16_to_cpu(cp->window); - len -= sizeof(*cp); + len -= sizeof(struct mgmt_cp_set_mesh); /* If filters don't fit, forward all adv pkts */ if (len <= sizeof(hdev->mesh_ad_types)) @@ -3867,15 +3941,16 @@ static int name_changed_sync(struct hci_dev *hdev, void *data) static void set_name_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; - struct mgmt_cp_set_local_name *cp = cmd->param; + struct mgmt_cp_set_local_name *cp; u8 status = mgmt_status(err); bt_dev_dbg(hdev, "err %d", err); - if (err == -ECANCELED || - cmd != pending_find(MGMT_OP_SET_LOCAL_NAME, hdev)) + if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd)) return; + cp = cmd->param; + if (status) { mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_SET_LOCAL_NAME, status); @@ -3887,13 +3962,27 @@ static void set_name_complete(struct hci_dev *hdev, void *data, int err) hci_cmd_sync_queue(hdev, name_changed_sync, NULL, NULL); } - mgmt_pending_remove(cmd); + mgmt_pending_free(cmd); } static int set_name_sync(struct hci_dev *hdev, void *data) { + struct mgmt_pending_cmd *cmd = data; + struct mgmt_cp_set_local_name cp; + + mutex_lock(&hdev->mgmt_pending_lock); + + if (!__mgmt_pending_listed(hdev, cmd)) { + mutex_unlock(&hdev->mgmt_pending_lock); + return -ECANCELED; + } + + memcpy(&cp, cmd->param, sizeof(cp)); + + mutex_unlock(&hdev->mgmt_pending_lock); + if (lmp_bredr_capable(hdev)) { - hci_update_name_sync(hdev); + hci_update_name_sync(hdev, cp.name); hci_update_eir_sync(hdev); } @@ -4045,12 +4134,10 @@ int mgmt_phy_configuration_changed(struct hci_dev *hdev, struct sock *skip) static void set_default_phy_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; - struct sk_buff *skb = cmd->skb; + struct sk_buff *skb; u8 status = mgmt_status(err); - if (err == -ECANCELED || - cmd != pending_find(MGMT_OP_SET_PHY_CONFIGURATION, hdev)) - return; + skb = cmd->skb; if (!status) { if (!skb) @@ -4077,7 +4164,7 @@ static void set_default_phy_complete(struct hci_dev *hdev, void *data, int err) if (skb && !IS_ERR(skb)) kfree_skb(skb); - mgmt_pending_remove(cmd); + mgmt_pending_free(cmd); } static int set_default_phy_sync(struct hci_dev *hdev, void *data) @@ -4085,7 +4172,9 @@ static int set_default_phy_sync(struct hci_dev *hdev, void *data) struct mgmt_pending_cmd *cmd = data; struct mgmt_cp_set_phy_configuration *cp = cmd->param; struct hci_cp_le_set_default_phy cp_phy; - u32 selected_phys = __le32_to_cpu(cp->selected_phys); + u32 selected_phys; + + selected_phys = __le32_to_cpu(cp->selected_phys); memset(&cp_phy, 0, sizeof(cp_phy)); @@ -4225,7 +4314,7 @@ static int set_phy_configuration(struct sock *sk, struct hci_dev *hdev, goto unlock; } - cmd = mgmt_pending_add(sk, MGMT_OP_SET_PHY_CONFIGURATION, hdev, data, + cmd = mgmt_pending_new(sk, MGMT_OP_SET_PHY_CONFIGURATION, hdev, data, len); if (!cmd) err = -ENOMEM; @@ -4469,13 +4558,11 @@ static int read_exp_features_info(struct sock *sk, struct hci_dev *hdev, return -ENOMEM; #ifdef CONFIG_BT_FEATURE_DEBUG - if (!hdev) { - flags = bt_dbg_get() ? BIT(0) : 0; + flags = bt_dbg_get() ? BIT(0) : 0; - memcpy(rp->features[idx].uuid, debug_uuid, 16); - rp->features[idx].flags = cpu_to_le32(flags); - idx++; - } + memcpy(rp->features[idx].uuid, debug_uuid, 16); + rp->features[idx].flags = cpu_to_le32(flags); + idx++; #endif if (hdev && hci_dev_le_state_simultaneous(hdev)) { @@ -4513,7 +4600,7 @@ static int read_exp_features_info(struct sock *sk, struct hci_dev *hdev, } if (IS_ENABLED(CONFIG_BT_LE)) { - flags = iso_enabled() ? BIT(0) : 0; + flags = iso_inited() ? BIT(0) : 0; memcpy(rp->features[idx].uuid, iso_socket_uuid, 16); rp->features[idx].flags = cpu_to_le32(flags); idx++; @@ -5035,6 +5122,69 @@ static void device_flags_changed(struct sock *sk, struct hci_dev *hdev, mgmt_event(MGMT_EV_DEVICE_FLAGS_CHANGED, hdev, &ev, sizeof(ev), sk); } +static bool is_connected(struct hci_dev *hdev, bdaddr_t *addr, u8 type) +{ + struct hci_conn *conn; + + conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, addr); + if (!conn) + return false; + + if (conn->dst_type != type) + return false; + + if (conn->state != BT_CONNECTED) + return false; + + return true; +} + +/* This function requires the caller holds hdev->lock */ +static struct hci_conn_params *hci_conn_params_set(struct hci_dev *hdev, + bdaddr_t *addr, u8 addr_type, + u8 auto_connect) +{ + struct hci_conn_params *params; + + params = hci_conn_params_add(hdev, addr, addr_type); + if (!params) + return NULL; + + if (params->auto_connect == auto_connect) + return params; + + hci_pend_le_list_del_init(params); + + switch (auto_connect) { + case HCI_AUTO_CONN_DISABLED: + case HCI_AUTO_CONN_LINK_LOSS: + /* If auto connect is being disabled when we're trying to + * connect to device, keep connecting. + */ + if (params->explicit_connect) + hci_pend_le_list_add(params, &hdev->pend_le_conns); + break; + case HCI_AUTO_CONN_REPORT: + if (params->explicit_connect) + hci_pend_le_list_add(params, &hdev->pend_le_conns); + else + hci_pend_le_list_add(params, &hdev->pend_le_reports); + break; + case HCI_AUTO_CONN_DIRECT: + case HCI_AUTO_CONN_ALWAYS: + if (!is_connected(hdev, addr, addr_type)) + hci_pend_le_list_add(params, &hdev->pend_le_conns); + break; + } + + params->auto_connect = auto_connect; + + bt_dev_dbg(hdev, "addr %pMR (type %u) auto_connect %u", + addr, addr_type, auto_connect); + + return params; +} + static int set_device_flags(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { @@ -5078,9 +5228,16 @@ static int set_device_flags(struct sock *sk, struct hci_dev *hdev, void *data, params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr, le_addr_type(cp->addr.type)); if (!params) { - bt_dev_warn(hdev, "No such LE device %pMR (0x%x)", - &cp->addr.bdaddr, le_addr_type(cp->addr.type)); - goto unlock; + /* Create a new hci_conn_params if it doesn't exist */ + params = hci_conn_params_set(hdev, &cp->addr.bdaddr, + le_addr_type(cp->addr.type), + HCI_AUTO_CONN_DISABLED); + if (!params) { + bt_dev_warn(hdev, "No such LE device %pMR (0x%x)", + &cp->addr.bdaddr, + le_addr_type(cp->addr.type)); + goto unlock; + } } supported_flags = hdev->conn_flags; @@ -5186,7 +5343,17 @@ static void mgmt_add_adv_patterns_monitor_complete(struct hci_dev *hdev, { struct mgmt_rp_add_adv_patterns_monitor rp; struct mgmt_pending_cmd *cmd = data; - struct adv_monitor *monitor = cmd->user_data; + struct adv_monitor *monitor; + + /* This is likely the result of hdev being closed and mgmt_index_removed + * is attempting to clean up any pending command so + * hci_adv_monitors_clear is about to be called which will take care of + * freeing the adv_monitor instances. + */ + if (status == -ECANCELED && !mgmt_pending_valid(hdev, cmd)) + return; + + monitor = cmd->user_data; hci_dev_lock(hdev); @@ -5212,9 +5379,20 @@ static void mgmt_add_adv_patterns_monitor_complete(struct hci_dev *hdev, static int mgmt_add_adv_patterns_monitor_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; - struct adv_monitor *monitor = cmd->user_data; + struct adv_monitor *mon; + + mutex_lock(&hdev->mgmt_pending_lock); + + if (!__mgmt_pending_listed(hdev, cmd)) { + mutex_unlock(&hdev->mgmt_pending_lock); + return -ECANCELED; + } - return hci_add_adv_monitor(hdev, monitor); + mon = cmd->user_data; + + mutex_unlock(&hdev->mgmt_pending_lock); + + return hci_add_adv_monitor(hdev, mon); } static int __add_adv_patterns_monitor(struct sock *sk, struct hci_dev *hdev, @@ -5299,9 +5477,9 @@ static u8 parse_adv_monitor_pattern(struct adv_monitor *m, u8 pattern_count, for (i = 0; i < pattern_count; i++) { offset = patterns[i].offset; length = patterns[i].length; - if (offset >= HCI_MAX_EXT_AD_LENGTH || - length > HCI_MAX_EXT_AD_LENGTH || - (offset + length) > HCI_MAX_EXT_AD_LENGTH) + if (offset >= HCI_MAX_AD_LENGTH || + length > HCI_MAX_AD_LENGTH || + (offset + length) > HCI_MAX_AD_LENGTH) return MGMT_STATUS_INVALID_PARAMS; p = kmalloc(sizeof(*p), GFP_KERNEL); @@ -5481,7 +5659,8 @@ unlock: status); } -static void read_local_oob_data_complete(struct hci_dev *hdev, void *data, int err) +static void read_local_oob_data_complete(struct hci_dev *hdev, void *data, + int err) { struct mgmt_rp_read_local_oob_data mgmt_rp; size_t rp_size = sizeof(mgmt_rp); @@ -5501,7 +5680,8 @@ static void read_local_oob_data_complete(struct hci_dev *hdev, void *data, int e bt_dev_dbg(hdev, "status %d", status); if (status) { - mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_READ_LOCAL_OOB_DATA, status); + mgmt_cmd_status(cmd->sk, hdev->id, MGMT_OP_READ_LOCAL_OOB_DATA, + status); goto remove; } @@ -5783,17 +5963,12 @@ static void start_discovery_complete(struct hci_dev *hdev, void *data, int err) bt_dev_dbg(hdev, "err %d", err); - if (err == -ECANCELED) - return; - - if (cmd != pending_find(MGMT_OP_START_DISCOVERY, hdev) && - cmd != pending_find(MGMT_OP_START_LIMITED_DISCOVERY, hdev) && - cmd != pending_find(MGMT_OP_START_SERVICE_DISCOVERY, hdev)) + if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd)) return; mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_status(err), cmd->param, 1); - mgmt_pending_remove(cmd); + mgmt_pending_free(cmd); hci_discovery_set_state(hdev, err ? DISCOVERY_STOPPED: DISCOVERY_FINDING); @@ -5801,6 +5976,9 @@ static void start_discovery_complete(struct hci_dev *hdev, void *data, int err) static int start_discovery_sync(struct hci_dev *hdev, void *data) { + if (!mgmt_pending_listed(hdev, data)) + return -ECANCELED; + return hci_start_discovery_sync(hdev); } @@ -6006,15 +6184,14 @@ static void stop_discovery_complete(struct hci_dev *hdev, void *data, int err) { struct mgmt_pending_cmd *cmd = data; - if (err == -ECANCELED || - cmd != pending_find(MGMT_OP_STOP_DISCOVERY, hdev)) + if (err == -ECANCELED || !mgmt_pending_valid(hdev, cmd)) return; bt_dev_dbg(hdev, "err %d", err); mgmt_cmd_complete(cmd->sk, cmd->hdev->id, cmd->opcode, mgmt_status(err), cmd->param, 1); - mgmt_pending_remove(cmd); + mgmt_pending_free(cmd); if (!err) hci_discovery_set_state(hdev, DISCOVERY_STOPPED); @@ -6022,6 +6199,9 @@ static void stop_discovery_complete(struct hci_dev *hdev, void *data, int err) static int stop_discovery_sync(struct hci_dev *hdev, void *data) { + if (!mgmt_pending_listed(hdev, data)) + return -ECANCELED; + return hci_stop_discovery_sync(hdev); } @@ -6231,14 +6411,18 @@ static void enable_advertising_instance(struct hci_dev *hdev, int err) static void set_advertising_complete(struct hci_dev *hdev, void *data, int err) { + struct mgmt_pending_cmd *cmd = data; struct cmd_lookup match = { NULL, hdev }; u8 instance; struct adv_info *adv_instance; u8 status = mgmt_status(err); + if (err == -ECANCELED || !mgmt_pending_valid(hdev, data)) + return; + if (status) { - mgmt_pending_foreach(MGMT_OP_SET_ADVERTISING, hdev, true, - cmd_status_rsp, &status); + mgmt_cmd_status(cmd->sk, cmd->hdev->id, cmd->opcode, status); + mgmt_pending_free(cmd); return; } @@ -6247,8 +6431,7 @@ static void set_advertising_complete(struct hci_dev *hdev, void *data, int err) else hci_dev_clear_flag(hdev, HCI_ADVERTISING); - mgmt_pending_foreach(MGMT_OP_SET_ADVERTISING, hdev, true, settings_rsp, - &match); + settings_rsp(cmd, &match); new_settings(hdev, match.sk); @@ -6280,10 +6463,23 @@ static void set_advertising_complete(struct hci_dev *hdev, void *data, int err) static int set_adv_sync(struct hci_dev *hdev, void *data) { struct mgmt_pending_cmd *cmd = data; - struct mgmt_mode *cp = cmd->param; - u8 val = !!cp->val; + struct mgmt_mode cp; + u8 val; - if (cp->val == 0x02) + mutex_lock(&hdev->mgmt_pending_lock); + + if (!__mgmt_pending_listed(hdev, cmd)) { + mutex_unlock(&hdev->mgmt_pending_lock); + return -ECANCELED; + } + + memcpy(&cp, cmd->param, sizeof(cp)); + + mutex_unlock(&hdev->mgmt_pending_lock); + + val = !!cp.val; + + if (cp.val == 0x02) hci_dev_set_flag(hdev, HCI_ADVERTISING_CONNECTABLE); else hci_dev_clear_flag(hdev, HCI_ADVERTISING_CONNECTABLE); @@ -7428,68 +7624,6 @@ unlock: return err; } -static bool is_connected(struct hci_dev *hdev, bdaddr_t *addr, u8 type) -{ - struct hci_conn *conn; - - conn = hci_conn_hash_lookup_ba(hdev, LE_LINK, addr); - if (!conn) - return false; - - if (conn->dst_type != type) - return false; - - if (conn->state != BT_CONNECTED) - return false; - - return true; -} - -/* This function requires the caller holds hdev->lock */ -static int hci_conn_params_set(struct hci_dev *hdev, bdaddr_t *addr, - u8 addr_type, u8 auto_connect) -{ - struct hci_conn_params *params; - - params = hci_conn_params_add(hdev, addr, addr_type); - if (!params) - return -EIO; - - if (params->auto_connect == auto_connect) - return 0; - - hci_pend_le_list_del_init(params); - - switch (auto_connect) { - case HCI_AUTO_CONN_DISABLED: - case HCI_AUTO_CONN_LINK_LOSS: - /* If auto connect is being disabled when we're trying to - * connect to device, keep connecting. - */ - if (params->explicit_connect) - hci_pend_le_list_add(params, &hdev->pend_le_conns); - break; - case HCI_AUTO_CONN_REPORT: - if (params->explicit_connect) - hci_pend_le_list_add(params, &hdev->pend_le_conns); - else - hci_pend_le_list_add(params, &hdev->pend_le_reports); - break; - case HCI_AUTO_CONN_DIRECT: - case HCI_AUTO_CONN_ALWAYS: - if (!is_connected(hdev, addr, addr_type)) - hci_pend_le_list_add(params, &hdev->pend_le_conns); - break; - } - - params->auto_connect = auto_connect; - - bt_dev_dbg(hdev, "addr %pMR (type %u) auto_connect %u", - addr, addr_type, auto_connect); - - return 0; -} - static void device_added(struct sock *sk, struct hci_dev *hdev, bdaddr_t *bdaddr, u8 type, u8 action) { @@ -7601,17 +7735,13 @@ static int add_device(struct sock *sk, struct hci_dev *hdev, /* If the connection parameters don't exist for this device, * they will be created and configured with defaults. */ - if (hci_conn_params_set(hdev, &cp->addr.bdaddr, addr_type, - auto_conn) < 0) { + params = hci_conn_params_set(hdev, &cp->addr.bdaddr, addr_type, + auto_conn); + if (!params) { err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_DEVICE, MGMT_STATUS_FAILED, &cp->addr, sizeof(cp->addr)); goto unlock; - } else { - params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr, - addr_type); - if (params) - current_flags = params->flags; } cmd = mgmt_pending_new(sk, MGMT_OP_ADD_DEVICE, hdev, data, len); @@ -8036,10 +8166,6 @@ static void read_local_oob_ext_data_complete(struct hci_dev *hdev, void *data, u8 status = mgmt_status(err); u16 eir_len; - if (err == -ECANCELED || - cmd != pending_find(MGMT_OP_READ_LOCAL_OOB_EXT_DATA, hdev)) - return; - if (!status) { if (!skb) status = MGMT_STATUS_FAILED; @@ -8146,7 +8272,7 @@ done: kfree_skb(skb); kfree(mgmt_rp); - mgmt_pending_remove(cmd); + mgmt_pending_free(cmd); } static int read_local_ssp_oob_req(struct hci_dev *hdev, struct sock *sk, @@ -8155,7 +8281,7 @@ static int read_local_ssp_oob_req(struct hci_dev *hdev, struct sock *sk, struct mgmt_pending_cmd *cmd; int err; - cmd = mgmt_pending_add(sk, MGMT_OP_READ_LOCAL_OOB_EXT_DATA, hdev, + cmd = mgmt_pending_new(sk, MGMT_OP_READ_LOCAL_OOB_EXT_DATA, hdev, cp, sizeof(*cp)); if (!cmd) return -ENOMEM; @@ -9387,6 +9513,7 @@ void mgmt_index_removed(struct hci_dev *hdev) cancel_delayed_work_sync(&hdev->discov_off); cancel_delayed_work_sync(&hdev->service_cache); cancel_delayed_work_sync(&hdev->rpa_expired); + cancel_delayed_work_sync(&hdev->mesh_send_done); } void mgmt_power_on(struct hci_dev *hdev, int err) @@ -9705,7 +9832,9 @@ void mgmt_device_disconnected(struct hci_dev *hdev, bdaddr_t *bdaddr, if (!mgmt_connected) return; - if (link_type != ACL_LINK && link_type != LE_LINK) + if (link_type != ACL_LINK && + link_type != LE_LINK && + link_type != BIS_LINK) return; bacpy(&ev.addr.bdaddr, bdaddr); diff --git a/net/bluetooth/mgmt_config.c b/net/bluetooth/mgmt_config.c index 6ef701c27da4..c4063d200c0a 100644 --- a/net/bluetooth/mgmt_config.c +++ b/net/bluetooth/mgmt_config.c @@ -13,13 +13,13 @@ #define HDEV_PARAM_U16(_param_name_) \ struct {\ - struct mgmt_tlv entry; \ + struct mgmt_tlv_hdr entry; \ __le16 value; \ } __packed _param_name_ #define HDEV_PARAM_U8(_param_name_) \ struct {\ - struct mgmt_tlv entry; \ + struct mgmt_tlv_hdr entry; \ __u8 value; \ } __packed _param_name_ diff --git a/net/bluetooth/mgmt_util.c b/net/bluetooth/mgmt_util.c index a88a07da3947..aa7b5585cb26 100644 --- a/net/bluetooth/mgmt_util.c +++ b/net/bluetooth/mgmt_util.c @@ -320,6 +320,52 @@ void mgmt_pending_remove(struct mgmt_pending_cmd *cmd) mgmt_pending_free(cmd); } +bool __mgmt_pending_listed(struct hci_dev *hdev, struct mgmt_pending_cmd *cmd) +{ + struct mgmt_pending_cmd *tmp; + + lockdep_assert_held(&hdev->mgmt_pending_lock); + + if (!cmd) + return false; + + list_for_each_entry(tmp, &hdev->mgmt_pending, list) { + if (cmd == tmp) + return true; + } + + return false; +} + +bool mgmt_pending_listed(struct hci_dev *hdev, struct mgmt_pending_cmd *cmd) +{ + bool listed; + + mutex_lock(&hdev->mgmt_pending_lock); + listed = __mgmt_pending_listed(hdev, cmd); + mutex_unlock(&hdev->mgmt_pending_lock); + + return listed; +} + +bool mgmt_pending_valid(struct hci_dev *hdev, struct mgmt_pending_cmd *cmd) +{ + bool listed; + + if (!cmd) + return false; + + mutex_lock(&hdev->mgmt_pending_lock); + + listed = __mgmt_pending_listed(hdev, cmd); + if (listed) + list_del(&cmd->list); + + mutex_unlock(&hdev->mgmt_pending_lock); + + return listed; +} + void mgmt_mesh_foreach(struct hci_dev *hdev, void (*cb)(struct mgmt_mesh_tx *mesh_tx, void *data), void *data, struct sock *sk) diff --git a/net/bluetooth/mgmt_util.h b/net/bluetooth/mgmt_util.h index 024e51dd6937..bcba8c9d8952 100644 --- a/net/bluetooth/mgmt_util.h +++ b/net/bluetooth/mgmt_util.h @@ -65,6 +65,9 @@ struct mgmt_pending_cmd *mgmt_pending_new(struct sock *sk, u16 opcode, void *data, u16 len); void mgmt_pending_free(struct mgmt_pending_cmd *cmd); void mgmt_pending_remove(struct mgmt_pending_cmd *cmd); +bool __mgmt_pending_listed(struct hci_dev *hdev, struct mgmt_pending_cmd *cmd); +bool mgmt_pending_listed(struct hci_dev *hdev, struct mgmt_pending_cmd *cmd); +bool mgmt_pending_valid(struct hci_dev *hdev, struct mgmt_pending_cmd *cmd); void mgmt_mesh_foreach(struct hci_dev *hdev, void (*cb)(struct mgmt_mesh_tx *mesh_tx, void *data), void *data, struct sock *sk); diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index 96250807b32b..57b1dca8141f 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -781,7 +781,7 @@ static struct rfcomm_session *rfcomm_session_create(bdaddr_t *src, addr.l2_psm = 0; addr.l2_cid = 0; addr.l2_bdaddr_type = BDADDR_BREDR; - *err = kernel_bind(sock, (struct sockaddr *) &addr, sizeof(addr)); + *err = kernel_bind(sock, (struct sockaddr_unsized *)&addr, sizeof(addr)); if (*err < 0) goto failed; @@ -808,7 +808,7 @@ static struct rfcomm_session *rfcomm_session_create(bdaddr_t *src, addr.l2_psm = cpu_to_le16(L2CAP_PSM_RFCOMM); addr.l2_cid = 0; addr.l2_bdaddr_type = BDADDR_BREDR; - *err = kernel_connect(sock, (struct sockaddr *) &addr, sizeof(addr), O_NONBLOCK); + *err = kernel_connect(sock, (struct sockaddr_unsized *)&addr, sizeof(addr), O_NONBLOCK); if (*err == 0 || *err == -EINPROGRESS) return s; @@ -2068,7 +2068,7 @@ static int rfcomm_add_listener(bdaddr_t *ba) addr.l2_psm = cpu_to_le16(L2CAP_PSM_RFCOMM); addr.l2_cid = 0; addr.l2_bdaddr_type = BDADDR_BREDR; - err = kernel_bind(sock, (struct sockaddr *) &addr, sizeof(addr)); + err = kernel_bind(sock, (struct sockaddr_unsized *)&addr, sizeof(addr)); if (err < 0) { BT_ERR("Bind failed %d", err); goto failed; diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index 913402806fa0..be6639cd6f59 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -324,7 +324,7 @@ static int rfcomm_sock_create(struct net *net, struct socket *sock, return 0; } -static int rfcomm_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) +static int rfcomm_sock_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len) { struct sockaddr_rc sa; struct sock *sk = sock->sk; @@ -371,7 +371,8 @@ done: return err; } -static int rfcomm_sock_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags) +static int rfcomm_sock_connect(struct socket *sock, struct sockaddr_unsized *addr, + int alen, int flags) { struct sockaddr_rc *sa = (struct sockaddr_rc *) addr; struct sock *sk = sock->sk; diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c index 376ce6de84be..b783526ab588 100644 --- a/net/bluetooth/rfcomm/tty.c +++ b/net/bluetooth/rfcomm/tty.c @@ -643,8 +643,8 @@ static void rfcomm_dev_modem_status(struct rfcomm_dlc *dlc, u8 v24_sig) tty_port_tty_hangup(&dev->port, true); dev->modem_status = - ((v24_sig & RFCOMM_V24_RTC) ? (TIOCM_DSR | TIOCM_DTR) : 0) | - ((v24_sig & RFCOMM_V24_RTR) ? (TIOCM_RTS | TIOCM_CTS) : 0) | + ((v24_sig & RFCOMM_V24_RTC) ? TIOCM_DSR : 0) | + ((v24_sig & RFCOMM_V24_RTR) ? TIOCM_CTS : 0) | ((v24_sig & RFCOMM_V24_IC) ? TIOCM_RI : 0) | ((v24_sig & RFCOMM_V24_DV) ? TIOCM_CD : 0); } @@ -1055,10 +1055,14 @@ static void rfcomm_tty_hangup(struct tty_struct *tty) static int rfcomm_tty_tiocmget(struct tty_struct *tty) { struct rfcomm_dev *dev = tty->driver_data; + struct rfcomm_dlc *dlc = dev->dlc; + u8 v24_sig; BT_DBG("tty %p dev %p", tty, dev); - return dev->modem_status; + rfcomm_dlc_get_modem_status(dlc, &v24_sig); + + return (v24_sig & (TIOCM_DTR | TIOCM_RTS)) | dev->modem_status; } static int rfcomm_tty_tiocmset(struct tty_struct *tty, unsigned int set, unsigned int clear) @@ -1071,23 +1075,15 @@ static int rfcomm_tty_tiocmset(struct tty_struct *tty, unsigned int set, unsigne rfcomm_dlc_get_modem_status(dlc, &v24_sig); - if (set & TIOCM_DSR || set & TIOCM_DTR) + if (set & TIOCM_DTR) v24_sig |= RFCOMM_V24_RTC; - if (set & TIOCM_RTS || set & TIOCM_CTS) + if (set & TIOCM_RTS) v24_sig |= RFCOMM_V24_RTR; - if (set & TIOCM_RI) - v24_sig |= RFCOMM_V24_IC; - if (set & TIOCM_CD) - v24_sig |= RFCOMM_V24_DV; - if (clear & TIOCM_DSR || clear & TIOCM_DTR) + if (clear & TIOCM_DTR) v24_sig &= ~RFCOMM_V24_RTC; - if (clear & TIOCM_RTS || clear & TIOCM_CTS) + if (clear & TIOCM_RTS) v24_sig &= ~RFCOMM_V24_RTR; - if (clear & TIOCM_RI) - v24_sig &= ~RFCOMM_V24_IC; - if (clear & TIOCM_CD) - v24_sig &= ~RFCOMM_V24_DV; rfcomm_dlc_set_modem_status(dlc, v24_sig); diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index d382d980fd9a..87ba90336e80 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -498,6 +498,13 @@ static void sco_sock_kill(struct sock *sk) BT_DBG("sk %p state %d", sk, sk->sk_state); + /* Sock is dead, so set conn->sk to NULL to avoid possible UAF */ + if (sco_pi(sk)->conn) { + sco_conn_lock(sco_pi(sk)->conn); + sco_pi(sk)->conn->sk = NULL; + sco_conn_unlock(sco_pi(sk)->conn); + } + /* Kill poor orphan */ bt_sock_unlink(&sco_sk_list, sk); sock_set_flag(sk, SOCK_DEAD); @@ -598,7 +605,7 @@ static int sco_sock_create(struct net *net, struct socket *sock, int protocol, return 0; } -static int sco_sock_bind(struct socket *sock, struct sockaddr *addr, +static int sco_sock_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len) { struct sockaddr_sco *sa = (struct sockaddr_sco *) addr; @@ -632,7 +639,7 @@ done: return err; } -static int sco_sock_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags) +static int sco_sock_connect(struct socket *sock, struct sockaddr_unsized *addr, int alen, int flags) { struct sockaddr_sco *sa = (struct sockaddr_sco *) addr; struct sock *sk = sock->sk; @@ -1451,22 +1458,39 @@ static void sco_disconn_cfm(struct hci_conn *hcon, __u8 reason) sco_conn_del(hcon, bt_to_errno(reason)); } -void sco_recv_scodata(struct hci_conn *hcon, struct sk_buff *skb) +int sco_recv_scodata(struct hci_dev *hdev, u16 handle, struct sk_buff *skb) { - struct sco_conn *conn = hcon->sco_data; + struct hci_conn *hcon; + struct sco_conn *conn; - if (!conn) - goto drop; + hci_dev_lock(hdev); + + hcon = hci_conn_hash_lookup_handle(hdev, handle); + if (!hcon) { + hci_dev_unlock(hdev); + kfree_skb(skb); + return -ENOENT; + } + + conn = sco_conn_hold_unless_zero(hcon->sco_data); + hcon = NULL; + + hci_dev_unlock(hdev); + + if (!conn) { + kfree_skb(skb); + return -EINVAL; + } BT_DBG("conn %p len %u", conn, skb->len); - if (skb->len) { + if (skb->len) sco_recv_frame(conn, skb); - return; - } + else + kfree_skb(skb); -drop: - kfree_skb(skb); + sco_conn_put(conn); + return 0; } static struct hci_cb sco_cb = { diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index 45512b2ba951..3a1ce04a7a53 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -2136,7 +2136,7 @@ static u8 smp_cmd_pairing_random(struct l2cap_conn *conn, struct sk_buff *skb) struct smp_chan *smp = chan->data; struct hci_conn *hcon = conn->hcon; u8 *pkax, *pkbx, *na, *nb, confirm_hint; - u32 passkey; + u32 passkey = 0; int err; bt_dev_dbg(hcon->hdev, "conn %p", conn); @@ -2188,24 +2188,6 @@ static u8 smp_cmd_pairing_random(struct l2cap_conn *conn, struct sk_buff *skb) smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd), smp->prnd); SMP_ALLOW_CMD(smp, SMP_CMD_DHKEY_CHECK); - - /* Only Just-Works pairing requires extra checks */ - if (smp->method != JUST_WORKS) - goto mackey_and_ltk; - - /* If there already exists long term key in local host, leave - * the decision to user space since the remote device could - * be legitimate or malicious. - */ - if (hci_find_ltk(hcon->hdev, &hcon->dst, hcon->dst_type, - hcon->role)) { - /* Set passkey to 0. The value can be any number since - * it'll be ignored anyway. - */ - passkey = 0; - confirm_hint = 1; - goto confirm; - } } mackey_and_ltk: @@ -2226,11 +2208,12 @@ mackey_and_ltk: if (err) return SMP_UNSPECIFIED; - confirm_hint = 0; - -confirm: - if (smp->method == JUST_WORKS) - confirm_hint = 1; + /* Always require user confirmation for Just-Works pairing to prevent + * impersonation attacks, or in case of a legitimate device that is + * repairing use the confirmation as acknowledgment to proceed with the + * creation of new keys. + */ + confirm_hint = smp->method == JUST_WORKS ? 1 : 0; err = mgmt_user_confirm_request(hcon->hdev, &hcon->dst, hcon->type, hcon->dst_type, passkey, confirm_hint); diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 9728dbd4c66c..655efac6f133 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -29,7 +29,6 @@ #include <trace/events/bpf_test_run.h> struct bpf_test_timer { - enum { NO_PREEMPT, NO_MIGRATE } mode; u32 i; u64 time_start, time_spent; }; @@ -37,12 +36,7 @@ struct bpf_test_timer { static void bpf_test_timer_enter(struct bpf_test_timer *t) __acquires(rcu) { - rcu_read_lock(); - if (t->mode == NO_PREEMPT) - preempt_disable(); - else - migrate_disable(); - + rcu_read_lock_dont_migrate(); t->time_start = ktime_get_ns(); } @@ -50,12 +44,7 @@ static void bpf_test_timer_leave(struct bpf_test_timer *t) __releases(rcu) { t->time_start = 0; - - if (t->mode == NO_PREEMPT) - preempt_enable(); - else - migrate_enable(); - rcu_read_unlock(); + rcu_read_unlock_migrate(); } static bool bpf_test_timer_continue(struct bpf_test_timer *t, int iterations, @@ -374,7 +363,7 @@ static int bpf_test_run_xdp_live(struct bpf_prog *prog, struct xdp_buff *ctx, { struct xdp_test_data xdp = { .batch_size = batch_size }; - struct bpf_test_timer t = { .mode = NO_MIGRATE }; + struct bpf_test_timer t = {}; int ret; if (!repeat) @@ -404,7 +393,7 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, struct bpf_prog_array_item item = {.prog = prog}; struct bpf_run_ctx *old_ctx; struct bpf_cg_run_ctx run_ctx; - struct bpf_test_timer t = { NO_MIGRATE }; + struct bpf_test_timer t = {}; enum bpf_cgroup_storage_type stype; int ret; @@ -447,7 +436,7 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, static int bpf_test_finish(const union bpf_attr *kattr, union bpf_attr __user *uattr, const void *data, - struct skb_shared_info *sinfo, u32 size, + struct skb_shared_info *sinfo, u32 size, u32 frag_size, u32 retval, u32 duration) { void __user *data_out = u64_to_user_ptr(kattr->test.data_out); @@ -464,7 +453,7 @@ static int bpf_test_finish(const union bpf_attr *kattr, } if (data_out) { - int len = sinfo ? copy_size - sinfo->xdp_frags_size : copy_size; + int len = sinfo ? copy_size - frag_size : copy_size; if (len < 0) { err = -ENOSPC; @@ -524,27 +513,27 @@ __bpf_kfunc int bpf_fentry_test1(int a) } EXPORT_SYMBOL_GPL(bpf_fentry_test1); -int noinline bpf_fentry_test2(int a, u64 b) +noinline int bpf_fentry_test2(int a, u64 b) { return a + b; } -int noinline bpf_fentry_test3(char a, int b, u64 c) +noinline int bpf_fentry_test3(char a, int b, u64 c) { return a + b + c; } -int noinline bpf_fentry_test4(void *a, char b, int c, u64 d) +noinline int bpf_fentry_test4(void *a, char b, int c, u64 d) { return (long)a + b + c + d; } -int noinline bpf_fentry_test5(u64 a, void *b, short c, int d, u64 e) +noinline int bpf_fentry_test5(u64 a, void *b, short c, int d, u64 e) { return a + (long)b + c + d + e; } -int noinline bpf_fentry_test6(u64 a, void *b, short c, int d, void *e, u64 f) +noinline int bpf_fentry_test6(u64 a, void *b, short c, int d, void *e, u64 f) { return a + (long)b + c + d + (long)e + f; } @@ -553,13 +542,13 @@ struct bpf_fentry_test_t { struct bpf_fentry_test_t *a; }; -int noinline bpf_fentry_test7(struct bpf_fentry_test_t *arg) +noinline int bpf_fentry_test7(struct bpf_fentry_test_t *arg) { - asm volatile ("": "+r"(arg)); + asm volatile ("" : "+r"(arg)); return (long)arg; } -int noinline bpf_fentry_test8(struct bpf_fentry_test_t *arg) +noinline int bpf_fentry_test8(struct bpf_fentry_test_t *arg) { return (long)arg->a; } @@ -569,12 +558,12 @@ __bpf_kfunc u32 bpf_fentry_test9(u32 *a) return *a; } -int noinline bpf_fentry_test10(const void *a) +noinline int bpf_fentry_test10(const void *a) { return (long)a; } -void noinline bpf_fentry_test_sinfo(struct skb_shared_info *sinfo) +noinline void bpf_fentry_test_sinfo(struct skb_shared_info *sinfo) { } @@ -598,7 +587,7 @@ __bpf_kfunc int bpf_modify_return_test_tp(int nonce) return nonce; } -int noinline bpf_fentry_shadow_test(int a) +noinline int bpf_fentry_shadow_test(int a) { return a + 1; } @@ -665,7 +654,7 @@ static void *bpf_test_init(const union bpf_attr *kattr, u32 user_size, void __user *data_in = u64_to_user_ptr(kattr->test.data_in); void *data; - if (user_size < ETH_HLEN || user_size > PAGE_SIZE - headroom - tailroom) + if (user_size > PAGE_SIZE - headroom - tailroom) return ERR_PTR(-EINVAL); size = SKB_DATA_ALIGN(size); @@ -910,6 +899,12 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb) /* cb is allowed */ if (!range_is_zero(__skb, offsetofend(struct __sk_buff, cb), + offsetof(struct __sk_buff, data_end))) + return -EINVAL; + + /* data_end is allowed, but not copied to skb */ + + if (!range_is_zero(__skb, offsetofend(struct __sk_buff, data_end), offsetof(struct __sk_buff, tstamp))) return -EINVAL; @@ -950,6 +945,11 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb) if (__skb->gso_segs > GSO_MAX_SEGS) return -EINVAL; + + /* Currently GSO type is zero/unset. If this gets extended with + * a small list of accepted GSO types in future, the filter for + * an unset GSO type in bpf_clone_redirect() can be lifted. + */ skb_shinfo(skb)->gso_segs = __skb->gso_segs; skb_shinfo(skb)->gso_size = __skb->gso_size; skb_shinfo(skb)->hwtstamps.hwtstamp = __skb->hwtstamp; @@ -984,43 +984,39 @@ static struct proto bpf_dummy_proto = { int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { - bool is_l2 = false, is_direct_pkt_access = false; + bool is_l2 = false, is_direct_pkt_access = false, is_lwt = false; + u32 tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); struct net *net = current->nsproxy->net_ns; struct net_device *dev = net->loopback_dev; - u32 size = kattr->test.data_size_in; + u32 headroom = NET_SKB_PAD + NET_IP_ALIGN; + u32 linear_sz = kattr->test.data_size_in; u32 repeat = kattr->test.repeat; struct __sk_buff *ctx = NULL; + struct sk_buff *skb = NULL; + struct sock *sk = NULL; u32 retval, duration; int hh_len = ETH_HLEN; - struct sk_buff *skb; - struct sock *sk; - void *data; + void *data = NULL; int ret; if ((kattr->test.flags & ~BPF_F_TEST_SKB_CHECKSUM_COMPLETE) || kattr->test.cpu || kattr->test.batch_size) return -EINVAL; - data = bpf_test_init(kattr, kattr->test.data_size_in, - size, NET_SKB_PAD + NET_IP_ALIGN, - SKB_DATA_ALIGN(sizeof(struct skb_shared_info))); - if (IS_ERR(data)) - return PTR_ERR(data); - - ctx = bpf_ctx_init(kattr, sizeof(struct __sk_buff)); - if (IS_ERR(ctx)) { - kfree(data); - return PTR_ERR(ctx); - } + if (kattr->test.data_size_in < ETH_HLEN) + return -EINVAL; switch (prog->type) { case BPF_PROG_TYPE_SCHED_CLS: case BPF_PROG_TYPE_SCHED_ACT: + is_direct_pkt_access = true; is_l2 = true; - fallthrough; + break; case BPF_PROG_TYPE_LWT_IN: case BPF_PROG_TYPE_LWT_OUT: case BPF_PROG_TYPE_LWT_XMIT: + is_lwt = true; + fallthrough; case BPF_PROG_TYPE_CGROUP_SKB: is_direct_pkt_access = true; break; @@ -1028,25 +1024,88 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, break; } + ctx = bpf_ctx_init(kattr, sizeof(struct __sk_buff)); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + if (ctx) { + if (ctx->data_end > kattr->test.data_size_in || ctx->data || ctx->data_meta) { + ret = -EINVAL; + goto out; + } + if (ctx->data_end) { + /* Non-linear LWT test_run is unsupported for now. */ + if (is_lwt) { + ret = -EINVAL; + goto out; + } + linear_sz = max(ETH_HLEN, ctx->data_end); + } + } + + linear_sz = min_t(u32, linear_sz, PAGE_SIZE - headroom - tailroom); + + data = bpf_test_init(kattr, linear_sz, linear_sz, headroom, tailroom); + if (IS_ERR(data)) { + ret = PTR_ERR(data); + data = NULL; + goto out; + } + sk = sk_alloc(net, AF_UNSPEC, GFP_USER, &bpf_dummy_proto, 1); if (!sk) { - kfree(data); - kfree(ctx); - return -ENOMEM; + ret = -ENOMEM; + goto out; } sock_init_data(NULL, sk); skb = slab_build_skb(data); if (!skb) { - kfree(data); - kfree(ctx); - sk_free(sk); - return -ENOMEM; + ret = -ENOMEM; + goto out; } skb->sk = sk; + data = NULL; /* data released via kfree_skb */ + skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); - __skb_put(skb, size); + __skb_put(skb, linear_sz); + + if (unlikely(kattr->test.data_size_in > linear_sz)) { + void __user *data_in = u64_to_user_ptr(kattr->test.data_in); + struct skb_shared_info *sinfo = skb_shinfo(skb); + u32 copied = linear_sz; + + while (copied < kattr->test.data_size_in) { + struct page *page; + u32 data_len; + + if (sinfo->nr_frags == MAX_SKB_FRAGS) { + ret = -ENOMEM; + goto out; + } + + page = alloc_page(GFP_KERNEL); + if (!page) { + ret = -ENOMEM; + goto out; + } + + data_len = min_t(u32, kattr->test.data_size_in - copied, + PAGE_SIZE); + skb_fill_page_desc(skb, sinfo->nr_frags, page, 0, data_len); + + if (copy_from_user(page_address(page), data_in + copied, + data_len)) { + ret = -EFAULT; + goto out; + } + skb->data_len += data_len; + skb->truesize += PAGE_SIZE; + skb->len += data_len; + copied += data_len; + } + } if (ctx && ctx->ifindex > 1) { dev = dev_get_by_index(net, ctx->ifindex); @@ -1126,12 +1185,11 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, convert_skb_to___skb(skb, ctx); - size = skb->len; - /* bpf program can never convert linear skb to non-linear */ - if (WARN_ON_ONCE(skb_is_nonlinear(skb))) - size = skb_headlen(skb); - ret = bpf_test_finish(kattr, uattr, skb->data, NULL, size, retval, - duration); + if (skb_is_nonlinear(skb)) + /* bpf program can never convert linear skb to non-linear */ + WARN_ON_ONCE(linear_sz == kattr->test.data_size_in); + ret = bpf_test_finish(kattr, uattr, skb->data, skb_shinfo(skb), skb->len, + skb->data_len, retval, duration); if (!ret) ret = bpf_ctx_finish(kattr, uattr, ctx, sizeof(struct __sk_buff)); @@ -1139,7 +1197,9 @@ out: if (dev && dev != net->loopback_dev) dev_put(dev); kfree_skb(skb); - sk_free(sk); + kfree(data); + if (sk) + sk_free(sk); kfree(ctx); return ret; } @@ -1207,9 +1267,9 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, { bool do_live = (kattr->test.flags & BPF_F_TEST_XDP_LIVE_FRAMES); u32 tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + u32 retval = 0, meta_sz = 0, duration, max_linear_sz, size; + u32 linear_sz = kattr->test.data_size_in; u32 batch_size = kattr->test.batch_size; - u32 retval = 0, duration, max_data_sz; - u32 size = kattr->test.data_size_in; u32 headroom = XDP_PACKET_HEADROOM; u32 repeat = kattr->test.repeat; struct netdev_rx_queue *rxqueue; @@ -1246,39 +1306,45 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, if (ctx) { /* There can't be user provided data before the meta data */ - if (ctx->data_meta || ctx->data_end != size || + if (ctx->data_meta || ctx->data_end > kattr->test.data_size_in || ctx->data > ctx->data_end || unlikely(xdp_metalen_invalid(ctx->data)) || (do_live && (kattr->test.data_out || kattr->test.ctx_out))) goto free_ctx; /* Meta data is allocated from the headroom */ headroom -= ctx->data; - } - max_data_sz = PAGE_SIZE - headroom - tailroom; - if (size > max_data_sz) { - /* disallow live data mode for jumbo frames */ - if (do_live) - goto free_ctx; - size = max_data_sz; + meta_sz = ctx->data; + linear_sz = ctx->data_end; } - data = bpf_test_init(kattr, size, max_data_sz, headroom, tailroom); + max_linear_sz = PAGE_SIZE - headroom - tailroom; + linear_sz = min_t(u32, linear_sz, max_linear_sz); + + /* disallow live data mode for jumbo frames */ + if (do_live && kattr->test.data_size_in > linear_sz) + goto free_ctx; + + if (kattr->test.data_size_in - meta_sz < ETH_HLEN) + goto free_ctx; + + data = bpf_test_init(kattr, linear_sz, max_linear_sz, headroom, tailroom); if (IS_ERR(data)) { ret = PTR_ERR(data); goto free_ctx; } rxqueue = __netif_get_rx_queue(current->nsproxy->net_ns->loopback_dev, 0); - rxqueue->xdp_rxq.frag_size = headroom + max_data_sz + tailroom; + rxqueue->xdp_rxq.frag_size = PAGE_SIZE; xdp_init_buff(&xdp, rxqueue->xdp_rxq.frag_size, &rxqueue->xdp_rxq); - xdp_prepare_buff(&xdp, data, headroom, size, true); + xdp_prepare_buff(&xdp, data, headroom, linear_sz, true); sinfo = xdp_get_shared_info_from_buff(&xdp); ret = xdp_convert_md_to_buff(ctx, &xdp); if (ret) goto free_data; + size = linear_sz; if (unlikely(kattr->test.data_size_in > size)) { void __user *data_in = u64_to_user_ptr(kattr->test.data_in); @@ -1331,7 +1397,7 @@ int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, goto out; size = xdp.data_end - xdp.data_meta + sinfo->xdp_frags_size; - ret = bpf_test_finish(kattr, uattr, xdp.data_meta, sinfo, size, + ret = bpf_test_finish(kattr, uattr, xdp.data_meta, sinfo, size, sinfo->xdp_frags_size, retval, duration); if (!ret) ret = bpf_ctx_finish(kattr, uattr, ctx, @@ -1368,7 +1434,7 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { - struct bpf_test_timer t = { NO_PREEMPT }; + struct bpf_test_timer t = {}; u32 size = kattr->test.data_size_in; struct bpf_flow_dissector ctx = {}; u32 repeat = kattr->test.repeat; @@ -1422,7 +1488,7 @@ int bpf_prog_test_run_flow_dissector(struct bpf_prog *prog, goto out; ret = bpf_test_finish(kattr, uattr, &flow_keys, NULL, - sizeof(flow_keys), retval, duration); + sizeof(flow_keys), 0, retval, duration); if (!ret) ret = bpf_ctx_finish(kattr, uattr, user_ctx, sizeof(struct bpf_flow_keys)); @@ -1436,7 +1502,7 @@ out: int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr) { - struct bpf_test_timer t = { NO_PREEMPT }; + struct bpf_test_timer t = {}; struct bpf_prog_array *progs = NULL; struct bpf_sk_lookup_kern ctx = {}; u32 repeat = kattr->test.repeat; @@ -1523,7 +1589,7 @@ int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog, const union bpf_attr *kat user_ctx->cookie = sock_gen_cookie(ctx.selected_sk); } - ret = bpf_test_finish(kattr, uattr, NULL, NULL, 0, retval, duration); + ret = bpf_test_finish(kattr, uattr, NULL, NULL, 0, 0, retval, duration); if (!ret) ret = bpf_ctx_finish(kattr, uattr, user_ctx, sizeof(*user_ctx)); @@ -1723,7 +1789,7 @@ int bpf_prog_test_run_nf(struct bpf_prog *prog, if (ret) goto out; - ret = bpf_test_finish(kattr, uattr, NULL, NULL, 0, retval, duration); + ret = bpf_test_finish(kattr, uattr, NULL, NULL, 0, 0, retval, duration); out: kfree(user_ctx); diff --git a/net/bridge/br.c b/net/bridge/br.c index 1885d0c315f0..c37e52e2f29a 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -37,6 +37,11 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v int err; if (netif_is_bridge_master(dev)) { + struct net_bridge *br = netdev_priv(dev); + + if (event == NETDEV_REGISTER) + br_fdb_change_mac_address(br, dev->dev_addr); + err = br_vlan_bridge_event(dev, event, ptr); if (err) return notifier_from_errno(err); @@ -259,6 +264,23 @@ static struct notifier_block br_switchdev_blocking_notifier = { .notifier_call = br_switchdev_blocking_event, }; +static int +br_toggle_fdb_local_vlan_0(struct net_bridge *br, bool on, + struct netlink_ext_ack *extack) +{ + int err; + + if (br_opt_get(br, BROPT_FDB_LOCAL_VLAN_0) == on) + return 0; + + err = br_fdb_toggle_local_vlan_0(br, on, extack); + if (err) + return err; + + br_opt_toggle(br, BROPT_FDB_LOCAL_VLAN_0, on); + return 0; +} + /* br_boolopt_toggle - change user-controlled boolean option * * @br: bridge device @@ -287,6 +309,9 @@ int br_boolopt_toggle(struct net_bridge *br, enum br_boolopt_id opt, bool on, case BR_BOOLOPT_MDB_OFFLOAD_FAIL_NOTIFICATION: br_opt_toggle(br, BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION, on); break; + case BR_BOOLOPT_FDB_LOCAL_VLAN_0: + err = br_toggle_fdb_local_vlan_0(br, on, extack); + break; default: /* shouldn't be called with unsupported options */ WARN_ON(1); @@ -307,6 +332,8 @@ int br_boolopt_get(const struct net_bridge *br, enum br_boolopt_id opt) return br_opt_get(br, BROPT_MST_ENABLED); case BR_BOOLOPT_MDB_OFFLOAD_FAIL_NOTIFICATION: return br_opt_get(br, BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION); + case BR_BOOLOPT_FDB_LOCAL_VLAN_0: + return br_opt_get(br, BROPT_FDB_LOCAL_VLAN_0); default: /* shouldn't be called with unsupported options */ WARN_ON(1); @@ -324,6 +351,13 @@ int br_boolopt_multi_toggle(struct net_bridge *br, int err = 0; int opt_id; + opt_id = find_next_bit(&bitmap, BITS_PER_LONG, BR_BOOLOPT_MAX); + if (opt_id != BITS_PER_LONG) { + NL_SET_ERR_MSG_FMT_MOD(extack, "Unknown boolean option %d", + opt_id); + return -EINVAL; + } + for_each_set_bit(opt_id, &bitmap, BR_BOOLOPT_MAX) { bool on = !!(bm->optval & BIT(opt_id)); diff --git a/net/bridge/br_cfm.c b/net/bridge/br_cfm.c index a3c755d0a09d..c2c1c7d44c61 100644 --- a/net/bridge/br_cfm.c +++ b/net/bridge/br_cfm.c @@ -134,7 +134,7 @@ static void ccm_rx_timer_start(struct br_cfm_peer_mep *peer_mep) * of the configured CC 'expected_interval' * in order to detect CCM defect after 3.25 interval. */ - queue_delayed_work(system_wq, &peer_mep->ccm_rx_dwork, + queue_delayed_work(system_percpu_wq, &peer_mep->ccm_rx_dwork, usecs_to_jiffies(interval_us / 4)); } @@ -285,7 +285,7 @@ static void ccm_tx_work_expired(struct work_struct *work) ccm_frame_tx(skb); interval_us = interval_to_us(mep->cc_config.exp_interval); - queue_delayed_work(system_wq, &mep->ccm_tx_dwork, + queue_delayed_work(system_percpu_wq, &mep->ccm_tx_dwork, usecs_to_jiffies(interval_us)); } @@ -809,7 +809,7 @@ int br_cfm_cc_ccm_tx(struct net_bridge *br, const u32 instance, * to send first frame immediately */ mep->ccm_tx_end = jiffies + usecs_to_jiffies(tx_info->period * 1000000); - queue_delayed_work(system_wq, &mep->ccm_tx_dwork, 0); + queue_delayed_work(system_percpu_wq, &mep->ccm_tx_dwork, 0); save: mep->cc_ccm_tx_info = *tx_info; diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 902694c0ce64..58d22e2b85fc 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -459,6 +459,9 @@ void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr) struct net_bridge_fdb_entry *f; struct net_bridge *br = p->br; struct net_bridge_vlan *v; + bool local_vlan_0; + + local_vlan_0 = br_opt_get(br, BROPT_FDB_LOCAL_VLAN_0); spin_lock_bh(&br->hash_lock); vg = nbp_vlan_group(p); @@ -468,11 +471,11 @@ void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr) /* delete old one */ fdb_delete_local(br, p, f); - /* if this port has no vlan information - * configured, we can safely be done at - * this point. + /* if this port has no vlan information configured, or + * local entries are only kept on VLAN 0, we can safely + * be done at this point. */ - if (!vg || !vg->num_vlans) + if (!vg || !vg->num_vlans || local_vlan_0) goto insert; } } @@ -481,7 +484,7 @@ insert: /* insert new address, may fail if invalid address or dup. */ fdb_add_local(br, p, newaddr, 0); - if (!vg || !vg->num_vlans) + if (!vg || !vg->num_vlans || local_vlan_0) goto done; /* Now add entries for every VLAN configured on the port. @@ -500,6 +503,9 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr) struct net_bridge_vlan_group *vg; struct net_bridge_fdb_entry *f; struct net_bridge_vlan *v; + bool local_vlan_0; + + local_vlan_0 = br_opt_get(br, BROPT_FDB_LOCAL_VLAN_0); spin_lock_bh(&br->hash_lock); @@ -511,7 +517,7 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr) fdb_add_local(br, NULL, newaddr, 0); vg = br_vlan_group(br); - if (!vg || !vg->num_vlans) + if (!vg || !vg->num_vlans || local_vlan_0) goto out; /* Now remove and add entries for every VLAN configured on the * bridge. This function runs under RTNL so the bitmap will not @@ -576,6 +582,102 @@ void br_fdb_cleanup(struct work_struct *work) mod_delayed_work(system_long_wq, &br->gc_work, work_delay); } +static void br_fdb_delete_locals_per_vlan_port(struct net_bridge *br, + struct net_bridge_port *p) +{ + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *v; + struct net_device *dev; + + if (p) { + vg = nbp_vlan_group(p); + dev = p->dev; + } else { + vg = br_vlan_group(br); + dev = br->dev; + } + + list_for_each_entry(v, &vg->vlan_list, vlist) + br_fdb_find_delete_local(br, p, dev->dev_addr, v->vid); +} + +static void br_fdb_delete_locals_per_vlan(struct net_bridge *br) +{ + struct net_bridge_port *p; + + ASSERT_RTNL(); + + list_for_each_entry(p, &br->port_list, list) + br_fdb_delete_locals_per_vlan_port(br, p); + + br_fdb_delete_locals_per_vlan_port(br, NULL); +} + +static int br_fdb_insert_locals_per_vlan_port(struct net_bridge *br, + struct net_bridge_port *p, + struct netlink_ext_ack *extack) +{ + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *v; + struct net_device *dev; + int err; + + if (p) { + vg = nbp_vlan_group(p); + dev = p->dev; + } else { + vg = br_vlan_group(br); + dev = br->dev; + } + + list_for_each_entry(v, &vg->vlan_list, vlist) { + if (!br_vlan_should_use(v)) + continue; + + err = br_fdb_add_local(br, p, dev->dev_addr, v->vid); + if (err) + return err; + } + + return 0; +} + +static int br_fdb_insert_locals_per_vlan(struct net_bridge *br, + struct netlink_ext_ack *extack) +{ + struct net_bridge_port *p; + int err; + + ASSERT_RTNL(); + + list_for_each_entry(p, &br->port_list, list) { + err = br_fdb_insert_locals_per_vlan_port(br, p, extack); + if (err) + goto rollback; + } + + err = br_fdb_insert_locals_per_vlan_port(br, NULL, extack); + if (err) + goto rollback; + + return 0; + +rollback: + NL_SET_ERR_MSG_MOD(extack, "fdb_local_vlan_0 toggle: FDB entry insertion failed"); + br_fdb_delete_locals_per_vlan(br); + return err; +} + +int br_fdb_toggle_local_vlan_0(struct net_bridge *br, bool on, + struct netlink_ext_ack *extack) +{ + if (!on) + return br_fdb_insert_locals_per_vlan(br, extack); + + br_fdb_delete_locals_per_vlan(br); + return 0; +} + static bool __fdb_flush_matches(const struct net_bridge *br, const struct net_bridge_fdb_entry *f, const struct net_bridge_fdb_flush_desc *desc) diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 29097e984b4f..dea09096ad0f 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -25,7 +25,7 @@ static inline int should_deliver(const struct net_bridge_port *p, vg = nbp_vlan_group_rcu(p); return ((p->flags & BR_HAIRPIN_MODE) || skb->dev != p->dev) && - (br_mst_is_enabled(p->br) || p->state == BR_STATE_FORWARDING) && + (br_mst_is_enabled(p) || p->state == BR_STATE_FORWARDING) && br_allowed_egress(vg, skb) && nbp_switchdev_allowed_egress(p, skb) && !br_skb_isolated(p, skb); } @@ -148,7 +148,8 @@ void br_forward(const struct net_bridge_port *to, goto out; /* redirect to backup link if the destination port is down */ - if (rcu_access_pointer(to->backup_port) && !netif_carrier_ok(to->dev)) { + if (rcu_access_pointer(to->backup_port) && + (!netif_carrier_ok(to->dev) || !netif_running(to->dev))) { struct net_bridge_port *backup_port; backup_port = rcu_dereference(to->backup_port); diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 98c5b9c3145f..4c67a32745f6 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -386,6 +386,7 @@ void br_dev_delete(struct net_device *dev, struct list_head *head) del_nbp(p); } + br_mst_uninit(br); br_recalculate_neigh_suppress_enabled(br); br_fdb_delete_by_port(br, NULL, 0, 1); @@ -525,20 +526,6 @@ void br_mtu_auto_adjust(struct net_bridge *br) br_opt_toggle(br, BROPT_MTU_SET_BY_USER, false); } -static void br_set_gso_limits(struct net_bridge *br) -{ - unsigned int tso_max_size = TSO_MAX_SIZE; - const struct net_bridge_port *p; - u16 tso_max_segs = TSO_MAX_SEGS; - - list_for_each_entry(p, &br->port_list, list) { - tso_max_size = min(tso_max_size, p->dev->tso_max_size); - tso_max_segs = min(tso_max_segs, p->dev->tso_max_segs); - } - netif_set_tso_max_size(br->dev, tso_max_size); - netif_set_tso_max_segs(br->dev, tso_max_segs); -} - /* * Recomputes features using slave's features */ @@ -652,8 +639,6 @@ int br_add_if(struct net_bridge *br, struct net_device *dev, netdev_err(dev, "failed to sync bridge static fdb addresses to this port\n"); } - netdev_update_features(br->dev); - br_hr = br->dev->needed_headroom; dev_hr = netdev_get_fwd_headroom(dev); if (br_hr < dev_hr) @@ -694,7 +679,8 @@ int br_add_if(struct net_bridge *br, struct net_device *dev, call_netdevice_notifiers(NETDEV_CHANGEADDR, br->dev); br_mtu_auto_adjust(br); - br_set_gso_limits(br); + + netdev_compute_master_upper_features(br->dev, false); kobject_uevent(&p->kobj, KOBJ_ADD); @@ -740,7 +726,6 @@ int br_del_if(struct net_bridge *br, struct net_device *dev) del_nbp(p); br_mtu_auto_adjust(br); - br_set_gso_limits(br); spin_lock_bh(&br->lock); changed_addr = br_stp_recalculate_bridge_id(br); @@ -749,7 +734,7 @@ int br_del_if(struct net_bridge *br, struct net_device *dev) if (changed_addr) call_netdevice_notifiers(NETDEV_CHANGEADDR, br->dev); - netdev_update_features(br->dev); + netdev_compute_master_upper_features(br->dev, false); return 0; } diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 5f6ac9bf1527..777fa869c1a1 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -94,7 +94,7 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb br = p->br; - if (br_mst_is_enabled(br)) { + if (br_mst_is_enabled(p)) { state = BR_STATE_FORWARDING; } else { if (p->state == BR_STATE_DISABLED) { @@ -202,6 +202,14 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb break; case BR_PKT_UNICAST: dst = br_fdb_find_rcu(br, eth_hdr(skb)->h_dest, vid); + if (unlikely(!dst && vid && + br_opt_get(br, BROPT_FDB_LOCAL_VLAN_0))) { + dst = br_fdb_find_rcu(br, eth_hdr(skb)->h_dest, 0); + if (dst && + (!test_bit(BR_FDB_LOCAL, &dst->flags) || + test_bit(BR_FDB_ADDED_BY_USER, &dst->flags))) + dst = NULL; + } break; default: break; @@ -421,7 +429,7 @@ static rx_handler_result_t br_handle_frame(struct sk_buff **pskb) return RX_HANDLER_PASS; forward: - if (br_mst_is_enabled(p->br)) + if (br_mst_is_enabled(p)) goto defer_stp_filtering; switch (p->state) { diff --git a/net/bridge/br_mrp.c b/net/bridge/br_mrp.c index fd2de35ffb3c..3c36fa24bc05 100644 --- a/net/bridge/br_mrp.c +++ b/net/bridge/br_mrp.c @@ -341,7 +341,7 @@ static void br_mrp_test_work_expired(struct work_struct *work) out: rcu_read_unlock(); - queue_delayed_work(system_wq, &mrp->test_work, + queue_delayed_work(system_percpu_wq, &mrp->test_work, usecs_to_jiffies(mrp->test_interval)); } @@ -418,7 +418,7 @@ static void br_mrp_in_test_work_expired(struct work_struct *work) out: rcu_read_unlock(); - queue_delayed_work(system_wq, &mrp->in_test_work, + queue_delayed_work(system_percpu_wq, &mrp->in_test_work, usecs_to_jiffies(mrp->in_test_interval)); } @@ -725,7 +725,7 @@ int br_mrp_start_test(struct net_bridge *br, mrp->test_max_miss = test->max_miss; mrp->test_monitor = test->monitor; mrp->test_count_miss = 0; - queue_delayed_work(system_wq, &mrp->test_work, + queue_delayed_work(system_percpu_wq, &mrp->test_work, usecs_to_jiffies(test->interval)); return 0; @@ -865,7 +865,7 @@ int br_mrp_start_in_test(struct net_bridge *br, mrp->in_test_end = jiffies + usecs_to_jiffies(in_test->period); mrp->in_test_max_miss = in_test->max_miss; mrp->in_test_count_miss = 0; - queue_delayed_work(system_wq, &mrp->in_test_work, + queue_delayed_work(system_percpu_wq, &mrp->in_test_work, usecs_to_jiffies(in_test->interval)); return 0; diff --git a/net/bridge/br_mst.c b/net/bridge/br_mst.c index 3f24b4ee49c2..43a300ae6bfa 100644 --- a/net/bridge/br_mst.c +++ b/net/bridge/br_mst.c @@ -22,6 +22,12 @@ bool br_mst_enabled(const struct net_device *dev) } EXPORT_SYMBOL_GPL(br_mst_enabled); +void br_mst_uninit(struct net_bridge *br) +{ + if (br_opt_get(br, BROPT_MST_ENABLED)) + static_branch_dec(&br_mst_used); +} + int br_mst_get_info(const struct net_device *dev, u16 msti, unsigned long *vids) { const struct net_bridge_vlan_group *vg; @@ -225,9 +231,9 @@ int br_mst_set_enabled(struct net_bridge *br, bool on, return err; if (on) - static_branch_enable(&br_mst_used); + static_branch_inc(&br_mst_used); else - static_branch_disable(&br_mst_used); + static_branch_dec(&br_mst_used); br_opt_toggle(br, BROPT_MST_ENABLED, on); return 0; diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 1377f31b719c..d55a4ab87837 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -4049,8 +4049,7 @@ int br_multicast_rcv(struct net_bridge_mcast **brmctx, } static void br_multicast_query_expired(struct net_bridge_mcast *brmctx, - struct bridge_mcast_own_query *query, - struct bridge_mcast_querier *querier) + struct bridge_mcast_own_query *query) { spin_lock(&brmctx->br->multicast_lock); if (br_multicast_ctx_vlan_disabled(brmctx)) @@ -4069,8 +4068,7 @@ static void br_ip4_multicast_query_expired(struct timer_list *t) struct net_bridge_mcast *brmctx = timer_container_of(brmctx, t, ip4_own_query.timer); - br_multicast_query_expired(brmctx, &brmctx->ip4_own_query, - &brmctx->ip4_querier); + br_multicast_query_expired(brmctx, &brmctx->ip4_own_query); } #if IS_ENABLED(CONFIG_IPV6) @@ -4079,8 +4077,7 @@ static void br_ip6_multicast_query_expired(struct timer_list *t) struct net_bridge_mcast *brmctx = timer_container_of(brmctx, t, ip6_own_query.timer); - br_multicast_query_expired(brmctx, &brmctx->ip6_own_query, - &brmctx->ip6_querier); + br_multicast_query_expired(brmctx, &brmctx->ip6_own_query); } #endif @@ -4652,6 +4649,14 @@ static void br_multicast_start_querier(struct net_bridge_mcast *brmctx, rcu_read_unlock(); } +static void br_multicast_del_grps(struct net_bridge *br) +{ + struct net_bridge_port *port; + + list_for_each_entry(port, &br->port_list, list) + __br_multicast_disable_port_ctx(&port->multicast_ctx); +} + int br_multicast_toggle(struct net_bridge *br, unsigned long val, struct netlink_ext_ack *extack) { @@ -4672,6 +4677,7 @@ int br_multicast_toggle(struct net_bridge *br, unsigned long val, br_opt_toggle(br, BROPT_MULTICAST_ENABLED, !!val); if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) { change_snoopers = true; + br_multicast_del_grps(br); goto unlock; } @@ -4818,6 +4824,14 @@ void br_multicast_set_query_intvl(struct net_bridge_mcast *brmctx, intvl_jiffies = BR_MULTICAST_QUERY_INTVL_MIN; } + if (intvl_jiffies > BR_MULTICAST_QUERY_INTVL_MAX) { + br_info(brmctx->br, + "trying to set multicast query interval above maximum, setting to %lu (%ums)\n", + jiffies_to_clock_t(BR_MULTICAST_QUERY_INTVL_MAX), + jiffies_to_msecs(BR_MULTICAST_QUERY_INTVL_MAX)); + intvl_jiffies = BR_MULTICAST_QUERY_INTVL_MAX; + } + brmctx->multicast_query_interval = intvl_jiffies; } @@ -4834,6 +4848,14 @@ void br_multicast_set_startup_query_intvl(struct net_bridge_mcast *brmctx, intvl_jiffies = BR_MULTICAST_STARTUP_QUERY_INTVL_MIN; } + if (intvl_jiffies > BR_MULTICAST_STARTUP_QUERY_INTVL_MAX) { + br_info(brmctx->br, + "trying to set multicast startup query interval above maximum, setting to %lu (%ums)\n", + jiffies_to_clock_t(BR_MULTICAST_STARTUP_QUERY_INTVL_MAX), + jiffies_to_msecs(BR_MULTICAST_STARTUP_QUERY_INTVL_MAX)); + intvl_jiffies = BR_MULTICAST_STARTUP_QUERY_INTVL_MAX; + } + brmctx->multicast_startup_query_interval = intvl_jiffies; } diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 94cbe967d1c1..083e2fe96441 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -626,9 +626,6 @@ static unsigned int br_nf_local_in(void *priv, break; } - ct = container_of(nfct, struct nf_conn, ct_general); - WARN_ON_ONCE(!nf_ct_is_confirmed(ct)); - return ret; } #endif diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 4e2d53b27221..0264730938f4 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -467,7 +467,7 @@ static int br_fill_ifinfo(struct sk_buff *skb, else br = netdev_priv(dev); - br_debug(br, "br_fill_info event %d port %s master %s\n", + br_debug(br, "br_fill_ifinfo event %d port %s master %s\n", event, dev->name, br->dev->name); nlh = nlmsg_put(skb, pid, seq, event, sizeof(*hdr), flags); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index b159aae594c0..7280c4e9305f 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -31,6 +31,8 @@ #define BR_MULTICAST_DEFAULT_HASH_MAX 4096 #define BR_MULTICAST_QUERY_INTVL_MIN msecs_to_jiffies(1000) #define BR_MULTICAST_STARTUP_QUERY_INTVL_MIN BR_MULTICAST_QUERY_INTVL_MIN +#define BR_MULTICAST_QUERY_INTVL_MAX msecs_to_jiffies(86400000) /* 24 hours */ +#define BR_MULTICAST_STARTUP_QUERY_INTVL_MAX BR_MULTICAST_QUERY_INTVL_MAX #define BR_HWDOM_MAX BITS_PER_LONG @@ -485,6 +487,7 @@ enum net_bridge_opts { BROPT_MCAST_VLAN_SNOOPING_ENABLED, BROPT_MST_ENABLED, BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION, + BROPT_FDB_LOCAL_VLAN_0, }; struct net_bridge { @@ -841,6 +844,8 @@ void br_fdb_find_delete_local(struct net_bridge *br, void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr); void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr); void br_fdb_cleanup(struct work_struct *work); +int br_fdb_toggle_local_vlan_0(struct net_bridge *br, bool on, + struct netlink_ext_ack *extack); void br_fdb_delete_by_port(struct net_bridge *br, const struct net_bridge_port *p, u16 vid, int do_all); struct net_bridge_fdb_entry *br_fdb_find_rcu(struct net_bridge *br, @@ -1930,10 +1935,12 @@ static inline bool br_vlan_state_allowed(u8 state, bool learn_allow) /* br_mst.c */ #ifdef CONFIG_BRIDGE_VLAN_FILTERING DECLARE_STATIC_KEY_FALSE(br_mst_used); -static inline bool br_mst_is_enabled(struct net_bridge *br) +static inline bool br_mst_is_enabled(const struct net_bridge_port *p) { + /* check the port's vlan group to avoid racing with port deletion */ return static_branch_unlikely(&br_mst_used) && - br_opt_get(br, BROPT_MST_ENABLED); + br_opt_get(p->br, BROPT_MST_ENABLED) && + rcu_access_pointer(p->vlgrp); } int br_mst_set_state(struct net_bridge_port *p, u16 msti, u8 state, @@ -1947,8 +1954,9 @@ int br_mst_fill_info(struct sk_buff *skb, const struct net_bridge_vlan_group *vg); int br_mst_process(struct net_bridge_port *p, const struct nlattr *mst_attr, struct netlink_ext_ack *extack); +void br_mst_uninit(struct net_bridge *br); #else -static inline bool br_mst_is_enabled(struct net_bridge *br) +static inline bool br_mst_is_enabled(const struct net_bridge_port *p) { return false; } @@ -1982,6 +1990,10 @@ static inline int br_mst_process(struct net_bridge_port *p, { return -EOPNOTSUPP; } + +static inline void br_mst_uninit(struct net_bridge *br) +{ +} #endif struct nf_br_ops { diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 939a3aa78d5c..ce72b837ff8e 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -331,10 +331,12 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags, /* Add the dev mac and count the vlan only if it's usable */ if (br_vlan_should_use(v)) { - err = br_fdb_add_local(br, p, dev->dev_addr, v->vid); - if (err) { - br_err(br, "failed insert local address into bridge forwarding table\n"); - goto out_filt; + if (!br_opt_get(br, BROPT_FDB_LOCAL_VLAN_0)) { + err = br_fdb_add_local(br, p, dev->dev_addr, v->vid); + if (err) { + br_err(br, "failed insert local address into bridge forwarding table\n"); + goto out_filt; + } } vg->num_vlans++; } @@ -1455,7 +1457,7 @@ void br_vlan_fill_forward_path_pvid(struct net_bridge *br, if (!br_opt_get(br, BROPT_VLAN_ENABLED)) return; - vg = br_vlan_group(br); + vg = br_vlan_group_rcu(br); if (idx >= 0 && ctx->vlan[idx].proto == br->vlan_proto) { diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig index 60f28e4fb5c0..4fd5a6ea26b4 100644 --- a/net/bridge/netfilter/Kconfig +++ b/net/bridge/netfilter/Kconfig @@ -43,6 +43,7 @@ config NF_CONNTRACK_BRIDGE config BRIDGE_NF_EBTABLES_LEGACY tristate "Legacy EBTABLES support" depends on BRIDGE && NETFILTER_XTABLES_LEGACY + depends on NETFILTER_XTABLES default n help Legacy ebtables packet/frame classifier. diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 3e67d4aff419..5697e3949a36 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -920,8 +920,8 @@ static int translate_table(struct net *net, const char *name, * if an error occurs */ newinfo->chainstack = - vmalloc(array_size(nr_cpu_ids, - sizeof(*(newinfo->chainstack)))); + vmalloc_array(nr_cpu_ids, + sizeof(*(newinfo->chainstack))); if (!newinfo->chainstack) return -ENOMEM; for_each_possible_cpu(i) { @@ -938,7 +938,7 @@ static int translate_table(struct net *net, const char *name, } } - cl_s = vmalloc(array_size(udc_cnt, sizeof(*cl_s))); + cl_s = vmalloc_array(udc_cnt, sizeof(*cl_s)); if (!cl_s) return -ENOMEM; i = 0; /* the i'th udc */ @@ -1018,8 +1018,8 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl, * the check on the size is done later, when we have the lock */ if (repl->num_counters) { - unsigned long size = repl->num_counters * sizeof(*counterstmp); - counterstmp = vmalloc(size); + counterstmp = vmalloc_array(repl->num_counters, + sizeof(*counterstmp)); if (!counterstmp) return -ENOMEM; } @@ -1386,7 +1386,7 @@ static int do_update_counters(struct net *net, const char *name, if (num_counters == 0) return -EINVAL; - tmp = vmalloc(array_size(num_counters, sizeof(*tmp))); + tmp = vmalloc_array(num_counters, sizeof(*tmp)); if (!tmp) return -ENOMEM; @@ -1526,7 +1526,7 @@ static int copy_counters_to_user(struct ebt_table *t, if (num_counters != nentries) return -EINVAL; - counterstmp = vmalloc(array_size(nentries, sizeof(*counterstmp))); + counterstmp = vmalloc_array(nentries, sizeof(*counterstmp)); if (!counterstmp) return -ENOMEM; diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c index 5adced1e7d0c..b7af36bbd306 100644 --- a/net/bridge/netfilter/nft_meta_bridge.c +++ b/net/bridge/netfilter/nft_meta_bridge.c @@ -59,6 +59,13 @@ static void nft_meta_bridge_get_eval(const struct nft_expr *expr, nft_reg_store_be16(dest, htons(p_proto)); return; } + case NFT_META_BRI_IIFHWADDR: + br_dev = nft_meta_get_bridge(in); + if (!br_dev) + goto err; + + memcpy(dest, br_dev->dev_addr, ETH_ALEN); + return; default: return nft_meta_get_eval(expr, regs, pkt); } @@ -86,6 +93,9 @@ static int nft_meta_bridge_get_init(const struct nft_ctx *ctx, case NFT_META_BRI_IIFVPROTO: len = sizeof(u16); break; + case NFT_META_BRI_IIFHWADDR: + len = ETH_ALEN; + break; default: return nft_meta_get_init(ctx, expr, tb); } @@ -175,6 +185,7 @@ static int nft_meta_bridge_set_validate(const struct nft_ctx *ctx, switch (priv->key) { case NFT_META_BRI_BROUTE: + case NFT_META_BRI_IIFHWADDR: hooks = 1 << NF_BR_PRE_ROUTING; break; default: diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index 039dfbd367c9..af218742af5a 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -734,7 +734,7 @@ bad_sol: * o sock->state: holds the SS_* socket state and is updated by connect and * disconnect. */ -static int caif_connect(struct socket *sock, struct sockaddr *uaddr, +static int caif_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; diff --git a/net/caif/cfctrl.c b/net/caif/cfctrl.c index 06b604cf9d58..2aa1e7d46eb2 100644 --- a/net/caif/cfctrl.c +++ b/net/caif/cfctrl.c @@ -257,9 +257,7 @@ int cfctrl_linkup_request(struct cflayer *layer, cfpkt_add_body(pkt, &tmp16, 2); tmp16 = cpu_to_le16(param->u.utility.fifosize_bufs); cfpkt_add_body(pkt, &tmp16, 2); - memset(utility_name, 0, sizeof(utility_name)); - strscpy(utility_name, param->u.utility.name, - UTILITY_NAME_LENGTH); + strscpy_pad(utility_name, param->u.utility.name); cfpkt_add_body(pkt, utility_name, UTILITY_NAME_LENGTH); tmp8 = param->u.utility.paramlen; cfpkt_add_body(pkt, &tmp8, 1); diff --git a/net/can/Kconfig b/net/can/Kconfig index af64a6f76458..e4ccf731a24c 100644 --- a/net/can/Kconfig +++ b/net/can/Kconfig @@ -5,6 +5,7 @@ menuconfig CAN tristate "CAN bus subsystem support" + select CAN_DEV help Controller Area Network (CAN) is a slow (up to 1Mbit/s) serial communications protocol. Development of the CAN bus started in diff --git a/net/can/af_can.c b/net/can/af_can.c index b2387a46794a..770173d8db42 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -221,7 +221,7 @@ int can_send(struct sk_buff *skb, int loop) } /* Make sure the CAN frame can pass the selected CAN netdevice. */ - if (unlikely(skb->len > skb->dev->mtu)) { + if (unlikely(skb->len > READ_ONCE(skb->dev->mtu))) { err = -EMSGSIZE; goto inval_skb; } diff --git a/net/can/bcm.c b/net/can/bcm.c index 5e690a2377e4..7eba8ae01a5b 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -1657,7 +1657,7 @@ static int bcm_release(struct socket *sock) return 0; } -static int bcm_connect(struct socket *sock, struct sockaddr *uaddr, int len, +static int bcm_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int len, int flags) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; diff --git a/net/can/isotp.c b/net/can/isotp.c index dee1412b3c9c..ce588b85665a 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -1246,7 +1246,7 @@ static int isotp_release(struct socket *sock) return 0; } -static int isotp_bind(struct socket *sock, struct sockaddr *uaddr, int len) +static int isotp_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int len) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct sock *sk = sock->sk; @@ -1313,7 +1313,7 @@ static int isotp_bind(struct socket *sock, struct sockaddr *uaddr, int len) err = -ENODEV; goto out; } - if (dev->mtu < so->ll.mtu) { + if (READ_ONCE(dev->mtu) < so->ll.mtu) { dev_put(dev); err = -EINVAL; goto out; diff --git a/net/can/j1939/bus.c b/net/can/j1939/bus.c index 39844f14eed8..797719cb227e 100644 --- a/net/can/j1939/bus.c +++ b/net/can/j1939/bus.c @@ -290,8 +290,11 @@ int j1939_local_ecu_get(struct j1939_priv *priv, name_t name, u8 sa) if (!ecu) ecu = j1939_ecu_create_locked(priv, name); err = PTR_ERR_OR_ZERO(ecu); - if (err) + if (err) { + if (j1939_address_is_unicast(sa)) + priv->ents[sa].nusers--; goto done; + } ecu->nusers++; /* TODO: do we care if ecu->addr != sa? */ diff --git a/net/can/j1939/j1939-priv.h b/net/can/j1939/j1939-priv.h index 31a93cae5111..81f58924b4ac 100644 --- a/net/can/j1939/j1939-priv.h +++ b/net/can/j1939/j1939-priv.h @@ -212,6 +212,7 @@ void j1939_priv_get(struct j1939_priv *priv); /* notify/alert all j1939 sockets bound to ifindex */ void j1939_sk_netdev_event_netdown(struct j1939_priv *priv); +void j1939_sk_netdev_event_unregister(struct j1939_priv *priv); int j1939_cancel_active_session(struct j1939_priv *priv, struct sock *sk); void j1939_tp_init(struct j1939_priv *priv); diff --git a/net/can/j1939/main.c b/net/can/j1939/main.c index 7e8a20f2fc42..a93af55df5fd 100644 --- a/net/can/j1939/main.c +++ b/net/can/j1939/main.c @@ -377,6 +377,11 @@ static int j1939_netdev_notify(struct notifier_block *nb, j1939_sk_netdev_event_netdown(priv); j1939_ecu_unmap_all(priv); break; + case NETDEV_UNREGISTER: + j1939_cancel_active_session(priv, NULL); + j1939_sk_netdev_event_netdown(priv); + j1939_sk_netdev_event_unregister(priv); + break; } j1939_priv_put(priv); diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index 3d8b588822f9..6272326dd614 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -440,7 +440,7 @@ static int j1939_sk_sanity_check(struct sockaddr_can *addr, int len) return 0; } -static int j1939_sk_bind(struct socket *sock, struct sockaddr *uaddr, int len) +static int j1939_sk_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int len) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct j1939_sock *jsk = j1939_sk(sock->sk); @@ -521,6 +521,9 @@ static int j1939_sk_bind(struct socket *sock, struct sockaddr *uaddr, int len) ret = j1939_local_ecu_get(priv, jsk->addr.src_name, jsk->addr.sa); if (ret) { j1939_netdev_stop(priv); + jsk->priv = NULL; + synchronize_rcu(); + j1939_priv_put(priv); goto out_release_sock; } @@ -532,7 +535,7 @@ static int j1939_sk_bind(struct socket *sock, struct sockaddr *uaddr, int len) return ret; } -static int j1939_sk_connect(struct socket *sock, struct sockaddr *uaddr, +static int j1939_sk_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int len, int flags) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; @@ -1300,6 +1303,55 @@ void j1939_sk_netdev_event_netdown(struct j1939_priv *priv) read_unlock_bh(&priv->j1939_socks_lock); } +void j1939_sk_netdev_event_unregister(struct j1939_priv *priv) +{ + struct sock *sk; + struct j1939_sock *jsk; + bool wait_rcu = false; + +rescan: /* The caller is holding a ref on this "priv" via j1939_priv_get_by_ndev(). */ + read_lock_bh(&priv->j1939_socks_lock); + list_for_each_entry(jsk, &priv->j1939_socks, list) { + /* Skip if j1939_jsk_add() is not called on this socket. */ + if (!(jsk->state & J1939_SOCK_BOUND)) + continue; + sk = &jsk->sk; + sock_hold(sk); + read_unlock_bh(&priv->j1939_socks_lock); + /* Check if j1939_jsk_del() is not yet called on this socket after holding + * socket's lock, for both j1939_sk_bind() and j1939_sk_release() call + * j1939_jsk_del() with socket's lock held. + */ + lock_sock(sk); + if (jsk->state & J1939_SOCK_BOUND) { + /* Neither j1939_sk_bind() nor j1939_sk_release() called j1939_jsk_del(). + * Make this socket no longer bound, by pretending as if j1939_sk_bind() + * dropped old references but did not get new references. + */ + j1939_jsk_del(priv, jsk); + j1939_local_ecu_put(priv, jsk->addr.src_name, jsk->addr.sa); + j1939_netdev_stop(priv); + /* Call j1939_priv_put() now and prevent j1939_sk_sock_destruct() from + * calling the corresponding j1939_priv_put(). + * + * j1939_sk_sock_destruct() is supposed to call j1939_priv_put() after + * an RCU grace period. But since the caller is holding a ref on this + * "priv", we can defer synchronize_rcu() until immediately before + * the caller calls j1939_priv_put(). + */ + j1939_priv_put(priv); + jsk->priv = NULL; + wait_rcu = true; + } + release_sock(sk); + sock_put(sk); + goto rescan; + } + read_unlock_bh(&priv->j1939_socks_lock); + if (wait_rcu) + synchronize_rcu(); +} + static int j1939_sk_no_ioctlcmd(struct socket *sock, unsigned int cmd, unsigned long arg) { diff --git a/net/can/raw.c b/net/can/raw.c index 76b867d21def..be1ef7cf4204 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -75,31 +75,31 @@ MODULE_ALIAS("can-proto-1"); */ struct uniqframe { - int skbcnt; const struct sk_buff *skb; + int skbcnt; unsigned int join_rx_count; }; struct raw_sock { struct sock sk; - int bound; - int ifindex; struct net_device *dev; netdevice_tracker dev_tracker; struct list_head notifier; - int loopback; - int recv_own_msgs; - int fd_frames; - int xl_frames; + int ifindex; + unsigned int bound:1; + unsigned int loopback:1; + unsigned int recv_own_msgs:1; + unsigned int fd_frames:1; + unsigned int xl_frames:1; + unsigned int join_filters:1; struct can_raw_vcid_options raw_vcid_opts; canid_t tx_vcid_shifted; canid_t rx_vcid_shifted; canid_t rx_vcid_mask_shifted; - int join_filters; + can_err_mask_t err_mask; int count; /* number of active filters */ struct can_filter dfilter; /* default/single filter */ struct can_filter *filter; /* pointer to filter(s) */ - can_err_mask_t err_mask; struct uniqframe __percpu *uniq; }; @@ -449,7 +449,7 @@ static int raw_release(struct socket *sock) return 0; } -static int raw_bind(struct socket *sock, struct sockaddr *uaddr, int len) +static int raw_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int len) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct sock *sk = sock->sk; @@ -560,8 +560,8 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, struct can_filter sfilter; /* single filter */ struct net_device *dev = NULL; can_err_mask_t err_mask = 0; - int fd_frames; int count = 0; + int flag; int err = 0; if (level != SOL_CAN_RAW) @@ -682,44 +682,48 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, break; case CAN_RAW_LOOPBACK: - if (optlen != sizeof(ro->loopback)) + if (optlen != sizeof(flag)) return -EINVAL; - if (copy_from_sockptr(&ro->loopback, optval, optlen)) + if (copy_from_sockptr(&flag, optval, optlen)) return -EFAULT; + ro->loopback = !!flag; break; case CAN_RAW_RECV_OWN_MSGS: - if (optlen != sizeof(ro->recv_own_msgs)) + if (optlen != sizeof(flag)) return -EINVAL; - if (copy_from_sockptr(&ro->recv_own_msgs, optval, optlen)) + if (copy_from_sockptr(&flag, optval, optlen)) return -EFAULT; + ro->recv_own_msgs = !!flag; break; case CAN_RAW_FD_FRAMES: - if (optlen != sizeof(fd_frames)) + if (optlen != sizeof(flag)) return -EINVAL; - if (copy_from_sockptr(&fd_frames, optval, optlen)) + if (copy_from_sockptr(&flag, optval, optlen)) return -EFAULT; /* Enabling CAN XL includes CAN FD */ - if (ro->xl_frames && !fd_frames) + if (ro->xl_frames && !flag) return -EINVAL; - ro->fd_frames = fd_frames; + ro->fd_frames = !!flag; break; case CAN_RAW_XL_FRAMES: - if (optlen != sizeof(ro->xl_frames)) + if (optlen != sizeof(flag)) return -EINVAL; - if (copy_from_sockptr(&ro->xl_frames, optval, optlen)) + if (copy_from_sockptr(&flag, optval, optlen)) return -EFAULT; + ro->xl_frames = !!flag; + /* Enabling CAN XL includes CAN FD */ if (ro->xl_frames) ro->fd_frames = ro->xl_frames; @@ -739,12 +743,13 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, break; case CAN_RAW_JOIN_FILTERS: - if (optlen != sizeof(ro->join_filters)) + if (optlen != sizeof(flag)) return -EINVAL; - if (copy_from_sockptr(&ro->join_filters, optval, optlen)) + if (copy_from_sockptr(&flag, optval, optlen)) return -EFAULT; + ro->join_filters = !!flag; break; default: @@ -758,6 +763,7 @@ static int raw_getsockopt(struct socket *sock, int level, int optname, { struct sock *sk = sock->sk; struct raw_sock *ro = raw_sk(sk); + int flag; int len; void *val; @@ -806,25 +812,29 @@ static int raw_getsockopt(struct socket *sock, int level, int optname, case CAN_RAW_LOOPBACK: if (len > sizeof(int)) len = sizeof(int); - val = &ro->loopback; + flag = ro->loopback; + val = &flag; break; case CAN_RAW_RECV_OWN_MSGS: if (len > sizeof(int)) len = sizeof(int); - val = &ro->recv_own_msgs; + flag = ro->recv_own_msgs; + val = &flag; break; case CAN_RAW_FD_FRAMES: if (len > sizeof(int)) len = sizeof(int); - val = &ro->fd_frames; + flag = ro->fd_frames; + val = &flag; break; case CAN_RAW_XL_FRAMES: if (len > sizeof(int)) len = sizeof(int); - val = &ro->xl_frames; + flag = ro->xl_frames; + val = &flag; break; case CAN_RAW_XL_VCID_OPTS: { @@ -849,7 +859,8 @@ static int raw_getsockopt(struct socket *sock, int level, int optname, case CAN_RAW_JOIN_FILTERS: if (len > sizeof(int)) len = sizeof(int); - val = &ro->join_filters; + flag = ro->join_filters; + val = &flag; break; default: @@ -881,20 +892,58 @@ static void raw_put_canxl_vcid(struct raw_sock *ro, struct sk_buff *skb) } } -static unsigned int raw_check_txframe(struct raw_sock *ro, struct sk_buff *skb, int mtu) +static inline bool raw_dev_cc_enabled(struct net_device *dev, + struct can_priv *priv) { - /* Classical CAN -> no checks for flags and device capabilities */ - if (can_is_can_skb(skb)) + /* The CANXL-only mode disables error-signalling on the CAN bus + * which is needed to send CAN CC/FD frames + */ + if (priv) + return !can_dev_in_xl_only_mode(priv); + + /* virtual CAN interfaces always support CAN CC */ + return true; +} + +static inline bool raw_dev_fd_enabled(struct net_device *dev, + struct can_priv *priv) +{ + /* check FD ctrlmode on real CAN interfaces */ + if (priv) + return (priv->ctrlmode & CAN_CTRLMODE_FD); + + /* check MTU for virtual CAN FD interfaces */ + return (READ_ONCE(dev->mtu) >= CANFD_MTU); +} + +static inline bool raw_dev_xl_enabled(struct net_device *dev, + struct can_priv *priv) +{ + /* check XL ctrlmode on real CAN interfaces */ + if (priv) + return (priv->ctrlmode & CAN_CTRLMODE_XL); + + /* check MTU for virtual CAN XL interfaces */ + return can_is_canxl_dev_mtu(READ_ONCE(dev->mtu)); +} + +static unsigned int raw_check_txframe(struct raw_sock *ro, struct sk_buff *skb, + struct net_device *dev) +{ + struct can_priv *priv = safe_candev_priv(dev); + + /* Classical CAN */ + if (can_is_can_skb(skb) && raw_dev_cc_enabled(dev, priv)) return CAN_MTU; - /* CAN FD -> needs to be enabled and a CAN FD or CAN XL device */ + /* CAN FD */ if (ro->fd_frames && can_is_canfd_skb(skb) && - (mtu == CANFD_MTU || can_is_canxl_dev_mtu(mtu))) + raw_dev_fd_enabled(dev, priv)) return CANFD_MTU; - /* CAN XL -> needs to be enabled and a CAN XL device */ + /* CAN XL */ if (ro->xl_frames && can_is_canxl_skb(skb) && - can_is_canxl_dev_mtu(mtu)) + raw_dev_xl_enabled(dev, priv)) return CANXL_MTU; return 0; @@ -950,7 +999,7 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) err = -EINVAL; /* check for valid CAN (CC/FD/XL) frame content */ - txmtu = raw_check_txframe(ro, skb, dev->mtu); + txmtu = raw_check_txframe(ro, skb, dev); if (!txmtu) goto free_skb; diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig index 0aa21fcbf6ec..ea60e3ef0834 100644 --- a/net/ceph/Kconfig +++ b/net/ceph/Kconfig @@ -6,8 +6,7 @@ config CEPH_LIB select CRYPTO_AES select CRYPTO_CBC select CRYPTO_GCM - select CRYPTO_HMAC - select CRYPTO_SHA256 + select CRYPTO_LIB_SHA256 select CRYPTO select KEYS default n diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c index b71b1635916e..a21c157daf7d 100644 --- a/net/ceph/auth_x.c +++ b/net/ceph/auth_x.c @@ -631,6 +631,7 @@ static int handle_auth_session_key(struct ceph_auth_client *ac, u64 global_id, /* connection secret */ ceph_decode_32_safe(p, end, len, e_inval); + ceph_decode_need(p, end, len, e_inval); dout("%s connection secret blob len %d\n", __func__, len); if (len > 0) { dp = *p + ceph_x_encrypt_offset(); @@ -648,6 +649,7 @@ static int handle_auth_session_key(struct ceph_auth_client *ac, u64 global_id, /* service tickets */ ceph_decode_32_safe(p, end, len, e_inval); + ceph_decode_need(p, end, len, e_inval); dout("%s service tickets blob len %d\n", __func__, len); if (len > 0) { ret = ceph_x_proc_ticket_reply(ac, &th->session_key, diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 4c6441536d55..e734e57be083 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -786,41 +786,52 @@ void ceph_reset_client_addr(struct ceph_client *client) EXPORT_SYMBOL(ceph_reset_client_addr); /* - * true if we have the mon map (and have thus joined the cluster) - */ -static bool have_mon_and_osd_map(struct ceph_client *client) -{ - return client->monc.monmap && client->monc.monmap->epoch && - client->osdc.osdmap && client->osdc.osdmap->epoch; -} - -/* * mount: join the ceph cluster, and open root directory. */ -int __ceph_open_session(struct ceph_client *client, unsigned long started) +int __ceph_open_session(struct ceph_client *client) { - unsigned long timeout = client->options->mount_timeout; - long err; + DEFINE_WAIT_FUNC(wait, woken_wake_function); + long timeout = ceph_timeout_jiffies(client->options->mount_timeout); + bool have_monmap, have_osdmap; + int err; /* open session, and wait for mon and osd maps */ err = ceph_monc_open_session(&client->monc); if (err < 0) return err; - while (!have_mon_and_osd_map(client)) { - if (timeout && time_after_eq(jiffies, started + timeout)) - return -ETIMEDOUT; + add_wait_queue(&client->auth_wq, &wait); + for (;;) { + mutex_lock(&client->monc.mutex); + err = client->auth_err; + have_monmap = client->monc.monmap && client->monc.monmap->epoch; + mutex_unlock(&client->monc.mutex); + + down_read(&client->osdc.lock); + have_osdmap = client->osdc.osdmap && client->osdc.osdmap->epoch; + up_read(&client->osdc.lock); + + if (err || (have_monmap && have_osdmap)) + break; + + if (signal_pending(current)) { + err = -ERESTARTSYS; + break; + } + + if (!timeout) { + err = -ETIMEDOUT; + break; + } /* wait */ dout("mount waiting for mon_map\n"); - err = wait_event_interruptible_timeout(client->auth_wq, - have_mon_and_osd_map(client) || (client->auth_err < 0), - ceph_timeout_jiffies(timeout)); - if (err < 0) - return err; - if (client->auth_err < 0) - return client->auth_err; + timeout = wait_woken(&wait, TASK_INTERRUPTIBLE, timeout); } + remove_wait_queue(&client->auth_wq, &wait); + + if (err) + return err; pr_info("client%llu fsid %pU\n", ceph_client_gid(client), &client->fsid); @@ -833,12 +844,11 @@ EXPORT_SYMBOL(__ceph_open_session); int ceph_open_session(struct ceph_client *client) { int ret; - unsigned long started = jiffies; /* note the start time */ dout("open_session start\n"); mutex_lock(&client->mount_mutex); - ret = __ceph_open_session(client, started); + ret = __ceph_open_session(client); mutex_unlock(&client->mount_mutex); return ret; diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index 2110439f8a24..83c270bce63c 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -36,8 +36,9 @@ static int monmap_show(struct seq_file *s, void *p) int i; struct ceph_client *client = s->private; + mutex_lock(&client->monc.mutex); if (client->monc.monmap == NULL) - return 0; + goto out_unlock; seq_printf(s, "epoch %d\n", client->monc.monmap->epoch); for (i = 0; i < client->monc.monmap->num_mon; i++) { @@ -48,6 +49,9 @@ static int monmap_show(struct seq_file *s, void *p) ENTITY_NAME(inst->name), ceph_pr_addr(&inst->addr)); } + +out_unlock: + mutex_unlock(&client->monc.mutex); return 0; } @@ -56,13 +60,14 @@ static int osdmap_show(struct seq_file *s, void *p) int i; struct ceph_client *client = s->private; struct ceph_osd_client *osdc = &client->osdc; - struct ceph_osdmap *map = osdc->osdmap; + struct ceph_osdmap *map; struct rb_node *n; + down_read(&osdc->lock); + map = osdc->osdmap; if (map == NULL) - return 0; + goto out_unlock; - down_read(&osdc->lock); seq_printf(s, "epoch %u barrier %u flags 0x%x\n", map->epoch, osdc->epoch_barrier, map->flags); @@ -131,6 +136,7 @@ static int osdmap_show(struct seq_file *s, void *p) seq_printf(s, "]\n"); } +out_unlock: up_read(&osdc->lock); return 0; } diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index d1b5705dc0c6..70b25f4ecba6 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -252,7 +252,8 @@ int __init ceph_msgr_init(void) * The number of active work items is limited by the number of * connections, so leave @max_active at default. */ - ceph_msgr_wq = alloc_workqueue("ceph-msgr", WQ_MEM_RECLAIM, 0); + ceph_msgr_wq = alloc_workqueue("ceph-msgr", + WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (ceph_msgr_wq) return 0; @@ -459,7 +460,7 @@ int ceph_tcp_connect(struct ceph_connection *con) set_sock_callbacks(sock, con); con_sock_state_connecting(con); - ret = kernel_connect(sock, (struct sockaddr *)&ss, sizeof(ss), + ret = kernel_connect(sock, (struct sockaddr_unsized *)&ss, sizeof(ss), O_NONBLOCK); if (ret == -EINPROGRESS) { dout("connect %s EINPROGRESS sk_state = %u\n", @@ -1524,7 +1525,7 @@ static void con_fault_finish(struct ceph_connection *con) * in case we faulted due to authentication, invalidate our * current tickets so that we can get new ones. */ - if (con->v1.auth_retry) { + if (!ceph_msgr2(from_msgr(con->msgr)) && con->v1.auth_retry) { dout("auth_retry %d, invalidating\n", con->v1.auth_retry); if (con->ops->invalidate_authorizer) con->ops->invalidate_authorizer(con); @@ -1714,9 +1715,10 @@ static void clear_standby(struct ceph_connection *con) { /* come back from STANDBY? */ if (con->state == CEPH_CON_S_STANDBY) { - dout("clear_standby %p and ++connect_seq\n", con); + dout("clear_standby %p\n", con); con->state = CEPH_CON_S_PREOPEN; - con->v1.connect_seq++; + if (!ceph_msgr2(from_msgr(con->msgr))) + con->v1.connect_seq++; WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_WRITE_PENDING)); WARN_ON(ceph_con_flag_test(con, CEPH_CON_F_KEEPALIVE_PENDING)); } @@ -1792,9 +1794,9 @@ void ceph_msg_revoke(struct ceph_msg *msg) WARN_ON(con->state != CEPH_CON_S_OPEN); dout("%s con %p msg %p was sending\n", __func__, con, msg); if (ceph_msgr2(from_msgr(con->msgr))) - ceph_con_v2_revoke(con); + ceph_con_v2_revoke(con, msg); else - ceph_con_v1_revoke(con); + ceph_con_v1_revoke(con, msg); ceph_msg_put(con->out_msg); con->out_msg = NULL; } else { @@ -2109,11 +2111,13 @@ int ceph_con_in_msg_alloc(struct ceph_connection *con, return ret; } -void ceph_con_get_out_msg(struct ceph_connection *con) +struct ceph_msg *ceph_con_get_out_msg(struct ceph_connection *con) { struct ceph_msg *msg; - BUG_ON(list_empty(&con->out_queue)); + if (list_empty(&con->out_queue)) + return NULL; + msg = list_first_entry(&con->out_queue, struct ceph_msg, list_head); WARN_ON(msg->con != con); @@ -2140,7 +2144,7 @@ void ceph_con_get_out_msg(struct ceph_connection *con) * message or in case of a fault. */ WARN_ON(con->out_msg); - con->out_msg = ceph_msg_get(msg); + return con->out_msg = ceph_msg_get(msg); } /* diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c index 0cb61c76b9b8..c9e002d96319 100644 --- a/net/ceph/messenger_v1.c +++ b/net/ceph/messenger_v1.c @@ -169,10 +169,9 @@ static void prepare_message_data(struct ceph_msg *msg, u32 data_len) * Prepare footer for currently outgoing message, and finish things * off. Assumes out_kvec* are already valid.. we just add on to the end. */ -static void prepare_write_message_footer(struct ceph_connection *con) +static void prepare_write_message_footer(struct ceph_connection *con, + struct ceph_msg *m) { - struct ceph_msg *m = con->out_msg; - m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE; dout("prepare_write_message_footer %p\n", con); @@ -192,9 +191,9 @@ static void prepare_write_message_footer(struct ceph_connection *con) /* * Prepare headers for the next outgoing message. */ -static void prepare_write_message(struct ceph_connection *con) +static void prepare_write_message(struct ceph_connection *con, + struct ceph_msg *m) { - struct ceph_msg *m; u32 crc; con_out_kvec_reset(con); @@ -210,9 +209,6 @@ static void prepare_write_message(struct ceph_connection *con) &con->v1.out_temp_ack); } - ceph_con_get_out_msg(con); - m = con->out_msg; - dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n", m, con->out_seq, le16_to_cpu(m->hdr.type), le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len), @@ -231,31 +227,31 @@ static void prepare_write_message(struct ceph_connection *con) /* fill in hdr crc and finalize hdr */ crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc)); - con->out_msg->hdr.crc = cpu_to_le32(crc); - memcpy(&con->v1.out_hdr, &con->out_msg->hdr, sizeof(con->v1.out_hdr)); + m->hdr.crc = cpu_to_le32(crc); + memcpy(&con->v1.out_hdr, &m->hdr, sizeof(con->v1.out_hdr)); /* fill in front and middle crc, footer */ crc = crc32c(0, m->front.iov_base, m->front.iov_len); - con->out_msg->footer.front_crc = cpu_to_le32(crc); + m->footer.front_crc = cpu_to_le32(crc); if (m->middle) { crc = crc32c(0, m->middle->vec.iov_base, m->middle->vec.iov_len); - con->out_msg->footer.middle_crc = cpu_to_le32(crc); + m->footer.middle_crc = cpu_to_le32(crc); } else - con->out_msg->footer.middle_crc = 0; + m->footer.middle_crc = 0; dout("%s front_crc %u middle_crc %u\n", __func__, - le32_to_cpu(con->out_msg->footer.front_crc), - le32_to_cpu(con->out_msg->footer.middle_crc)); - con->out_msg->footer.flags = 0; + le32_to_cpu(m->footer.front_crc), + le32_to_cpu(m->footer.middle_crc)); + m->footer.flags = 0; /* is there a data payload? */ - con->out_msg->footer.data_crc = 0; + m->footer.data_crc = 0; if (m->data_length) { - prepare_message_data(con->out_msg, m->data_length); + prepare_message_data(m, m->data_length); con->v1.out_more = 1; /* data + footer will follow */ } else { /* no, queue up footer too and be done */ - prepare_write_message_footer(con); + prepare_write_message_footer(con, m); } ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING); @@ -462,9 +458,9 @@ out: * 0 -> socket full, but more to do * <0 -> error */ -static int write_partial_message_data(struct ceph_connection *con) +static int write_partial_message_data(struct ceph_connection *con, + struct ceph_msg *msg) { - struct ceph_msg *msg = con->out_msg; struct ceph_msg_data_cursor *cursor = &msg->cursor; bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC); u32 crc; @@ -516,7 +512,7 @@ static int write_partial_message_data(struct ceph_connection *con) else msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC; con_out_kvec_reset(con); - prepare_write_message_footer(con); + prepare_write_message_footer(con, msg); return 1; /* must return > 0 to indicate success */ } @@ -1472,6 +1468,7 @@ bad_tag: */ int ceph_con_v1_try_write(struct ceph_connection *con) { + struct ceph_msg *msg; int ret = 1; dout("try_write start %p state %d\n", con, con->state); @@ -1518,14 +1515,15 @@ more: } /* msg pages? */ - if (con->out_msg) { + msg = con->out_msg; + if (msg) { if (con->v1.out_msg_done) { - ceph_msg_put(con->out_msg); + ceph_msg_put(msg); con->out_msg = NULL; /* we're done with this one */ goto do_next; } - ret = write_partial_message_data(con); + ret = write_partial_message_data(con, msg); if (ret == 1) goto more; /* we need to send the footer, too! */ if (ret == 0) @@ -1545,8 +1543,8 @@ do_next: goto more; } /* is anything else pending? */ - if (!list_empty(&con->out_queue)) { - prepare_write_message(con); + if ((msg = ceph_con_get_out_msg(con)) != NULL) { + prepare_write_message(con, msg); goto more; } if (con->in_seq > con->in_seq_acked) { @@ -1564,10 +1562,8 @@ out: return ret; } -void ceph_con_v1_revoke(struct ceph_connection *con) +void ceph_con_v1_revoke(struct ceph_connection *con, struct ceph_msg *msg) { - struct ceph_msg *msg = con->out_msg; - WARN_ON(con->v1.out_skip); /* footer */ if (con->v1.out_msg_done) { diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c index 5483b4eed94e..833e57849c1d 100644 --- a/net/ceph/messenger_v2.c +++ b/net/ceph/messenger_v2.c @@ -709,7 +709,7 @@ static int setup_crypto(struct ceph_connection *con, dout("%s con %p con_mode %d session_key_len %d con_secret_len %d\n", __func__, con, con->v2.con_mode, session_key_len, con_secret_len); - WARN_ON(con->v2.hmac_tfm || con->v2.gcm_tfm || con->v2.gcm_req); + WARN_ON(con->v2.hmac_key_set || con->v2.gcm_tfm || con->v2.gcm_req); if (con->v2.con_mode != CEPH_CON_MODE_CRC && con->v2.con_mode != CEPH_CON_MODE_SECURE) { @@ -723,22 +723,8 @@ static int setup_crypto(struct ceph_connection *con, return 0; /* auth_none */ } - noio_flag = memalloc_noio_save(); - con->v2.hmac_tfm = crypto_alloc_shash("hmac(sha256)", 0, 0); - memalloc_noio_restore(noio_flag); - if (IS_ERR(con->v2.hmac_tfm)) { - ret = PTR_ERR(con->v2.hmac_tfm); - con->v2.hmac_tfm = NULL; - pr_err("failed to allocate hmac tfm context: %d\n", ret); - return ret; - } - - ret = crypto_shash_setkey(con->v2.hmac_tfm, session_key, - session_key_len); - if (ret) { - pr_err("failed to set hmac key: %d\n", ret); - return ret; - } + hmac_sha256_preparekey(&con->v2.hmac_key, session_key, session_key_len); + con->v2.hmac_key_set = true; if (con->v2.con_mode == CEPH_CON_MODE_CRC) { WARN_ON(con_secret_len); @@ -793,38 +779,26 @@ static int setup_crypto(struct ceph_connection *con, return 0; /* auth_x, secure mode */ } -static int ceph_hmac_sha256(struct ceph_connection *con, - const struct kvec *kvecs, int kvec_cnt, u8 *hmac) +static void ceph_hmac_sha256(struct ceph_connection *con, + const struct kvec *kvecs, int kvec_cnt, + u8 hmac[SHA256_DIGEST_SIZE]) { - SHASH_DESC_ON_STACK(desc, con->v2.hmac_tfm); /* tfm arg is ignored */ - int ret; + struct hmac_sha256_ctx ctx; int i; - dout("%s con %p hmac_tfm %p kvec_cnt %d\n", __func__, con, - con->v2.hmac_tfm, kvec_cnt); + dout("%s con %p hmac_key_set %d kvec_cnt %d\n", __func__, con, + con->v2.hmac_key_set, kvec_cnt); - if (!con->v2.hmac_tfm) { + if (!con->v2.hmac_key_set) { memset(hmac, 0, SHA256_DIGEST_SIZE); - return 0; /* auth_none */ - } - - desc->tfm = con->v2.hmac_tfm; - ret = crypto_shash_init(desc); - if (ret) - goto out; - - for (i = 0; i < kvec_cnt; i++) { - ret = crypto_shash_update(desc, kvecs[i].iov_base, - kvecs[i].iov_len); - if (ret) - goto out; + return; /* auth_none */ } - ret = crypto_shash_final(desc, hmac); - -out: - shash_desc_zero(desc); - return ret; /* auth_x, both plain and secure modes */ + /* auth_x, both plain and secure modes */ + hmac_sha256_init(&ctx, &con->v2.hmac_key); + for (i = 0; i < kvec_cnt; i++) + hmac_sha256_update(&ctx, kvecs[i].iov_base, kvecs[i].iov_len); + hmac_sha256_final(&ctx, hmac); } static void gcm_inc_nonce(struct ceph_gcm_nonce *nonce) @@ -1087,13 +1061,16 @@ static int decrypt_control_remainder(struct ceph_connection *con) static int process_v2_sparse_read(struct ceph_connection *con, struct page **pages, int spos) { - struct ceph_msg_data_cursor *cursor = &con->v2.in_cursor; + struct ceph_msg_data_cursor cursor; int ret; + ceph_msg_data_cursor_init(&cursor, con->in_msg, + con->in_msg->sparse_read_total); + for (;;) { char *buf = NULL; - ret = con->ops->sparse_read(con, cursor, &buf); + ret = con->ops->sparse_read(con, &cursor, &buf); if (ret <= 0) return ret; @@ -1111,11 +1088,11 @@ static int process_v2_sparse_read(struct ceph_connection *con, } else { struct bio_vec bv; - get_bvec_at(cursor, &bv); + get_bvec_at(&cursor, &bv); len = min_t(int, len, bv.bv_len); memcpy_page(bv.bv_page, bv.bv_offset, spage, soff, len); - ceph_msg_data_advance(cursor, len); + ceph_msg_data_advance(&cursor, len); } spos += len; ret -= len; @@ -1455,17 +1432,14 @@ static int prepare_auth_request_more(struct ceph_connection *con, static int prepare_auth_signature(struct ceph_connection *con) { void *buf; - int ret; buf = alloc_conn_buf(con, head_onwire_len(SHA256_DIGEST_SIZE, con_secure(con))); if (!buf) return -ENOMEM; - ret = ceph_hmac_sha256(con, con->v2.in_sign_kvecs, - con->v2.in_sign_kvec_cnt, CTRL_BODY(buf)); - if (ret) - return ret; + ceph_hmac_sha256(con, con->v2.in_sign_kvecs, con->v2.in_sign_kvec_cnt, + CTRL_BODY(buf)); return prepare_control(con, FRAME_TAG_AUTH_SIGNATURE, buf, SHA256_DIGEST_SIZE); @@ -1564,8 +1538,7 @@ static int prepare_keepalive2(struct ceph_connection *con) struct timespec64 now; ktime_get_real_ts64(&now); - dout("%s con %p timestamp %lld.%09ld\n", __func__, con, now.tv_sec, - now.tv_nsec); + dout("%s con %p timestamp %ptSp\n", __func__, con, &now); ceph_encode_timespec64(ts, &now); @@ -1589,10 +1562,11 @@ static int prepare_ack(struct ceph_connection *con) return prepare_control(con, FRAME_TAG_ACK, con->v2.out_buf, 8); } -static void prepare_epilogue_plain(struct ceph_connection *con, bool aborted) +static void prepare_epilogue_plain(struct ceph_connection *con, + struct ceph_msg *msg, bool aborted) { dout("%s con %p msg %p aborted %d crcs %u %u %u\n", __func__, con, - con->out_msg, aborted, con->v2.out_epil.front_crc, + msg, aborted, con->v2.out_epil.front_crc, con->v2.out_epil.middle_crc, con->v2.out_epil.data_crc); encode_epilogue_plain(con, aborted); @@ -1603,10 +1577,9 @@ static void prepare_epilogue_plain(struct ceph_connection *con, bool aborted) * For "used" empty segments, crc is -1. For unused (trailing) * segments, crc is 0. */ -static void prepare_message_plain(struct ceph_connection *con) +static void prepare_message_plain(struct ceph_connection *con, + struct ceph_msg *msg) { - struct ceph_msg *msg = con->out_msg; - prepare_head_plain(con, con->v2.out_buf, sizeof(struct ceph_msg_header2), NULL, 0, false); @@ -1647,7 +1620,7 @@ static void prepare_message_plain(struct ceph_connection *con) con->v2.out_state = OUT_S_QUEUE_DATA; } else { con->v2.out_epil.data_crc = 0; - prepare_epilogue_plain(con, false); + prepare_epilogue_plain(con, msg, false); con->v2.out_state = OUT_S_FINISH_MESSAGE; } } @@ -1659,7 +1632,8 @@ static void prepare_message_plain(struct ceph_connection *con) * allocate pages for the entire tail of the message (currently up * to ~32M) and two sgs arrays (up to ~256K each)... */ -static int prepare_message_secure(struct ceph_connection *con) +static int prepare_message_secure(struct ceph_connection *con, + struct ceph_msg *msg) { void *zerop = page_address(ceph_zero_page); struct sg_table enc_sgt = {}; @@ -1674,7 +1648,7 @@ static int prepare_message_secure(struct ceph_connection *con) if (ret) return ret; - tail_len = tail_onwire_len(con->out_msg, true); + tail_len = tail_onwire_len(msg, true); if (!tail_len) { /* * Empty message: once the head is written, @@ -1685,7 +1659,7 @@ static int prepare_message_secure(struct ceph_connection *con) } encode_epilogue_secure(con, false); - ret = setup_message_sgs(&sgt, con->out_msg, zerop, zerop, zerop, + ret = setup_message_sgs(&sgt, msg, zerop, zerop, zerop, &con->v2.out_epil, NULL, 0, false); if (ret) goto out; @@ -1714,7 +1688,7 @@ static int prepare_message_secure(struct ceph_connection *con) goto out; dout("%s con %p msg %p sg_cnt %d enc_page_cnt %d\n", __func__, con, - con->out_msg, sgt.orig_nents, enc_page_cnt); + msg, sgt.orig_nents, enc_page_cnt); con->v2.out_state = OUT_S_QUEUE_ENC_PAGE; out: @@ -1723,19 +1697,19 @@ out: return ret; } -static int prepare_message(struct ceph_connection *con) +static int prepare_message(struct ceph_connection *con, struct ceph_msg *msg) { int lens[] = { sizeof(struct ceph_msg_header2), - front_len(con->out_msg), - middle_len(con->out_msg), - data_len(con->out_msg) + front_len(msg), + middle_len(msg), + data_len(msg) }; struct ceph_frame_desc desc; int ret; dout("%s con %p msg %p logical %d+%d+%d+%d\n", __func__, con, - con->out_msg, lens[0], lens[1], lens[2], lens[3]); + msg, lens[0], lens[1], lens[2], lens[3]); if (con->in_seq > con->in_seq_acked) { dout("%s con %p in_seq_acked %llu -> %llu\n", __func__, con, @@ -1746,15 +1720,15 @@ static int prepare_message(struct ceph_connection *con) reset_out_kvecs(con); init_frame_desc(&desc, FRAME_TAG_MESSAGE, lens, 4); encode_preamble(&desc, con->v2.out_buf); - fill_header2(CTRL_BODY(con->v2.out_buf), &con->out_msg->hdr, + fill_header2(CTRL_BODY(con->v2.out_buf), &msg->hdr, con->in_seq_acked); if (con_secure(con)) { - ret = prepare_message_secure(con); + ret = prepare_message_secure(con, msg); if (ret) return ret; } else { - prepare_message_plain(con); + prepare_message_plain(con, msg); } ceph_con_flag_set(con, CEPH_CON_F_WRITE_PENDING); @@ -2460,10 +2434,8 @@ static int process_auth_signature(struct ceph_connection *con, return -EINVAL; } - ret = ceph_hmac_sha256(con, con->v2.out_sign_kvecs, - con->v2.out_sign_kvec_cnt, hmac); - if (ret) - return ret; + ceph_hmac_sha256(con, con->v2.out_sign_kvecs, con->v2.out_sign_kvec_cnt, + hmac); ceph_decode_need(&p, end, SHA256_DIGEST_SIZE, bad); if (crypto_memneq(p, hmac, SHA256_DIGEST_SIZE)) { @@ -2759,8 +2731,7 @@ static int process_keepalive2_ack(struct ceph_connection *con, ceph_decode_need(&p, end, sizeof(struct ceph_timespec), bad); ceph_decode_timespec64(&con->last_keepalive_ack, p); - dout("%s con %p timestamp %lld.%09ld\n", __func__, con, - con->last_keepalive_ack.tv_sec, con->last_keepalive_ack.tv_nsec); + dout("%s con %p timestamp %ptSp\n", __func__, con, &con->last_keepalive_ack); return 0; @@ -3184,20 +3155,20 @@ int ceph_con_v2_try_read(struct ceph_connection *con) } } -static void queue_data(struct ceph_connection *con) +static void queue_data(struct ceph_connection *con, struct ceph_msg *msg) { struct bio_vec bv; con->v2.out_epil.data_crc = -1; - ceph_msg_data_cursor_init(&con->v2.out_cursor, con->out_msg, - data_len(con->out_msg)); + ceph_msg_data_cursor_init(&con->v2.out_cursor, msg, + data_len(msg)); get_bvec_at(&con->v2.out_cursor, &bv); set_out_bvec(con, &bv, true); con->v2.out_state = OUT_S_QUEUE_DATA_CONT; } -static void queue_data_cont(struct ceph_connection *con) +static void queue_data_cont(struct ceph_connection *con, struct ceph_msg *msg) { struct bio_vec bv; @@ -3218,7 +3189,7 @@ static void queue_data_cont(struct ceph_connection *con) * we are done. */ reset_out_kvecs(con); - prepare_epilogue_plain(con, false); + prepare_epilogue_plain(con, msg, false); con->v2.out_state = OUT_S_FINISH_MESSAGE; } @@ -3250,7 +3221,7 @@ static void queue_enc_page(struct ceph_connection *con) con->v2.out_state = OUT_S_FINISH_MESSAGE; } -static void queue_zeros(struct ceph_connection *con) +static void queue_zeros(struct ceph_connection *con, struct ceph_msg *msg) { dout("%s con %p out_zero %d\n", __func__, con, con->v2.out_zero); @@ -3267,7 +3238,7 @@ static void queue_zeros(struct ceph_connection *con) * Once it's written, we are done patching up for the revoke. */ reset_out_kvecs(con); - prepare_epilogue_plain(con, true); + prepare_epilogue_plain(con, msg, true); con->v2.out_state = OUT_S_FINISH_MESSAGE; } @@ -3294,6 +3265,7 @@ static void finish_message(struct ceph_connection *con) static int populate_out_iter(struct ceph_connection *con) { + struct ceph_msg *msg; int ret; dout("%s con %p state %d out_state %d\n", __func__, con, con->state, @@ -3309,18 +3281,18 @@ static int populate_out_iter(struct ceph_connection *con) switch (con->v2.out_state) { case OUT_S_QUEUE_DATA: WARN_ON(!con->out_msg); - queue_data(con); + queue_data(con, con->out_msg); goto populated; case OUT_S_QUEUE_DATA_CONT: WARN_ON(!con->out_msg); - queue_data_cont(con); + queue_data_cont(con, con->out_msg); goto populated; case OUT_S_QUEUE_ENC_PAGE: queue_enc_page(con); goto populated; case OUT_S_QUEUE_ZEROS: WARN_ON(con->out_msg); /* revoked */ - queue_zeros(con); + queue_zeros(con, con->out_msg); goto populated; case OUT_S_FINISH_MESSAGE: finish_message(con); @@ -3339,9 +3311,8 @@ static int populate_out_iter(struct ceph_connection *con) pr_err("prepare_keepalive2 failed: %d\n", ret); return ret; } - } else if (!list_empty(&con->out_queue)) { - ceph_con_get_out_msg(con); - ret = prepare_message(con); + } else if ((msg = ceph_con_get_out_msg(con)) != NULL) { + ret = prepare_message(con, msg); if (ret) { pr_err("prepare_message failed: %d\n", ret); return ret; @@ -3453,17 +3424,18 @@ static u32 crc32c_zeros(u32 crc, int zero_len) return crc; } -static void prepare_zero_front(struct ceph_connection *con, int resid) +static void prepare_zero_front(struct ceph_connection *con, + struct ceph_msg *msg, int resid) { int sent; - WARN_ON(!resid || resid > front_len(con->out_msg)); - sent = front_len(con->out_msg) - resid; + WARN_ON(!resid || resid > front_len(msg)); + sent = front_len(msg) - resid; dout("%s con %p sent %d resid %d\n", __func__, con, sent, resid); if (sent) { con->v2.out_epil.front_crc = - crc32c(-1, con->out_msg->front.iov_base, sent); + crc32c(-1, msg->front.iov_base, sent); con->v2.out_epil.front_crc = crc32c_zeros(con->v2.out_epil.front_crc, resid); } else { @@ -3474,17 +3446,18 @@ static void prepare_zero_front(struct ceph_connection *con, int resid) out_zero_add(con, resid); } -static void prepare_zero_middle(struct ceph_connection *con, int resid) +static void prepare_zero_middle(struct ceph_connection *con, + struct ceph_msg *msg, int resid) { int sent; - WARN_ON(!resid || resid > middle_len(con->out_msg)); - sent = middle_len(con->out_msg) - resid; + WARN_ON(!resid || resid > middle_len(msg)); + sent = middle_len(msg) - resid; dout("%s con %p sent %d resid %d\n", __func__, con, sent, resid); if (sent) { con->v2.out_epil.middle_crc = - crc32c(-1, con->out_msg->middle->vec.iov_base, sent); + crc32c(-1, msg->middle->vec.iov_base, sent); con->v2.out_epil.middle_crc = crc32c_zeros(con->v2.out_epil.middle_crc, resid); } else { @@ -3495,61 +3468,64 @@ static void prepare_zero_middle(struct ceph_connection *con, int resid) out_zero_add(con, resid); } -static void prepare_zero_data(struct ceph_connection *con) +static void prepare_zero_data(struct ceph_connection *con, + struct ceph_msg *msg) { dout("%s con %p\n", __func__, con); - con->v2.out_epil.data_crc = crc32c_zeros(-1, data_len(con->out_msg)); - out_zero_add(con, data_len(con->out_msg)); + con->v2.out_epil.data_crc = crc32c_zeros(-1, data_len(msg)); + out_zero_add(con, data_len(msg)); } -static void revoke_at_queue_data(struct ceph_connection *con) +static void revoke_at_queue_data(struct ceph_connection *con, + struct ceph_msg *msg) { int boundary; int resid; - WARN_ON(!data_len(con->out_msg)); + WARN_ON(!data_len(msg)); WARN_ON(!iov_iter_is_kvec(&con->v2.out_iter)); resid = iov_iter_count(&con->v2.out_iter); - boundary = front_len(con->out_msg) + middle_len(con->out_msg); + boundary = front_len(msg) + middle_len(msg); if (resid > boundary) { resid -= boundary; WARN_ON(resid > MESSAGE_HEAD_PLAIN_LEN); dout("%s con %p was sending head\n", __func__, con); - if (front_len(con->out_msg)) - prepare_zero_front(con, front_len(con->out_msg)); - if (middle_len(con->out_msg)) - prepare_zero_middle(con, middle_len(con->out_msg)); - prepare_zero_data(con); + if (front_len(msg)) + prepare_zero_front(con, msg, front_len(msg)); + if (middle_len(msg)) + prepare_zero_middle(con, msg, middle_len(msg)); + prepare_zero_data(con, msg); WARN_ON(iov_iter_count(&con->v2.out_iter) != resid); con->v2.out_state = OUT_S_QUEUE_ZEROS; return; } - boundary = middle_len(con->out_msg); + boundary = middle_len(msg); if (resid > boundary) { resid -= boundary; dout("%s con %p was sending front\n", __func__, con); - prepare_zero_front(con, resid); - if (middle_len(con->out_msg)) - prepare_zero_middle(con, middle_len(con->out_msg)); - prepare_zero_data(con); - queue_zeros(con); + prepare_zero_front(con, msg, resid); + if (middle_len(msg)) + prepare_zero_middle(con, msg, middle_len(msg)); + prepare_zero_data(con, msg); + queue_zeros(con, msg); return; } WARN_ON(!resid); dout("%s con %p was sending middle\n", __func__, con); - prepare_zero_middle(con, resid); - prepare_zero_data(con); - queue_zeros(con); + prepare_zero_middle(con, msg, resid); + prepare_zero_data(con, msg); + queue_zeros(con, msg); } -static void revoke_at_queue_data_cont(struct ceph_connection *con) +static void revoke_at_queue_data_cont(struct ceph_connection *con, + struct ceph_msg *msg) { int sent, resid; /* current piece of data */ - WARN_ON(!data_len(con->out_msg)); + WARN_ON(!data_len(msg)); WARN_ON(!iov_iter_is_bvec(&con->v2.out_iter)); resid = iov_iter_count(&con->v2.out_iter); WARN_ON(!resid || resid > con->v2.out_bvec.bv_len); @@ -3568,10 +3544,11 @@ static void revoke_at_queue_data_cont(struct ceph_connection *con) con->v2.out_iter.count -= resid; out_zero_add(con, con->v2.out_cursor.total_resid); - queue_zeros(con); + queue_zeros(con, msg); } -static void revoke_at_finish_message(struct ceph_connection *con) +static void revoke_at_finish_message(struct ceph_connection *con, + struct ceph_msg *msg) { int boundary; int resid; @@ -3579,39 +3556,39 @@ static void revoke_at_finish_message(struct ceph_connection *con) WARN_ON(!iov_iter_is_kvec(&con->v2.out_iter)); resid = iov_iter_count(&con->v2.out_iter); - if (!front_len(con->out_msg) && !middle_len(con->out_msg) && - !data_len(con->out_msg)) { + if (!front_len(msg) && !middle_len(msg) && + !data_len(msg)) { WARN_ON(!resid || resid > MESSAGE_HEAD_PLAIN_LEN); dout("%s con %p was sending head (empty message) - noop\n", __func__, con); return; } - boundary = front_len(con->out_msg) + middle_len(con->out_msg) + + boundary = front_len(msg) + middle_len(msg) + CEPH_EPILOGUE_PLAIN_LEN; if (resid > boundary) { resid -= boundary; WARN_ON(resid > MESSAGE_HEAD_PLAIN_LEN); dout("%s con %p was sending head\n", __func__, con); - if (front_len(con->out_msg)) - prepare_zero_front(con, front_len(con->out_msg)); - if (middle_len(con->out_msg)) - prepare_zero_middle(con, middle_len(con->out_msg)); + if (front_len(msg)) + prepare_zero_front(con, msg, front_len(msg)); + if (middle_len(msg)) + prepare_zero_middle(con, msg, middle_len(msg)); con->v2.out_iter.count -= CEPH_EPILOGUE_PLAIN_LEN; WARN_ON(iov_iter_count(&con->v2.out_iter) != resid); con->v2.out_state = OUT_S_QUEUE_ZEROS; return; } - boundary = middle_len(con->out_msg) + CEPH_EPILOGUE_PLAIN_LEN; + boundary = middle_len(msg) + CEPH_EPILOGUE_PLAIN_LEN; if (resid > boundary) { resid -= boundary; dout("%s con %p was sending front\n", __func__, con); - prepare_zero_front(con, resid); - if (middle_len(con->out_msg)) - prepare_zero_middle(con, middle_len(con->out_msg)); + prepare_zero_front(con, msg, resid); + if (middle_len(msg)) + prepare_zero_middle(con, msg, middle_len(msg)); con->v2.out_iter.count -= CEPH_EPILOGUE_PLAIN_LEN; - queue_zeros(con); + queue_zeros(con, msg); return; } @@ -3619,9 +3596,9 @@ static void revoke_at_finish_message(struct ceph_connection *con) if (resid > boundary) { resid -= boundary; dout("%s con %p was sending middle\n", __func__, con); - prepare_zero_middle(con, resid); + prepare_zero_middle(con, msg, resid); con->v2.out_iter.count -= CEPH_EPILOGUE_PLAIN_LEN; - queue_zeros(con); + queue_zeros(con, msg); return; } @@ -3629,7 +3606,7 @@ static void revoke_at_finish_message(struct ceph_connection *con) dout("%s con %p was sending epilogue - noop\n", __func__, con); } -void ceph_con_v2_revoke(struct ceph_connection *con) +void ceph_con_v2_revoke(struct ceph_connection *con, struct ceph_msg *msg) { WARN_ON(con->v2.out_zero); @@ -3642,13 +3619,13 @@ void ceph_con_v2_revoke(struct ceph_connection *con) switch (con->v2.out_state) { case OUT_S_QUEUE_DATA: - revoke_at_queue_data(con); + revoke_at_queue_data(con, msg); break; case OUT_S_QUEUE_DATA_CONT: - revoke_at_queue_data_cont(con); + revoke_at_queue_data_cont(con, msg); break; case OUT_S_FINISH_MESSAGE: - revoke_at_finish_message(con); + revoke_at_finish_message(con, msg); break; default: WARN(1, "bad out_state %d", con->v2.out_state); @@ -3814,10 +3791,8 @@ void ceph_con_v2_reset_protocol(struct ceph_connection *con) memzero_explicit(&con->v2.in_gcm_nonce, CEPH_GCM_IV_LEN); memzero_explicit(&con->v2.out_gcm_nonce, CEPH_GCM_IV_LEN); - if (con->v2.hmac_tfm) { - crypto_free_shash(con->v2.hmac_tfm); - con->v2.hmac_tfm = NULL; - } + memzero_explicit(&con->v2.hmac_key, sizeof(con->v2.hmac_key)); + con->v2.hmac_key_set = false; if (con->v2.gcm_req) { aead_request_free(con->v2.gcm_req); con->v2.gcm_req = NULL; diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index ab66b599ac47..c227ececa925 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -314,7 +314,7 @@ static void __schedule_delayed(struct ceph_mon_client *monc) delay = CEPH_MONC_PING_INTERVAL; dout("__schedule_delayed after %lu\n", delay); - mod_delayed_work(system_wq, &monc->delayed_work, + mod_delayed_work(system_percpu_wq, &monc->delayed_work, round_jiffies_relative(delay)); } diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 295098873861..d245fa508e1c 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -1504,8 +1504,6 @@ static int decode_new_primary_temp(void **p, void *end, u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd) { - BUG_ON(osd >= map->max_osd); - if (!map->osd_primary_affinity) return CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; @@ -1514,8 +1512,6 @@ u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd) static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff) { - BUG_ON(osd >= map->max_osd); - if (!map->osd_primary_affinity) { int i; @@ -1577,6 +1573,8 @@ static int decode_new_primary_affinity(void **p, void *end, ceph_decode_32_safe(p, end, osd, e_inval); ceph_decode_32_safe(p, end, aff, e_inval); + if (osd >= map->max_osd) + goto e_inval; ret = set_primary_affinity(map, osd, aff); if (ret) @@ -1879,7 +1877,9 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v, ceph_decode_need(p, end, 2*sizeof(u32), e_inval); osd = ceph_decode_32(p); w = ceph_decode_32(p); - BUG_ON(osd >= map->max_osd); + if (osd >= map->max_osd) + goto e_inval; + osdmap_info(map, "osd%d weight 0x%x %s\n", osd, w, w == CEPH_OSD_IN ? "(in)" : (w == CEPH_OSD_OUT ? "(out)" : "")); @@ -1905,13 +1905,15 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v, u32 xorstate; osd = ceph_decode_32(p); + if (osd >= map->max_osd) + goto e_inval; + if (struct_v >= 5) xorstate = ceph_decode_32(p); else xorstate = ceph_decode_8(p); if (xorstate == 0) xorstate = CEPH_OSD_UP; - BUG_ON(osd >= map->max_osd); if ((map->osd_state[osd] & CEPH_OSD_UP) && (xorstate & CEPH_OSD_UP)) osdmap_info(map, "osd%d down\n", osd); @@ -1937,7 +1939,9 @@ static int decode_new_up_state_weight(void **p, void *end, u8 struct_v, struct ceph_entity_addr addr; osd = ceph_decode_32(p); - BUG_ON(osd >= map->max_osd); + if (osd >= map->max_osd) + goto e_inval; + if (struct_v >= 7) ret = ceph_decode_entity_addrvec(p, end, msgr2, &addr); else diff --git a/net/compat.c b/net/compat.c index 485db8ee9b28..2c9bd0edac99 100644 --- a/net/compat.c +++ b/net/compat.c @@ -460,10 +460,10 @@ COMPAT_SYSCALL_DEFINE2(socketcall, int, call, u32 __user *, args) ret = __sys_accept4(a0, compat_ptr(a1), compat_ptr(a[2]), 0); break; case SYS_GETSOCKNAME: - ret = __sys_getsockname(a0, compat_ptr(a1), compat_ptr(a[2])); + ret = __sys_getsockname(a0, compat_ptr(a1), compat_ptr(a[2]), 0); break; case SYS_GETPEERNAME: - ret = __sys_getpeername(a0, compat_ptr(a1), compat_ptr(a[2])); + ret = __sys_getsockname(a0, compat_ptr(a1), compat_ptr(a[2]), 1); break; case SYS_SOCKETPAIR: ret = __sys_socketpair(a0, a1, a[2], compat_ptr(a[3])); diff --git a/net/core/Makefile b/net/core/Makefile index b2a76ce33932..9ef2099c5426 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_NETDEV_ADDR_LIST_TEST) += dev_addr_lists_test.o obj-y += net-sysfs.o obj-y += hotdata.o obj-y += netdev_rx_queue.o +obj-y += netdev_queues.o obj-$(CONFIG_PAGE_POOL) += page_pool.o page_pool_user.o obj-$(CONFIG_PROC_FS) += net-procfs.o obj-$(CONFIG_NET_PKTGEN) += pktgen.o diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 2e538399757f..850dd736ccd1 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -50,16 +50,14 @@ void bpf_sk_storage_free(struct sock *sk) { struct bpf_local_storage *sk_storage; - migrate_disable(); - rcu_read_lock(); + rcu_read_lock_dont_migrate(); sk_storage = rcu_dereference(sk->sk_bpf_storage); if (!sk_storage) goto out; bpf_local_storage_destroy(sk_storage); out: - rcu_read_unlock(); - migrate_enable(); + rcu_read_unlock_migrate(); } static void bpf_sk_storage_map_free(struct bpf_map *map) @@ -138,7 +136,7 @@ bpf_sk_storage_clone_elem(struct sock *newsk, { struct bpf_local_storage_elem *copy_selem; - copy_selem = bpf_selem_alloc(smap, newsk, NULL, true, false, GFP_ATOMIC); + copy_selem = bpf_selem_alloc(smap, newsk, NULL, false, GFP_ATOMIC); if (!copy_selem) return NULL; @@ -161,8 +159,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) RCU_INIT_POINTER(newsk->sk_bpf_storage, NULL); - migrate_disable(); - rcu_read_lock(); + rcu_read_lock_dont_migrate(); sk_storage = rcu_dereference(sk->sk_bpf_storage); if (!sk_storage || hlist_empty(&sk_storage->list)) @@ -199,7 +196,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) } else { ret = bpf_local_storage_alloc(newsk, smap, copy_selem, GFP_ATOMIC); if (ret) { - bpf_selem_free(copy_selem, smap, true); + bpf_selem_free(copy_selem, true); atomic_sub(smap->elem_size, &newsk->sk_omem_alloc); bpf_map_put(map); @@ -213,8 +210,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk) } out: - rcu_read_unlock(); - migrate_enable(); + rcu_read_unlock_migrate(); /* In case of an error, don't free anything explicitly here, the * caller is responsible to call bpf_sk_storage_free. diff --git a/net/core/datagram.c b/net/core/datagram.c index 94cc4705e91d..c285c6465923 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -345,7 +345,7 @@ int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue, spin_unlock_bh(&sk_queue->lock); } - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); return err; } EXPORT_SYMBOL(__sk_queue_drop_skb); @@ -618,6 +618,20 @@ fault: } EXPORT_SYMBOL(skb_copy_datagram_from_iter); +int skb_copy_datagram_from_iter_full(struct sk_buff *skb, int offset, + struct iov_iter *from, int len) +{ + struct iov_iter_state state; + int ret; + + iov_iter_save_state(from, &state); + ret = skb_copy_datagram_from_iter(skb, offset, from, len); + if (ret) + iov_iter_restore(from, &state); + return ret; +} +EXPORT_SYMBOL(skb_copy_datagram_from_iter_full); + int zerocopy_fill_skb_from_iter(struct sk_buff *skb, struct iov_iter *from, size_t length) { @@ -906,21 +920,22 @@ fault: EXPORT_SYMBOL(skb_copy_and_csum_datagram_msg); /** - * datagram_poll - generic datagram poll + * datagram_poll_queue - same as datagram_poll, but on a specific receive + * queue * @file: file struct * @sock: socket * @wait: poll table + * @rcv_queue: receive queue to poll * - * Datagram poll: Again totally generic. This also handles - * sequenced packet sockets providing the socket receive queue - * is only ever holding data ready to receive. + * Performs polling on the given receive queue, handling shutdown, error, + * and connection state. This is useful for protocols that deliver + * userspace-bound packets through a custom queue instead of + * sk->sk_receive_queue. * - * Note: when you *don't* use this routine for this protocol, - * and you use a different write policy from sock_writeable() - * then please supply your own write_space callback. + * Return: poll bitmask indicating the socket's current state */ -__poll_t datagram_poll(struct file *file, struct socket *sock, - poll_table *wait) +__poll_t datagram_poll_queue(struct file *file, struct socket *sock, + poll_table *wait, struct sk_buff_head *rcv_queue) { struct sock *sk = sock->sk; __poll_t mask; @@ -942,7 +957,7 @@ __poll_t datagram_poll(struct file *file, struct socket *sock, mask |= EPOLLHUP; /* readable? */ - if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) + if (!skb_queue_empty_lockless(rcv_queue)) mask |= EPOLLIN | EPOLLRDNORM; /* Connection-based need to check for termination and startup */ @@ -964,4 +979,27 @@ __poll_t datagram_poll(struct file *file, struct socket *sock, return mask; } +EXPORT_SYMBOL(datagram_poll_queue); + +/** + * datagram_poll - generic datagram poll + * @file: file struct + * @sock: socket + * @wait: poll table + * + * Datagram poll: Again totally generic. This also handles + * sequenced packet sockets providing the socket receive queue + * is only ever holding data ready to receive. + * + * Note: when you *don't* use this routine for this protocol, + * and you use a different write policy from sock_writeable() + * then please supply your own write_space callback. + * + * Return: poll bitmask indicating the socket's current state + */ +__poll_t datagram_poll(struct file *file, struct socket *sock, poll_table *wait) +{ + return datagram_poll_queue(file, sock, wait, + &sock->sk->sk_receive_queue); +} EXPORT_SYMBOL(datagram_poll); diff --git a/net/core/dev.c b/net/core/dev.c index 68dc47d7e700..9094c0fb8c68 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1163,6 +1163,7 @@ void netdev_copy_name(struct net_device *dev, char *name) strscpy(name, dev->name, IFNAMSIZ); } while (read_seqretry(&netdev_rename_lock, seq)); } +EXPORT_IPV6_MOD_GPL(netdev_copy_name); /** * netdev_get_name - get a netdevice name, knowing its ifindex. @@ -2462,9 +2463,9 @@ int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb) return __dev_forward_skb2(dev, skb, false) ?: netif_rx_internal(skb); } -static inline int deliver_skb(struct sk_buff *skb, - struct packet_type *pt_prev, - struct net_device *orig_dev) +static int deliver_skb(struct sk_buff *skb, + struct packet_type *pt_prev, + struct net_device *orig_dev) { if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) return -ENOMEM; @@ -2483,7 +2484,7 @@ static inline void deliver_ptype_list_skb(struct sk_buff *skb, list_for_each_entry_rcu(ptype, ptype_list, list) { if (ptype->type != type) continue; - if (pt_prev) + if (unlikely(pt_prev)) deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; } @@ -2544,7 +2545,7 @@ again: if (skb_loop_sk(ptype, skb)) continue; - if (pt_prev) { + if (unlikely(pt_prev)) { deliver_skb(skb2, pt_prev, skb->dev); pt_prev = ptype; continue; @@ -3373,6 +3374,13 @@ static void __netif_reschedule(struct Qdisc *q) void __netif_schedule(struct Qdisc *q) { + /* If q->defer_list is not empty, at least one thread is + * in __dev_xmit_skb() before llist_del_all(&q->defer_list). + * This thread will attempt to run the queue. + */ + if (!llist_empty(&q->defer_list)) + return; + if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state)) __netif_reschedule(q); } @@ -3768,8 +3776,14 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb, if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL)) features &= ~dev->gso_partial_features; - /* Make sure to clear the IPv4 ID mangling feature if the - * IPv4 header has the potential to be fragmented. + /* Make sure to clear the IPv4 ID mangling feature if the IPv4 header + * has the potential to be fragmented so that TSO does not generate + * segments with the same ID. For encapsulated packets, the ID mangling + * feature is guaranteed not to use the same ID for the outer IPv4 + * headers of the generated segments if the headers have the potential + * to be fragmented, so there is no need to clear the IPv4 ID mangling + * feature (see the section about NETIF_F_TSO_MANGLEID in + * segmentation-offloads.rst). */ if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) { struct iphdr *iph = skb->encapsulation ? @@ -3779,6 +3793,18 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb, features &= ~NETIF_F_TSO_MANGLEID; } + /* NETIF_F_IPV6_CSUM does not support IPv6 extension headers, + * so neither does TSO that depends on it. + */ + if (features & NETIF_F_IPV6_CSUM && + (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6 || + (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 && + vlan_get_protocol(skb) == htons(ETH_P_IPV6))) && + skb_transport_header_was_set(skb) && + skb_network_header_len(skb) != sizeof(struct ipv6hdr) && + !ipv6_has_hopopt_jumbo(skb)) + features &= ~(NETIF_F_IPV6_CSUM | NETIF_F_TSO6 | NETIF_F_GSO_UDP_L4); + return features; } @@ -3895,6 +3921,38 @@ sw_checksum: } EXPORT_SYMBOL(skb_csum_hwoffload_help); +/* Checks if this SKB belongs to an HW offloaded socket + * and whether any SW fallbacks are required based on dev. + * Check decrypted mark in case skb_orphan() cleared socket. + */ +static struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb, + struct net_device *dev) +{ +#ifdef CONFIG_SOCK_VALIDATE_XMIT + struct sk_buff *(*sk_validate)(struct sock *sk, struct net_device *dev, + struct sk_buff *skb); + struct sock *sk = skb->sk; + + sk_validate = NULL; + if (sk) { + if (sk_fullsock(sk)) + sk_validate = sk->sk_validate_xmit_skb; + else if (sk_is_inet(sk) && sk->sk_state == TCP_TIME_WAIT) + sk_validate = inet_twsk(sk)->tw_validate_xmit_skb; + } + + if (sk_validate) { + skb = sk_validate(sk, dev, skb); + } else if (unlikely(skb_is_decrypted(skb))) { + pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n"); + kfree_skb(skb); + skb = NULL; + } +#endif + + return skb; +} + static struct sk_buff *validate_xmit_unreadable_skb(struct sk_buff *skb, struct net_device *dev) { @@ -4011,17 +4069,23 @@ struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *d } EXPORT_SYMBOL_GPL(validate_xmit_skb_list); -static void qdisc_pkt_len_init(struct sk_buff *skb) +static void qdisc_pkt_len_segs_init(struct sk_buff *skb) { - const struct skb_shared_info *shinfo = skb_shinfo(skb); + struct skb_shared_info *shinfo = skb_shinfo(skb); + u16 gso_segs; qdisc_skb_cb(skb)->pkt_len = skb->len; + if (!shinfo->gso_size) { + qdisc_skb_cb(skb)->pkt_segs = 1; + return; + } + + qdisc_skb_cb(skb)->pkt_segs = gso_segs = shinfo->gso_segs; /* To get more precise estimation of bytes sent on wire, * we add to pkt_len the headers size of all segments */ - if (shinfo->gso_size && skb_transport_header_was_set(skb)) { - u16 gso_segs = shinfo->gso_segs; + if (skb_transport_header_was_set(skb)) { unsigned int hdr_len; /* mac layer + network layer */ @@ -4054,6 +4118,8 @@ static void qdisc_pkt_len_init(struct sk_buff *skb) if (payload <= 0) return; gso_segs = DIV_ROUND_UP(payload, shinfo->gso_size); + shinfo->gso_segs = gso_segs; + qdisc_skb_cb(skb)->pkt_segs = gso_segs; } qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len; } @@ -4075,9 +4141,10 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, struct net_device *dev, struct netdev_queue *txq) { + struct sk_buff *next, *to_free = NULL, *to_free2 = NULL; spinlock_t *root_lock = qdisc_lock(q); - struct sk_buff *to_free = NULL; - bool contended; + struct llist_node *ll_list, *first_n; + unsigned long defer_count = 0; int rc; qdisc_calculate_pkt_len(skb, q); @@ -4093,9 +4160,9 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, if (unlikely(!nolock_qdisc_is_empty(q))) { rc = dev_qdisc_enqueue(skb, q, &to_free, txq); __qdisc_run(q); - qdisc_run_end(q); + to_free2 = qdisc_run_end(q); - goto no_lock_out; + goto free_skbs; } qdisc_bstats_cpu_update(q, skb); @@ -4103,81 +4170,93 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, !nolock_qdisc_is_empty(q)) __qdisc_run(q); - qdisc_run_end(q); - return NET_XMIT_SUCCESS; + to_free2 = qdisc_run_end(q); + rc = NET_XMIT_SUCCESS; + goto free_skbs; } rc = dev_qdisc_enqueue(skb, q, &to_free, txq); - qdisc_run(q); - -no_lock_out: - if (unlikely(to_free)) - kfree_skb_list_reason(to_free, - tcf_get_drop_reason(to_free)); - return rc; + to_free2 = qdisc_run(q); + goto free_skbs; } - if (unlikely(READ_ONCE(q->owner) == smp_processor_id())) { - kfree_skb_reason(skb, SKB_DROP_REASON_TC_RECLASSIFY_LOOP); - return NET_XMIT_DROP; - } - /* - * Heuristic to force contended enqueues to serialize on a - * separate lock before trying to get qdisc main lock. - * This permits qdisc->running owner to get the lock more - * often and dequeue packets faster. - * On PREEMPT_RT it is possible to preempt the qdisc owner during xmit - * and then other tasks will only enqueue packets. The packets will be - * sent after the qdisc owner is scheduled again. To prevent this - * scenario the task always serialize on the lock. + /* Open code llist_add(&skb->ll_node, &q->defer_list) + queue limit. + * In the try_cmpxchg() loop, we want to increment q->defer_count + * at most once to limit the number of skbs in defer_list. + * We perform the defer_count increment only if the list is not empty, + * because some arches have slow atomic_long_inc_return(). + */ + first_n = READ_ONCE(q->defer_list.first); + do { + if (first_n && !defer_count) { + defer_count = atomic_long_inc_return(&q->defer_count); + if (unlikely(defer_count > READ_ONCE(q->limit))) { + kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_DROP); + return NET_XMIT_DROP; + } + } + skb->ll_node.next = first_n; + } while (!try_cmpxchg(&q->defer_list.first, &first_n, &skb->ll_node)); + + /* If defer_list was not empty, we know the cpu which queued + * the first skb will process the whole list for us. */ - contended = qdisc_is_running(q) || IS_ENABLED(CONFIG_PREEMPT_RT); - if (unlikely(contended)) - spin_lock(&q->busylock); + if (first_n) + return NET_XMIT_SUCCESS; spin_lock(root_lock); + + ll_list = llist_del_all(&q->defer_list); + /* There is a small race because we clear defer_count not atomically + * with the prior llist_del_all(). This means defer_list could grow + * over q->limit. + */ + atomic_long_set(&q->defer_count, 0); + + ll_list = llist_reverse_order(ll_list); + if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { - __qdisc_drop(skb, &to_free); + llist_for_each_entry_safe(skb, next, ll_list, ll_node) + __qdisc_drop(skb, &to_free); rc = NET_XMIT_DROP; - } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) && - qdisc_run_begin(q)) { + goto unlock; + } + if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) && + !llist_next(ll_list) && qdisc_run_begin(q)) { /* * This is a work-conserving queue; there are no old skbs * waiting to be sent out; and the qdisc is not running - * xmit the skb directly. */ + DEBUG_NET_WARN_ON_ONCE(skb != llist_entry(ll_list, + struct sk_buff, + ll_node)); qdisc_bstats_update(q, skb); - - if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) { - if (unlikely(contended)) { - spin_unlock(&q->busylock); - contended = false; - } + if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) __qdisc_run(q); - } - - qdisc_run_end(q); + to_free2 = qdisc_run_end(q); rc = NET_XMIT_SUCCESS; } else { - WRITE_ONCE(q->owner, smp_processor_id()); - rc = dev_qdisc_enqueue(skb, q, &to_free, txq); - WRITE_ONCE(q->owner, -1); - if (qdisc_run_begin(q)) { - if (unlikely(contended)) { - spin_unlock(&q->busylock); - contended = false; - } - __qdisc_run(q); - qdisc_run_end(q); + int count = 0; + + llist_for_each_entry_safe(skb, next, ll_list, ll_node) { + prefetch(next); + prefetch(&next->priority); + skb_mark_not_on_list(skb); + rc = dev_qdisc_enqueue(skb, q, &to_free, txq); + count++; } + to_free2 = qdisc_run(q); + if (count != 1) + rc = NET_XMIT_SUCCESS; } +unlock: spin_unlock(root_lock); - if (unlikely(to_free)) - kfree_skb_list_reason(to_free, - tcf_get_drop_reason(to_free)); - if (unlikely(contended)) - spin_unlock(&q->busylock); + +free_skbs: + tcf_kfree_skb_list(to_free); + tcf_kfree_skb_list(to_free2); return rc; } @@ -4282,7 +4361,7 @@ static int tc_run(struct tcx_entry *entry, struct sk_buff *skb, return ret; tc_skb_cb(skb)->mru = 0; - tc_skb_cb(skb)->post_ct = false; + qdisc_skb_cb(skb)->post_ct = false; tcf_set_drop_reason(skb, *drop_reason); mini_qdisc_bstats_cpu_update(miniq, skb); @@ -4348,12 +4427,12 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, return skb; bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); - if (*pt_prev) { + if (unlikely(*pt_prev)) { *ret = deliver_skb(skb, *pt_prev, orig_dev); *pt_prev = NULL; } - qdisc_skb_cb(skb)->pkt_len = skb->len; + qdisc_pkt_len_segs_init(skb); tcx_set_ingress(skb, true); if (static_branch_unlikely(&tcx_needed_key)) { @@ -4541,6 +4620,32 @@ u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb, } EXPORT_SYMBOL(dev_pick_tx_zero); +int sk_tx_queue_get(const struct sock *sk) +{ + int resel, val; + + if (!sk) + return -1; + /* Paired with WRITE_ONCE() in sk_tx_queue_clear() + * and sk_tx_queue_set(). + */ + val = READ_ONCE(sk->sk_tx_queue_mapping); + + if (val == NO_QUEUE_MAPPING) + return -1; + + if (!sk_fullsock(sk)) + return val; + + resel = READ_ONCE(sock_net(sk)->core.sysctl_txq_reselection); + if (resel && time_is_before_jiffies( + READ_ONCE(sk->sk_tx_queue_mapping_jiffies) + resel)) + return -1; + + return val; +} +EXPORT_SYMBOL(sk_tx_queue_get); + u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) { @@ -4556,8 +4661,7 @@ u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, if (new_index < 0) new_index = skb_tx_hash(dev, sb_dev, skb); - if (queue_index != new_index && sk && - sk_fullsock(sk) && + if (sk && sk_fullsock(sk) && rcu_access_pointer(sk->sk_dst_cache)) sk_tx_queue_set(sk, new_index); @@ -4639,7 +4743,7 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) skb_update_prio(skb); - qdisc_pkt_len_init(skb); + qdisc_pkt_len_segs_init(skb); tcx_set_ingress(skb, false); #ifdef CONFIG_NET_EGRESS if (static_branch_unlikely(&egress_needed_key)) { @@ -4837,9 +4941,40 @@ static u32 rfs_slot(u32 hash, const struct rps_dev_flow_table *flow_table) return hash_32(hash, flow_table->log); } +#ifdef CONFIG_RFS_ACCEL +/** + * rps_flow_is_active - check whether the flow is recently active. + * @rflow: Specific flow to check activity. + * @flow_table: per-queue flowtable that @rflow belongs to. + * @cpu: CPU saved in @rflow. + * + * If the CPU has processed many packets since the flow's last activity + * (beyond 10 times the table size), the flow is considered stale. + * + * Return: true if flow was recently active. + */ +static bool rps_flow_is_active(struct rps_dev_flow *rflow, + struct rps_dev_flow_table *flow_table, + unsigned int cpu) +{ + unsigned int flow_last_active; + unsigned int sd_input_head; + + if (cpu >= nr_cpu_ids) + return false; + + sd_input_head = READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head); + flow_last_active = READ_ONCE(rflow->last_qtail); + + return (int)(sd_input_head - flow_last_active) < + (int)(10 << flow_table->log); +} +#endif + static struct rps_dev_flow * set_rps_cpu(struct net_device *dev, struct sk_buff *skb, - struct rps_dev_flow *rflow, u16 next_cpu) + struct rps_dev_flow *rflow, u16 next_cpu, u32 hash, + u32 flow_id) { if (next_cpu < nr_cpu_ids) { u32 head; @@ -4847,8 +4982,9 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct netdev_rx_queue *rxqueue; struct rps_dev_flow_table *flow_table; struct rps_dev_flow *old_rflow; + struct rps_dev_flow *tmp_rflow; + unsigned int tmp_cpu; u16 rxq_index; - u32 flow_id; int rc; /* Should we steer this flow to a different hardware queue? */ @@ -4863,14 +4999,29 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb, flow_table = rcu_dereference(rxqueue->rps_flow_table); if (!flow_table) goto out; - flow_id = rfs_slot(skb_get_hash(skb), flow_table); + + tmp_rflow = &flow_table->flows[flow_id]; + tmp_cpu = READ_ONCE(tmp_rflow->cpu); + + if (READ_ONCE(tmp_rflow->filter) != RPS_NO_FILTER) { + if (rps_flow_is_active(tmp_rflow, flow_table, + tmp_cpu)) { + if (hash != READ_ONCE(tmp_rflow->hash) || + next_cpu == tmp_cpu) + goto out; + } + } + rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, rxq_index, flow_id); if (rc < 0) goto out; + old_rflow = rflow; - rflow = &flow_table->flows[flow_id]; + rflow = tmp_rflow; WRITE_ONCE(rflow->filter, rc); + WRITE_ONCE(rflow->hash, hash); + if (old_rflow->filter == rc) WRITE_ONCE(old_rflow->filter, RPS_NO_FILTER); out: @@ -4896,6 +5047,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow_table *flow_table; struct rps_map *map; int cpu = -1; + u32 flow_id; u32 tcpu; u32 hash; @@ -4942,7 +5094,8 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, /* OK, now we know there is a match, * we can look at the local (per receive queue) flow table */ - rflow = &flow_table->flows[rfs_slot(hash, flow_table)]; + flow_id = rfs_slot(hash, flow_table); + rflow = &flow_table->flows[flow_id]; tcpu = rflow->cpu; /* @@ -4961,7 +5114,8 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, ((int)(READ_ONCE(per_cpu(softnet_data, tcpu).input_queue_head) - rflow->last_qtail)) >= 0)) { tcpu = next_cpu; - rflow = set_rps_cpu(dev, skb, rflow, next_cpu); + rflow = set_rps_cpu(dev, skb, rflow, next_cpu, hash, + flow_id); } if (tcpu < nr_cpu_ids && cpu_online(tcpu)) { @@ -5005,17 +5159,16 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, struct rps_dev_flow_table *flow_table; struct rps_dev_flow *rflow; bool expire = true; - unsigned int cpu; rcu_read_lock(); flow_table = rcu_dereference(rxqueue->rps_flow_table); if (flow_table && flow_id < (1UL << flow_table->log)) { + unsigned int cpu; + rflow = &flow_table->flows[flow_id]; cpu = READ_ONCE(rflow->cpu); - if (READ_ONCE(rflow->filter) == filter_id && cpu < nr_cpu_ids && - ((int)(READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head) - - READ_ONCE(rflow->last_qtail)) < - (int)(10 << flow_table->log))) + if (READ_ONCE(rflow->filter) == filter_id && + rps_flow_is_active(rflow, flow_table, cpu)) expire = false; } rcu_read_unlock(); @@ -5081,8 +5234,9 @@ static void napi_schedule_rps(struct softnet_data *sd) __napi_schedule_irqoff(&mysd->backlog); } -void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu) +void kick_defer_list_purge(unsigned int cpu) { + struct softnet_data *sd = &per_cpu(softnet_data, cpu); unsigned long flags; if (use_backlog_threads()) { @@ -5102,14 +5256,15 @@ void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu) int netdev_flow_limit_table_len __read_mostly = (1 << 12); #endif -static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen) +static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen, + int max_backlog) { #ifdef CONFIG_NET_FLOW_LIMIT - struct sd_flow_limit *fl; - struct softnet_data *sd; unsigned int old_flow, new_flow; + const struct softnet_data *sd; + struct sd_flow_limit *fl; - if (qlen < (READ_ONCE(net_hotdata.max_backlog) >> 1)) + if (likely(qlen < (max_backlog >> 1))) return false; sd = this_cpu_ptr(&softnet_data); @@ -5154,19 +5309,19 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, u32 tail; reason = SKB_DROP_REASON_DEV_READY; - if (!netif_running(skb->dev)) + if (unlikely(!netif_running(skb->dev))) goto bad_dev; - reason = SKB_DROP_REASON_CPU_BACKLOG; sd = &per_cpu(softnet_data, cpu); qlen = skb_queue_len_lockless(&sd->input_pkt_queue); max_backlog = READ_ONCE(net_hotdata.max_backlog); - if (unlikely(qlen > max_backlog)) + if (unlikely(qlen > max_backlog) || + skb_flow_limit(skb, qlen, max_backlog)) goto cpu_backlog_drop; backlog_lock_irq_save(sd, &flags); qlen = skb_queue_len(&sd->input_pkt_queue); - if (qlen <= max_backlog && !skb_flow_limit(skb, qlen)) { + if (likely(qlen <= max_backlog)) { if (!qlen) { /* Schedule NAPI for backlog device. We can use * non atomic operation as we own the queue lock. @@ -5187,7 +5342,8 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, backlog_unlock_irq_restore(sd, &flags); cpu_backlog_drop: - atomic_inc(&sd->dropped); + reason = SKB_DROP_REASON_CPU_BACKLOG; + numa_drop_add(&sd->drop_counters, 1); bad_dev: dev_core_stats_rx_dropped_inc(skb->dev); kfree_skb_reason(skb, reason); @@ -5593,8 +5749,9 @@ static __latent_entropy void net_tx_action(void) rcu_read_lock(); while (head) { - struct Qdisc *q = head; spinlock_t *root_lock = NULL; + struct sk_buff *to_free; + struct Qdisc *q = head; head = head->next_sched; @@ -5621,9 +5778,10 @@ static __latent_entropy void net_tx_action(void) } clear_bit(__QDISC_STATE_SCHED, &q->state); - qdisc_run(q); + to_free = qdisc_run(q); if (root_lock) spin_unlock(root_lock); + tcf_kfree_skb_list(to_free); } rcu_read_unlock(); @@ -5733,7 +5891,7 @@ static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev, if (nf_hook_ingress_active(skb)) { int ingress_retval; - if (*pt_prev) { + if (unlikely(*pt_prev)) { *ret = deliver_skb(skb, *pt_prev, orig_dev); *pt_prev = NULL; } @@ -5810,13 +5968,13 @@ another_round: list_for_each_entry_rcu(ptype, &dev_net_rcu(skb->dev)->ptype_all, list) { - if (pt_prev) + if (unlikely(pt_prev)) ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; } list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) { - if (pt_prev) + if (unlikely(pt_prev)) ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = ptype; } @@ -5847,7 +6005,7 @@ skip_classify: } if (skb_vlan_tag_present(skb)) { - if (pt_prev) { + if (unlikely(pt_prev)) { ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = NULL; } @@ -5859,7 +6017,7 @@ skip_classify: rx_handler = rcu_dereference(skb->dev->rx_handler); if (rx_handler) { - if (pt_prev) { + if (unlikely(pt_prev)) { ret = deliver_skb(skb, pt_prev, orig_dev); pt_prev = NULL; } @@ -6616,24 +6774,25 @@ bool napi_complete_done(struct napi_struct *n, int work_done) } EXPORT_SYMBOL(napi_complete_done); -static void skb_defer_free_flush(struct softnet_data *sd) +static void skb_defer_free_flush(void) { + struct llist_node *free_list; struct sk_buff *skb, *next; + struct skb_defer_node *sdn; + int node; - /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */ - if (!READ_ONCE(sd->defer_list)) - return; + for_each_node(node) { + sdn = this_cpu_ptr(net_hotdata.skb_defer_nodes) + node; - spin_lock(&sd->defer_lock); - skb = sd->defer_list; - sd->defer_list = NULL; - sd->defer_count = 0; - spin_unlock(&sd->defer_lock); + if (llist_empty(&sdn->defer_list)) + continue; + atomic_long_set(&sdn->defer_count, 0); + free_list = llist_del_all(&sdn->defer_list); - while (skb != NULL) { - next = skb->next; - napi_consume_skb(skb, 1); - skb = next; + llist_for_each_entry_safe(skb, next, free_list, ll_node) { + prefetch(next); + napi_consume_skb(skb, 1); + } } } @@ -6761,7 +6920,7 @@ count: if (work > 0) __NET_ADD_STATS(dev_net(napi->dev), LINUX_MIB_BUSYPOLLRXPACKETS, work); - skb_defer_free_flush(this_cpu_ptr(&softnet_data)); + skb_defer_free_flush(); bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); @@ -6939,7 +7098,8 @@ static void napi_stop_kthread(struct napi_struct *napi) */ if ((val & NAPIF_STATE_SCHED_THREADED) || !(val & NAPIF_STATE_SCHED)) { - new = val & (~NAPIF_STATE_THREADED); + new = val & (~(NAPIF_STATE_THREADED | + NAPIF_STATE_THREADED_BUSY_POLL)); } else { msleep(20); continue; @@ -6953,7 +7113,7 @@ static void napi_stop_kthread(struct napi_struct *napi) * the kthread. */ while (true) { - if (!test_bit(NAPIF_STATE_SCHED_THREADED, &napi->state)) + if (!test_bit(NAPI_STATE_SCHED_THREADED, &napi->state)) break; msleep(20); @@ -6963,6 +7123,16 @@ static void napi_stop_kthread(struct napi_struct *napi) napi->thread = NULL; } +static void napi_set_threaded_state(struct napi_struct *napi, + enum netdev_napi_threaded threaded_mode) +{ + bool threaded = threaded_mode != NETDEV_NAPI_THREADED_DISABLED; + bool busy_poll = threaded_mode == NETDEV_NAPI_THREADED_BUSY_POLL; + + assign_bit(NAPI_STATE_THREADED, &napi->state, threaded); + assign_bit(NAPI_STATE_THREADED_BUSY_POLL, &napi->state, busy_poll); +} + int napi_set_threaded(struct napi_struct *napi, enum netdev_napi_threaded threaded) { @@ -6989,7 +7159,7 @@ int napi_set_threaded(struct napi_struct *napi, } else { /* Make sure kthread is created before THREADED bit is set. */ smp_mb__before_atomic(); - assign_bit(NAPI_STATE_THREADED, &napi->state, threaded); + napi_set_threaded_state(napi, threaded); } return 0; @@ -6999,7 +7169,7 @@ int netif_set_threaded(struct net_device *dev, enum netdev_napi_threaded threaded) { struct napi_struct *napi; - int err = 0; + int i, err = 0; netdev_assert_locked_or_invisible(dev); @@ -7021,6 +7191,10 @@ int netif_set_threaded(struct net_device *dev, list_for_each_entry(napi, &dev->napi_list, dev_list) WARN_ON_ONCE(napi_set_threaded(napi, threaded)); + /* Override the config for all NAPIs even if currently not listed */ + for (i = 0; i < dev->num_napi_configs; i++) + dev->napi_config[i].threaded = threaded; + return err; } @@ -7353,8 +7527,9 @@ void netif_napi_add_weight_locked(struct net_device *dev, * Clear dev->threaded if kthread creation failed so that * threaded mode will not be enabled in napi_enable(). */ - if (dev->threaded && napi_kthread_create(napi)) - dev->threaded = NETDEV_NAPI_THREADED_DISABLED; + if (napi_get_threaded_config(dev, napi)) + if (napi_kthread_create(napi)) + dev->threaded = NETDEV_NAPI_THREADED_DISABLED; netif_napi_set_irq_locked(napi, -1); } EXPORT_SYMBOL(netif_napi_add_weight_locked); @@ -7376,7 +7551,9 @@ void napi_disable_locked(struct napi_struct *n) } new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC; - new &= ~(NAPIF_STATE_THREADED | NAPIF_STATE_PREFER_BUSY_POLL); + new &= ~(NAPIF_STATE_THREADED | + NAPIF_STATE_THREADED_BUSY_POLL | + NAPIF_STATE_PREFER_BUSY_POLL); } while (!try_cmpxchg(&n->state, &val, new)); hrtimer_cancel(&n->timer); @@ -7588,7 +7765,7 @@ static int napi_thread_wait(struct napi_struct *napi) return -1; } -static void napi_threaded_poll_loop(struct napi_struct *napi) +static void napi_threaded_poll_loop(struct napi_struct *napi, bool busy_poll) { struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; struct softnet_data *sd; @@ -7615,24 +7792,49 @@ static void napi_threaded_poll_loop(struct napi_struct *napi) local_irq_disable(); net_rps_action_and_irq_enable(sd); } - skb_defer_free_flush(sd); + skb_defer_free_flush(); bpf_net_ctx_clear(bpf_net_ctx); + + /* When busy poll is enabled, the old packets are not flushed in + * napi_complete_done. So flush them here. + */ + if (busy_poll) + gro_flush_normal(&napi->gro, HZ >= 1000); local_bh_enable(); + /* Call cond_resched here to avoid watchdog warnings. */ + if (repoll || busy_poll) { + rcu_softirq_qs_periodic(last_qs); + cond_resched(); + } + if (!repoll) break; - - rcu_softirq_qs_periodic(last_qs); - cond_resched(); } } static int napi_threaded_poll(void *data) { struct napi_struct *napi = data; + bool want_busy_poll; + bool in_busy_poll; + unsigned long val; + + while (!napi_thread_wait(napi)) { + val = READ_ONCE(napi->state); + + want_busy_poll = val & NAPIF_STATE_THREADED_BUSY_POLL; + in_busy_poll = val & NAPIF_STATE_IN_BUSY_POLL; + + if (unlikely(val & NAPIF_STATE_DISABLE)) + want_busy_poll = false; + + if (want_busy_poll != in_busy_poll) + assign_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state, + want_busy_poll); - while (!napi_thread_wait(napi)) - napi_threaded_poll_loop(napi); + napi_threaded_poll_loop(napi, want_busy_poll); + } return 0; } @@ -7657,7 +7859,7 @@ start: for (;;) { struct napi_struct *n; - skb_defer_free_flush(sd); + skb_defer_free_flush(); if (list_empty(&list)) { if (list_empty(&repoll)) { @@ -9780,7 +9982,7 @@ DECLARE_RWSEM(dev_addr_sem); /* "sa" is a true struct sockaddr with limited "sa_data" member. */ int netif_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name) { - size_t size = sizeof(sa->sa_data_min); + size_t size = sizeof(sa->sa_data); struct net_device *dev; int ret = 0; @@ -11873,6 +12075,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, goto free_all; dev->cfg_pending = dev->cfg; + dev->num_napi_configs = maxqs; napi_config_sz = array_size(maxqs, sizeof(*dev->napi_config)); dev->napi_config = kvzalloc(napi_config_sz, GFP_KERNEL_ACCOUNT); if (!dev->napi_config) @@ -12070,6 +12273,35 @@ static void dev_memory_provider_uninstall(struct net_device *dev) } } +/* devices must be UP and netdev_lock()'d */ +static void netif_close_many_and_unlock(struct list_head *close_head) +{ + struct net_device *dev, *tmp; + + netif_close_many(close_head, false); + + /* ... now unlock them */ + list_for_each_entry_safe(dev, tmp, close_head, close_list) { + netdev_unlock(dev); + list_del_init(&dev->close_list); + } +} + +static void netif_close_many_and_unlock_cond(struct list_head *close_head) +{ +#ifdef CONFIG_LOCKDEP + /* We can only track up to MAX_LOCK_DEPTH locks per task. + * + * Reserve half the available slots for additional locks possibly + * taken by notifiers and (soft)irqs. + */ + unsigned int limit = MAX_LOCK_DEPTH / 2; + + if (lockdep_depth(current) > limit) + netif_close_many_and_unlock(close_head); +#endif +} + void unregister_netdevice_many_notify(struct list_head *head, u32 portid, const struct nlmsghdr *nlh) { @@ -12102,17 +12334,18 @@ void unregister_netdevice_many_notify(struct list_head *head, /* If device is running, close it first. Start with ops locked... */ list_for_each_entry(dev, head, unreg_list) { + if (!(dev->flags & IFF_UP)) + continue; if (netdev_need_ops_lock(dev)) { list_add_tail(&dev->close_list, &close_head); netdev_lock(dev); } + netif_close_many_and_unlock_cond(&close_head); } - netif_close_many(&close_head, true); - /* ... now unlock them and go over the rest. */ + netif_close_many_and_unlock(&close_head); + /* ... now go over the rest. */ list_for_each_entry(dev, head, unreg_list) { - if (netdev_need_ops_lock(dev)) - netdev_unlock(dev); - else + if (!netdev_need_ops_lock(dev)) list_add_tail(&dev->close_list, &close_head); } netif_close_many(&close_head, true); @@ -12510,6 +12743,94 @@ netdev_features_t netdev_increment_features(netdev_features_t all, } EXPORT_SYMBOL(netdev_increment_features); +/** + * netdev_compute_master_upper_features - compute feature from lowers + * @dev: the upper device + * @update_header: whether to update upper device's header_len/headroom/tailroom + * + * Recompute the upper device's feature based on all lower devices. + */ +void netdev_compute_master_upper_features(struct net_device *dev, bool update_header) +{ + unsigned int dst_release_flag = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM; + netdev_features_t gso_partial_features = MASTER_UPPER_DEV_GSO_PARTIAL_FEATURES; + netdev_features_t xfrm_features = MASTER_UPPER_DEV_XFRM_FEATURES; + netdev_features_t mpls_features = MASTER_UPPER_DEV_MPLS_FEATURES; + netdev_features_t vlan_features = MASTER_UPPER_DEV_VLAN_FEATURES; + netdev_features_t enc_features = MASTER_UPPER_DEV_ENC_FEATURES; + unsigned short max_header_len = ETH_HLEN; + unsigned int tso_max_size = TSO_MAX_SIZE; + unsigned short max_headroom = 0; + unsigned short max_tailroom = 0; + u16 tso_max_segs = TSO_MAX_SEGS; + struct net_device *lower_dev; + struct list_head *iter; + + mpls_features = netdev_base_features(mpls_features); + vlan_features = netdev_base_features(vlan_features); + enc_features = netdev_base_features(enc_features); + + netdev_for_each_lower_dev(dev, lower_dev, iter) { + gso_partial_features = netdev_increment_features(gso_partial_features, + lower_dev->gso_partial_features, + MASTER_UPPER_DEV_GSO_PARTIAL_FEATURES); + + vlan_features = netdev_increment_features(vlan_features, + lower_dev->vlan_features, + MASTER_UPPER_DEV_VLAN_FEATURES); + + enc_features = netdev_increment_features(enc_features, + lower_dev->hw_enc_features, + MASTER_UPPER_DEV_ENC_FEATURES); + + if (IS_ENABLED(CONFIG_XFRM_OFFLOAD)) + xfrm_features = netdev_increment_features(xfrm_features, + lower_dev->hw_enc_features, + MASTER_UPPER_DEV_XFRM_FEATURES); + + mpls_features = netdev_increment_features(mpls_features, + lower_dev->mpls_features, + MASTER_UPPER_DEV_MPLS_FEATURES); + + dst_release_flag &= lower_dev->priv_flags; + + if (update_header) { + max_header_len = max(max_header_len, lower_dev->hard_header_len); + max_headroom = max(max_headroom, lower_dev->needed_headroom); + max_tailroom = max(max_tailroom, lower_dev->needed_tailroom); + } + + tso_max_size = min(tso_max_size, lower_dev->tso_max_size); + tso_max_segs = min(tso_max_segs, lower_dev->tso_max_segs); + } + + dev->gso_partial_features = gso_partial_features; + dev->vlan_features = vlan_features; + dev->hw_enc_features = enc_features | NETIF_F_GSO_ENCAP_ALL | + NETIF_F_HW_VLAN_CTAG_TX | + NETIF_F_HW_VLAN_STAG_TX; + if (IS_ENABLED(CONFIG_XFRM_OFFLOAD)) + dev->hw_enc_features |= xfrm_features; + dev->mpls_features = mpls_features; + + dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; + if ((dev->priv_flags & IFF_XMIT_DST_RELEASE_PERM) && + dst_release_flag == (IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM)) + dev->priv_flags |= IFF_XMIT_DST_RELEASE; + + if (update_header) { + dev->hard_header_len = max_header_len; + dev->needed_headroom = max_headroom; + dev->needed_tailroom = max_tailroom; + } + + netif_set_tso_max_segs(dev, tso_max_segs); + netif_set_tso_max_size(dev, tso_max_size); + + netdev_change_features(dev); +} +EXPORT_SYMBOL(netdev_compute_master_upper_features); + static struct hlist_head * __net_init netdev_create_hash(void) { int i; @@ -12823,7 +13144,7 @@ static void run_backlog_napi(unsigned int cpu) { struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu); - napi_threaded_poll_loop(&sd->backlog); + napi_threaded_poll_loop(&sd->backlog, false); } static void backlog_napi_setup(unsigned int cpu) @@ -12890,7 +13211,6 @@ static int __init net_dev_init(void) sd->cpu = i; #endif INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd); - spin_lock_init(&sd->defer_lock); gro_init(&sd->backlog.gro); sd->backlog.poll = process_backlog; @@ -12900,6 +13220,11 @@ static int __init net_dev_init(void) if (net_page_pool_create(i)) goto out; } + net_hotdata.skb_defer_nodes = + __alloc_percpu(sizeof(struct skb_defer_node) * nr_node_ids, + __alignof__(struct skb_defer_node)); + if (!net_hotdata.skb_defer_nodes) + goto out; if (use_backlog_threads()) smpboot_register_percpu_thread(&backlog_threads); diff --git a/net/core/dev.h b/net/core/dev.h index ab69edc0c3e3..da18536cbd35 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -29,7 +29,6 @@ struct napi_struct * netdev_napi_by_id_lock(struct net *net, unsigned int napi_id); struct net_device *dev_get_by_napi_id(unsigned int napi_id); -struct net_device *netdev_get_by_index_lock(struct net *net, int ifindex); struct net_device *__netdev_put_lock(struct net_device *dev, struct net *net); struct net_device * netdev_xa_find_lock(struct net *net, struct net_device *dev, @@ -317,12 +316,23 @@ static inline void napi_set_irq_suspend_timeout(struct napi_struct *n, static inline enum netdev_napi_threaded napi_get_threaded(struct napi_struct *n) { + if (test_bit(NAPI_STATE_THREADED_BUSY_POLL, &n->state)) + return NETDEV_NAPI_THREADED_BUSY_POLL; + if (test_bit(NAPI_STATE_THREADED, &n->state)) return NETDEV_NAPI_THREADED_ENABLED; return NETDEV_NAPI_THREADED_DISABLED; } +static inline enum netdev_napi_threaded +napi_get_threaded_config(struct net_device *dev, struct napi_struct *n) +{ + if (n->config) + return n->config->threaded; + return dev->threaded; +} + int napi_set_threaded(struct napi_struct *n, enum netdev_napi_threaded threaded); @@ -349,7 +359,7 @@ static inline void napi_assert_will_not_race(const struct napi_struct *napi) WARN_ON(READ_ONCE(napi->list_owner) != -1); } -void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu); +void kick_defer_list_purge(unsigned int cpu); #define XMIT_RECURSION_LIMIT 8 diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 9c0ad7f4b5d8..53a53357cfef 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -249,10 +249,11 @@ int net_hwtstamp_validate(const struct kernel_hwtstamp_config *cfg) * * Helper for calling the default hardware provider timestamping. * - * Note: phy_mii_ioctl() only handles SIOCSHWTSTAMP (not SIOCGHWTSTAMP), and - * there only exists a phydev->mii_ts->hwtstamp() method. So this will return - * -EOPNOTSUPP for phylib for now, which is still more accurate than letting - * the netdev handle the GET request. + * Note: phy_mii_ioctl() only handles SIOCSHWTSTAMP (not SIOCGHWTSTAMP), but + * phydev->mii_ts has both hwtstamp_get() and hwtstamp_set() methods. So this + * will return -EOPNOTSUPP for phylib only if hwtstamp_get() is not + * implemented for now, which is still more accurate than letting the netdev + * handle the GET request. */ int dev_get_hwtstamp_phylib(struct net_device *dev, struct kernel_hwtstamp_config *cfg) @@ -443,6 +444,9 @@ static int generic_hwtstamp_ioctl_lower(struct net_device *dev, int cmd, struct ifreq ifrr; int err; + if (!kernel_cfg->ifr) + return -EINVAL; + strscpy_pad(ifrr.ifr_name, dev->name, IFNAMSIZ); ifrr.ifr_ifru = kernel_cfg->ifr->ifr_ifru; @@ -464,8 +468,15 @@ int generic_hwtstamp_get_lower(struct net_device *dev, if (!netif_device_present(dev)) return -ENODEV; - if (ops->ndo_hwtstamp_get) - return dev_get_hwtstamp_phylib(dev, kernel_cfg); + if (ops->ndo_hwtstamp_get) { + int err; + + netdev_lock_ops(dev); + err = dev_get_hwtstamp_phylib(dev, kernel_cfg); + netdev_unlock_ops(dev); + + return err; + } /* Legacy path: unconverted lower driver */ return generic_hwtstamp_ioctl_lower(dev, SIOCGHWTSTAMP, kernel_cfg); @@ -481,8 +492,15 @@ int generic_hwtstamp_set_lower(struct net_device *dev, if (!netif_device_present(dev)) return -ENODEV; - if (ops->ndo_hwtstamp_set) - return dev_set_hwtstamp_phylib(dev, kernel_cfg, extack); + if (ops->ndo_hwtstamp_set) { + int err; + + netdev_lock_ops(dev); + err = dev_set_hwtstamp_phylib(dev, kernel_cfg, extack); + netdev_unlock_ops(dev); + + return err; + } /* Legacy path: unconverted lower driver */ return generic_hwtstamp_ioctl_lower(dev, SIOCSHWTSTAMP, kernel_cfg); @@ -582,7 +600,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data, if (ifr->ifr_hwaddr.sa_family != dev->type) return -EINVAL; memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, - min(sizeof(ifr->ifr_hwaddr.sa_data_min), + min(sizeof(ifr->ifr_hwaddr.sa_data), (size_t)dev->addr_len)); netdev_lock_ops(dev); call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); diff --git a/net/core/devmem.c b/net/core/devmem.c index 24c591ab38ae..ec4217d6c0b4 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -17,6 +17,7 @@ #include <net/page_pool/helpers.h> #include <net/page_pool/memory_provider.h> #include <net/sock.h> +#include <net/tcp.h> #include <trace/events/page_pool.h> #include "devmem.h" @@ -96,9 +97,9 @@ net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding) index = offset / PAGE_SIZE; niov = &owner->area.niovs[index]; - niov->pp_magic = 0; - niov->pp = NULL; - atomic_long_set(&niov->pp_ref_count, 0); + niov->desc.pp_magic = 0; + niov->desc.pp = NULL; + atomic_long_set(&niov->desc.pp_ref_count, 0); return niov; } @@ -176,6 +177,7 @@ err_close_rxq: struct net_devmem_dmabuf_binding * net_devmem_bind_dmabuf(struct net_device *dev, + struct device *dma_dev, enum dma_data_direction direction, unsigned int dmabuf_fd, struct netdev_nl_sock *priv, struct netlink_ext_ack *extack) @@ -188,6 +190,11 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned long virtual; int err; + if (!dma_dev) { + NL_SET_ERR_MSG(extack, "Device doesn't support DMA"); + return ERR_PTR(-EOPNOTSUPP); + } + dmabuf = dma_buf_get(dmabuf_fd); if (IS_ERR(dmabuf)) return ERR_CAST(dmabuf); @@ -209,7 +216,7 @@ net_devmem_bind_dmabuf(struct net_device *dev, binding->dmabuf = dmabuf; binding->direction = direction; - binding->attachment = dma_buf_attach(binding->dmabuf, dev->dev.parent); + binding->attachment = dma_buf_attach(binding->dmabuf, dma_dev); if (IS_ERR(binding->attachment)) { err = PTR_ERR(binding->attachment); NL_SET_ERR_MSG(extack, "Failed to bind dmabuf to device"); @@ -351,7 +358,8 @@ struct net_devmem_dmabuf_binding *net_devmem_get_binding(struct sock *sk, unsigned int dmabuf_id) { struct net_devmem_dmabuf_binding *binding; - struct dst_entry *dst = __sk_dst_get(sk); + struct net_device *dst_dev; + struct dst_entry *dst; int err = 0; binding = net_devmem_lookup_dmabuf(dmabuf_id); @@ -360,16 +368,35 @@ struct net_devmem_dmabuf_binding *net_devmem_get_binding(struct sock *sk, goto out_err; } + rcu_read_lock(); + dst = __sk_dst_get(sk); + /* If dst is NULL (route expired), attempt to rebuild it. */ + if (unlikely(!dst)) { + if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) { + err = -EHOSTUNREACH; + goto out_unlock; + } + dst = __sk_dst_get(sk); + if (unlikely(!dst)) { + err = -ENODEV; + goto out_unlock; + } + } + /* The dma-addrs in this binding are only reachable to the corresponding * net_device. */ - if (!dst || !dst->dev || dst->dev->ifindex != binding->dev->ifindex) { + dst_dev = dst_dev_rcu(dst); + if (unlikely(!dst_dev) || unlikely(dst_dev != binding->dev)) { err = -ENODEV; - goto out_err; + goto out_unlock; } + rcu_read_unlock(); return binding; +out_unlock: + rcu_read_unlock(); out_err: if (binding) net_devmem_dmabuf_binding_put(binding); diff --git a/net/core/devmem.h b/net/core/devmem.h index 41cd6e1c9141..0b43a648cd2e 100644 --- a/net/core/devmem.h +++ b/net/core/devmem.h @@ -85,6 +85,7 @@ struct dmabuf_genpool_chunk_owner { void __net_devmem_dmabuf_binding_free(struct work_struct *wq); struct net_devmem_dmabuf_binding * net_devmem_bind_dmabuf(struct net_device *dev, + struct device *dma_dev, enum dma_data_direction direction, unsigned int dmabuf_fd, struct netdev_nl_sock *priv, struct netlink_ext_ack *extack); @@ -93,7 +94,6 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding); int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, struct net_devmem_dmabuf_binding *binding, struct netlink_ext_ack *extack); -void net_devmem_bind_tx_release(struct sock *sk); static inline struct dmabuf_genpool_chunk_owner * net_devmem_iov_to_chunk_owner(const struct net_iov *niov) @@ -170,6 +170,7 @@ static inline void net_devmem_put_net_iov(struct net_iov *niov) static inline struct net_devmem_dmabuf_binding * net_devmem_bind_dmabuf(struct net_device *dev, + struct device *dma_dev, enum dma_data_direction direction, unsigned int dmabuf_fd, struct netdev_nl_sock *priv, diff --git a/net/core/dst.c b/net/core/dst.c index e2de8b68c41d..e9d35f49c9e7 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -150,7 +150,7 @@ void dst_dev_put(struct dst_entry *dst) dst->ops->ifdown(dst, dev); WRITE_ONCE(dst->input, dst_discard); WRITE_ONCE(dst->output, dst_discard_out); - WRITE_ONCE(dst->dev, blackhole_netdev); + rcu_assign_pointer(dst->dev_rcu, blackhole_netdev); netdev_ref_replace(dev, blackhole_netdev, &dst->dev_tracker, GFP_ATOMIC); } diff --git a/net/core/filter.c b/net/core/filter.c index da391e2b0788..616e0520a0bb 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2281,6 +2281,7 @@ static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev, if (IS_ERR(dst)) goto out_drop; + skb_dst_drop(skb); skb_dst_set(skb, dst); } else if (nh->nh_family != AF_INET6) { goto out_drop; @@ -2373,7 +2374,7 @@ static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev, struct flowi4 fl4 = { .flowi4_flags = FLOWI_FLAG_ANYSRC, .flowi4_mark = skb->mark, - .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip4h)), + .flowi4_dscp = ip4h_dscp(ip4h), .flowi4_oif = dev->ifindex, .flowi4_proto = ip4h->protocol, .daddr = ip4h->daddr, @@ -2389,6 +2390,7 @@ static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev, goto out_drop; } + skb_dst_drop(skb); skb_dst_set(skb, &rt->dst); } @@ -2456,6 +2458,13 @@ BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) if (unlikely(flags & (~(BPF_F_INGRESS) | BPF_F_REDIRECT_INTERNAL))) return -EINVAL; + /* BPF test infra's convert___skb_to_skb() can create type-less + * GSO packets. gso_features_check() will detect this as a bad + * offload. However, lets not leak them out in the first place. + */ + if (unlikely(skb_is_gso(skb) && !skb_shinfo(skb)->gso_type)) + return -EBADMSG; + dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex); if (unlikely(!dev)) return -EINVAL; @@ -3251,11 +3260,11 @@ static void bpf_skb_change_protocol(struct sk_buff *skb, u16 proto) static int bpf_skb_generic_push(struct sk_buff *skb, u32 off, u32 len) { - /* Caller already did skb_cow() with len as headroom, + /* Caller already did skb_cow() with meta_len+len as headroom, * so no need to do it here. */ skb_push(skb, len); - memmove(skb->data, skb->data + len, off); + skb_postpush_data_move(skb, len, off); memset(skb->data + off, 0, len); /* No skb_postpush_rcsum(skb, skb->data + off, len) @@ -3279,7 +3288,7 @@ static int bpf_skb_generic_pop(struct sk_buff *skb, u32 off, u32 len) old_data = skb->data; __skb_pull(skb, len); skb_postpull_rcsum(skb, old_data + off, len); - memmove(skb->data, old_data, off); + skb_postpull_data_move(skb, len, off); return 0; } @@ -3324,10 +3333,11 @@ static int bpf_skb_net_hdr_pop(struct sk_buff *skb, u32 off, u32 len) static int bpf_skb_proto_4_to_6(struct sk_buff *skb) { const u32 len_diff = sizeof(struct ipv6hdr) - sizeof(struct iphdr); + const u8 meta_len = skb_metadata_len(skb); u32 off = skb_mac_header_len(skb); int ret; - ret = skb_cow(skb, len_diff); + ret = skb_cow(skb, meta_len + len_diff); if (unlikely(ret < 0)) return ret; @@ -3487,6 +3497,7 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, u8 inner_mac_len = flags >> BPF_ADJ_ROOM_ENCAP_L2_SHIFT; bool encap = flags & BPF_F_ADJ_ROOM_ENCAP_L3_MASK; u16 mac_len = 0, inner_net = 0, inner_trans = 0; + const u8 meta_len = skb_metadata_len(skb); unsigned int gso_type = SKB_GSO_DODGY; int ret; @@ -3497,7 +3508,7 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, return -ENOTSUPP; } - ret = skb_cow_head(skb, len_diff); + ret = skb_cow_head(skb, meta_len + len_diff); if (unlikely(ret < 0)) return ret; @@ -3871,15 +3882,17 @@ static const struct bpf_func_proto sk_skb_change_tail_proto = { static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room, u64 flags) { + const u8 meta_len = skb_metadata_len(skb); u32 max_len = BPF_SKB_MAX_LEN; u32 new_len = skb->len + head_room; int ret; - if (unlikely(flags || (!skb_is_gso(skb) && new_len > max_len) || + if (unlikely(flags || (int)head_room < 0 || + (!skb_is_gso(skb) && new_len > max_len) || new_len < skb->len)) return -EINVAL; - ret = skb_cow(skb, head_room); + ret = skb_cow(skb, meta_len + head_room); if (likely(!ret)) { /* Idea for this helper is that we currently only * allow to expand on mac header. This means that @@ -3891,6 +3904,7 @@ static inline int __bpf_skb_change_head(struct sk_buff *skb, u32 head_room, * for redirection into L2 device. */ __skb_push(skb, head_room); + skb_postpush_data_move(skb, head_room, 0); memset(skb->data, 0, head_room); skb_reset_mac_header(skb); skb_reset_mac_len(skb); @@ -4153,34 +4167,45 @@ static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset) return 0; } -static void bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink, - enum xdp_mem_type mem_type, bool release) +static struct xdp_buff *bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink, + bool tail, bool release) { - struct xdp_buff *zc_frag = xsk_buff_get_tail(xdp); + struct xdp_buff *zc_frag = tail ? xsk_buff_get_tail(xdp) : + xsk_buff_get_head(xdp); if (release) { - xsk_buff_del_tail(zc_frag); - __xdp_return(0, mem_type, false, zc_frag); + xsk_buff_del_frag(zc_frag); } else { - zc_frag->data_end -= shrink; + if (tail) + zc_frag->data_end -= shrink; + else + zc_frag->data += shrink; } + + return zc_frag; } static bool bpf_xdp_shrink_data(struct xdp_buff *xdp, skb_frag_t *frag, - int shrink) + int shrink, bool tail) { enum xdp_mem_type mem_type = xdp->rxq->mem.type; bool release = skb_frag_size(frag) == shrink; + netmem_ref netmem = skb_frag_netmem(frag); + struct xdp_buff *zc_frag = NULL; if (mem_type == MEM_TYPE_XSK_BUFF_POOL) { - bpf_xdp_shrink_data_zc(xdp, shrink, mem_type, release); - goto out; + netmem = 0; + zc_frag = bpf_xdp_shrink_data_zc(xdp, shrink, tail, release); } - if (release) - __xdp_return(skb_frag_netmem(frag), mem_type, false, NULL); + if (release) { + __xdp_return(netmem, mem_type, false, zc_frag); + } else { + if (!tail) + skb_frag_off_add(frag, shrink); + skb_frag_size_sub(frag, shrink); + } -out: return release; } @@ -4198,18 +4223,15 @@ static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset) len_free += shrink; offset -= shrink; - if (bpf_xdp_shrink_data(xdp, frag, shrink)) { + if (bpf_xdp_shrink_data(xdp, frag, shrink, true)) n_frags_free++; - } else { - skb_frag_size_sub(frag, shrink); - break; - } } sinfo->nr_frags -= n_frags_free; sinfo->xdp_frags_size -= len_free; if (unlikely(!sinfo->nr_frags)) { xdp_buff_clear_frags_flag(xdp); + xdp_buff_clear_frag_pfmemalloc(xdp); xdp->data_end -= offset; } @@ -5723,6 +5745,77 @@ static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = { .arg5_type = ARG_CONST_SIZE, }; +static int sk_bpf_set_get_bypass_prot_mem(struct sock *sk, + char *optval, int optlen, + bool getopt) +{ + int val; + + if (optlen != sizeof(int)) + return -EINVAL; + + if (!sk_has_account(sk)) + return -EOPNOTSUPP; + + if (getopt) { + *(int *)optval = sk->sk_bypass_prot_mem; + return 0; + } + + val = *(int *)optval; + if (val < 0 || val > 1) + return -EINVAL; + + sk->sk_bypass_prot_mem = val; + return 0; +} + +BPF_CALL_5(bpf_sock_create_setsockopt, struct sock *, sk, int, level, + int, optname, char *, optval, int, optlen) +{ + if (level == SOL_SOCKET && optname == SK_BPF_BYPASS_PROT_MEM) + return sk_bpf_set_get_bypass_prot_mem(sk, optval, optlen, false); + + return __bpf_setsockopt(sk, level, optname, optval, optlen); +} + +static const struct bpf_func_proto bpf_sock_create_setsockopt_proto = { + .func = bpf_sock_create_setsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY, + .arg5_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(bpf_sock_create_getsockopt, struct sock *, sk, int, level, + int, optname, char *, optval, int, optlen) +{ + if (level == SOL_SOCKET && optname == SK_BPF_BYPASS_PROT_MEM) { + int err = sk_bpf_set_get_bypass_prot_mem(sk, optval, optlen, true); + + if (err) + memset(optval, 0, optlen); + + return err; + } + + return __bpf_getsockopt(sk, level, optname, optval, optlen); +} + +static const struct bpf_func_proto bpf_sock_create_getsockopt_proto = { + .func = bpf_sock_create_getsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_UNINIT_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { @@ -5897,7 +5990,7 @@ BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr, return err; if (((struct sockaddr_in *)addr)->sin_port == htons(0)) flags |= BIND_FORCE_ADDRESS_NO_PORT; - return __inet_bind(sk, addr, addr_len, flags); + return __inet_bind(sk, (struct sockaddr_unsized *)addr, addr_len, flags); #if IS_ENABLED(CONFIG_IPV6) } else if (addr->sa_family == AF_INET6) { if (addr_len < SIN6_LEN_RFC2133) @@ -5907,7 +6000,8 @@ BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr, /* ipv6_bpf_stub cannot be NULL, since it's called from * bpf_cgroup_inet6_connect hook and ipv6 is already loaded */ - return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, flags); + return ipv6_bpf_stub->inet6_bind(sk, (struct sockaddr_unsized *)addr, + addr_len, flags); #endif /* CONFIG_IPV6 */ } #endif /* CONFIG_INET */ @@ -6020,7 +6114,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, fl4.flowi4_iif = params->ifindex; fl4.flowi4_oif = 0; } - fl4.flowi4_tos = params->tos & INET_DSCP_MASK; + fl4.flowi4_dscp = inet_dsfield_to_dscp(params->tos); fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_flags = 0; @@ -6411,9 +6505,12 @@ BPF_CALL_5(bpf_skb_check_mtu, struct sk_buff *, skb, */ if (skb_is_gso(skb)) { ret = BPF_MTU_CHK_RET_SUCCESS; - if (flags & BPF_MTU_CHK_SEGS && - !skb_gso_validate_network_len(skb, mtu)) - ret = BPF_MTU_CHK_RET_SEGS_TOOBIG; + if (flags & BPF_MTU_CHK_SEGS) { + if (!skb_transport_header_was_set(skb)) + return -EINVAL; + if (!skb_gso_validate_network_len(skb, mtu)) + ret = BPF_MTU_CHK_RET_SEGS_TOOBIG; + } } out: *mtu_len = mtu; @@ -6767,7 +6864,6 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, int dif, int sdif, u8 family, u8 proto) { - struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo; bool refcounted = false; struct sock *sk = NULL; @@ -6776,7 +6872,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, __be32 dst4 = tuple->ipv4.daddr; if (proto == IPPROTO_TCP) - sk = __inet_lookup(net, hinfo, NULL, 0, + sk = __inet_lookup(net, NULL, 0, src4, tuple->ipv4.sport, dst4, tuple->ipv4.dport, dif, sdif, &refcounted); @@ -6790,7 +6886,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr; if (proto == IPPROTO_TCP) - sk = __inet6_lookup(net, hinfo, NULL, 0, + sk = __inet6_lookup(net, NULL, 0, src6, tuple->ipv6.sport, dst6, ntohs(tuple->ipv6.dport), dif, sdif, &refcounted); @@ -7431,6 +7527,8 @@ u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type, offsetof(struct xdp_sock, FIELD)); \ } while (0) + BTF_TYPE_EMIT(struct bpf_xdp_sock); + switch (si->off) { case offsetof(struct bpf_xdp_sock, queue_id): BPF_XDP_SOCK_GET(queue_id); @@ -8051,6 +8149,20 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_storage_get_cg_sock_proto; case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; + case BPF_FUNC_setsockopt: + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET_SOCK_CREATE: + return &bpf_sock_create_setsockopt_proto; + default: + return NULL; + } + case BPF_FUNC_getsockopt: + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET_SOCK_CREATE: + return &bpf_sock_create_getsockopt_proto; + default: + return NULL; + } default: return bpf_base_func_proto(func_id, prog); } @@ -9284,13 +9396,17 @@ static bool sock_addr_is_valid_access(int off, int size, return false; info->reg_type = PTR_TO_SOCKET; break; - default: - if (type == BPF_READ) { - if (size != size_default) - return false; - } else { + case bpf_ctx_range(struct bpf_sock_addr, user_family): + case bpf_ctx_range(struct bpf_sock_addr, family): + case bpf_ctx_range(struct bpf_sock_addr, type): + case bpf_ctx_range(struct bpf_sock_addr, protocol): + if (type != BPF_READ) return false; - } + if (size != size_default) + return false; + break; + default: + return false; } return true; @@ -11990,6 +12106,28 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return func; } +/** + * bpf_skb_meta_pointer() - Gets a mutable pointer within the skb metadata area. + * @skb: socket buffer carrying the metadata + * @offset: offset into the metadata area, must be <= skb_metadata_len() + */ +void *bpf_skb_meta_pointer(struct sk_buff *skb, u32 offset) +{ + return skb_metadata_end(skb) - skb_metadata_len(skb) + offset; +} + +int __bpf_skb_meta_store_bytes(struct sk_buff *skb, u32 offset, + const void *from, u32 len, u64 flags) +{ + if (unlikely(flags)) + return -EINVAL; + if (unlikely(bpf_try_make_writable(skb, 0))) + return -EFAULT; + + memmove(bpf_skb_meta_pointer(skb, offset), from, len); + return 0; +} + __bpf_kfunc_start_defs(); __bpf_kfunc int bpf_dynptr_from_skb(struct __sk_buff *s, u64 flags, struct bpf_dynptr *ptr__uninit) @@ -12007,6 +12145,36 @@ __bpf_kfunc int bpf_dynptr_from_skb(struct __sk_buff *s, u64 flags, return 0; } +/** + * bpf_dynptr_from_skb_meta() - Initialize a dynptr to the skb metadata area. + * @skb_: socket buffer carrying the metadata + * @flags: future use, must be zero + * @ptr__uninit: dynptr to initialize + * + * Set up a dynptr for access to the metadata area earlier allocated from the + * XDP context with bpf_xdp_adjust_meta(). Serves as an alternative to + * &__sk_buff->data_meta. + * + * Return: + * * %0 - dynptr ready to use + * * %-EINVAL - invalid flags, dynptr set to null + */ +__bpf_kfunc int bpf_dynptr_from_skb_meta(struct __sk_buff *skb_, u64 flags, + struct bpf_dynptr *ptr__uninit) +{ + struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit; + struct sk_buff *skb = (struct sk_buff *)skb_; + + if (flags) { + bpf_dynptr_set_null(ptr); + return -EINVAL; + } + + bpf_dynptr_init(ptr, skb, BPF_DYNPTR_TYPE_SKB_META, 0, skb_metadata_len(skb)); + + return 0; +} + __bpf_kfunc int bpf_dynptr_from_xdp(struct xdp_md *x, u64 flags, struct bpf_dynptr *ptr__uninit) { @@ -12160,6 +12328,98 @@ __bpf_kfunc int bpf_sock_ops_enable_tx_tstamp(struct bpf_sock_ops_kern *skops, return 0; } +/** + * bpf_xdp_pull_data() - Pull in non-linear xdp data. + * @x: &xdp_md associated with the XDP buffer + * @len: length of data to be made directly accessible in the linear part + * + * Pull in data in case the XDP buffer associated with @x is non-linear and + * not all @len are in the linear data area. + * + * Direct packet access allows reading and writing linear XDP data through + * packet pointers (i.e., &xdp_md->data + offsets). The amount of data which + * ends up in the linear part of the xdp_buff depends on the NIC and its + * configuration. When a frag-capable XDP program wants to directly access + * headers that may be in the non-linear area, call this kfunc to make sure + * the data is available in the linear area. Alternatively, use dynptr or + * bpf_xdp_{load,store}_bytes() to access data without pulling. + * + * This kfunc can also be used with bpf_xdp_adjust_head() to decapsulate + * headers in the non-linear data area. + * + * A call to this kfunc may reduce headroom. If there is not enough tailroom + * in the linear data area, metadata and data will be shifted down. + * + * A call to this kfunc is susceptible to change the buffer geometry. + * Therefore, at load time, all checks on pointers previously done by the + * verifier are invalidated and must be performed again, if the kfunc is used + * in combination with direct packet access. + * + * Return: + * * %0 - success + * * %-EINVAL - invalid len + */ +__bpf_kfunc int bpf_xdp_pull_data(struct xdp_md *x, u32 len) +{ + struct xdp_buff *xdp = (struct xdp_buff *)x; + struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); + int i, delta, shift, headroom, tailroom, n_frags_free = 0; + void *data_hard_end = xdp_data_hard_end(xdp); + int data_len = xdp->data_end - xdp->data; + void *start; + + if (len <= data_len) + return 0; + + if (unlikely(len > xdp_get_buff_len(xdp))) + return -EINVAL; + + start = xdp_data_meta_unsupported(xdp) ? xdp->data : xdp->data_meta; + + headroom = start - xdp->data_hard_start - sizeof(struct xdp_frame); + tailroom = data_hard_end - xdp->data_end; + + delta = len - data_len; + if (unlikely(delta > tailroom + headroom)) + return -EINVAL; + + shift = delta - tailroom; + if (shift > 0) { + memmove(start - shift, start, xdp->data_end - start); + + xdp->data_meta -= shift; + xdp->data -= shift; + xdp->data_end -= shift; + } + + for (i = 0; i < sinfo->nr_frags && delta; i++) { + skb_frag_t *frag = &sinfo->frags[i]; + u32 shrink = min_t(u32, delta, skb_frag_size(frag)); + + memcpy(xdp->data_end, skb_frag_address(frag), shrink); + + xdp->data_end += shrink; + sinfo->xdp_frags_size -= shrink; + delta -= shrink; + if (bpf_xdp_shrink_data(xdp, frag, shrink, false)) + n_frags_free++; + } + + if (unlikely(n_frags_free)) { + memmove(sinfo->frags, sinfo->frags + n_frags_free, + (sinfo->nr_frags - n_frags_free) * sizeof(skb_frag_t)); + + sinfo->nr_frags -= n_frags_free; + + if (!sinfo->nr_frags) { + xdp_buff_clear_frags_flag(xdp); + xdp_buff_clear_frag_pfmemalloc(xdp); + } + } + + return 0; +} + __bpf_kfunc_end_defs(); int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags, @@ -12181,8 +12441,13 @@ BTF_KFUNCS_START(bpf_kfunc_check_set_skb) BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS) BTF_KFUNCS_END(bpf_kfunc_check_set_skb) +BTF_KFUNCS_START(bpf_kfunc_check_set_skb_meta) +BTF_ID_FLAGS(func, bpf_dynptr_from_skb_meta, KF_TRUSTED_ARGS) +BTF_KFUNCS_END(bpf_kfunc_check_set_skb_meta) + BTF_KFUNCS_START(bpf_kfunc_check_set_xdp) BTF_ID_FLAGS(func, bpf_dynptr_from_xdp) +BTF_ID_FLAGS(func, bpf_xdp_pull_data) BTF_KFUNCS_END(bpf_kfunc_check_set_xdp) BTF_KFUNCS_START(bpf_kfunc_check_set_sock_addr) @@ -12202,6 +12467,11 @@ static const struct btf_kfunc_id_set bpf_kfunc_set_skb = { .set = &bpf_kfunc_check_set_skb, }; +static const struct btf_kfunc_id_set bpf_kfunc_set_skb_meta = { + .owner = THIS_MODULE, + .set = &bpf_kfunc_check_set_skb_meta, +}; + static const struct btf_kfunc_id_set bpf_kfunc_set_xdp = { .owner = THIS_MODULE, .set = &bpf_kfunc_check_set_xdp, @@ -12237,6 +12507,8 @@ static int __init bpf_kfunc_init(void) ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_NETFILTER, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_kfunc_set_skb); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_skb_meta); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_ACT, &bpf_kfunc_set_skb_meta); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, &bpf_kfunc_set_sock_addr); diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index 7d426a8e29f3..f112156db587 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -90,10 +90,12 @@ static void est_timer(struct timer_list *t) rate = (b_packets - est->last_packets) << (10 - est->intvl_log); rate = (rate >> est->ewma_log) - (est->avpps >> est->ewma_log); + preempt_disable_nested(); write_seqcount_begin(&est->seq); est->avbps += brate; est->avpps += rate; write_seqcount_end(&est->seq); + preempt_enable_nested(); est->last_bytes = b_bytes; est->last_packets = b_packets; diff --git a/net/core/gro.c b/net/core/gro.c index b350e5b69549..76f9c3712422 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later +#include <net/psp.h> #include <net/gro.h> #include <net/dst_metadata.h> #include <net/busy_poll.h> @@ -376,6 +377,7 @@ static void gro_list_prepare(const struct list_head *head, diffs |= skb_get_nfct(p) ^ skb_get_nfct(skb); diffs |= gro_list_prepare_tc_ext(skb, p, diffs); + diffs |= __psp_skb_coalesce_diff(skb, p, diffs); } NAPI_GRO_CB(p)->same_flow = !diffs; @@ -637,6 +639,8 @@ EXPORT_SYMBOL(gro_receive_skb); static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) { + struct skb_shared_info *shinfo; + if (unlikely(skb->pfmemalloc)) { consume_skb(skb); return; @@ -653,8 +657,12 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) skb->encapsulation = 0; skb->ip_summed = CHECKSUM_NONE; - skb_shinfo(skb)->gso_type = 0; - skb_shinfo(skb)->gso_size = 0; + + shinfo = skb_shinfo(skb); + shinfo->gso_type = 0; + shinfo->gso_size = 0; + shinfo->hwtstamps.hwtstamp = 0; + if (unlikely(skb->slow_gro)) { skb_orphan(skb); skb_ext_reset(skb); diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c index ff8e5b64bf6b..a725d21159a6 100644 --- a/net/core/gro_cells.c +++ b/net/core/gro_cells.c @@ -8,11 +8,13 @@ struct gro_cell { struct sk_buff_head napi_skbs; struct napi_struct napi; + local_lock_t bh_lock; }; int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb) { struct net_device *dev = skb->dev; + bool have_bh_lock = false; struct gro_cell *cell; int res; @@ -25,6 +27,8 @@ int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb) goto unlock; } + local_lock_nested_bh(&gcells->cells->bh_lock); + have_bh_lock = true; cell = this_cpu_ptr(gcells->cells); if (skb_queue_len(&cell->napi_skbs) > READ_ONCE(net_hotdata.max_backlog)) { @@ -42,6 +46,8 @@ drop: res = NET_RX_SUCCESS; unlock: + if (have_bh_lock) + local_unlock_nested_bh(&gcells->cells->bh_lock); rcu_read_unlock(); return res; } @@ -55,7 +61,9 @@ static int gro_cell_poll(struct napi_struct *napi, int budget) int work_done = 0; while (work_done < budget) { + __local_lock_nested_bh(&cell->bh_lock); skb = __skb_dequeue(&cell->napi_skbs); + __local_unlock_nested_bh(&cell->bh_lock); if (!skb) break; napi_gro_receive(napi, skb); @@ -79,6 +87,7 @@ int gro_cells_init(struct gro_cells *gcells, struct net_device *dev) struct gro_cell *cell = per_cpu_ptr(gcells->cells, i); __skb_queue_head_init(&cell->napi_skbs); + local_lock_init(&cell->bh_lock); set_bit(NAPI_STATE_NO_BUSY_POLL, &cell->napi.state); diff --git a/net/core/hotdata.c b/net/core/hotdata.c index 95d0a4df1006..dddd5c287cf0 100644 --- a/net/core/hotdata.c +++ b/net/core/hotdata.c @@ -20,7 +20,7 @@ struct net_hotdata net_hotdata __cacheline_aligned = { .dev_tx_weight = 64, .dev_rx_weight = 64, .sysctl_max_skb_frags = MAX_SKB_FRAGS, - .sysctl_skb_defer_max = 64, + .sysctl_skb_defer_max = 128, .sysctl_mem_pcpu_rsv = SK_MEMORY_PCPU_RESERVE }; EXPORT_SYMBOL(net_hotdata); diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 864f3bbc3a4c..212cde35affa 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -157,9 +157,9 @@ static void linkwatch_schedule_work(int urgent) * override the existing timer. */ if (test_bit(LW_URGENT, &linkwatch_flags)) - mod_delayed_work(system_unbound_wq, &linkwatch_work, 0); + mod_delayed_work(system_dfl_wq, &linkwatch_work, 0); else - queue_delayed_work(system_unbound_wq, &linkwatch_work, delay); + queue_delayed_work(system_dfl_wq, &linkwatch_work, delay); } diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index ae74634310a3..9f40be0c3e71 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -8,12 +8,12 @@ #include <linux/skbuff.h> #include <linux/types.h> #include <linux/bpf.h> +#include <net/flow.h> #include <net/lwtunnel.h> #include <net/gre.h> #include <net/ip.h> #include <net/ip6_route.h> #include <net/ipv6_stubs.h> -#include <net/inet_dscp.h> struct bpf_lwt_prog { struct bpf_prog *prog; @@ -209,7 +209,7 @@ static int bpf_lwt_xmit_reroute(struct sk_buff *skb) fl4.flowi4_oif = oif; fl4.flowi4_mark = skb->mark; fl4.flowi4_uid = sock_net_uid(net, sk); - fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)); + fl4.flowi4_dscp = ip4h_dscp(iph); fl4.flowi4_flags = FLOWI_FLAG_ANYSRC; fl4.flowi4_proto = iph->protocol; fl4.daddr = iph->daddr; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index bddfa389effa..96a3b1a93252 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -81,7 +81,7 @@ static struct hlist_head *neigh_get_dev_table(struct net_device *dev, int family } /* - Neighbour hash table buckets are protected with rwlock tbl->lock. + Neighbour hash table buckets are protected with tbl->lock. - All the scans/updates to hash buckets MUST be made under this lock. - NOTHING clever should be made under this lock: no callbacks @@ -149,7 +149,7 @@ static void neigh_update_gc_list(struct neighbour *n) { bool on_gc_list, exempt_from_gc; - write_lock_bh(&n->tbl->lock); + spin_lock_bh(&n->tbl->lock); write_lock(&n->lock); if (n->dead) goto out; @@ -172,14 +172,14 @@ static void neigh_update_gc_list(struct neighbour *n) } out: write_unlock(&n->lock); - write_unlock_bh(&n->tbl->lock); + spin_unlock_bh(&n->tbl->lock); } static void neigh_update_managed_list(struct neighbour *n) { bool on_managed_list, add_to_managed; - write_lock_bh(&n->tbl->lock); + spin_lock_bh(&n->tbl->lock); write_lock(&n->lock); if (n->dead) goto out; @@ -193,7 +193,7 @@ static void neigh_update_managed_list(struct neighbour *n) list_add_tail(&n->managed_list, &n->tbl->managed_list); out: write_unlock(&n->lock); - write_unlock_bh(&n->tbl->lock); + spin_unlock_bh(&n->tbl->lock); } static void neigh_update_flags(struct neighbour *neigh, u32 flags, int *notify, @@ -263,7 +263,7 @@ static int neigh_forced_gc(struct neigh_table *tbl) NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs); - write_lock_bh(&tbl->lock); + spin_lock_bh(&tbl->lock); list_for_each_entry_safe(n, tmp, &tbl->gc_list, gc_list) { if (refcount_read(&n->refcnt) == 1) { @@ -292,7 +292,7 @@ static int neigh_forced_gc(struct neigh_table *tbl) WRITE_ONCE(tbl->last_flush, jiffies); unlock: - write_unlock_bh(&tbl->lock); + spin_unlock_bh(&tbl->lock); return shrunk; } @@ -454,23 +454,23 @@ static void neigh_flush_table(struct neigh_table *tbl) void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev) { - write_lock_bh(&tbl->lock); + spin_lock_bh(&tbl->lock); neigh_flush_dev(tbl, dev, false); - write_unlock_bh(&tbl->lock); + spin_unlock_bh(&tbl->lock); } EXPORT_SYMBOL(neigh_changeaddr); static int __neigh_ifdown(struct neigh_table *tbl, struct net_device *dev, bool skip_perm) { - write_lock_bh(&tbl->lock); + spin_lock_bh(&tbl->lock); if (likely(dev)) { neigh_flush_dev(tbl, dev, skip_perm); } else { DEBUG_NET_WARN_ON_ONCE(skip_perm); neigh_flush_table(tbl); } - write_unlock_bh(&tbl->lock); + spin_unlock_bh(&tbl->lock); pneigh_ifdown(tbl, dev, skip_perm); pneigh_queue_purge(&tbl->proxy_queue, dev ? dev_net(dev) : NULL, @@ -687,7 +687,7 @@ ___neigh_create(struct neigh_table *tbl, const void *pkey, n->confirmed = jiffies - (NEIGH_VAR(n->parms, BASE_REACHABLE_TIME) << 1); - write_lock_bh(&tbl->lock); + spin_lock_bh(&tbl->lock); nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); @@ -722,13 +722,13 @@ ___neigh_create(struct neigh_table *tbl, const void *pkey, hlist_add_head_rcu(&n->dev_list, neigh_get_dev_table(dev, tbl->family)); - write_unlock_bh(&tbl->lock); + spin_unlock_bh(&tbl->lock); neigh_dbg(2, "neigh %p is created\n", n); rc = n; out: return rc; out_tbl_unlock: - write_unlock_bh(&tbl->lock); + spin_unlock_bh(&tbl->lock); out_neigh_release: if (!exempt_from_gc) atomic_dec(&tbl->gc_entries); @@ -982,7 +982,7 @@ static void neigh_periodic_work(struct work_struct *work) NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs); - write_lock_bh(&tbl->lock); + spin_lock_bh(&tbl->lock); nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); @@ -995,8 +995,7 @@ static void neigh_periodic_work(struct work_struct *work) WRITE_ONCE(tbl->last_rand, jiffies); list_for_each_entry(p, &tbl->parms_list, list) - p->reachable_time = - neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); + neigh_set_reach_time(p); } if (atomic_read(&tbl->entries) < READ_ONCE(tbl->gc_thresh1)) @@ -1037,9 +1036,9 @@ static void neigh_periodic_work(struct work_struct *work) * It's fine to release lock here, even if hash table * grows while we are preempted. */ - write_unlock_bh(&tbl->lock); + spin_unlock_bh(&tbl->lock); cond_resched(); - write_lock_bh(&tbl->lock); + spin_lock_bh(&tbl->lock); nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); } @@ -1050,7 +1049,7 @@ out: */ queue_delayed_work(system_power_efficient_wq, &tbl->gc_work, NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME) >> 1); - write_unlock_bh(&tbl->lock); + spin_unlock_bh(&tbl->lock); } static __inline__ int neigh_max_probes(struct neighbour *n) @@ -1642,12 +1641,12 @@ static void neigh_managed_work(struct work_struct *work) managed_work.work); struct neighbour *neigh; - write_lock_bh(&tbl->lock); + spin_lock_bh(&tbl->lock); list_for_each_entry(neigh, &tbl->managed_list, managed_list) neigh_event_send_probe(neigh, NULL, false); queue_delayed_work(system_power_efficient_wq, &tbl->managed_work, NEIGH_VAR(&tbl->parms, INTERVAL_PROBE_TIME_MS)); - write_unlock_bh(&tbl->lock); + spin_unlock_bh(&tbl->lock); } static void neigh_proxy_process(struct timer_list *t) @@ -1749,8 +1748,7 @@ struct neigh_parms *neigh_parms_alloc(struct net_device *dev, if (p) { p->tbl = tbl; refcount_set(&p->refcnt, 1); - p->reachable_time = - neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); + neigh_set_reach_time(p); p->qlen = 0; netdev_hold(dev, &p->dev_tracker, GFP_KERNEL); p->dev = dev; @@ -1763,9 +1761,9 @@ struct neigh_parms *neigh_parms_alloc(struct net_device *dev, return NULL; } - write_lock_bh(&tbl->lock); - list_add(&p->list, &tbl->parms.list); - write_unlock_bh(&tbl->lock); + spin_lock_bh(&tbl->lock); + list_add_rcu(&p->list, &tbl->parms.list); + spin_unlock_bh(&tbl->lock); neigh_parms_data_state_cleanall(p); } @@ -1785,10 +1783,12 @@ void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms) { if (!parms || parms == &tbl->parms) return; - write_lock_bh(&tbl->lock); - list_del(&parms->list); + + spin_lock_bh(&tbl->lock); + list_del_rcu(&parms->list); parms->dead = 1; - write_unlock_bh(&tbl->lock); + spin_unlock_bh(&tbl->lock); + netdev_put(parms->dev, &parms->dev_tracker); call_rcu(&parms->rcu_head, neigh_rcu_free_parms); } @@ -1810,8 +1810,7 @@ void neigh_table_init(int index, struct neigh_table *tbl) list_add(&tbl->parms.list, &tbl->parms_list); write_pnet(&tbl->parms.net, &init_net); refcount_set(&tbl->parms.refcnt, 1); - tbl->parms.reachable_time = - neigh_rand_reach_time(NEIGH_VAR(&tbl->parms, BASE_REACHABLE_TIME)); + neigh_set_reach_time(&tbl->parms); tbl->parms.qlen = 0; tbl->stats = alloc_percpu(struct neigh_statistics); @@ -1838,7 +1837,7 @@ void neigh_table_init(int index, struct neigh_table *tbl) else WARN_ON(tbl->entry_size % NEIGH_PRIV_ALIGN); - rwlock_init(&tbl->lock); + spin_lock_init(&tbl->lock); mutex_init(&tbl->phash_lock); INIT_DEFERRABLE_WORK(&tbl->gc_work, neigh_periodic_work); @@ -1981,10 +1980,10 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, err = __neigh_update(neigh, NULL, NUD_FAILED, NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN, NETLINK_CB(skb).portid, extack); - write_lock_bh(&tbl->lock); + spin_lock_bh(&tbl->lock); neigh_release(neigh); neigh_remove_one(neigh); - write_unlock_bh(&tbl->lock); + spin_unlock_bh(&tbl->lock); out: return err; @@ -2179,7 +2178,7 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms) return -ENOBUFS; if ((parms->dev && - nla_put_u32(skb, NDTPA_IFINDEX, parms->dev->ifindex)) || + nla_put_u32(skb, NDTPA_IFINDEX, READ_ONCE(parms->dev->ifindex))) || nla_put_u32(skb, NDTPA_REFCNT, refcount_read(&parms->refcnt)) || nla_put_u32(skb, NDTPA_QUEUE_LENBYTES, NEIGH_VAR(parms, QUEUE_LEN_BYTES)) || @@ -2194,7 +2193,7 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms) NEIGH_VAR(parms, MCAST_PROBES)) || nla_put_u32(skb, NDTPA_MCAST_REPROBES, NEIGH_VAR(parms, MCAST_REPROBES)) || - nla_put_msecs(skb, NDTPA_REACHABLE_TIME, parms->reachable_time, + nla_put_msecs(skb, NDTPA_REACHABLE_TIME, READ_ONCE(parms->reachable_time), NDTPA_PAD) || nla_put_msecs(skb, NDTPA_BASE_REACHABLE_TIME, NEIGH_VAR(parms, BASE_REACHABLE_TIME), NDTPA_PAD) || @@ -2231,8 +2230,6 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, return -EMSGSIZE; ndtmsg = nlmsg_data(nlh); - - read_lock_bh(&tbl->lock); ndtmsg->ndtm_family = tbl->family; ndtmsg->ndtm_pad1 = 0; ndtmsg->ndtm_pad2 = 0; @@ -2258,11 +2255,9 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, .ndtc_proxy_qlen = READ_ONCE(tbl->proxy_queue.qlen), }; - rcu_read_lock(); nht = rcu_dereference(tbl->nht); ndc.ndtc_hash_rnd = nht->hash_rnd[0]; ndc.ndtc_hash_mask = ((1 << nht->hash_shift) - 1); - rcu_read_unlock(); if (nla_put(skb, NDTA_CONFIG, sizeof(ndc), &ndc)) goto nla_put_failure; @@ -2300,12 +2295,10 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, if (neightbl_fill_parms(skb, &tbl->parms) < 0) goto nla_put_failure; - read_unlock_bh(&tbl->lock); nlmsg_end(skb, nlh); return 0; nla_put_failure: - read_unlock_bh(&tbl->lock); nlmsg_cancel(skb, nlh); return -EMSGSIZE; } @@ -2324,8 +2317,6 @@ static int neightbl_fill_param_info(struct sk_buff *skb, return -EMSGSIZE; ndtmsg = nlmsg_data(nlh); - - read_lock_bh(&tbl->lock); ndtmsg->ndtm_family = tbl->family; ndtmsg->ndtm_pad1 = 0; ndtmsg->ndtm_pad2 = 0; @@ -2334,11 +2325,9 @@ static int neightbl_fill_param_info(struct sk_buff *skb, neightbl_fill_parms(skb, parms) < 0) goto errout; - read_unlock_bh(&tbl->lock); nlmsg_end(skb, nlh); return 0; errout: - read_unlock_bh(&tbl->lock); nlmsg_cancel(skb, nlh); return -EMSGSIZE; } @@ -2375,9 +2364,9 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); + struct nlattr *tb[NDTA_MAX + 1]; struct neigh_table *tbl; struct ndtmsg *ndtmsg; - struct nlattr *tb[NDTA_MAX+1]; bool found = false; int err, tidx; @@ -2393,26 +2382,33 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, ndtmsg = nlmsg_data(nlh); + rcu_read_lock(); + for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) { - tbl = rcu_dereference_rtnl(neigh_tables[tidx]); + tbl = rcu_dereference(neigh_tables[tidx]); if (!tbl) continue; + if (ndtmsg->ndtm_family && tbl->family != ndtmsg->ndtm_family) continue; + if (nla_strcmp(tb[NDTA_NAME], tbl->id) == 0) { found = true; break; } } - if (!found) - return -ENOENT; + if (!found) { + rcu_read_unlock(); + err = -ENOENT; + goto errout; + } /* * We acquire tbl->lock to be nice to the periodic timers and * make sure they always see a consistent set of values. */ - write_lock_bh(&tbl->lock); + spin_lock_bh(&tbl->lock); if (tb[NDTA_PARMS]) { struct nlattr *tbp[NDTPA_MAX+1]; @@ -2475,8 +2471,7 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, * only be effective after the next time neigh_periodic_work * decides to recompute it (can be multiple minutes) */ - p->reachable_time = - neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); + neigh_set_reach_time(p); break; case NDTPA_GC_STALETIME: NEIGH_VAR_SET(p, GC_STALETIME, @@ -2532,7 +2527,8 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, err = 0; errout_tbl_lock: - write_unlock_bh(&tbl->lock); + spin_unlock_bh(&tbl->lock); + rcu_read_unlock(); errout: return err; } @@ -2579,10 +2575,12 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family; + rcu_read_lock(); + for (tidx = 0; tidx < NEIGH_NR_TABLES; tidx++) { struct neigh_parms *p; - tbl = rcu_dereference_rtnl(neigh_tables[tidx]); + tbl = rcu_dereference(neigh_tables[tidx]); if (!tbl) continue; @@ -2596,7 +2594,7 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) nidx = 0; p = list_next_entry(&tbl->parms, list); - list_for_each_entry_from(p, &tbl->parms_list, list) { + list_for_each_entry_from_rcu(p, &tbl->parms_list, list) { if (!net_eq(neigh_parms_net(p), net)) continue; @@ -2616,6 +2614,8 @@ static int neightbl_dump_info(struct sk_buff *skb, struct netlink_callback *cb) neigh_skip = 0; } out: + rcu_read_unlock(); + cb->args[0] = tidx; cb->args[1] = nidx; @@ -3127,14 +3127,14 @@ void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void rcu_read_lock(); nht = rcu_dereference(tbl->nht); - read_lock_bh(&tbl->lock); /* avoid resizes */ + spin_lock_bh(&tbl->lock); /* avoid resizes */ for (chain = 0; chain < (1 << nht->hash_shift); chain++) { struct neighbour *n; neigh_for_each_in_bucket(n, &nht->hash_heads[chain]) cb(n, cookie); } - read_unlock_bh(&tbl->lock); + spin_unlock_bh(&tbl->lock); rcu_read_unlock(); } EXPORT_SYMBOL(neigh_for_each); @@ -3404,7 +3404,7 @@ void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl rcu_read_lock(); state->nht = rcu_dereference(tbl->nht); - read_lock_bh(&tbl->lock); + spin_lock_bh(&tbl->lock); return *pos ? neigh_get_idx_any(seq, pos) : SEQ_START_TOKEN; } @@ -3444,7 +3444,7 @@ void neigh_seq_stop(struct seq_file *seq, void *v) struct neigh_seq_state *state = seq->private; struct neigh_table *tbl = state->tbl; - read_unlock_bh(&tbl->lock); + spin_unlock_bh(&tbl->lock); rcu_read_unlock(); } EXPORT_SYMBOL(neigh_seq_stop); @@ -3721,8 +3721,7 @@ static int neigh_proc_base_reachable_time(const struct ctl_table *ctl, int write * only be effective after the next time neigh_periodic_work * decides to recompute it */ - p->reachable_time = - neigh_rand_reach_time(NEIGH_VAR(p, BASE_REACHABLE_TIME)); + neigh_set_reach_time(p); } return ret; } @@ -3918,8 +3917,10 @@ static const struct rtnl_msg_handler neigh_rtnl_msg_handlers[] __initconst = { {.msgtype = RTM_DELNEIGH, .doit = neigh_delete}, {.msgtype = RTM_GETNEIGH, .doit = neigh_get, .dumpit = neigh_dump_info, .flags = RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, - {.msgtype = RTM_GETNEIGHTBL, .dumpit = neightbl_dump_info}, - {.msgtype = RTM_SETNEIGHTBL, .doit = neightbl_set}, + {.msgtype = RTM_GETNEIGHTBL, .dumpit = neightbl_dump_info, + .flags = RTNL_FLAG_DUMP_UNLOCKED}, + {.msgtype = RTM_SETNEIGHTBL, .doit = neightbl_set, + .flags = RTNL_FLAG_DOIT_UNLOCKED}, }; static int __init neigh_init(void) diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index 4f0f0709a1cb..70e0e9a3b650 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -145,7 +145,8 @@ static int softnet_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x " "%08x %08x\n", - READ_ONCE(sd->processed), atomic_read(&sd->dropped), + READ_ONCE(sd->processed), + numa_drop_read(&sd->drop_counters), READ_ONCE(sd->time_squeeze), 0, 0, 0, 0, 0, /* was fastroute */ 0, /* was cpu_collision */ diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index c28cd6665444..ca878525ad7c 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1120,8 +1120,10 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, return -ENOMEM; table->log = ilog2(mask) + 1; - for (count = 0; count <= mask; count++) + for (count = 0; count <= mask; count++) { table->flows[count].cpu = RPS_NO_CPU; + table->flows[count].filter = RPS_NO_FILTER; + } } else { table = NULL; } @@ -1328,7 +1330,7 @@ net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) struct netdev_rx_queue *queue = &dev->_rx[i]; struct kobject *kobj = &queue->kobj; - if (!refcount_read(&dev_net(dev)->ns.count)) + if (!check_net(dev_net(dev))) kobj->uevent_suppress = 1; if (dev->sysfs_rx_queue_group) sysfs_remove_group(kobj, dev->sysfs_rx_queue_group); @@ -2061,7 +2063,7 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) while (--i >= new_num) { struct netdev_queue *queue = dev->_tx + i; - if (!refcount_read(&dev_net(dev)->ns.count)) + if (!check_net(dev_net(dev))) queue->kobj.uevent_suppress = 1; if (netdev_uses_bql(dev)) @@ -2315,7 +2317,7 @@ void netdev_unregister_kobject(struct net_device *ndev) { struct device *dev = &ndev->dev; - if (!refcount_read(&dev_net(ndev)->ns.count)) + if (!check_net(dev_net(ndev))) dev_set_uevent_suppress(dev, 1); kobject_get(&dev->kobj); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 1b6f3826dd0e..a6e6a964a287 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -20,6 +20,7 @@ #include <linux/sched/task.h> #include <linux/uidgid.h> #include <linux/proc_fs.h> +#include <linux/nstree.h> #include <net/aligned_data.h> #include <net/sock.h> @@ -314,7 +315,7 @@ int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp) { int id; - if (refcount_read(&net->ns.count) == 0) + if (!check_net(net)) return NETNSA_NSID_NOT_ASSIGNED; spin_lock(&net->nsid_lock); @@ -394,13 +395,19 @@ static __net_init void preinit_net_sysctl(struct net *net) net->core.sysctl_optmem_max = 128 * 1024; net->core.sysctl_txrehash = SOCK_TXREHASH_ENABLED; net->core.sysctl_tstamp_allow_data = 1; + net->core.sysctl_txq_reselection = msecs_to_jiffies(1000); } /* init code that must occur even if setup_net() is not called. */ -static __net_init void preinit_net(struct net *net, struct user_namespace *user_ns) +static __net_init int preinit_net(struct net *net, struct user_namespace *user_ns) { + int ret; + + ret = ns_common_init(net); + if (ret) + return ret; + refcount_set(&net->passive, 1); - refcount_set(&net->ns.count, 1); ref_tracker_dir_init(&net->refcnt_tracker, 128, "net_refcnt"); ref_tracker_dir_init(&net->notrefcnt_tracker, 128, "net_notrefcnt"); @@ -420,6 +427,7 @@ static __net_init void preinit_net(struct net *net, struct user_namespace *user_ INIT_LIST_HEAD(&net->ptype_all); INIT_LIST_HEAD(&net->ptype_specific); preinit_net_sysctl(net); + return 0; } /* @@ -432,7 +440,7 @@ static __net_init int setup_net(struct net *net) LIST_HEAD(net_exit_list); int error = 0; - net->net_cookie = atomic64_inc_return(&net_aligned_data.net_cookie); + net->net_cookie = ns_tree_gen_id(net); list_for_each_entry(ops, &pernet_list, list) { error = ops_init(ops, net); @@ -442,6 +450,7 @@ static __net_init int setup_net(struct net *net) down_write(&net_rwsem); list_add_tail_rcu(&net->list, &net_namespace_list); up_write(&net_rwsem); + ns_tree_add_raw(net); out: return error; @@ -539,7 +548,7 @@ void net_drop_ns(void *p) net_passive_dec(net); } -struct net *copy_net_ns(unsigned long flags, +struct net *copy_net_ns(u64 flags, struct user_namespace *user_ns, struct net *old_net) { struct ucounts *ucounts; @@ -559,7 +568,9 @@ struct net *copy_net_ns(unsigned long flags, goto dec_ucounts; } - preinit_net(net, user_ns); + rv = preinit_net(net, user_ns); + if (rv < 0) + goto dec_ucounts; net->ucounts = ucounts; get_user_ns(user_ns); @@ -573,6 +584,7 @@ struct net *copy_net_ns(unsigned long flags, if (rv < 0) { put_userns: + ns_common_free(net); #ifdef CONFIG_KEYS key_remove_domain(net->key_domain); #endif @@ -659,8 +671,10 @@ static void cleanup_net(struct work_struct *work) /* Don't let anyone else find us. */ down_write(&net_rwsem); - llist_for_each_entry(net, net_kill_list, cleanup_list) + llist_for_each_entry(net, net_kill_list, cleanup_list) { + ns_tree_remove(net); list_del_rcu(&net->list); + } /* Cache last net. After we unlock rtnl, no one new net * added to net_namespace_list can assign nsid pointer * to a net from net_kill_list (see peernet2id_alloc()). @@ -693,6 +707,7 @@ static void cleanup_net(struct work_struct *work) /* Finally it is safe to free my network namespace structure */ list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) { list_del_init(&net->exit_list); + ns_common_free(net); dec_net_namespaces(net->ucounts); #ifdef CONFIG_KEYS key_remove_domain(net->key_domain); @@ -812,31 +827,12 @@ static void net_ns_net_debugfs(struct net *net) static __net_init int net_ns_net_init(struct net *net) { -#ifdef CONFIG_NET_NS - net->ns.ops = &netns_operations; -#endif - net->ns.inum = PROC_NET_INIT_INO; - if (net != &init_net) { - int ret = ns_alloc_inum(&net->ns); - if (ret) - return ret; - } net_ns_net_debugfs(net); return 0; } -static __net_exit void net_ns_net_exit(struct net *net) -{ - /* - * Initial network namespace doesn't exit so we don't need any - * special checks here. - */ - ns_free_inum(&net->ns); -} - static struct pernet_operations __net_initdata net_ns_ops = { .init = net_ns_net_init, - .exit = net_ns_net_exit, }; static const struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = { @@ -1227,15 +1223,13 @@ static void __init netns_ipv4_struct_check(void) sysctl_tcp_wmem); CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_tx, sysctl_ip_fwd_use_pmtu); - CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_tx, 33); - - /* TXRX readonly hotpath cache lines */ - CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_txrx, - sysctl_tcp_moderate_rcvbuf); - CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_txrx, 1); /* RX readonly hotpath cache line */ CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, + sysctl_tcp_moderate_rcvbuf); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, + sysctl_tcp_rcvbuf_low_rtt); + CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, sysctl_ip_early_demux); CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, sysctl_tcp_early_demux); @@ -1245,7 +1239,6 @@ static void __init netns_ipv4_struct_check(void) sysctl_tcp_reordering); CACHELINE_ASSERT_GROUP_MEMBER(struct netns_ipv4, netns_ipv4_read_rx, sysctl_tcp_rmem); - CACHELINE_ASSERT_GROUP_SIZE(struct netns_ipv4, netns_ipv4_read_rx, 22); } #endif @@ -1282,7 +1275,12 @@ void __init net_ns_init(void) #ifdef CONFIG_KEYS init_net.key_domain = &init_net_key_domain; #endif - preinit_net(&init_net, &init_user_ns); + /* + * This currently cannot fail as the initial network namespace + * has a static inode number. + */ + if (preinit_net(&init_net, &init_user_ns)) + panic("Could not preinitialize the initial network namespace"); down_write(&pernet_ops_rwsem); if (setup_net(&init_net)) @@ -1517,11 +1515,6 @@ static struct ns_common *netns_get(struct task_struct *task) return net ? &net->ns : NULL; } -static inline struct net *to_net_ns(struct ns_common *ns) -{ - return container_of(ns, struct net, ns); -} - static void netns_put(struct ns_common *ns) { put_net(to_net_ns(ns)); @@ -1548,7 +1541,6 @@ static struct user_namespace *netns_owner(struct ns_common *ns) const struct proc_ns_operations netns_operations = { .name = "net", - .type = CLONE_NEWNET, .get = netns_get, .put = netns_put, .install = netns_install, diff --git a/net/core/netdev-genl-gen.c b/net/core/netdev-genl-gen.c index e9a2a6f26cb7..ba673e81716f 100644 --- a/net/core/netdev-genl-gen.c +++ b/net/core/netdev-genl-gen.c @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/netdev.yaml */ /* YNL-GEN kernel source */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #include <net/netlink.h> #include <net/genetlink.h> @@ -97,7 +98,7 @@ static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_THREADED [NETDEV_A_NAPI_DEFER_HARD_IRQS] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_napi_defer_hard_irqs_range), [NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT] = { .type = NLA_UINT, }, [NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT] = { .type = NLA_UINT, }, - [NETDEV_A_NAPI_THREADED] = NLA_POLICY_MAX(NLA_U32, 1), + [NETDEV_A_NAPI_THREADED] = NLA_POLICY_MAX(NLA_U32, 2), }; /* NETDEV_CMD_BIND_TX - do */ diff --git a/net/core/netdev-genl-gen.h b/net/core/netdev-genl-gen.h index cf3fad74511f..cffc08517a41 100644 --- a/net/core/netdev-genl-gen.h +++ b/net/core/netdev-genl-gen.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/netdev.yaml */ /* YNL-GEN kernel header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _LINUX_NETDEV_GEN_H #define _LINUX_NETDEV_GEN_H diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 6314eb7bdf69..470fabbeacd9 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -869,16 +869,79 @@ int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, return err; } -int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) +static int netdev_nl_read_rxq_bitmap(struct genl_info *info, + u32 rxq_bitmap_len, + unsigned long *rxq_bitmap) { + const int maxtype = ARRAY_SIZE(netdev_queue_id_nl_policy) - 1; struct nlattr *tb[ARRAY_SIZE(netdev_queue_id_nl_policy)]; + struct nlattr *attr; + int rem, err = 0; + u32 rxq_idx; + + nla_for_each_attr_type(attr, NETDEV_A_DMABUF_QUEUES, + genlmsg_data(info->genlhdr), + genlmsg_len(info->genlhdr), rem) { + err = nla_parse_nested(tb, maxtype, attr, + netdev_queue_id_nl_policy, info->extack); + if (err < 0) + return err; + + if (NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_ID) || + NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_TYPE)) + return -EINVAL; + + if (nla_get_u32(tb[NETDEV_A_QUEUE_TYPE]) != NETDEV_QUEUE_TYPE_RX) { + NL_SET_BAD_ATTR(info->extack, tb[NETDEV_A_QUEUE_TYPE]); + return -EINVAL; + } + + rxq_idx = nla_get_u32(tb[NETDEV_A_QUEUE_ID]); + if (rxq_idx >= rxq_bitmap_len) { + NL_SET_BAD_ATTR(info->extack, tb[NETDEV_A_QUEUE_ID]); + return -EINVAL; + } + + bitmap_set(rxq_bitmap, rxq_idx, 1); + } + + return 0; +} + +static struct device * +netdev_nl_get_dma_dev(struct net_device *netdev, unsigned long *rxq_bitmap, + struct netlink_ext_ack *extack) +{ + struct device *dma_dev = NULL; + u32 rxq_idx, prev_rxq_idx; + + for_each_set_bit(rxq_idx, rxq_bitmap, netdev->real_num_rx_queues) { + struct device *rxq_dma_dev; + + rxq_dma_dev = netdev_queue_get_dma_dev(netdev, rxq_idx); + if (dma_dev && rxq_dma_dev != dma_dev) { + NL_SET_ERR_MSG_FMT(extack, "DMA device mismatch between queue %u and %u (multi-PF device?)", + rxq_idx, prev_rxq_idx); + return ERR_PTR(-EOPNOTSUPP); + } + + dma_dev = rxq_dma_dev; + prev_rxq_idx = rxq_idx; + } + + return dma_dev; +} + +int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) +{ struct net_devmem_dmabuf_binding *binding; u32 ifindex, dmabuf_fd, rxq_idx; struct netdev_nl_sock *priv; struct net_device *netdev; + unsigned long *rxq_bitmap; + struct device *dma_dev; struct sk_buff *rsp; - struct nlattr *attr; - int rem, err = 0; + int err = 0; void *hdr; if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX) || @@ -921,36 +984,31 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) goto err_unlock; } - binding = net_devmem_bind_dmabuf(netdev, DMA_FROM_DEVICE, dmabuf_fd, - priv, info->extack); - if (IS_ERR(binding)) { - err = PTR_ERR(binding); + rxq_bitmap = bitmap_zalloc(netdev->real_num_rx_queues, GFP_KERNEL); + if (!rxq_bitmap) { + err = -ENOMEM; goto err_unlock; } - nla_for_each_attr_type(attr, NETDEV_A_DMABUF_QUEUES, - genlmsg_data(info->genlhdr), - genlmsg_len(info->genlhdr), rem) { - err = nla_parse_nested( - tb, ARRAY_SIZE(netdev_queue_id_nl_policy) - 1, attr, - netdev_queue_id_nl_policy, info->extack); - if (err < 0) - goto err_unbind; - - if (NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_ID) || - NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_TYPE)) { - err = -EINVAL; - goto err_unbind; - } + err = netdev_nl_read_rxq_bitmap(info, netdev->real_num_rx_queues, + rxq_bitmap); + if (err) + goto err_rxq_bitmap; - if (nla_get_u32(tb[NETDEV_A_QUEUE_TYPE]) != NETDEV_QUEUE_TYPE_RX) { - NL_SET_BAD_ATTR(info->extack, tb[NETDEV_A_QUEUE_TYPE]); - err = -EINVAL; - goto err_unbind; - } + dma_dev = netdev_nl_get_dma_dev(netdev, rxq_bitmap, info->extack); + if (IS_ERR(dma_dev)) { + err = PTR_ERR(dma_dev); + goto err_rxq_bitmap; + } - rxq_idx = nla_get_u32(tb[NETDEV_A_QUEUE_ID]); + binding = net_devmem_bind_dmabuf(netdev, dma_dev, DMA_FROM_DEVICE, + dmabuf_fd, priv, info->extack); + if (IS_ERR(binding)) { + err = PTR_ERR(binding); + goto err_rxq_bitmap; + } + for_each_set_bit(rxq_idx, rxq_bitmap, netdev->real_num_rx_queues) { err = net_devmem_bind_dmabuf_to_queue(netdev, rxq_idx, binding, info->extack); if (err) @@ -964,6 +1022,8 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) if (err) goto err_unbind; + bitmap_free(rxq_bitmap); + netdev_unlock(netdev); mutex_unlock(&priv->lock); @@ -972,6 +1032,8 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) err_unbind: net_devmem_unbind_dmabuf(binding); +err_rxq_bitmap: + bitmap_free(rxq_bitmap); err_unlock: netdev_unlock(netdev); err_unlock_sock: @@ -986,6 +1048,7 @@ int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info) struct net_devmem_dmabuf_binding *binding; struct netdev_nl_sock *priv; struct net_device *netdev; + struct device *dma_dev; u32 ifindex, dmabuf_fd; struct sk_buff *rsp; int err = 0; @@ -1032,8 +1095,9 @@ int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info) goto err_unlock_netdev; } - binding = net_devmem_bind_dmabuf(netdev, DMA_TO_DEVICE, dmabuf_fd, priv, - info->extack); + dma_dev = netdev_queue_get_dma_dev(netdev, 0); + binding = net_devmem_bind_dmabuf(netdev, dma_dev, DMA_TO_DEVICE, + dmabuf_fd, priv, info->extack); if (IS_ERR(binding)) { err = PTR_ERR(binding); goto err_unlock_netdev; diff --git a/net/core/netdev_queues.c b/net/core/netdev_queues.c new file mode 100644 index 000000000000..251f27a8307f --- /dev/null +++ b/net/core/netdev_queues.c @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <net/netdev_queues.h> + +/** + * netdev_queue_get_dma_dev() - get dma device for zero-copy operations + * @dev: net_device + * @idx: queue index + * + * Get dma device for zero-copy operations to be used for this queue. + * When such device is not available or valid, the function will return NULL. + * + * Return: Device or NULL on error + */ +struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx) +{ + const struct netdev_queue_mgmt_ops *queue_ops = dev->queue_mgmt_ops; + struct device *dma_dev; + + if (queue_ops && queue_ops->ndo_queue_get_dma_dev) + dma_dev = queue_ops->ndo_queue_get_dma_dev(dev, idx); + else + dma_dev = dev->dev.parent; + + return dma_dev && dma_dev->dma_mask ? dma_dev : NULL; +} + diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c index 3bf1151d8061..c7d9341b7630 100644 --- a/net/core/netdev_rx_queue.c +++ b/net/core/netdev_rx_queue.c @@ -9,6 +9,15 @@ #include "page_pool_priv.h" +/* See also page_pool_is_unreadable() */ +bool netif_rxq_has_unreadable_mp(struct net_device *dev, int idx) +{ + struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, idx); + + return !!rxq->mp_params.mp_ops; +} +EXPORT_SYMBOL(netif_rxq_has_unreadable_mp); + int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx) { struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, rxq_idx); diff --git a/net/core/netmem_priv.h b/net/core/netmem_priv.h index cd95394399b4..23175cb2bd86 100644 --- a/net/core/netmem_priv.h +++ b/net/core/netmem_priv.h @@ -5,19 +5,19 @@ static inline unsigned long netmem_get_pp_magic(netmem_ref netmem) { - return __netmem_clear_lsb(netmem)->pp_magic & ~PP_DMA_INDEX_MASK; + return netmem_to_nmdesc(netmem)->pp_magic & ~PP_DMA_INDEX_MASK; } static inline void netmem_or_pp_magic(netmem_ref netmem, unsigned long pp_magic) { - __netmem_clear_lsb(netmem)->pp_magic |= pp_magic; + netmem_to_nmdesc(netmem)->pp_magic |= pp_magic; } static inline void netmem_clear_pp_magic(netmem_ref netmem) { - WARN_ON_ONCE(__netmem_clear_lsb(netmem)->pp_magic & PP_DMA_INDEX_MASK); + WARN_ON_ONCE(netmem_to_nmdesc(netmem)->pp_magic & PP_DMA_INDEX_MASK); - __netmem_clear_lsb(netmem)->pp_magic = 0; + netmem_to_nmdesc(netmem)->pp_magic = 0; } static inline bool netmem_is_pp(netmem_ref netmem) @@ -27,13 +27,13 @@ static inline bool netmem_is_pp(netmem_ref netmem) static inline void netmem_set_pp(netmem_ref netmem, struct page_pool *pool) { - __netmem_clear_lsb(netmem)->pp = pool; + netmem_to_nmdesc(netmem)->pp = pool; } static inline void netmem_set_dma_addr(netmem_ref netmem, unsigned long dma_addr) { - __netmem_clear_lsb(netmem)->dma_addr = dma_addr; + netmem_to_nmdesc(netmem)->dma_addr = dma_addr; } static inline unsigned long netmem_get_dma_index(netmem_ref netmem) @@ -43,7 +43,7 @@ static inline unsigned long netmem_get_dma_index(netmem_ref netmem) if (WARN_ON_ONCE(netmem_is_net_iov(netmem))) return 0; - magic = __netmem_clear_lsb(netmem)->pp_magic; + magic = netmem_to_nmdesc(netmem)->pp_magic; return (magic & PP_DMA_INDEX_MASK) >> PP_DMA_INDEX_SHIFT; } @@ -57,6 +57,6 @@ static inline void netmem_set_dma_index(netmem_ref netmem, return; magic = netmem_get_pp_magic(netmem) | (id << PP_DMA_INDEX_SHIFT); - __netmem_clear_lsb(netmem)->pp_magic = magic; + netmem_to_nmdesc(netmem)->pp_magic = magic; } #endif diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 5f65b62346d4..09f72f10813c 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -228,19 +228,16 @@ static void refill_skbs(struct netpoll *np) { struct sk_buff_head *skb_pool; struct sk_buff *skb; - unsigned long flags; skb_pool = &np->skb_pool; - spin_lock_irqsave(&skb_pool->lock, flags); - while (skb_pool->qlen < MAX_SKBS) { + while (READ_ONCE(skb_pool->qlen) < MAX_SKBS) { skb = alloc_skb(MAX_SKB_SIZE, GFP_ATOMIC); if (!skb) break; - __skb_queue_tail(skb_pool, skb); + skb_queue_tail(skb_pool, skb); } - spin_unlock_irqrestore(&skb_pool->lock, flags); } static void zap_completion_queue(void) @@ -557,6 +554,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) int err; skb_queue_head_init(&np->skb_pool); + INIT_WORK(&np->refill_wq, refill_skbs_work_handler); if (ndev->priv_flags & IFF_DISABLE_NETPOLL) { np_err(np, "%s doesn't support polling, aborting\n", @@ -591,11 +589,9 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) np->dev = ndev; strscpy(np->dev_name, ndev->name, IFNAMSIZ); - npinfo->netpoll = np; /* fill up the skb queue */ refill_skbs(np); - INIT_WORK(&np->refill_wq, refill_skbs_work_handler); /* last thing to do is link it to the net device structure */ rcu_assign_pointer(ndev->npinfo, npinfo); @@ -815,6 +811,10 @@ static void __netpoll_cleanup(struct netpoll *np) if (!npinfo) return; + /* At this point, there is a single npinfo instance per netdevice, and + * its refcnt tracks how many netpoll structures are linked to it. We + * only perform npinfo cleanup when the refcnt decrements to zero. + */ if (refcount_dec_and_test(&npinfo->refcnt)) { const struct net_device_ops *ops; @@ -824,8 +824,7 @@ static void __netpoll_cleanup(struct netpoll *np) RCU_INIT_POINTER(np->dev->npinfo, NULL); call_rcu(&npinfo->rcu, rcu_cleanup_netpoll_info); - } else - RCU_INIT_POINTER(np->dev->npinfo, NULL); + } skb_pool_flush(np); } @@ -835,7 +834,7 @@ void __netpoll_free(struct netpoll *np) ASSERT_RTNL(); /* Wait for transmitting packets to finish before freeing. */ - synchronize_rcu(); + synchronize_net(); __netpoll_cleanup(np); kfree(np); } diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 05e2e22a8f7c..265a729431bb 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -211,11 +211,7 @@ static int page_pool_init(struct page_pool *pool, return -EINVAL; if (pool->p.pool_size) - ring_qsize = pool->p.pool_size; - - /* Sanity limit mem that can be pinned down */ - if (ring_qsize > 32768) - return -E2BIG; + ring_qsize = min(pool->p.pool_size, 16384); /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL. * DMA_BIDIRECTIONAL is for allowing page used for DMA sending, @@ -287,8 +283,10 @@ static int page_pool_init(struct page_pool *pool, } if (pool->mp_ops) { - if (!pool->dma_map || !pool->dma_sync) - return -EOPNOTSUPP; + if (!pool->dma_map || !pool->dma_sync) { + err = -EOPNOTSUPP; + goto free_ptr_ring; + } if (WARN_ON(!is_kernel_rodata((unsigned long)pool->mp_ops))) { err = -EFAULT; @@ -303,12 +301,16 @@ static int page_pool_init(struct page_pool *pool, } static_branch_inc(&page_pool_mem_providers); + } else if (pool->p.order > MAX_PAGE_ORDER) { + err = -EINVAL; + goto free_ptr_ring; } return 0; free_ptr_ring: ptr_ring_cleanup(&pool->ring, NULL); + xa_destroy(&pool->dma_mapped); #ifdef CONFIG_PAGE_POOL_STATS if (!pool->system) free_percpu(pool->recycle_stats); @@ -470,11 +472,60 @@ page_pool_dma_sync_for_device(const struct page_pool *pool, } } +static int page_pool_register_dma_index(struct page_pool *pool, + netmem_ref netmem, gfp_t gfp) +{ + int err = 0; + u32 id; + + if (unlikely(!PP_DMA_INDEX_BITS)) + goto out; + + if (in_softirq()) + err = xa_alloc(&pool->dma_mapped, &id, netmem_to_page(netmem), + PP_DMA_INDEX_LIMIT, gfp); + else + err = xa_alloc_bh(&pool->dma_mapped, &id, netmem_to_page(netmem), + PP_DMA_INDEX_LIMIT, gfp); + if (err) { + WARN_ONCE(err != -ENOMEM, "couldn't track DMA mapping, please report to netdev@"); + goto out; + } + + netmem_set_dma_index(netmem, id); +out: + return err; +} + +static int page_pool_release_dma_index(struct page_pool *pool, + netmem_ref netmem) +{ + struct page *old, *page = netmem_to_page(netmem); + unsigned long id; + + if (unlikely(!PP_DMA_INDEX_BITS)) + return 0; + + id = netmem_get_dma_index(netmem); + if (!id) + return -1; + + if (in_softirq()) + old = xa_cmpxchg(&pool->dma_mapped, id, page, NULL, 0); + else + old = xa_cmpxchg_bh(&pool->dma_mapped, id, page, NULL, 0); + if (old != page) + return -1; + + netmem_set_dma_index(netmem, 0); + + return 0; +} + static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem, gfp_t gfp) { dma_addr_t dma; int err; - u32 id; /* Setup DMA mapping: use 'struct page' area for storing DMA-addr * since dma_addr_t can be either 32 or 64 bits and does not always fit @@ -493,18 +544,10 @@ static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem, gfp_t g goto unmap_failed; } - if (in_softirq()) - err = xa_alloc(&pool->dma_mapped, &id, netmem_to_page(netmem), - PP_DMA_INDEX_LIMIT, gfp); - else - err = xa_alloc_bh(&pool->dma_mapped, &id, netmem_to_page(netmem), - PP_DMA_INDEX_LIMIT, gfp); - if (err) { - WARN_ONCE(err != -ENOMEM, "couldn't track DMA mapping, please report to netdev@"); + err = page_pool_register_dma_index(pool, netmem, gfp); + if (err) goto unset_failed; - } - netmem_set_dma_index(netmem, id); page_pool_dma_sync_for_device(pool, netmem, pool->p.max_len); return true; @@ -553,6 +596,12 @@ static noinline netmem_ref __page_pool_alloc_netmems_slow(struct page_pool *pool netmem_ref netmem; int i, nr_pages; + /* Unconditionally set NOWARN if allocating from NAPI. + * Drivers forget to set it, and OOM reports on packet Rx are useless. + */ + if ((gfp & GFP_ATOMIC) == GFP_ATOMIC) + gfp |= __GFP_NOWARN; + /* Don't support bulk alloc for high-order pages */ if (unlikely(pp_order)) return page_to_netmem(__page_pool_alloc_page_order(pool, gfp)); @@ -676,8 +725,6 @@ void page_pool_clear_pp_info(netmem_ref netmem) static __always_inline void __page_pool_release_netmem_dma(struct page_pool *pool, netmem_ref netmem) { - struct page *old, *page = netmem_to_page(netmem); - unsigned long id; dma_addr_t dma; if (!pool->dma_map) @@ -686,15 +733,7 @@ static __always_inline void __page_pool_release_netmem_dma(struct page_pool *poo */ return; - id = netmem_get_dma_index(netmem); - if (!id) - return; - - if (in_softirq()) - old = xa_cmpxchg(&pool->dma_mapped, id, page, NULL, 0); - else - old = xa_cmpxchg_bh(&pool->dma_mapped, id, page, NULL, 0); - if (old != page) + if (page_pool_release_dma_index(pool, netmem)) return; dma = page_pool_get_dma_addr_netmem(netmem); @@ -704,7 +743,6 @@ static __always_inline void __page_pool_release_netmem_dma(struct page_pool *poo PAGE_SIZE << pool->p.order, pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); page_pool_set_dma_addr_netmem(netmem, 0); - netmem_set_dma_index(netmem, 0); } /* Disconnects a page (from a page_pool). API users can have a need @@ -1201,6 +1239,35 @@ void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), pool->xdp_mem_id = mem->id; } +/** + * page_pool_enable_direct_recycling() - mark page pool as owned by NAPI + * @pool: page pool to modify + * @napi: NAPI instance to associate the page pool with + * + * Associate a page pool with a NAPI instance for lockless page recycling. + * This is useful when a new page pool has to be added to a NAPI instance + * without disabling that NAPI instance, to mark the point at which control + * path "hands over" the page pool to the NAPI instance. In most cases driver + * can simply set the @napi field in struct page_pool_params, and does not + * have to call this helper. + * + * The function is idempotent, but does not implement any refcounting. + * Single page_pool_disable_direct_recycling() will disable recycling, + * no matter how many times enable was called. + */ +void page_pool_enable_direct_recycling(struct page_pool *pool, + struct napi_struct *napi) +{ + if (READ_ONCE(pool->p.napi) == napi) + return; + WARN_ON(!napi || pool->p.napi); + + mutex_lock(&page_pools_lock); + WRITE_ONCE(pool->p.napi, napi); + mutex_unlock(&page_pools_lock); +} +EXPORT_SYMBOL(page_pool_enable_direct_recycling); + void page_pool_disable_direct_recycling(struct page_pool *pool) { /* Disable direct recycling based on pool->cpuid. diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 0ebe5461d4d9..d41b03fd1f63 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -114,6 +114,7 @@ #include <linux/sys.h> #include <linux/types.h> +#include <linux/minmax.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/kernel.h> @@ -2841,8 +2842,7 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb, } i = 0; - frag_len = (datalen/frags) < PAGE_SIZE ? - (datalen/frags) : PAGE_SIZE; + frag_len = min_t(int, datalen / frags, PAGE_SIZE); while (datalen > 0) { if (unlikely(!pkt_dev->page)) { int node = numa_node_id(); @@ -2859,8 +2859,7 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb, if (i == (frags - 1)) skb_frag_fill_page_desc(&skb_shinfo(skb)->frags[i], pkt_dev->page, 0, - (datalen < PAGE_SIZE ? - datalen : PAGE_SIZE)); + min(datalen, PAGE_SIZE)); else skb_frag_fill_page_desc(&skb_shinfo(skb)->frags[i], pkt_dev->page, 0, frag_len); diff --git a/net/core/request_sock.c b/net/core/request_sock.c index 63de5c635842..897a8f01a67b 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -77,9 +77,7 @@ void reqsk_queue_alloc(struct request_sock_queue *queue) * a simple spin lock - one must consider sock_owned_by_user() and arrange * to use sk_add_backlog() stuff. But what really makes it infeasible is the * locking hierarchy violation. E.g., inet_csk_listen_stop() may try to - * acquire a child's lock while holding listener's socket lock. A corner - * case might also exist in tcp_v4_hnd_req() that will trigger this locking - * order. + * acquire a child's lock while holding listener's socket lock. * * This function also sets "treq->tfo_listener" to false. * treq->tfo_listener is used by the listener so it is protected by the diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 094b085cff20..b1ed55141d8a 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -478,7 +478,7 @@ static int rtnl_unregister(int protocol, int msgtype) * rtnl_unregister_all - Unregister all rtnetlink message type of a protocol * @protocol : Protocol family or PF_UNSPEC * - * Identical to calling rtnl_unregster() for all registered message types + * Identical to calling rtnl_unregister() for all registered message types * of a certain protocol family. */ void rtnl_unregister_all(int protocol) @@ -1270,13 +1270,13 @@ static size_t rtnl_dpll_pin_size(const struct net_device *dev) static noinline size_t if_nlmsg_size(const struct net_device *dev, u32 ext_filter_mask) { - return NLMSG_ALIGN(sizeof(struct ifinfomsg)) + size_t size; + + size = NLMSG_ALIGN(sizeof(struct ifinfomsg)) + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */ + nla_total_size(IFALIASZ) /* IFLA_IFALIAS */ + nla_total_size(IFNAMSIZ) /* IFLA_QDISC */ + nla_total_size_64bit(sizeof(struct rtnl_link_ifmap)) - + nla_total_size(sizeof(struct rtnl_link_stats)) - + nla_total_size_64bit(sizeof(struct rtnl_link_stats64)) + nla_total_size(MAX_ADDR_LEN) /* IFLA_ADDRESS */ + nla_total_size(MAX_ADDR_LEN) /* IFLA_BROADCAST */ + nla_total_size(4) /* IFLA_TXQLEN */ @@ -1326,7 +1326,15 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + rtnl_devlink_port_size(dev) + rtnl_dpll_pin_size(dev) + nla_total_size(8) /* IFLA_MAX_PACING_OFFLOAD_HORIZON */ + + nla_total_size(2) /* IFLA_HEADROOM */ + + nla_total_size(2) /* IFLA_TAILROOM */ + 0; + + if (!(ext_filter_mask & RTEXT_FILTER_SKIP_STATS)) + size += nla_total_size(sizeof(struct rtnl_link_stats)) + + nla_total_size_64bit(sizeof(struct rtnl_link_stats64)); + + return size; } static int rtnl_vf_ports_fill(struct sk_buff *skb, struct net_device *dev) @@ -2091,7 +2099,11 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, nla_put_u32(skb, IFLA_CARRIER_UP_COUNT, atomic_read(&dev->carrier_up_count)) || nla_put_u32(skb, IFLA_CARRIER_DOWN_COUNT, - atomic_read(&dev->carrier_down_count))) + atomic_read(&dev->carrier_down_count)) || + nla_put_u16(skb, IFLA_HEADROOM, + READ_ONCE(dev->needed_headroom)) || + nla_put_u16(skb, IFLA_TAILROOM, + READ_ONCE(dev->needed_tailroom))) goto nla_put_failure; if (rtnl_fill_proto_down(skb, dev)) @@ -2117,7 +2129,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, if (rtnl_phys_switch_id_fill(skb, dev)) goto nla_put_failure; - if (rtnl_fill_stats(skb, dev)) + if (!(ext_filter_mask & RTEXT_FILTER_SKIP_STATS) && + rtnl_fill_stats(skb, dev)) goto nla_put_failure; if (rtnl_fill_vf(skb, dev, ext_filter_mask)) @@ -2243,6 +2256,8 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_GSO_IPV4_MAX_SIZE] = NLA_POLICY_MIN(NLA_U32, MAX_TCP_HEADER + 1), [IFLA_GRO_IPV4_MAX_SIZE] = { .type = NLA_U32 }, [IFLA_NETNS_IMMUTABLE] = { .type = NLA_REJECT }, + [IFLA_HEADROOM] = { .type = NLA_REJECT }, + [IFLA_TAILROOM] = { .type = NLA_REJECT }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { @@ -4707,9 +4722,6 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, int err; u16 vid; - if (!netlink_capable(skb, CAP_NET_ADMIN)) - return -EPERM; - if (!del_bulk) { err = nlmsg_parse_deprecated(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, extack); diff --git a/net/core/scm.c b/net/core/scm.c index 072d5742440a..cd87f66671aa 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -273,15 +273,13 @@ int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) check_object_size(data, cmlen - sizeof(*cm), true); - if (!user_write_access_begin(cm, cmlen)) - goto efault; - - unsafe_put_user(cmlen, &cm->cmsg_len, efault_end); - unsafe_put_user(level, &cm->cmsg_level, efault_end); - unsafe_put_user(type, &cm->cmsg_type, efault_end); - unsafe_copy_to_user(CMSG_USER_DATA(cm), data, - cmlen - sizeof(*cm), efault_end); - user_write_access_end(); + scoped_user_write_access_size(cm, cmlen, efault) { + unsafe_put_user(cmlen, &cm->cmsg_len, efault); + unsafe_put_user(level, &cm->cmsg_level, efault); + unsafe_put_user(type, &cm->cmsg_type, efault); + unsafe_copy_to_user(CMSG_USER_DATA(cm), data, + cmlen - sizeof(*cm), efault); + } } else { struct cmsghdr *cm = msg->msg_control; @@ -299,8 +297,6 @@ int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) msg->msg_controllen -= cmlen; return 0; -efault_end: - user_write_access_end(); efault: return -EFAULT; } diff --git a/net/core/selftests.c b/net/core/selftests.c index 3d79133a91a6..8b81feb82c4a 100644 --- a/net/core/selftests.c +++ b/net/core/selftests.c @@ -14,46 +14,10 @@ #include <net/tcp.h> #include <net/udp.h> -struct net_packet_attrs { - const unsigned char *src; - const unsigned char *dst; - u32 ip_src; - u32 ip_dst; - bool tcp; - u16 sport; - u16 dport; - int timeout; - int size; - int max_size; - u8 id; - u16 queue_mapping; - bool bad_csum; -}; - -struct net_test_priv { - struct net_packet_attrs *packet; - struct packet_type pt; - struct completion comp; - int double_vlan; - int vlan_id; - int ok; -}; - -struct netsfhdr { - __be32 version; - __be64 magic; - u8 id; -} __packed; - static u8 net_test_next_id; -#define NET_TEST_PKT_SIZE (sizeof(struct ethhdr) + sizeof(struct iphdr) + \ - sizeof(struct netsfhdr)) -#define NET_TEST_PKT_MAGIC 0xdeadcafecafedeadULL -#define NET_LB_TIMEOUT msecs_to_jiffies(200) - -static struct sk_buff *net_test_get_skb(struct net_device *ndev, - struct net_packet_attrs *attr) +struct sk_buff *net_test_get_skb(struct net_device *ndev, u8 id, + struct net_packet_attrs *attr) { struct sk_buff *skb = NULL; struct udphdr *uhdr = NULL; @@ -142,8 +106,8 @@ static struct sk_buff *net_test_get_skb(struct net_device *ndev, shdr = skb_put(skb, sizeof(*shdr)); shdr->version = 0; shdr->magic = cpu_to_be64(NET_TEST_PKT_MAGIC); - attr->id = net_test_next_id; - shdr->id = net_test_next_id++; + attr->id = id; + shdr->id = id; if (attr->size) { void *payload = skb_put(skb, attr->size); @@ -190,6 +154,7 @@ static struct sk_buff *net_test_get_skb(struct net_device *ndev, return skb; } +EXPORT_SYMBOL_GPL(net_test_get_skb); static int net_test_loopback_validate(struct sk_buff *skb, struct net_device *ndev, @@ -286,12 +251,13 @@ static int __net_test_loopback(struct net_device *ndev, tpriv->packet = attr; dev_add_pack(&tpriv->pt); - skb = net_test_get_skb(ndev, attr); + skb = net_test_get_skb(ndev, net_test_next_id, attr); if (!skb) { ret = -ENOMEM; goto cleanup; } + net_test_next_id++; ret = dev_direct_xmit(skb, attr->queue_mapping); if (ret < 0) { goto cleanup; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index ee0274417948..a00808f7be6a 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -79,7 +79,9 @@ #include <net/mptcp.h> #include <net/mctp.h> #include <net/page_pool/helpers.h> +#include <net/psp/types.h> #include <net/dropreason.h> +#include <net/xdp_sock.h> #include <linux/uaccess.h> #include <trace/events/skb.h> @@ -221,9 +223,9 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr) skb_panic(skb, sz, addr, __func__); } -#define NAPI_SKB_CACHE_SIZE 64 -#define NAPI_SKB_CACHE_BULK 16 -#define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2) +#define NAPI_SKB_CACHE_SIZE 128 +#define NAPI_SKB_CACHE_BULK 32 +#define NAPI_SKB_CACHE_FREE 32 struct napi_alloc_cache { local_lock_t bh_lock; @@ -273,17 +275,23 @@ void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) } EXPORT_SYMBOL(__netdev_alloc_frag_align); -static struct sk_buff *napi_skb_cache_get(void) +/* Cache kmem_cache_size(net_hotdata.skbuff_cache) to help the compiler + * remove dead code (and skbuff_cache_size) when CONFIG_KASAN is unset. + */ +static u32 skbuff_cache_size __read_mostly; + +static struct sk_buff *napi_skb_cache_get(bool alloc) { struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); struct sk_buff *skb; local_lock_nested_bh(&napi_alloc_cache.bh_lock); if (unlikely(!nc->skb_count)) { - nc->skb_count = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, - GFP_ATOMIC | __GFP_NOWARN, - NAPI_SKB_CACHE_BULK, - nc->skb_cache); + if (alloc) + nc->skb_count = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache, + GFP_ATOMIC | __GFP_NOWARN, + NAPI_SKB_CACHE_BULK, + nc->skb_cache); if (unlikely(!nc->skb_count)) { local_unlock_nested_bh(&napi_alloc_cache.bh_lock); return NULL; @@ -291,8 +299,10 @@ static struct sk_buff *napi_skb_cache_get(void) } skb = nc->skb_cache[--nc->skb_count]; + if (nc->skb_count) + prefetch(nc->skb_cache[nc->skb_count - 1]); local_unlock_nested_bh(&napi_alloc_cache.bh_lock); - kasan_mempool_unpoison_object(skb, kmem_cache_size(net_hotdata.skbuff_cache)); + kasan_mempool_unpoison_object(skb, skbuff_cache_size); return skb; } @@ -344,11 +354,9 @@ u32 napi_skb_cache_get_bulk(void **skbs, u32 n) get: for (u32 base = nc->skb_count - n, i = 0; i < n; i++) { - u32 cache_size = kmem_cache_size(net_hotdata.skbuff_cache); - skbs[i] = nc->skb_cache[base + i]; - kasan_mempool_unpoison_object(skbs[i], cache_size); + kasan_mempool_unpoison_object(skbs[i], skbuff_cache_size); memset(skbs[i], 0, offsetof(struct sk_buff, tail)); } @@ -525,7 +533,7 @@ static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size) { struct sk_buff *skb; - skb = napi_skb_cache_get(); + skb = napi_skb_cache_get(true); if (unlikely(!skb)) return NULL; @@ -640,25 +648,38 @@ out: struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, int flags, int node) { + struct sk_buff *skb = NULL; struct kmem_cache *cache; - struct sk_buff *skb; bool pfmemalloc; u8 *data; - cache = (flags & SKB_ALLOC_FCLONE) - ? net_hotdata.skbuff_fclone_cache : net_hotdata.skbuff_cache; - if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX)) gfp_mask |= __GFP_MEMALLOC; - /* Get the HEAD */ - if ((flags & (SKB_ALLOC_FCLONE | SKB_ALLOC_NAPI)) == SKB_ALLOC_NAPI && - likely(node == NUMA_NO_NODE || node == numa_mem_id())) - skb = napi_skb_cache_get(); - else + if (flags & SKB_ALLOC_FCLONE) { + cache = net_hotdata.skbuff_fclone_cache; + goto fallback; + } + cache = net_hotdata.skbuff_cache; + if (unlikely(node != NUMA_NO_NODE && node != numa_mem_id())) + goto fallback; + + if (flags & SKB_ALLOC_NAPI) { + skb = napi_skb_cache_get(true); + if (unlikely(!skb)) + return NULL; + } else if (!in_hardirq() && !irqs_disabled()) { + local_bh_disable(); + skb = napi_skb_cache_get(false); + local_bh_enable(); + } + + if (!skb) { +fallback: skb = kmem_cache_alloc_node(cache, gfp_mask & ~GFP_DMA, node); - if (unlikely(!skb)) - return NULL; + if (unlikely(!skb)) + return NULL; + } prefetchw(skb); /* We do our best to align skb_shared_info on a separate cache @@ -1135,12 +1156,22 @@ void skb_release_head_state(struct sk_buff *skb) skb_dst_drop(skb); if (skb->destructor) { DEBUG_NET_WARN_ON_ONCE(in_hardirq()); - skb->destructor(skb); - } -#if IS_ENABLED(CONFIG_NF_CONNTRACK) - nf_conntrack_put(skb_nfct(skb)); +#ifdef CONFIG_INET + INDIRECT_CALL_4(skb->destructor, + tcp_wfree, __sock_wfree, sock_wfree, + xsk_destruct_skb, + skb); +#else + INDIRECT_CALL_2(skb->destructor, + sock_wfree, xsk_destruct_skb, + skb); + #endif - skb_ext_put(skb); + skb->destructor = NULL; + skb->sk = NULL; + } + nf_reset_ct(skb); + skb_ext_reset(skb); } /* Free everything but the sk_buff shell. */ @@ -1416,7 +1447,6 @@ void __consume_stateless_skb(struct sk_buff *skb) static void napi_skb_cache_put(struct sk_buff *skb) { struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); - u32 i; if (!kasan_mempool_poison_object(skb)) return; @@ -1425,13 +1455,16 @@ static void napi_skb_cache_put(struct sk_buff *skb) nc->skb_cache[nc->skb_count++] = skb; if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) { - for (i = NAPI_SKB_CACHE_HALF; i < NAPI_SKB_CACHE_SIZE; i++) + u32 i, remaining = NAPI_SKB_CACHE_SIZE - NAPI_SKB_CACHE_FREE; + + for (i = remaining; i < NAPI_SKB_CACHE_SIZE; i++) kasan_mempool_unpoison_object(nc->skb_cache[i], - kmem_cache_size(net_hotdata.skbuff_cache)); + skbuff_cache_size); - kmem_cache_free_bulk(net_hotdata.skbuff_cache, NAPI_SKB_CACHE_HALF, - nc->skb_cache + NAPI_SKB_CACHE_HALF); - nc->skb_count = NAPI_SKB_CACHE_HALF; + kmem_cache_free_bulk(net_hotdata.skbuff_cache, + NAPI_SKB_CACHE_FREE, + nc->skb_cache + remaining); + nc->skb_count = remaining; } local_unlock_nested_bh(&napi_alloc_cache.bh_lock); } @@ -1457,13 +1490,18 @@ void napi_skb_free_stolen_head(struct sk_buff *skb) void napi_consume_skb(struct sk_buff *skb, int budget) { /* Zero budget indicate non-NAPI context called us, like netpoll */ - if (unlikely(!budget)) { + if (unlikely(!budget || !skb)) { dev_consume_skb_any(skb); return; } DEBUG_NET_WARN_ON_ONCE(!in_softirq()); + if (skb->alloc_cpu != smp_processor_id() && !skb_shared(skb)) { + skb_release_head_state(skb); + return skb_attempt_defer_free(skb); + } + if (!skb_unref(skb)) return; @@ -2217,6 +2255,10 @@ EXPORT_SYMBOL(__pskb_copy_fclone); * * All the pointers pointing into skb header may change and must be * reloaded after call to this function. + * + * Note: If you skb_push() the start of the buffer after reallocating the + * header, call skb_postpush_data_move() first to move the metadata out of + * the way before writing to &sk_buff->data. */ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, @@ -2288,8 +2330,6 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, skb->nohdr = 0; atomic_set(&skb_shinfo(skb)->dataref, 1); - skb_metadata_clear(skb); - /* It is not generally safe to change skb->truesize. * For the moment, we really care of rx path, or * when skb is orphaned (not attached to a socket). @@ -3112,7 +3152,9 @@ static bool __splice_segment(struct page *page, unsigned int poff, poff += flen; plen -= flen; *len -= flen; - } while (*len && plen); + if (!*len) + return true; + } while (plen); return false; } @@ -5060,6 +5102,9 @@ static const u8 skb_ext_type_len[] = { #if IS_ENABLED(CONFIG_MCTP_FLOWS) [SKB_EXT_MCTP] = SKB_EXT_CHUNKSIZEOF(struct mctp_flow), #endif +#if IS_ENABLED(CONFIG_INET_PSP) + [SKB_EXT_PSP] = SKB_EXT_CHUNKSIZEOF(struct psp_skb_ext), +#endif }; static __always_inline unsigned int skb_ext_total_length(void) @@ -5110,6 +5155,8 @@ void __init skb_init(void) offsetof(struct sk_buff, cb), sizeof_field(struct sk_buff, cb), NULL); + skbuff_cache_size = kmem_cache_size(net_hotdata.skbuff_cache); + net_hotdata.skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache", sizeof(struct sk_buff_fclones), 0, @@ -6667,7 +6714,7 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len, return NULL; while (data_len) { - if (nr_frags == MAX_SKB_FRAGS - 1) + if (nr_frags == MAX_SKB_FRAGS) goto failure; while (order && PAGE_ALIGN(data_len) < (PAGE_SIZE << order)) order--; @@ -7042,6 +7089,7 @@ void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id, skb->active_extensions = 1 << id; return skb_ext_get_ptr(ext, id); } +EXPORT_SYMBOL_NS_GPL(__skb_ext_set, "NETDEV_INTERNAL"); /** * skb_ext_add - allocate space for given extension, COW if needed @@ -7178,8 +7226,9 @@ static void kfree_skb_napi_cache(struct sk_buff *skb) */ void skb_attempt_defer_free(struct sk_buff *skb) { + struct skb_defer_node *sdn; + unsigned long defer_count; int cpu = skb->alloc_cpu; - struct softnet_data *sd; unsigned int defer_max; bool kick; @@ -7192,28 +7241,26 @@ nodefer: kfree_skb_napi_cache(skb); DEBUG_NET_WARN_ON_ONCE(skb_dst(skb)); DEBUG_NET_WARN_ON_ONCE(skb->destructor); + DEBUG_NET_WARN_ON_ONCE(skb_nfct(skb)); + + sdn = per_cpu_ptr(net_hotdata.skb_defer_nodes, cpu) + numa_node_id(); - sd = &per_cpu(softnet_data, cpu); defer_max = READ_ONCE(net_hotdata.sysctl_skb_defer_max); - if (READ_ONCE(sd->defer_count) >= defer_max) + defer_count = atomic_long_inc_return(&sdn->defer_count); + + if (defer_count >= defer_max) goto nodefer; - spin_lock_bh(&sd->defer_lock); - /* Send an IPI every time queue reaches half capacity. */ - kick = sd->defer_count == (defer_max >> 1); - /* Paired with the READ_ONCE() few lines above */ - WRITE_ONCE(sd->defer_count, sd->defer_count + 1); + llist_add(&skb->ll_node, &sdn->defer_list); - skb->next = sd->defer_list; - /* Paired with READ_ONCE() in skb_defer_free_flush() */ - WRITE_ONCE(sd->defer_list, skb); - spin_unlock_bh(&sd->defer_lock); + /* Send an IPI every time queue reaches half capacity. */ + kick = (defer_count - 1) == (defer_max >> 1); /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU * if we are unlucky enough (this seems very unlikely). */ if (unlikely(kick)) - kick_defer_list_purge(sd, cpu); + kick_defer_list_purge(cpu); } static void skb_splice_csum_page(struct sk_buff *skb, struct page *page, diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 83c78379932e..2ac7731e1e0a 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -876,7 +876,7 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock) sk_psock_stop(psock); INIT_RCU_WORK(&psock->rwork, sk_psock_destroy); - queue_rcu_work(system_wq, &psock->rwork); + queue_rcu_work(system_percpu_wq, &psock->rwork); } EXPORT_SYMBOL_GPL(sk_psock_drop); diff --git a/net/core/sock.c b/net/core/sock.c index 7c26ec8dce63..45c98bf524b2 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -155,7 +155,7 @@ static DEFINE_MUTEX(proto_list_mutex); static LIST_HEAD(proto_list); -static void sock_def_write_space_wfree(struct sock *sk); +static void sock_def_write_space_wfree(struct sock *sk, int wmem_alloc); static void sock_def_write_space(struct sock *sk); /** @@ -281,12 +281,12 @@ static struct lock_class_key af_elock_keys[AF_MAX]; static struct lock_class_key af_kern_callback_keys[AF_MAX]; /* Run time adjustable parameters. */ -__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX; +__u32 sysctl_wmem_max __read_mostly = 4 << 20; EXPORT_SYMBOL(sysctl_wmem_max); -__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX; +__u32 sysctl_rmem_max __read_mostly = 4 << 20; EXPORT_SYMBOL(sysctl_rmem_max); -__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; -__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; +__u32 sysctl_wmem_default __read_mostly = SK_WMEM_DEFAULT; +__u32 sysctl_rmem_default __read_mostly = SK_RMEM_DEFAULT; DEFINE_STATIC_KEY_FALSE(memalloc_socks_key); EXPORT_SYMBOL_GPL(memalloc_socks_key); @@ -491,13 +491,13 @@ int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) struct sk_buff_head *list = &sk->sk_receive_queue; if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); trace_sock_rcvqueue_full(sk, skb); return -ENOMEM; } if (!sk_rmem_schedule(sk, skb, skb->truesize)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); return -ENOBUFS; } @@ -562,7 +562,7 @@ int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, skb->dev = NULL; if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); reason = SKB_DROP_REASON_SOCKET_RCVBUFF; goto discard_and_relse; } @@ -585,7 +585,7 @@ int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, reason = SKB_DROP_REASON_PFMEMALLOC; if (err == -ENOBUFS) reason = SKB_DROP_REASON_SOCKET_BACKLOG; - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); goto discard_and_relse; } @@ -1032,7 +1032,7 @@ static int sock_reserve_memory(struct sock *sk, int bytes) bool charged; int pages; - if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk)) + if (!mem_cgroup_sk_enabled(sk) || !sk_has_account(sk)) return -EOPNOTSUPP; if (!bytes) @@ -1041,22 +1041,28 @@ static int sock_reserve_memory(struct sock *sk, int bytes) pages = sk_mem_pages(bytes); /* pre-charge to memcg */ - charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages, - GFP_KERNEL | __GFP_RETRY_MAYFAIL); + charged = mem_cgroup_sk_charge(sk, pages, + GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!charged) return -ENOMEM; + if (sk->sk_bypass_prot_mem) + goto success; + /* pre-charge to forward_alloc */ sk_memory_allocated_add(sk, pages); allocated = sk_memory_allocated(sk); + /* If the system goes into memory pressure with this * precharge, give up and return error. */ if (allocated > sk_prot_mem_limits(sk, 1)) { sk_memory_allocated_sub(sk, pages); - mem_cgroup_uncharge_skmem(sk->sk_memcg, pages); + mem_cgroup_sk_uncharge(sk, pages); return -ENOMEM; } + +success: sk_forward_alloc_add(sk, pages << PAGE_SHIFT); WRITE_ONCE(sk->sk_reserved_mem, @@ -2300,8 +2306,13 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, * why we need sk_prot_creator -acme */ sk->sk_prot = sk->sk_prot_creator = prot; + + if (READ_ONCE(net->core.sysctl_bypass_prot_mem)) + sk->sk_bypass_prot_mem = 1; + sk->sk_kern_sock = kern; sock_lock_init(sk); + sk->sk_net_refcnt = kern ? 0 : 1; if (likely(sk->sk_net_refcnt)) { get_net_track(net, &sk->ns_tracker, priority); @@ -2313,7 +2324,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, } sock_net_set(sk, net); - refcount_set(&sk->sk_wmem_alloc, 1); + refcount_set(&sk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS); mem_cgroup_sk_alloc(sk); cgroup_sk_alloc(&sk->sk_cgrp_data); @@ -2451,13 +2462,16 @@ static void sk_init_common(struct sock *sk) } /** - * sk_clone_lock - clone a socket, and lock its clone - * @sk: the socket to clone - * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) + * sk_clone - clone a socket + * @sk: the socket to clone + * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) + * @lock: if true, lock the cloned sk * - * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) + * If @lock is true, the clone is locked by bh_lock_sock(), and + * caller must unlock socket even in error path by bh_unlock_sock(). */ -struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) +struct sock *sk_clone(const struct sock *sk, const gfp_t priority, + bool lock) { struct proto *prot = READ_ONCE(sk->sk_prot); struct sk_filter *filter; @@ -2486,16 +2500,19 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) __netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker, false, priority); } + sk_node_init(&newsk->sk_node); sock_lock_init(newsk); - bh_lock_sock(newsk); + + if (lock) + bh_lock_sock(newsk); + newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; newsk->sk_backlog.len = 0; atomic_set(&newsk->sk_rmem_alloc, 0); - /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */ - refcount_set(&newsk->sk_wmem_alloc, 1); + refcount_set(&newsk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS); atomic_set(&newsk->sk_omem_alloc, 0); sk_init_common(newsk); @@ -2505,15 +2522,18 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk->sk_wmem_queued = 0; newsk->sk_forward_alloc = 0; newsk->sk_reserved_mem = 0; - atomic_set(&newsk->sk_drops, 0); + DEBUG_NET_WARN_ON_ONCE(newsk->sk_drop_counters); + sk_drops_reset(newsk); newsk->sk_send_head = NULL; newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; atomic_set(&newsk->sk_zckey, 0); sock_reset_flag(newsk, SOCK_DONE); +#ifdef CONFIG_MEMCG /* sk->sk_memcg will be populated at accept() time */ newsk->sk_memcg = NULL; +#endif cgroup_sk_clone(&newsk->sk_cgrp_data); @@ -2577,14 +2597,15 @@ free: * destructor and make plain sk_free() */ newsk->sk_destruct = NULL; - bh_unlock_sock(newsk); + if (lock) + bh_unlock_sock(newsk); sk_free(newsk); newsk = NULL; goto out; } -EXPORT_SYMBOL_GPL(sk_clone_lock); +EXPORT_SYMBOL_GPL(sk_clone); -static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst) +static u32 sk_dst_gso_max_size(struct sock *sk, const struct net_device *dev) { bool is_ipv6 = false; u32 max_size; @@ -2594,8 +2615,8 @@ static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst) !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)); #endif /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */ - max_size = is_ipv6 ? READ_ONCE(dst_dev(dst)->gso_max_size) : - READ_ONCE(dst_dev(dst)->gso_ipv4_max_size); + max_size = is_ipv6 ? READ_ONCE(dev->gso_max_size) : + READ_ONCE(dev->gso_ipv4_max_size); if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk)) max_size = GSO_LEGACY_MAX_SIZE; @@ -2604,9 +2625,12 @@ static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst) void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { + const struct net_device *dev; u32 max_segs = 1; - sk->sk_route_caps = dst_dev(dst)->features; + rcu_read_lock(); + dev = dst_dev_rcu(dst); + sk->sk_route_caps = dev->features; if (sk_is_tcp(sk)) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -2622,13 +2646,14 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) sk->sk_route_caps &= ~NETIF_F_GSO_MASK; } else { sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; - sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst); + sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dev); /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */ - max_segs = max_t(u32, READ_ONCE(dst_dev(dst)->gso_max_segs), 1); + max_segs = max_t(u32, READ_ONCE(dev->gso_max_segs), 1); } } sk->sk_gso_max_segs = max_segs; sk_dst_set(sk, dst); + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(sk_setup_caps); @@ -2642,16 +2667,18 @@ EXPORT_SYMBOL_GPL(sk_setup_caps); */ void sock_wfree(struct sk_buff *skb) { - struct sock *sk = skb->sk; unsigned int len = skb->truesize; + struct sock *sk = skb->sk; bool free; + int old; if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) { if (sock_flag(sk, SOCK_RCU_FREE) && sk->sk_write_space == sock_def_write_space) { rcu_read_lock(); - free = refcount_sub_and_test(len, &sk->sk_wmem_alloc); - sock_def_write_space_wfree(sk); + free = __refcount_sub_and_test(len, &sk->sk_wmem_alloc, + &old); + sock_def_write_space_wfree(sk, old - len); rcu_read_unlock(); if (unlikely(free)) __sk_free(sk); @@ -2688,6 +2715,8 @@ void __sock_wfree(struct sk_buff *skb) void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) { + int old_wmem; + skb_orphan(skb); #ifdef CONFIG_INET if (unlikely(!sk_fullsock(sk))) @@ -2701,7 +2730,15 @@ void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) * is enough to guarantee sk_free() won't free this sock until * all in-flight packets are completed */ - refcount_add(skb->truesize, &sk->sk_wmem_alloc); + __refcount_add(skb->truesize, &sk->sk_wmem_alloc, &old_wmem); + + /* (old_wmem == SK_WMEM_ALLOC_BIAS) if no other TX packet for this socket + * is in a host queue (qdisc, NIC queue). + * Set skb->ooo_okay so that netdev_pick_tx() can choose a TX queue + * based on XPS for better performance. + * Otherwise clear ooo_okay to not risk Out Of Order delivery. + */ + skb->ooo_okay = (old_wmem == SK_WMEM_ALLOC_BIAS); } EXPORT_SYMBOL(skb_set_owner_w); @@ -2780,28 +2817,6 @@ void sock_pfree(struct sk_buff *skb) EXPORT_SYMBOL(sock_pfree); #endif /* CONFIG_INET */ -unsigned long __sock_i_ino(struct sock *sk) -{ - unsigned long ino; - - read_lock(&sk->sk_callback_lock); - ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; - read_unlock(&sk->sk_callback_lock); - return ino; -} -EXPORT_SYMBOL(__sock_i_ino); - -unsigned long sock_i_ino(struct sock *sk) -{ - unsigned long ino; - - local_bh_disable(); - ino = __sock_i_ino(sk); - local_bh_enable(); - return ino; -} -EXPORT_SYMBOL(sock_i_ino); - /* * Allocate a skb from the socket's send buffer. */ @@ -3151,8 +3166,11 @@ bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) return true; - sk_enter_memory_pressure(sk); + if (!sk->sk_bypass_prot_mem) + sk_enter_memory_pressure(sk); + sk_stream_moderate_sndbuf(sk); + return false; } EXPORT_SYMBOL(sk_page_frag_refill); @@ -3180,23 +3198,27 @@ void __release_sock(struct sock *sk) __acquires(&sk->sk_lock.slock) { struct sk_buff *skb, *next; + int nb = 0; while ((skb = sk->sk_backlog.head) != NULL) { sk->sk_backlog.head = sk->sk_backlog.tail = NULL; spin_unlock_bh(&sk->sk_lock.slock); - do { + while (1) { next = skb->next; prefetch(next); DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb)); skb_mark_not_on_list(skb); sk_backlog_rcv(sk, skb); - cond_resched(); - skb = next; - } while (skb != NULL); + if (!skb) + break; + + if (!(++nb & 15)) + cond_resched(); + } spin_lock_bh(&sk->sk_lock.slock); } @@ -3263,20 +3285,25 @@ EXPORT_SYMBOL(sk_wait_data); */ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) { - struct mem_cgroup *memcg = mem_cgroup_sockets_enabled ? sk->sk_memcg : NULL; + bool memcg_enabled = false, charged = false; struct proto *prot = sk->sk_prot; - bool charged = true; - long allocated; + long allocated = 0; - sk_memory_allocated_add(sk, amt); - allocated = sk_memory_allocated(sk); + if (!sk->sk_bypass_prot_mem) { + sk_memory_allocated_add(sk, amt); + allocated = sk_memory_allocated(sk); + } - if (memcg) { - charged = mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge()); + if (mem_cgroup_sk_enabled(sk)) { + memcg_enabled = true; + charged = mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge()); if (!charged) goto suppress_allocation; } + if (!allocated) + return 1; + /* Under limit. */ if (allocated <= sk_prot_mem_limits(sk, 0)) { sk_leave_memory_pressure(sk); @@ -3346,21 +3373,20 @@ suppress_allocation: */ if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) { /* Force charge with __GFP_NOFAIL */ - if (memcg && !charged) { - mem_cgroup_charge_skmem(memcg, amt, - gfp_memcg_charge() | __GFP_NOFAIL); - } + if (memcg_enabled && !charged) + mem_cgroup_sk_charge(sk, amt, + gfp_memcg_charge() | __GFP_NOFAIL); return 1; } } - if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged)) - trace_sock_exceed_buf_limit(sk, prot, allocated, kind); + trace_sock_exceed_buf_limit(sk, prot, allocated, kind); - sk_memory_allocated_sub(sk, amt); + if (allocated) + sk_memory_allocated_sub(sk, amt); - if (memcg && charged) - mem_cgroup_uncharge_skmem(memcg, amt); + if (charged) + mem_cgroup_sk_uncharge(sk, amt); return 0; } @@ -3396,10 +3422,13 @@ EXPORT_SYMBOL(__sk_mem_schedule); */ void __sk_mem_reduce_allocated(struct sock *sk, int amount) { - sk_memory_allocated_sub(sk, amount); + if (mem_cgroup_sk_enabled(sk)) + mem_cgroup_sk_uncharge(sk, amount); + + if (sk->sk_bypass_prot_mem) + return; - if (mem_cgroup_sockets_enabled && sk->sk_memcg) - mem_cgroup_uncharge_skmem(sk->sk_memcg, amount); + sk_memory_allocated_sub(sk, amount); if (sk_under_global_memory_pressure(sk) && (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) @@ -3419,6 +3448,24 @@ void __sk_mem_reclaim(struct sock *sk, int amount) } EXPORT_SYMBOL(__sk_mem_reclaim); +void __sk_charge(struct sock *sk, gfp_t gfp) +{ + int amt; + + gfp |= __GFP_NOFAIL; + if (mem_cgroup_from_sk(sk)) { + /* The socket has not been accepted yet, no need + * to look at newsk->sk_wmem_queued. + */ + amt = sk_mem_pages(sk->sk_forward_alloc + + atomic_read(&sk->sk_rmem_alloc)); + if (amt) + mem_cgroup_sk_charge(sk, amt, gfp); + } + + kmem_cache_charge(sk, gfp); +} + int sk_set_peek_off(struct sock *sk, int val) { WRITE_ONCE(sk->sk_peek_off, val); @@ -3433,13 +3480,13 @@ EXPORT_SYMBOL_GPL(sk_set_peek_off); * function, some default processing is provided. */ -int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len) +int sock_no_bind(struct socket *sock, struct sockaddr_unsized *saddr, int len) { return -EOPNOTSUPP; } EXPORT_SYMBOL(sock_no_bind); -int sock_no_connect(struct socket *sock, struct sockaddr *saddr, +int sock_no_connect(struct socket *sock, struct sockaddr_unsized *saddr, int len, int flags) { return -EOPNOTSUPP; @@ -3593,12 +3640,12 @@ static void sock_def_write_space(struct sock *sk) * for SOCK_RCU_FREE sockets under RCU read section and after putting * ->sk_wmem_alloc. */ -static void sock_def_write_space_wfree(struct sock *sk) +static void sock_def_write_space_wfree(struct sock *sk, int wmem_alloc) { /* Do not wake up a writer until he can make "significant" * progress. --DaveM */ - if (sock_writeable(sk)) { + if (__sock_writeable(sk, wmem_alloc)) { struct socket_wq *wq = rcu_dereference(sk->sk_wq); /* rely on refcount_sub from sock_wfree() */ @@ -3713,7 +3760,7 @@ void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid) */ smp_wmb(); refcount_set(&sk->sk_refcnt, 1); - atomic_set(&sk->sk_drops, 0); + sk_drops_reset(sk); } EXPORT_SYMBOL(sock_init_data_uid); @@ -3973,7 +4020,7 @@ void sk_get_meminfo(const struct sock *sk, u32 *mem) mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued); mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); - mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); + mem[SK_MEMINFO_DROPS] = sk_drops_read(sk); } #ifdef CONFIG_PROC_FS @@ -4366,7 +4413,7 @@ bool sk_busy_loop_end(void *p, unsigned long start_time) EXPORT_SYMBOL(sk_busy_loop_end); #endif /* CONFIG_NET_RX_BUSY_POLL */ -int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len) +int sock_bind_add(struct sock *sk, struct sockaddr_unsized *addr, int addr_len) { if (!sk->sk_prot->bind_add) return -EOPNOTSUPP; @@ -4454,7 +4501,9 @@ static int __init sock_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket); +#ifdef CONFIG_MEMCG CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg); +#endif CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem); @@ -4463,31 +4512,34 @@ static int __init sock_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc); - CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_sndbuf); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_err_soft); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending); - CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_dst_pending_confirm); - CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_status); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_pending_confirm); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_status); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_uid); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_protocol); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndbuf); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag); diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index b23594c767f2..026ce9bd9e5e 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -348,7 +348,7 @@ static struct pernet_operations diag_net_ops = { static int __init sock_diag_init(void) { - broadcast_wq = alloc_workqueue("sock_diag_events", 0, 0); + broadcast_wq = alloc_workqueue("sock_diag_events", WQ_PERCPU, 0); BUG_ON(!broadcast_wq); return register_pernet_subsys(&diag_net_ops); } diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 8cf04b57ade1..8d4decb2606f 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -668,6 +668,13 @@ static struct ctl_table netns_core_table[] = { .proc_handler = proc_dou8vec_minmax, }, { + .procname = "txq_reselection_ms", + .data = &init_net.core.sysctl_txq_reselection, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_ms_jiffies, + }, + { .procname = "tstamp_allow_data", .data = &init_net.core.sysctl_tstamp_allow_data, .maxlen = sizeof(u8), @@ -676,6 +683,15 @@ static struct ctl_table netns_core_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE }, + { + .procname = "bypass_prot_mem", + .data = &init_net.core.sysctl_bypass_prot_mem, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE + }, /* sysctl_core_net_init() will set the values after this * to readonly in network namespaces */ diff --git a/net/core/xdp.c b/net/core/xdp.c index 491334b9b8be..9100e160113a 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -663,9 +663,8 @@ struct sk_buff *xdp_build_skb_from_buff(const struct xdp_buff *xdp) u32 tsize; tsize = sinfo->xdp_frags_truesize ? : nr_frags * xdp->frame_sz; - xdp_update_skb_shared_info(skb, nr_frags, - sinfo->xdp_frags_size, tsize, - xdp_buff_is_frag_pfmemalloc(xdp)); + xdp_update_skb_frags_info(skb, nr_frags, sinfo->xdp_frags_size, + tsize, xdp_buff_get_skb_flags(xdp)); } skb->protocol = eth_type_trans(skb, rxq->dev); @@ -692,7 +691,7 @@ static noinline bool xdp_copy_frags_from_zc(struct sk_buff *skb, struct skb_shared_info *sinfo = skb_shinfo(skb); const struct skb_shared_info *xinfo; u32 nr_frags, tsize = 0; - bool pfmemalloc = false; + u32 flags = 0; xinfo = xdp_get_shared_info_from_buff(xdp); nr_frags = xinfo->nr_frags; @@ -714,11 +713,12 @@ static noinline bool xdp_copy_frags_from_zc(struct sk_buff *skb, __skb_fill_page_desc_noacc(sinfo, i, page, offset, len); tsize += truesize; - pfmemalloc |= page_is_pfmemalloc(page); + if (page_is_pfmemalloc(page)) + flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC; } - xdp_update_skb_shared_info(skb, nr_frags, xinfo->xdp_frags_size, - tsize, pfmemalloc); + xdp_update_skb_frags_info(skb, nr_frags, xinfo->xdp_frags_size, tsize, + flags); return true; } @@ -823,10 +823,9 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, skb_metadata_set(skb, xdpf->metasize); if (unlikely(xdp_frame_has_frags(xdpf))) - xdp_update_skb_shared_info(skb, nr_frags, - sinfo->xdp_frags_size, - nr_frags * xdpf->frame_sz, - xdp_frame_is_frag_pfmemalloc(xdpf)); + xdp_update_skb_frags_info(skb, nr_frags, sinfo->xdp_frags_size, + nr_frags * xdpf->frame_sz, + xdp_frame_get_skb_flags(xdpf)); /* Essential SKB info: protocol and skb->dev */ skb->protocol = eth_type_trans(skb, dev); diff --git a/net/devlink/core.c b/net/devlink/core.c index 7203c39532fc..58093f49c090 100644 --- a/net/devlink/core.c +++ b/net/devlink/core.c @@ -320,7 +320,7 @@ static void devlink_release(struct work_struct *work) void devlink_put(struct devlink *devlink) { if (refcount_dec_and_test(&devlink->refcount)) - queue_rcu_work(system_wq, &devlink->rwork); + queue_rcu_work(system_percpu_wq, &devlink->rwork); } struct devlink *devlinks_xa_find_get(struct net *net, unsigned long *indexp) diff --git a/net/devlink/health.c b/net/devlink/health.c index b3ce8ecbb7fb..136a67c36a20 100644 --- a/net/devlink/health.c +++ b/net/devlink/health.c @@ -60,6 +60,7 @@ struct devlink_health_reporter { struct devlink_port *devlink_port; struct devlink_fmsg *dump_fmsg; u64 graceful_period; + u64 burst_period; bool auto_recover; bool auto_dump; u8 health_state; @@ -108,11 +109,14 @@ devlink_port_health_reporter_find_by_name(struct devlink_port *devlink_port, static struct devlink_health_reporter * __devlink_health_reporter_create(struct devlink *devlink, const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv) + void *priv) { struct devlink_health_reporter *reporter; - if (WARN_ON(graceful_period && !ops->recover)) + if (WARN_ON(ops->default_graceful_period && !ops->recover)) + return ERR_PTR(-EINVAL); + + if (WARN_ON(ops->default_burst_period && !ops->default_graceful_period)) return ERR_PTR(-EINVAL); reporter = kzalloc(sizeof(*reporter), GFP_KERNEL); @@ -122,7 +126,8 @@ __devlink_health_reporter_create(struct devlink *devlink, reporter->priv = priv; reporter->ops = ops; reporter->devlink = devlink; - reporter->graceful_period = graceful_period; + reporter->graceful_period = ops->default_graceful_period; + reporter->burst_period = ops->default_burst_period; reporter->auto_recover = !!ops->recover; reporter->auto_dump = !!ops->dump; return reporter; @@ -134,13 +139,12 @@ __devlink_health_reporter_create(struct devlink *devlink, * * @port: devlink_port to which health reports will relate * @ops: devlink health reporter ops - * @graceful_period: min time (in msec) between recovery attempts * @priv: driver priv pointer */ struct devlink_health_reporter * devl_port_health_reporter_create(struct devlink_port *port, const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv) + void *priv) { struct devlink_health_reporter *reporter; @@ -150,8 +154,7 @@ devl_port_health_reporter_create(struct devlink_port *port, ops->name)) return ERR_PTR(-EEXIST); - reporter = __devlink_health_reporter_create(port->devlink, ops, - graceful_period, priv); + reporter = __devlink_health_reporter_create(port->devlink, ops, priv); if (IS_ERR(reporter)) return reporter; @@ -164,14 +167,13 @@ EXPORT_SYMBOL_GPL(devl_port_health_reporter_create); struct devlink_health_reporter * devlink_port_health_reporter_create(struct devlink_port *port, const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv) + void *priv) { struct devlink_health_reporter *reporter; struct devlink *devlink = port->devlink; devl_lock(devlink); - reporter = devl_port_health_reporter_create(port, ops, - graceful_period, priv); + reporter = devl_port_health_reporter_create(port, ops, priv); devl_unlock(devlink); return reporter; } @@ -182,13 +184,12 @@ EXPORT_SYMBOL_GPL(devlink_port_health_reporter_create); * * @devlink: devlink instance which the health reports will relate * @ops: devlink health reporter ops - * @graceful_period: min time (in msec) between recovery attempts * @priv: driver priv pointer */ struct devlink_health_reporter * devl_health_reporter_create(struct devlink *devlink, const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv) + void *priv) { struct devlink_health_reporter *reporter; @@ -197,8 +198,7 @@ devl_health_reporter_create(struct devlink *devlink, if (devlink_health_reporter_find_by_name(devlink, ops->name)) return ERR_PTR(-EEXIST); - reporter = __devlink_health_reporter_create(devlink, ops, - graceful_period, priv); + reporter = __devlink_health_reporter_create(devlink, ops, priv); if (IS_ERR(reporter)) return reporter; @@ -210,13 +210,12 @@ EXPORT_SYMBOL_GPL(devl_health_reporter_create); struct devlink_health_reporter * devlink_health_reporter_create(struct devlink *devlink, const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv) + void *priv) { struct devlink_health_reporter *reporter; devl_lock(devlink); - reporter = devl_health_reporter_create(devlink, ops, - graceful_period, priv); + reporter = devl_health_reporter_create(devlink, ops, priv); devl_unlock(devlink); return reporter; } @@ -298,6 +297,10 @@ devlink_nl_health_reporter_fill(struct sk_buff *msg, reporter->graceful_period)) goto reporter_nest_cancel; if (reporter->ops->recover && + devlink_nl_put_u64(msg, DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD, + reporter->burst_period)) + goto reporter_nest_cancel; + if (reporter->ops->recover && nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER, reporter->auto_recover)) goto reporter_nest_cancel; @@ -462,16 +465,33 @@ int devlink_nl_health_reporter_set_doit(struct sk_buff *skb, if (!reporter->ops->recover && (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] || - info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER])) + info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] || + info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD])) return -EOPNOTSUPP; if (!reporter->ops->dump && info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]) return -EOPNOTSUPP; - if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]) + if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]) { reporter->graceful_period = nla_get_u64(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]); + if (!reporter->graceful_period) + reporter->burst_period = 0; + } + + if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD]) { + u64 burst_period = + nla_get_u64(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD]); + + if (!reporter->graceful_period && burst_period) { + NL_SET_ERR_MSG_MOD(info->extack, + "Cannot set burst period without a grace period."); + return -EINVAL; + } + + reporter->burst_period = burst_period; + } if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]) reporter->auto_recover = @@ -514,11 +534,25 @@ static void devlink_recover_notify(struct devlink_health_reporter *reporter, devlink_nl_notify_send_desc(devlink, msg, &desc); } +static bool +devlink_health_reporter_in_burst(struct devlink_health_reporter *reporter) +{ + unsigned long burst_threshold = reporter->last_recovery_ts + + msecs_to_jiffies(reporter->burst_period); + + return time_is_after_jiffies(burst_threshold); +} + void devlink_health_reporter_recovery_done(struct devlink_health_reporter *reporter) { reporter->recovery_count++; - reporter->last_recovery_ts = jiffies; + if (!devlink_health_reporter_in_burst(reporter)) + /* When burst period is set, last_recovery_ts marks the first + * recovery within the burst period, not necessarily the last + * one. + */ + reporter->last_recovery_ts = jiffies; } EXPORT_SYMBOL_GPL(devlink_health_reporter_recovery_done); @@ -592,12 +626,37 @@ dump_err: return err; } +static bool +devlink_health_recover_abort(struct devlink_health_reporter *reporter, + enum devlink_health_reporter_state prev_state) +{ + unsigned long recover_ts_threshold; + + if (!reporter->auto_recover) + return false; + + /* abort if the previous error wasn't recovered */ + if (prev_state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY) + return true; + + if (devlink_health_reporter_in_burst(reporter)) + return false; + + recover_ts_threshold = reporter->last_recovery_ts + + msecs_to_jiffies(reporter->burst_period) + + msecs_to_jiffies(reporter->graceful_period); + if (reporter->last_recovery_ts && reporter->recovery_count && + time_is_after_jiffies(recover_ts_threshold)) + return true; + + return false; +} + int devlink_health_report(struct devlink_health_reporter *reporter, const char *msg, void *priv_ctx) { enum devlink_health_reporter_state prev_health_state; struct devlink *devlink = reporter->devlink; - unsigned long recover_ts_threshold; int ret; /* write a log message of the current error */ @@ -608,13 +667,7 @@ int devlink_health_report(struct devlink_health_reporter *reporter, reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_ERROR; devlink_recover_notify(reporter, DEVLINK_CMD_HEALTH_REPORTER_RECOVER); - /* abort if the previous error wasn't recovered */ - recover_ts_threshold = reporter->last_recovery_ts + - msecs_to_jiffies(reporter->graceful_period); - if (reporter->auto_recover && - (prev_health_state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY || - (reporter->last_recovery_ts && reporter->recovery_count && - time_is_after_jiffies(recover_ts_threshold)))) { + if (devlink_health_recover_abort(reporter, prev_health_state)) { trace_devlink_health_recover_aborted(devlink, reporter->ops->name, reporter->health_state, diff --git a/net/devlink/netlink_gen.c b/net/devlink/netlink_gen.c index d97c326a9045..f4c61c2b4f22 100644 --- a/net/devlink/netlink_gen.c +++ b/net/devlink/netlink_gen.c @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/devlink.yaml */ /* YNL-GEN kernel source */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #include <net/netlink.h> #include <net/genetlink.h> @@ -229,7 +230,7 @@ static const struct nla_policy devlink_eswitch_get_nl_policy[DEVLINK_ATTR_DEV_NA static const struct nla_policy devlink_eswitch_set_nl_policy[DEVLINK_ATTR_ESWITCH_ENCAP_MODE + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, - [DEVLINK_ATTR_ESWITCH_MODE] = NLA_POLICY_MAX(NLA_U16, 1), + [DEVLINK_ATTR_ESWITCH_MODE] = NLA_POLICY_MAX(NLA_U16, 2), [DEVLINK_ATTR_ESWITCH_INLINE_MODE] = NLA_POLICY_MAX(NLA_U8, 3), [DEVLINK_ATTR_ESWITCH_ENCAP_MODE] = NLA_POLICY_MAX(NLA_U8, 1), }; @@ -301,12 +302,13 @@ static const struct nla_policy devlink_param_get_dump_nl_policy[DEVLINK_ATTR_DEV }; /* DEVLINK_CMD_PARAM_SET - do */ -static const struct nla_policy devlink_param_set_nl_policy[DEVLINK_ATTR_PARAM_VALUE_CMODE + 1] = { +static const struct nla_policy devlink_param_set_nl_policy[DEVLINK_ATTR_PARAM_RESET_DEFAULT + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_PARAM_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_PARAM_TYPE] = NLA_POLICY_VALIDATE_FN(NLA_U8, &devlink_attr_param_type_validate), [DEVLINK_ATTR_PARAM_VALUE_CMODE] = NLA_POLICY_MAX(NLA_U8, 2), + [DEVLINK_ATTR_PARAM_RESET_DEFAULT] = { .type = NLA_FLAG, }, }; /* DEVLINK_CMD_REGION_GET - do */ @@ -389,7 +391,7 @@ static const struct nla_policy devlink_health_reporter_get_dump_nl_policy[DEVLIN }; /* DEVLINK_CMD_HEALTH_REPORTER_SET - do */ -static const struct nla_policy devlink_health_reporter_set_nl_policy[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP + 1] = { +static const struct nla_policy devlink_health_reporter_set_nl_policy[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, }, @@ -397,6 +399,7 @@ static const struct nla_policy devlink_health_reporter_set_nl_policy[DEVLINK_ATT [DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] = { .type = NLA_U64, }, [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] = { .type = NLA_U8, }, [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP] = { .type = NLA_U8, }, + [DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD] = { .type = NLA_U64, }, }; /* DEVLINK_CMD_HEALTH_REPORTER_RECOVER - do */ @@ -918,7 +921,7 @@ const struct genl_split_ops devlink_nl_ops[74] = { .doit = devlink_nl_param_set_doit, .post_doit = devlink_nl_post_doit, .policy = devlink_param_set_nl_policy, - .maxattr = DEVLINK_ATTR_PARAM_VALUE_CMODE, + .maxattr = DEVLINK_ATTR_PARAM_RESET_DEFAULT, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, { @@ -1032,7 +1035,7 @@ const struct genl_split_ops devlink_nl_ops[74] = { .doit = devlink_nl_health_reporter_set_doit, .post_doit = devlink_nl_post_doit, .policy = devlink_health_reporter_set_nl_policy, - .maxattr = DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP, + .maxattr = DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, { diff --git a/net/devlink/netlink_gen.h b/net/devlink/netlink_gen.h index 09cc6f264ccf..2817d53a0eba 100644 --- a/net/devlink/netlink_gen.h +++ b/net/devlink/netlink_gen.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/devlink.yaml */ /* YNL-GEN kernel header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _LINUX_DEVLINK_GEN_H #define _LINUX_DEVLINK_GEN_H diff --git a/net/devlink/param.c b/net/devlink/param.c index 41dcc86cfd94..e0ea93eded43 100644 --- a/net/devlink/param.c +++ b/net/devlink/param.c @@ -102,6 +102,21 @@ static const struct devlink_param devlink_param_generic[] = { .name = DEVLINK_PARAM_GENERIC_CLOCK_ID_NAME, .type = DEVLINK_PARAM_GENERIC_CLOCK_ID_TYPE, }, + { + .id = DEVLINK_PARAM_GENERIC_ID_TOTAL_VFS, + .name = DEVLINK_PARAM_GENERIC_TOTAL_VFS_NAME, + .type = DEVLINK_PARAM_GENERIC_TOTAL_VFS_TYPE, + }, + { + .id = DEVLINK_PARAM_GENERIC_ID_NUM_DOORBELLS, + .name = DEVLINK_PARAM_GENERIC_NUM_DOORBELLS_NAME, + .type = DEVLINK_PARAM_GENERIC_NUM_DOORBELLS_TYPE, + }, + { + .id = DEVLINK_PARAM_GENERIC_ID_MAX_MAC_PER_VF, + .name = DEVLINK_PARAM_GENERIC_MAX_MAC_PER_VF_NAME, + .type = DEVLINK_PARAM_GENERIC_MAX_MAC_PER_VF_TYPE, + }, }; static int devlink_param_generic_verify(const struct devlink_param *param) @@ -159,11 +174,12 @@ devlink_param_cmode_is_supported(const struct devlink_param *param, static int devlink_param_get(struct devlink *devlink, const struct devlink_param *param, - struct devlink_param_gset_ctx *ctx) + struct devlink_param_gset_ctx *ctx, + struct netlink_ext_ack *extack) { if (!param->get) return -EOPNOTSUPP; - return param->get(devlink, param->id, ctx); + return param->get(devlink, param->id, ctx, extack); } static int devlink_param_set(struct devlink *devlink, @@ -176,68 +192,121 @@ static int devlink_param_set(struct devlink *devlink, return param->set(devlink, param->id, ctx, extack); } -static int -devlink_nl_param_value_fill_one(struct sk_buff *msg, - enum devlink_param_type type, - enum devlink_param_cmode cmode, - union devlink_param_value val) +static int devlink_param_get_default(struct devlink *devlink, + const struct devlink_param *param, + struct devlink_param_gset_ctx *ctx, + struct netlink_ext_ack *extack) { - struct nlattr *param_value_attr; + if (!param->get_default) + return -EOPNOTSUPP; - param_value_attr = nla_nest_start_noflag(msg, - DEVLINK_ATTR_PARAM_VALUE); - if (!param_value_attr) - goto nla_put_failure; + return param->get_default(devlink, param->id, ctx, extack); +} - if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_VALUE_CMODE, cmode)) - goto value_nest_cancel; +static int devlink_param_reset_default(struct devlink *devlink, + const struct devlink_param *param, + enum devlink_param_cmode cmode, + struct netlink_ext_ack *extack) +{ + if (!param->reset_default) + return -EOPNOTSUPP; + return param->reset_default(devlink, param->id, cmode, extack); +} + +static int +devlink_nl_param_value_put(struct sk_buff *msg, enum devlink_param_type type, + int nla_type, union devlink_param_value val, + bool flag_as_u8) +{ switch (type) { case DEVLINK_PARAM_TYPE_U8: - if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu8)) - goto value_nest_cancel; + if (nla_put_u8(msg, nla_type, val.vu8)) + return -EMSGSIZE; break; case DEVLINK_PARAM_TYPE_U16: - if (nla_put_u16(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu16)) - goto value_nest_cancel; + if (nla_put_u16(msg, nla_type, val.vu16)) + return -EMSGSIZE; break; case DEVLINK_PARAM_TYPE_U32: - if (nla_put_u32(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu32)) - goto value_nest_cancel; + if (nla_put_u32(msg, nla_type, val.vu32)) + return -EMSGSIZE; break; case DEVLINK_PARAM_TYPE_U64: - if (devlink_nl_put_u64(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, - val.vu64)) - goto value_nest_cancel; + if (devlink_nl_put_u64(msg, nla_type, val.vu64)) + return -EMSGSIZE; break; case DEVLINK_PARAM_TYPE_STRING: - if (nla_put_string(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, - val.vstr)) - goto value_nest_cancel; + if (nla_put_string(msg, nla_type, val.vstr)) + return -EMSGSIZE; break; case DEVLINK_PARAM_TYPE_BOOL: - if (val.vbool && - nla_put_flag(msg, DEVLINK_ATTR_PARAM_VALUE_DATA)) - goto value_nest_cancel; + /* default values of type bool are encoded with u8, so that + * false can be distinguished from not present + */ + if (flag_as_u8) { + if (nla_put_u8(msg, nla_type, val.vbool)) + return -EMSGSIZE; + } else { + if (val.vbool && nla_put_flag(msg, nla_type)) + return -EMSGSIZE; + } break; } + return 0; +} + +static int +devlink_nl_param_value_fill_one(struct sk_buff *msg, + enum devlink_param_type type, + enum devlink_param_cmode cmode, + union devlink_param_value val, + union devlink_param_value default_val, + bool has_default) +{ + struct nlattr *param_value_attr; + int err = -EMSGSIZE; + + param_value_attr = nla_nest_start_noflag(msg, + DEVLINK_ATTR_PARAM_VALUE); + if (!param_value_attr) + return -EMSGSIZE; + + if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_VALUE_CMODE, cmode)) + goto value_nest_cancel; + + err = devlink_nl_param_value_put(msg, type, + DEVLINK_ATTR_PARAM_VALUE_DATA, + val, false); + if (err) + goto value_nest_cancel; + + if (has_default) { + err = devlink_nl_param_value_put(msg, type, + DEVLINK_ATTR_PARAM_VALUE_DEFAULT, + default_val, true); + if (err) + goto value_nest_cancel; + } nla_nest_end(msg, param_value_attr); return 0; value_nest_cancel: nla_nest_cancel(msg, param_value_attr); -nla_put_failure: - return -EMSGSIZE; + return err; } static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink, unsigned int port_index, struct devlink_param_item *param_item, enum devlink_command cmd, - u32 portid, u32 seq, int flags) + u32 portid, u32 seq, int flags, + struct netlink_ext_ack *extack) { + union devlink_param_value default_value[DEVLINK_PARAM_CMODE_MAX + 1]; union devlink_param_value param_value[DEVLINK_PARAM_CMODE_MAX + 1]; + bool default_value_set[DEVLINK_PARAM_CMODE_MAX + 1] = {}; bool param_value_set[DEVLINK_PARAM_CMODE_MAX + 1] = {}; const struct devlink_param *param = param_item->param; struct devlink_param_gset_ctx ctx; @@ -258,12 +327,26 @@ static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink, param_value[i] = param_item->driverinit_value; else return -EOPNOTSUPP; + + if (param_item->driverinit_value_valid) { + default_value[i] = param_item->driverinit_default; + default_value_set[i] = true; + } } else { ctx.cmode = i; - err = devlink_param_get(devlink, param, &ctx); + err = devlink_param_get(devlink, param, &ctx, extack); if (err) return err; param_value[i] = ctx.val; + + err = devlink_param_get_default(devlink, param, &ctx, + extack); + if (!err) { + default_value[i] = ctx.val; + default_value_set[i] = true; + } else if (err != -EOPNOTSUPP) { + return err; + } } param_value_set[i] = true; } @@ -300,7 +383,9 @@ static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink, if (!param_value_set[i]) continue; err = devlink_nl_param_value_fill_one(msg, param->type, - i, param_value[i]); + i, param_value[i], + default_value[i], + default_value_set[i]); if (err) goto values_list_nest_cancel; } @@ -342,7 +427,7 @@ static void devlink_param_notify(struct devlink *devlink, if (!msg) return; err = devlink_nl_param_fill(msg, devlink, port_index, param_item, cmd, - 0, 0, 0); + 0, 0, 0, NULL); if (err) { nlmsg_free(msg); return; @@ -385,7 +470,8 @@ static int devlink_nl_param_get_dump_one(struct sk_buff *msg, err = devlink_nl_param_fill(msg, devlink, 0, param_item, DEVLINK_CMD_PARAM_GET, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, flags); + cb->nlh->nlmsg_seq, flags, + cb->extack); if (err == -EOPNOTSUPP) { err = 0; } else if (err) { @@ -494,8 +580,8 @@ int devlink_nl_param_get_doit(struct sk_buff *skb, return -ENOMEM; err = devlink_nl_param_fill(msg, devlink, 0, param_item, - DEVLINK_CMD_PARAM_GET, - info->snd_portid, info->snd_seq, 0); + DEVLINK_CMD_PARAM_GET, info->snd_portid, + info->snd_seq, 0, info->extack); if (err) { nlmsg_free(msg); return err; @@ -516,6 +602,7 @@ static int __devlink_nl_cmd_param_set_doit(struct devlink *devlink, struct devlink_param_item *param_item; const struct devlink_param *param; union devlink_param_value value; + bool reset_default; int err = 0; param_item = devlink_param_get_from_info(params, info); @@ -527,13 +614,18 @@ static int __devlink_nl_cmd_param_set_doit(struct devlink *devlink, return err; if (param_type != param->type) return -EINVAL; - err = devlink_param_value_get_from_info(param, info, &value); - if (err) - return err; - if (param->validate) { - err = param->validate(devlink, param->id, value, info->extack); + + reset_default = info->attrs[DEVLINK_ATTR_PARAM_RESET_DEFAULT]; + if (!reset_default) { + err = devlink_param_value_get_from_info(param, info, &value); if (err) return err; + if (param->validate) { + err = param->validate(devlink, param->id, value, + info->extack); + if (err) + return err; + } } if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PARAM_VALUE_CMODE)) @@ -543,6 +635,15 @@ static int __devlink_nl_cmd_param_set_doit(struct devlink *devlink, return -EOPNOTSUPP; if (cmode == DEVLINK_PARAM_CMODE_DRIVERINIT) { + if (reset_default) { + if (!param_item->driverinit_value_valid) { + NL_SET_ERR_MSG(info->extack, + "Default value not available"); + return -EOPNOTSUPP; + } + value = param_item->driverinit_default; + } + param_item->driverinit_value_new = value; param_item->driverinit_value_new_valid = true; } else { @@ -550,7 +651,12 @@ static int __devlink_nl_cmd_param_set_doit(struct devlink *devlink, return -EOPNOTSUPP; ctx.val = value; ctx.cmode = cmode; - err = devlink_param_set(devlink, param, &ctx, info->extack); + if (reset_default) + err = devlink_param_reset_default(devlink, param, cmode, + info->extack); + else + err = devlink_param_set(devlink, param, &ctx, + info->extack); if (err) return err; } @@ -798,6 +904,7 @@ void devl_param_driverinit_value_set(struct devlink *devlink, u32 param_id, param_item->driverinit_value = init_val; param_item->driverinit_value_valid = true; + param_item->driverinit_default = init_val; devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_NEW); } diff --git a/net/devlink/port.c b/net/devlink/port.c index 939081a0e615..93d8a25bb920 100644 --- a/net/devlink/port.c +++ b/net/devlink/port.c @@ -1333,8 +1333,8 @@ int devlink_port_netdevice_event(struct notifier_block *nb, return NOTIFY_OK; } -static int __devlink_port_attrs_set(struct devlink_port *devlink_port, - enum devlink_port_flavour flavour) +static void __devlink_port_attrs_set(struct devlink_port *devlink_port, + enum devlink_port_flavour flavour) { struct devlink_port_attrs *attrs = &devlink_port->attrs; @@ -1347,7 +1347,6 @@ static int __devlink_port_attrs_set(struct devlink_port *devlink_port, } else { devlink_port->switch_port = false; } - return 0; } /** @@ -1357,17 +1356,13 @@ static int __devlink_port_attrs_set(struct devlink_port *devlink_port, * @attrs: devlink port attrs */ void devlink_port_attrs_set(struct devlink_port *devlink_port, - struct devlink_port_attrs *attrs) + const struct devlink_port_attrs *attrs) { - int ret; - ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); + WARN_ON(attrs->splittable && attrs->split); devlink_port->attrs = *attrs; - ret = __devlink_port_attrs_set(devlink_port, attrs->flavour); - if (ret) - return; - WARN_ON(attrs->splittable && attrs->split); + __devlink_port_attrs_set(devlink_port, attrs->flavour); } EXPORT_SYMBOL_GPL(devlink_port_attrs_set); @@ -1383,14 +1378,10 @@ void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 contro u16 pf, bool external) { struct devlink_port_attrs *attrs = &devlink_port->attrs; - int ret; ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); - ret = __devlink_port_attrs_set(devlink_port, - DEVLINK_PORT_FLAVOUR_PCI_PF); - if (ret) - return; + __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_PF); attrs->pci_pf.controller = controller; attrs->pci_pf.pf = pf; attrs->pci_pf.external = external; @@ -1411,14 +1402,10 @@ void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 contro u16 pf, u16 vf, bool external) { struct devlink_port_attrs *attrs = &devlink_port->attrs; - int ret; ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); - ret = __devlink_port_attrs_set(devlink_port, - DEVLINK_PORT_FLAVOUR_PCI_VF); - if (ret) - return; + __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_VF); attrs->pci_vf.controller = controller; attrs->pci_vf.pf = pf; attrs->pci_vf.vf = vf; @@ -1439,14 +1426,10 @@ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 contro u16 pf, u32 sf, bool external) { struct devlink_port_attrs *attrs = &devlink_port->attrs; - int ret; ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); - ret = __devlink_port_attrs_set(devlink_port, - DEVLINK_PORT_FLAVOUR_PCI_SF); - if (ret) - return; + __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_SF); attrs->pci_sf.controller = controller; attrs->pci_sf.pf = pf; attrs->pci_sf.sf = sf; @@ -1519,7 +1502,7 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, struct devlink_port_attrs *attrs = &devlink_port->attrs; int n = 0; - if (!devlink_port->attrs_set) + if (!devlink_port->attrs_set || devlink_port->attrs.no_phys_port_name) return -EOPNOTSUPP; switch (attrs->flavour) { diff --git a/net/devlink/rate.c b/net/devlink/rate.c index 110b3fa8a0b1..d157a8419bca 100644 --- a/net/devlink/rate.c +++ b/net/devlink/rate.c @@ -34,7 +34,7 @@ devlink_rate_leaf_get_from_info(struct devlink *devlink, struct genl_info *info) static struct devlink_rate * devlink_rate_node_get_by_name(struct devlink *devlink, const char *node_name) { - static struct devlink_rate *devlink_rate; + struct devlink_rate *devlink_rate; list_for_each_entry(devlink_rate, &devlink->rate_list, list) { if (devlink_rate_is_node(devlink_rate) && @@ -819,8 +819,8 @@ EXPORT_SYMBOL_GPL(devl_rate_leaf_destroy); */ void devl_rate_nodes_destroy(struct devlink *devlink) { - static struct devlink_rate *devlink_rate, *tmp; const struct devlink_ops *ops = devlink->ops; + struct devlink_rate *devlink_rate, *tmp; devl_assert_locked(devlink); @@ -828,13 +828,15 @@ void devl_rate_nodes_destroy(struct devlink *devlink) if (!devlink_rate->parent) continue; - refcount_dec(&devlink_rate->parent->refcnt); if (devlink_rate_is_leaf(devlink_rate)) ops->rate_leaf_parent_set(devlink_rate, NULL, devlink_rate->priv, NULL, NULL); else if (devlink_rate_is_node(devlink_rate)) ops->rate_node_parent_set(devlink_rate, NULL, devlink_rate->priv, NULL, NULL); + + refcount_dec(&devlink_rate->parent->refcnt); + devlink_rate->parent = NULL; } list_for_each_entry_safe(devlink_rate, tmp, &devlink->rate_list, list) { if (devlink_rate_is_node(devlink_rate)) { diff --git a/net/devlink/region.c b/net/devlink/region.c index 63fb297f6d67..d6e5805cf3a0 100644 --- a/net/devlink/region.c +++ b/net/devlink/region.c @@ -50,7 +50,7 @@ devlink_port_region_get_by_name(struct devlink_port *port, struct devlink_region *region; list_for_each_entry(region, &port->region_list, list) - if (!strcmp(region->ops->name, region_name)) + if (!strcmp(region->port_ops->name, region_name)) return region; return NULL; diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c index 82b084cc1cc6..53da62984447 100644 --- a/net/dns_resolver/dns_query.c +++ b/net/dns_resolver/dns_query.c @@ -78,7 +78,6 @@ int dns_query(struct net *net, { struct key *rkey; struct user_key_payload *upayload; - const struct cred *saved_cred; size_t typelen, desclen; char *desc, *cp; int ret, len; @@ -124,9 +123,8 @@ int dns_query(struct net *net, /* make the upcall, using special credentials to prevent the use of * add_key() to preinstall malicious redirections */ - saved_cred = override_creds(dns_resolver_cache); - rkey = request_key_net(&key_type_dns_resolver, desc, net, options); - revert_creds(saved_cred); + scoped_with_creds(dns_resolver_cache) + rkey = request_key_net(&key_type_dns_resolver, desc, net, options); kfree(desc); if (IS_ERR(rkey)) { ret = PTR_ERR(rkey); diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index 869cbe57162f..f86b30742122 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -104,6 +104,14 @@ config NET_DSA_TAG_MTK Say Y or M if you want to enable support for tagging frames for Mediatek switches. +config NET_DSA_TAG_MXL_GSW1XX + tristate "Tag driver for MaxLinear GSW1xx switches" + help + The GSW1xx family of switches supports an 8-byte special tag which + can be used on the CPU port of the switch. + Say Y or M if you want to enable support for tagging frames for + MaxLinear GSW1xx switches. + config NET_DSA_TAG_KSZ tristate "Tag driver for Microchip 8795/937x/9477/9893 families of switches" help @@ -190,4 +198,10 @@ config NET_DSA_TAG_XRS700X Say Y or M if you want to enable support for tagging frames for Arrow SpeedChips XRS700x switches that use a single byte tag trailer. +config NET_DSA_TAG_YT921X + tristate "Tag driver for Motorcomm YT921x switches" + help + Say Y or M if you want to enable support for tagging frames for + Motorcomm YT921x switches. + endif diff --git a/net/dsa/Makefile b/net/dsa/Makefile index 555c07cfeb71..42d173f5a701 100644 --- a/net/dsa/Makefile +++ b/net/dsa/Makefile @@ -28,6 +28,7 @@ obj-$(CONFIG_NET_DSA_TAG_HELLCREEK) += tag_hellcreek.o obj-$(CONFIG_NET_DSA_TAG_KSZ) += tag_ksz.o obj-$(CONFIG_NET_DSA_TAG_LAN9303) += tag_lan9303.o obj-$(CONFIG_NET_DSA_TAG_MTK) += tag_mtk.o +obj-$(CONFIG_NET_DSA_TAG_MXL_GSW1XX) += tag_mxl-gsw1xx.o obj-$(CONFIG_NET_DSA_TAG_NONE) += tag_none.o obj-$(CONFIG_NET_DSA_TAG_OCELOT) += tag_ocelot.o obj-$(CONFIG_NET_DSA_TAG_OCELOT_8021Q) += tag_ocelot_8021q.o @@ -39,6 +40,7 @@ obj-$(CONFIG_NET_DSA_TAG_SJA1105) += tag_sja1105.o obj-$(CONFIG_NET_DSA_TAG_TRAILER) += tag_trailer.o obj-$(CONFIG_NET_DSA_TAG_VSC73XX_8021Q) += tag_vsc73xx_8021q.o obj-$(CONFIG_NET_DSA_TAG_XRS700X) += tag_xrs700x.o +obj-$(CONFIG_NET_DSA_TAG_YT921X) += tag_yt921x.o # for tracing framework to find trace.h CFLAGS_trace.o := -I$(src) diff --git a/net/dsa/conduit.c b/net/dsa/conduit.c index 4ae255cfb23f..a1b044467bd6 100644 --- a/net/dsa/conduit.c +++ b/net/dsa/conduit.c @@ -26,7 +26,7 @@ static int dsa_conduit_get_regs_len(struct net_device *dev) int ret = 0; int len; - if (ops->get_regs_len) { + if (ops && ops->get_regs_len) { netdev_lock_ops(dev); len = ops->get_regs_len(dev); netdev_unlock_ops(dev); @@ -59,7 +59,7 @@ static void dsa_conduit_get_regs(struct net_device *dev, int port = cpu_dp->index; int len; - if (ops->get_regs_len && ops->get_regs) { + if (ops && ops->get_regs_len && ops->get_regs) { netdev_lock_ops(dev); len = ops->get_regs_len(dev); if (len < 0) { @@ -87,30 +87,56 @@ static void dsa_conduit_get_regs(struct net_device *dev, } } +static ssize_t dsa_conduit_append_port_stats(struct dsa_switch *ds, int port, + u64 *data, size_t start) +{ + int count; + + if (!ds->ops->get_sset_count) + return 0; + + count = ds->ops->get_sset_count(ds, port, ETH_SS_STATS); + if (count < 0) + return count; + + if (ds->ops->get_ethtool_stats) + ds->ops->get_ethtool_stats(ds, port, data + start); + + return count; +} + static void dsa_conduit_get_ethtool_stats(struct net_device *dev, struct ethtool_stats *stats, - uint64_t *data) + u64 *data) { - struct dsa_port *cpu_dp = dev->dsa_ptr; + struct dsa_port *dp, *cpu_dp = dev->dsa_ptr; const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops; - struct dsa_switch *ds = cpu_dp->ds; - int port = cpu_dp->index; - int count = 0; + struct dsa_switch_tree *dst = cpu_dp->dst; + int count, mcount = 0; - if (ops->get_sset_count && ops->get_ethtool_stats) { + if (ops && ops->get_sset_count && ops->get_ethtool_stats) { netdev_lock_ops(dev); - count = ops->get_sset_count(dev, ETH_SS_STATS); + mcount = ops->get_sset_count(dev, ETH_SS_STATS); ops->get_ethtool_stats(dev, stats, data); netdev_unlock_ops(dev); } - if (ds->ops->get_ethtool_stats) - ds->ops->get_ethtool_stats(ds, port, data + count); + list_for_each_entry(dp, &dst->ports, list) { + if (!dsa_port_is_dsa(dp) && !dsa_port_is_cpu(dp)) + continue; + + count = dsa_conduit_append_port_stats(dp->ds, dp->index, + data, mcount); + if (count < 0) + return; + + mcount += count; + } } static void dsa_conduit_get_ethtool_phy_stats(struct net_device *dev, struct ethtool_stats *stats, - uint64_t *data) + u64 *data) { struct dsa_port *cpu_dp = dev->dsa_ptr; const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops; @@ -118,11 +144,11 @@ static void dsa_conduit_get_ethtool_phy_stats(struct net_device *dev, int port = cpu_dp->index; int count = 0; - if (dev->phydev && !ops->get_ethtool_phy_stats) { + if (dev->phydev && (!ops || !ops->get_ethtool_phy_stats)) { count = phy_ethtool_get_sset_count(dev->phydev); if (count >= 0) phy_ethtool_get_stats(dev->phydev, stats, data); - } else if (ops->get_sset_count && ops->get_ethtool_phy_stats) { + } else if (ops && ops->get_sset_count && ops->get_ethtool_phy_stats) { netdev_lock_ops(dev); count = ops->get_sset_count(dev, ETH_SS_PHY_STATS); ops->get_ethtool_phy_stats(dev, stats, data); @@ -136,45 +162,82 @@ static void dsa_conduit_get_ethtool_phy_stats(struct net_device *dev, ds->ops->get_ethtool_phy_stats(ds, port, data + count); } +static void dsa_conduit_append_port_sset_count(struct dsa_switch *ds, int port, + int sset, int *count) +{ + if (ds->ops->get_sset_count) + *count += ds->ops->get_sset_count(ds, port, sset); +} + static int dsa_conduit_get_sset_count(struct net_device *dev, int sset) { - struct dsa_port *cpu_dp = dev->dsa_ptr; + struct dsa_port *dp, *cpu_dp = dev->dsa_ptr; const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops; - struct dsa_switch *ds = cpu_dp->ds; + struct dsa_switch_tree *dst = cpu_dp->dst; int count = 0; netdev_lock_ops(dev); if (sset == ETH_SS_PHY_STATS && dev->phydev && - !ops->get_ethtool_phy_stats) + (!ops || !ops->get_ethtool_phy_stats)) count = phy_ethtool_get_sset_count(dev->phydev); - else if (ops->get_sset_count) + else if (ops && ops->get_sset_count) count = ops->get_sset_count(dev, sset); netdev_unlock_ops(dev); if (count < 0) count = 0; - if (ds->ops->get_sset_count) - count += ds->ops->get_sset_count(ds, cpu_dp->index, sset); + list_for_each_entry(dp, &dst->ports, list) { + if (!dsa_port_is_dsa(dp) && !dsa_port_is_cpu(dp)) + continue; + + dsa_conduit_append_port_sset_count(dp->ds, dp->index, sset, + &count); + } return count; } -static void dsa_conduit_get_strings(struct net_device *dev, uint32_t stringset, - uint8_t *data) +static ssize_t dsa_conduit_append_port_strings(struct dsa_switch *ds, int port, + u32 stringset, u8 *data, + size_t start) { - struct dsa_port *cpu_dp = dev->dsa_ptr; - const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops; - struct dsa_switch *ds = cpu_dp->ds; - int port = cpu_dp->index; int len = ETH_GSTRING_LEN; - int mcount = 0, count, i; - uint8_t pfx[4]; - uint8_t *ndata; + u8 pfx[8], *ndata; + int count, i; + + if (!ds->ops->get_strings) + return 0; - snprintf(pfx, sizeof(pfx), "p%.2d", port); + snprintf(pfx, sizeof(pfx), "s%.2d_p%.2d", ds->index, port); /* We do not want to be NULL-terminated, since this is a prefix */ pfx[sizeof(pfx) - 1] = '_'; + ndata = data + start * len; + /* This function copies ETH_GSTRINGS_LEN bytes, we will mangle + * the output after to prepend our CPU port prefix we + * constructed earlier + */ + ds->ops->get_strings(ds, port, stringset, ndata); + count = ds->ops->get_sset_count(ds, port, stringset); + if (count < 0) + return count; + + for (i = 0; i < count; i++) { + memmove(ndata + (i * len + sizeof(pfx)), + ndata + i * len, len - sizeof(pfx)); + memcpy(ndata + i * len, pfx, sizeof(pfx)); + } + + return count; +} + +static void dsa_conduit_get_strings(struct net_device *dev, u32 stringset, + u8 *data) +{ + struct dsa_port *dp, *cpu_dp = dev->dsa_ptr; + const struct ethtool_ops *ops = cpu_dp->orig_ethtool_ops; + struct dsa_switch_tree *dst = cpu_dp->dst; + int count, mcount = 0; netdev_lock_ops(dev); if (stringset == ETH_SS_PHY_STATS && dev->phydev && @@ -192,21 +255,17 @@ static void dsa_conduit_get_strings(struct net_device *dev, uint32_t stringset, } netdev_unlock_ops(dev); - if (ds->ops->get_strings) { - ndata = data + mcount * len; - /* This function copies ETH_GSTRINGS_LEN bytes, we will mangle - * the output after to prepend our CPU port prefix we - * constructed earlier - */ - ds->ops->get_strings(ds, port, stringset, ndata); - count = ds->ops->get_sset_count(ds, port, stringset); + list_for_each_entry(dp, &dst->ports, list) { + if (!dsa_port_is_dsa(dp) && !dsa_port_is_cpu(dp)) + continue; + + count = dsa_conduit_append_port_strings(dp->ds, dp->index, + stringset, data, + mcount); if (count < 0) return; - for (i = 0; i < count; i++) { - memmove(ndata + (i * len + sizeof(pfx)), - ndata + i * len, len - sizeof(pfx)); - memcpy(ndata + i * len, pfx, sizeof(pfx)); - } + + mcount += count; } } diff --git a/net/dsa/devlink.c b/net/dsa/devlink.c index f41f9fc2194e..ed342f345692 100644 --- a/net/dsa/devlink.c +++ b/net/dsa/devlink.c @@ -182,7 +182,8 @@ static const struct devlink_ops dsa_devlink_ops = { }; int dsa_devlink_param_get(struct devlink *dl, u32 id, - struct devlink_param_gset_ctx *ctx) + struct devlink_param_gset_ctx *ctx, + struct netlink_ext_ack *extack) { struct dsa_switch *ds = dsa_devlink_to_ds(dl); diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index 5b01a0e43ebe..a20efabe778f 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -9,6 +9,7 @@ #include <linux/device.h> #include <linux/err.h> +#include <linux/if_hsr.h> #include <linux/list.h> #include <linux/module.h> #include <linux/netdevice.h> @@ -1766,6 +1767,70 @@ bool dsa_mdb_present_in_other_db(struct dsa_switch *ds, int port, } EXPORT_SYMBOL_GPL(dsa_mdb_present_in_other_db); +/* Helpers for switches without specific HSR offloads, but which can implement + * NETIF_F_HW_HSR_DUP because their tagger uses dsa_xmit_port_mask() + */ +int dsa_port_simple_hsr_validate(struct dsa_switch *ds, int port, + struct net_device *hsr, + struct netlink_ext_ack *extack) +{ + enum hsr_port_type type; + int err; + + err = hsr_get_port_type(hsr, dsa_to_port(ds, port)->user, &type); + if (err) + return err; + + if (type != HSR_PT_SLAVE_A && type != HSR_PT_SLAVE_B) { + NL_SET_ERR_MSG_MOD(extack, + "Only HSR slave ports can be offloaded"); + return -EOPNOTSUPP; + } + + return 0; +} +EXPORT_SYMBOL_GPL(dsa_port_simple_hsr_validate); + +int dsa_port_simple_hsr_join(struct dsa_switch *ds, int port, + struct net_device *hsr, + struct netlink_ext_ack *extack) +{ + struct dsa_port *dp = dsa_to_port(ds, port), *other_dp; + int err; + + err = dsa_port_simple_hsr_validate(ds, port, hsr, extack); + if (err) + return err; + + dsa_hsr_foreach_port(other_dp, ds, hsr) { + if (other_dp != dp) { + dp->user->features |= NETIF_F_HW_HSR_DUP; + other_dp->user->features |= NETIF_F_HW_HSR_DUP; + break; + } + } + + return 0; +} +EXPORT_SYMBOL_GPL(dsa_port_simple_hsr_join); + +int dsa_port_simple_hsr_leave(struct dsa_switch *ds, int port, + struct net_device *hsr) +{ + struct dsa_port *dp = dsa_to_port(ds, port), *other_dp; + + dsa_hsr_foreach_port(other_dp, ds, hsr) { + if (other_dp != dp) { + dp->user->features &= ~NETIF_F_HW_HSR_DUP; + other_dp->user->features &= ~NETIF_F_HW_HSR_DUP; + break; + } + } + + return 0; +} +EXPORT_SYMBOL_GPL(dsa_port_simple_hsr_leave); + static const struct dsa_stubs __dsa_stubs = { .conduit_hwtstamp_validate = __dsa_conduit_hwtstamp_validate, }; diff --git a/net/dsa/port.c b/net/dsa/port.c index 082573ae6864..ca3a7f52229b 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -1909,6 +1909,9 @@ void dsa_port_hsr_leave(struct dsa_port *dp, struct net_device *hsr) struct dsa_switch *ds = dp->ds; int err; + if (!dp->hsr_dev) + return; + dp->hsr_dev = NULL; if (ds->ops->port_hsr_leave) { diff --git a/net/dsa/tag.h b/net/dsa/tag.h index 5d80ddad4ff6..cf52283fe9df 100644 --- a/net/dsa/tag.h +++ b/net/dsa/tag.h @@ -319,6 +319,24 @@ static inline void *dsa_etype_header_pos_tx(struct sk_buff *skb) return skb->data + 2 * ETH_ALEN; } +static inline unsigned long dsa_xmit_port_mask(const struct sk_buff *skb, + const struct net_device *dev) +{ + struct dsa_port *dp = dsa_user_to_port(dev); + unsigned long mask = BIT(dp->index); + + if (IS_ENABLED(CONFIG_HSR) && + unlikely(dev->features & NETIF_F_HW_HSR_DUP)) { + struct net_device *hsr_dev = dp->hsr_dev; + struct dsa_port *other_dp; + + dsa_hsr_foreach_port(other_dp, dp->ds, hsr_dev) + mask |= BIT(other_dp->index); + } + + return mask; +} + /* Create 2 modaliases per tagging protocol, one to auto-load the module * given the ID reported by get_tag_protocol(), and the other by name. */ diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index 26bb657ceac3..cf9420439054 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -92,6 +92,7 @@ static struct sk_buff *brcm_tag_xmit_ll(struct sk_buff *skb, { struct dsa_port *dp = dsa_user_to_port(dev); u16 queue = skb_get_queue_mapping(skb); + u16 port_mask; u8 *brcm_tag; /* The Ethernet switch we are interfaced with needs packets to be at @@ -119,10 +120,9 @@ static struct sk_buff *brcm_tag_xmit_ll(struct sk_buff *skb, brcm_tag[0] = (1 << BRCM_OPCODE_SHIFT) | ((queue & BRCM_IG_TC_MASK) << BRCM_IG_TC_SHIFT); brcm_tag[1] = 0; - brcm_tag[2] = 0; - if (dp->index == 8) - brcm_tag[2] = BRCM_IG_DSTMAP2_MASK; - brcm_tag[3] = (1 << dp->index) & BRCM_IG_DSTMAP1_MASK; + port_mask = dsa_xmit_port_mask(skb, dev); + brcm_tag[2] = (port_mask >> 8) & BRCM_IG_DSTMAP2_MASK; + brcm_tag[3] = port_mask & BRCM_IG_DSTMAP1_MASK; /* Now tell the conduit network device about the desired output queue * as well @@ -176,7 +176,8 @@ static struct sk_buff *brcm_tag_rcv_ll(struct sk_buff *skb, /* Remove Broadcom tag and update checksum */ skb_pull_rcsum(skb, BRCM_TAG_LEN); - dsa_default_offload_fwd_mark(skb); + if (likely(!is_link_local_ether_addr(eth_hdr(skb)->h_dest))) + dsa_default_offload_fwd_mark(skb); return skb; } @@ -224,12 +225,14 @@ static struct sk_buff *brcm_leg_tag_rcv(struct sk_buff *skb, { int len = BRCM_LEG_TAG_LEN; int source_port; + __be16 *proto; u8 *brcm_tag; if (unlikely(!pskb_may_pull(skb, BRCM_LEG_TAG_LEN + VLAN_HLEN))) return NULL; brcm_tag = dsa_etype_header_pos_rx(skb); + proto = (__be16 *)(brcm_tag + BRCM_LEG_TAG_LEN); source_port = brcm_tag[5] & BRCM_LEG_PORT_ID; @@ -237,14 +240,19 @@ static struct sk_buff *brcm_leg_tag_rcv(struct sk_buff *skb, if (!skb->dev) return NULL; - /* VLAN tag is added by BCM63xx internal switch */ - if (netdev_uses_dsa(skb->dev)) + /* The internal switch in BCM63XX SoCs always tags on egress on the CPU + * port. We use VID 0 internally for untagged traffic, so strip the tag + * if the TCI field is all 0, and keep it otherwise to also retain + * e.g. 802.1p tagged packets. + */ + if (proto[0] == htons(ETH_P_8021Q) && proto[1] == 0) len += VLAN_HLEN; /* Remove Broadcom tag and update checksum */ skb_pull_rcsum(skb, len); - dsa_default_offload_fwd_mark(skb); + if (likely(!is_link_local_ether_addr(eth_hdr(skb)->h_dest))) + dsa_default_offload_fwd_mark(skb); dsa_strip_etype_header(skb, len); diff --git a/net/dsa/tag_gswip.c b/net/dsa/tag_gswip.c index 51a1f46a567f..5fa436121087 100644 --- a/net/dsa/tag_gswip.c +++ b/net/dsa/tag_gswip.c @@ -48,8 +48,7 @@ /* Byte 3 */ #define GSWIP_TX_DPID_EN BIT(0) -#define GSWIP_TX_PORT_MAP_SHIFT 1 -#define GSWIP_TX_PORT_MAP_MASK GENMASK(6, 1) +#define GSWIP_TX_PORT_MAP GENMASK(6, 1) #define GSWIP_RX_HEADER_LEN 8 @@ -61,7 +60,6 @@ static struct sk_buff *gswip_tag_xmit(struct sk_buff *skb, struct net_device *dev) { - struct dsa_port *dp = dsa_user_to_port(dev); u8 *gswip_tag; skb_push(skb, GSWIP_TX_HEADER_LEN); @@ -70,7 +68,7 @@ static struct sk_buff *gswip_tag_xmit(struct sk_buff *skb, gswip_tag[0] = GSWIP_TX_SLPID_CPU; gswip_tag[1] = GSWIP_TX_DPID_ELAN; gswip_tag[2] = GSWIP_TX_PORT_MAP_EN | GSWIP_TX_PORT_MAP_SEL; - gswip_tag[3] = BIT(dp->index + GSWIP_TX_PORT_MAP_SHIFT) & GSWIP_TX_PORT_MAP_MASK; + gswip_tag[3] = FIELD_PREP(GSWIP_TX_PORT_MAP, dsa_xmit_port_mask(skb, dev)); gswip_tag[3] |= GSWIP_TX_DPID_EN; return skb; diff --git a/net/dsa/tag_hellcreek.c b/net/dsa/tag_hellcreek.c index 663b25785d95..544ab15685a2 100644 --- a/net/dsa/tag_hellcreek.c +++ b/net/dsa/tag_hellcreek.c @@ -20,7 +20,6 @@ static struct sk_buff *hellcreek_xmit(struct sk_buff *skb, struct net_device *dev) { - struct dsa_port *dp = dsa_user_to_port(dev); u8 *tag; /* Calculate checksums (if required) before adding the trailer tag to @@ -33,7 +32,7 @@ static struct sk_buff *hellcreek_xmit(struct sk_buff *skb, /* Tag encoding */ tag = skb_put(skb, HELLCREEK_TAG_LEN); - *tag = BIT(dp->index); + *tag = dsa_xmit_port_mask(skb, dev); return skb; } diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c index 0b7564b53790..9170a0148cc4 100644 --- a/net/dsa/tag_ksz.c +++ b/net/dsa/tag_ksz.c @@ -120,7 +120,6 @@ static struct sk_buff *ksz_common_rcv(struct sk_buff *skb, static struct sk_buff *ksz8795_xmit(struct sk_buff *skb, struct net_device *dev) { - struct dsa_port *dp = dsa_user_to_port(dev); struct ethhdr *hdr; u8 *tag; @@ -131,7 +130,7 @@ static struct sk_buff *ksz8795_xmit(struct sk_buff *skb, struct net_device *dev) tag = skb_put(skb, KSZ_INGRESS_TAG_LEN); hdr = skb_eth_hdr(skb); - *tag = 1 << dp->index; + *tag = dsa_xmit_port_mask(skb, dev); if (is_link_local_ether_addr(hdr->h_dest)) *tag |= KSZ8795_TAIL_TAG_OVERRIDE; @@ -294,21 +293,12 @@ static struct sk_buff *ksz9477_xmit(struct sk_buff *skb, tag = skb_put(skb, KSZ9477_INGRESS_TAG_LEN); hdr = skb_eth_hdr(skb); - val = BIT(dp->index); - + val = dsa_xmit_port_mask(skb, dev); val |= FIELD_PREP(KSZ9477_TAIL_TAG_PRIO, prio); if (is_link_local_ether_addr(hdr->h_dest)) val |= KSZ9477_TAIL_TAG_OVERRIDE; - if (dev->features & NETIF_F_HW_HSR_DUP) { - struct net_device *hsr_dev = dp->hsr_dev; - struct dsa_port *other_dp; - - dsa_hsr_foreach_port(other_dp, dp->ds, hsr_dev) - val |= BIT(other_dp->index); - } - *tag = cpu_to_be16(val); return ksz_defer_xmit(dp, skb); @@ -371,8 +361,7 @@ static struct sk_buff *ksz9893_xmit(struct sk_buff *skb, tag = skb_put(skb, KSZ_INGRESS_TAG_LEN); hdr = skb_eth_hdr(skb); - *tag = BIT(dp->index); - + *tag = dsa_xmit_port_mask(skb, dev); *tag |= FIELD_PREP(KSZ9893_TAIL_TAG_PRIO, prio); if (is_link_local_ether_addr(hdr->h_dest)) @@ -436,8 +425,7 @@ static struct sk_buff *lan937x_xmit(struct sk_buff *skb, tag = skb_put(skb, LAN937X_EGRESS_TAG_LEN); - val = BIT(dp->index); - + val = dsa_xmit_port_mask(skb, dev); val |= FIELD_PREP(LAN937X_TAIL_TAG_PRIO, prio); if (is_link_local_ether_addr(hdr->h_dest)) diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c index b670e3c53e91..dea3eecaf093 100644 --- a/net/dsa/tag_mtk.c +++ b/net/dsa/tag_mtk.c @@ -54,7 +54,8 @@ static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb, * whether that's a combined special tag with 802.1Q header. */ mtk_tag[0] = xmit_tpid; - mtk_tag[1] = (1 << dp->index) & MTK_HDR_XMIT_DP_BIT_MASK; + mtk_tag[1] = FIELD_PREP(MTK_HDR_XMIT_DP_BIT_MASK, + dsa_xmit_port_mask(skb, dev)); /* Tag control information is kept for 802.1Q */ if (xmit_tpid == MTK_HDR_XMIT_UNTAGGED) { diff --git a/net/dsa/tag_mxl-gsw1xx.c b/net/dsa/tag_mxl-gsw1xx.c new file mode 100644 index 000000000000..60f7c445e656 --- /dev/null +++ b/net/dsa/tag_mxl-gsw1xx.c @@ -0,0 +1,117 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * DSA driver Special Tag support for MaxLinear GSW1xx switch chips + * + * Copyright (C) 2025 Daniel Golle <daniel@makrotopia.org> + * Copyright (C) 2023 - 2024 MaxLinear Inc. + */ + +#include <linux/bitops.h> +#include <linux/etherdevice.h> +#include <linux/skbuff.h> +#include <net/dsa.h> + +#include "tag.h" + +/* To define the outgoing port and to discover the incoming port a special + * tag is used by the GSW1xx. + * + * Dest MAC Src MAC special TAG EtherType + * ...| 1 2 3 4 5 6 | 1 2 3 4 5 6 | 1 2 3 4 5 6 7 8 | 1 2 |... + * |<--------------->| + */ + +#define GSW1XX_TAG_NAME "gsw1xx" + +/* special tag header length (RX and TX) */ +#define GSW1XX_HEADER_LEN 8 + +/* Word 0 = Ethertype -> 0x88C3 */ + +/* Word 1 */ +#define GSW1XX_TX_PORT_MAP GENMASK(7, 0) +#define GSW1XX_TX_PORT_MAP_EN BIT(15) +#define GSW1XX_TX_CLASS_EN BIT(14) +#define GSW1XX_TX_TIME_STAMP_EN BIT(13) +#define GSW1XX_TX_LRN_DIS BIT(12) +#define GSW1XX_TX_CLASS GENMASK(11, 8) + +/* special tag in RX path header */ +/* Word 2 */ +#define GSW1XX_RX_PORT_MAP GENMASK(15, 8) + +static struct sk_buff *gsw1xx_tag_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + __be16 *gsw1xx_tag; + u16 tag; + + /* provide additional space 'GSW1XX_HEADER_LEN' bytes */ + skb_push(skb, GSW1XX_HEADER_LEN); + + /* add space between MAC address and Ethertype */ + dsa_alloc_etype_header(skb, GSW1XX_HEADER_LEN); + + /* special tag ingress */ + gsw1xx_tag = dsa_etype_header_pos_tx(skb); + gsw1xx_tag[0] = htons(ETH_P_MXLGSW); + + tag = FIELD_PREP(GSW1XX_TX_PORT_MAP, dsa_xmit_port_mask(skb, dev)) | + GSW1XX_TX_PORT_MAP_EN | GSW1XX_TX_LRN_DIS; + gsw1xx_tag[1] = htons(tag); + gsw1xx_tag[2] = 0; + gsw1xx_tag[3] = 0; + + return skb; +} + +static struct sk_buff *gsw1xx_tag_rcv(struct sk_buff *skb, + struct net_device *dev) +{ + int port; + __be16 *gsw1xx_tag; + + if (unlikely(!pskb_may_pull(skb, GSW1XX_HEADER_LEN))) { + dev_warn_ratelimited(&dev->dev, "Dropping packet, cannot pull SKB\n"); + return NULL; + } + + gsw1xx_tag = dsa_etype_header_pos_rx(skb); + + if (unlikely(ntohs(gsw1xx_tag[0]) != ETH_P_MXLGSW)) { + dev_warn_ratelimited(&dev->dev, "Dropping packet due to invalid special tag\n"); + dev_warn_ratelimited(&dev->dev, "Tag: %8ph\n", gsw1xx_tag); + return NULL; + } + + /* Get source port information */ + port = FIELD_GET(GSW1XX_RX_PORT_MAP, ntohs(gsw1xx_tag[1])); + skb->dev = dsa_conduit_find_user(dev, 0, port); + if (!skb->dev) { + dev_warn_ratelimited(&dev->dev, "Dropping packet due to invalid source port\n"); + dev_warn_ratelimited(&dev->dev, "Tag: %8ph\n", gsw1xx_tag); + return NULL; + } + + /* remove the GSW1xx special tag between MAC addresses and the current + * ethertype field. + */ + skb_pull_rcsum(skb, GSW1XX_HEADER_LEN); + dsa_strip_etype_header(skb, GSW1XX_HEADER_LEN); + + return skb; +} + +static const struct dsa_device_ops gsw1xx_netdev_ops = { + .name = GSW1XX_TAG_NAME, + .proto = DSA_TAG_PROTO_MXL_GSW1XX, + .xmit = gsw1xx_tag_xmit, + .rcv = gsw1xx_tag_rcv, + .needed_headroom = GSW1XX_HEADER_LEN, +}; + +MODULE_DESCRIPTION("DSA tag driver for MaxLinear GSW1xx 8 byte protocol"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_MXL_GSW1XX, GSW1XX_TAG_NAME); + +module_dsa_tag_driver(gsw1xx_netdev_ops); diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c index bf6608fc6be7..3405def79c2d 100644 --- a/net/dsa/tag_ocelot.c +++ b/net/dsa/tag_ocelot.c @@ -46,11 +46,10 @@ static void ocelot_xmit_common(struct sk_buff *skb, struct net_device *netdev, static struct sk_buff *ocelot_xmit(struct sk_buff *skb, struct net_device *netdev) { - struct dsa_port *dp = dsa_user_to_port(netdev); void *injection; ocelot_xmit_common(skb, netdev, cpu_to_be32(0x8880000a), &injection); - ocelot_ifh_set_dest(injection, BIT_ULL(dp->index)); + ocelot_ifh_set_dest(injection, dsa_xmit_port_mask(skb, netdev)); return skb; } @@ -58,11 +57,10 @@ static struct sk_buff *ocelot_xmit(struct sk_buff *skb, static struct sk_buff *seville_xmit(struct sk_buff *skb, struct net_device *netdev) { - struct dsa_port *dp = dsa_user_to_port(netdev); void *injection; ocelot_xmit_common(skb, netdev, cpu_to_be32(0x88800005), &injection); - seville_ifh_set_dest(injection, BIT_ULL(dp->index)); + seville_ifh_set_dest(injection, dsa_xmit_port_mask(skb, netdev)); return skb; } diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c index 0cf61286b426..6d56a28c914c 100644 --- a/net/dsa/tag_qca.c +++ b/net/dsa/tag_qca.c @@ -14,7 +14,6 @@ static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev) { - struct dsa_port *dp = dsa_user_to_port(dev); __be16 *phdr; u16 hdr; @@ -26,7 +25,7 @@ static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev) /* Set the version field, and set destination port information */ hdr = FIELD_PREP(QCA_HDR_XMIT_VERSION, QCA_HDR_VERSION); hdr |= QCA_HDR_XMIT_FROM_CPU; - hdr |= FIELD_PREP(QCA_HDR_XMIT_DP_BIT, BIT(dp->index)); + hdr |= FIELD_PREP(QCA_HDR_XMIT_DP_BIT, dsa_xmit_port_mask(skb, dev)); *phdr = htons(hdr); diff --git a/net/dsa/tag_rtl4_a.c b/net/dsa/tag_rtl4_a.c index feaefa0e179b..3cc63eacfa03 100644 --- a/net/dsa/tag_rtl4_a.c +++ b/net/dsa/tag_rtl4_a.c @@ -57,7 +57,7 @@ static struct sk_buff *rtl4a_tag_xmit(struct sk_buff *skb, out = (RTL4_A_PROTOCOL_RTL8366RB << RTL4_A_PROTOCOL_SHIFT); /* The lower bits indicate the port number */ - out |= BIT(dp->index); + out |= dsa_xmit_port_mask(skb, dev); p = (__be16 *)(tag + 2); *p = htons(out); diff --git a/net/dsa/tag_rtl8_4.c b/net/dsa/tag_rtl8_4.c index 15c2bae2b429..2464545da4d2 100644 --- a/net/dsa/tag_rtl8_4.c +++ b/net/dsa/tag_rtl8_4.c @@ -103,7 +103,6 @@ static void rtl8_4_write_tag(struct sk_buff *skb, struct net_device *dev, void *tag) { - struct dsa_port *dp = dsa_user_to_port(dev); __be16 tag16[RTL8_4_TAG_LEN / 2]; /* Set Realtek EtherType */ @@ -116,7 +115,7 @@ static void rtl8_4_write_tag(struct sk_buff *skb, struct net_device *dev, tag16[2] = htons(FIELD_PREP(RTL8_4_LEARN_DIS, 1)); /* Zero ALLOW; set RX (CPU->switch) forwarding port mask */ - tag16[3] = htons(FIELD_PREP(RTL8_4_RX, BIT(dp->index))); + tag16[3] = htons(FIELD_PREP(RTL8_4_RX, dsa_xmit_port_mask(skb, dev))); memcpy(tag, tag16, RTL8_4_TAG_LEN); } diff --git a/net/dsa/tag_rzn1_a5psw.c b/net/dsa/tag_rzn1_a5psw.c index 69d51221b1e5..10994b3470f6 100644 --- a/net/dsa/tag_rzn1_a5psw.c +++ b/net/dsa/tag_rzn1_a5psw.c @@ -39,7 +39,6 @@ struct a5psw_tag { static struct sk_buff *a5psw_tag_xmit(struct sk_buff *skb, struct net_device *dev) { - struct dsa_port *dp = dsa_user_to_port(dev); struct a5psw_tag *ptag; u32 data2_val; @@ -60,7 +59,7 @@ static struct sk_buff *a5psw_tag_xmit(struct sk_buff *skb, struct net_device *de ptag = dsa_etype_header_pos_tx(skb); - data2_val = FIELD_PREP(A5PSW_CTRL_DATA_PORT, BIT(dp->index)); + data2_val = FIELD_PREP(A5PSW_CTRL_DATA_PORT, dsa_xmit_port_mask(skb, dev)); ptag->ctrl_tag = htons(ETH_P_DSA_A5PSW); ptag->ctrl_data = htons(A5PSW_CTRL_DATA_FORCE_FORWARD); ptag->ctrl_data2_lo = htons(data2_val); diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index 22742a53d6f4..4dce24cfe6a7 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -14,12 +14,11 @@ static struct sk_buff *trailer_xmit(struct sk_buff *skb, struct net_device *dev) { - struct dsa_port *dp = dsa_user_to_port(dev); u8 *trailer; trailer = skb_put(skb, 4); trailer[0] = 0x80; - trailer[1] = 1 << dp->index; + trailer[1] = dsa_xmit_port_mask(skb, dev); trailer[2] = 0x10; trailer[3] = 0x00; diff --git a/net/dsa/tag_xrs700x.c b/net/dsa/tag_xrs700x.c index 68d4633ddd5e..a05219f702c6 100644 --- a/net/dsa/tag_xrs700x.c +++ b/net/dsa/tag_xrs700x.c @@ -13,16 +13,10 @@ static struct sk_buff *xrs700x_xmit(struct sk_buff *skb, struct net_device *dev) { - struct dsa_port *partner, *dp = dsa_user_to_port(dev); u8 *trailer; trailer = skb_put(skb, 1); - trailer[0] = BIT(dp->index); - - if (dp->hsr_dev) - dsa_hsr_foreach_port(partner, dp->ds, dp->hsr_dev) - if (partner != dp) - trailer[0] |= BIT(partner->index); + trailer[0] = dsa_xmit_port_mask(skb, dev); return skb; } diff --git a/net/dsa/tag_yt921x.c b/net/dsa/tag_yt921x.c new file mode 100644 index 000000000000..6bbfd42dc5df --- /dev/null +++ b/net/dsa/tag_yt921x.c @@ -0,0 +1,139 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Motorcomm YT921x Switch Extended CPU Port Tagging + * + * Copyright (c) 2025 David Yang <mmyangfl@gmail.com> + * + * +----+----+-------+-----+----+--------- + * | DA | SA | TagET | Tag | ET | Payload ... + * +----+----+-------+-----+----+--------- + * 6 6 2 6 2 N + * + * Tag Ethertype: CPU_TAG_TPID_TPID (default: ETH_P_YT921X = 0x9988) + * * Hardcoded for the moment, but still configurable. Discuss it if there + * are conflicts somewhere and/or you want to change it for some reason. + * Tag: + * 2: VLAN Tag + * 2: Rx Port + * 15b: Rx Port Valid + * 14b-11b: Rx Port + * 10b-0b: Cmd? + * 2: Tx Port(s) + * 15b: Tx Port(s) Valid + * 10b-0b: Tx Port(s) Mask + */ + +#include <linux/etherdevice.h> + +#include "tag.h" + +#define YT921X_TAG_NAME "yt921x" + +#define YT921X_TAG_LEN 8 + +#define YT921X_TAG_PORT_EN BIT(15) +#define YT921X_TAG_RX_PORT_M GENMASK(14, 11) +#define YT921X_TAG_RX_CMD_M GENMASK(10, 0) +#define YT921X_TAG_RX_CMD(x) FIELD_PREP(YT921X_TAG_RX_CMD_M, (x)) +#define YT921X_TAG_RX_CMD_FORWARDED 0x80 +#define YT921X_TAG_RX_CMD_UNK_UCAST 0xb2 +#define YT921X_TAG_RX_CMD_UNK_MCAST 0xb4 +#define YT921X_TAG_TX_PORTS GENMASK(10, 0) + +static struct sk_buff * +yt921x_tag_xmit(struct sk_buff *skb, struct net_device *netdev) +{ + __be16 *tag; + u16 tx; + + skb_push(skb, YT921X_TAG_LEN); + dsa_alloc_etype_header(skb, YT921X_TAG_LEN); + + tag = dsa_etype_header_pos_tx(skb); + + tag[0] = htons(ETH_P_YT921X); + /* VLAN tag unrelated when TX */ + tag[1] = 0; + tag[2] = 0; + tx = FIELD_PREP(YT921X_TAG_TX_PORTS, dsa_xmit_port_mask(skb, netdev)) | + YT921X_TAG_PORT_EN; + tag[3] = htons(tx); + + return skb; +} + +static struct sk_buff * +yt921x_tag_rcv(struct sk_buff *skb, struct net_device *netdev) +{ + unsigned int port; + __be16 *tag; + u16 cmd; + u16 rx; + + if (unlikely(!pskb_may_pull(skb, YT921X_TAG_LEN))) + return NULL; + + tag = dsa_etype_header_pos_rx(skb); + + if (unlikely(tag[0] != htons(ETH_P_YT921X))) { + dev_warn_ratelimited(&netdev->dev, + "Unexpected EtherType 0x%04x\n", + ntohs(tag[0])); + return NULL; + } + + /* Locate which port this is coming from */ + rx = ntohs(tag[2]); + if (unlikely((rx & YT921X_TAG_PORT_EN) == 0)) { + dev_warn_ratelimited(&netdev->dev, + "Unexpected rx tag 0x%04x\n", rx); + return NULL; + } + + port = FIELD_GET(YT921X_TAG_RX_PORT_M, rx); + skb->dev = dsa_conduit_find_user(netdev, 0, port); + if (unlikely(!skb->dev)) { + dev_warn_ratelimited(&netdev->dev, + "Couldn't decode source port %u\n", port); + return NULL; + } + + cmd = FIELD_GET(YT921X_TAG_RX_CMD_M, rx); + switch (cmd) { + case YT921X_TAG_RX_CMD_FORWARDED: + /* Already forwarded by hardware */ + dsa_default_offload_fwd_mark(skb); + break; + case YT921X_TAG_RX_CMD_UNK_UCAST: + case YT921X_TAG_RX_CMD_UNK_MCAST: + /* NOTE: hardware doesn't distinguish between TRAP (copy to CPU + * only) and COPY (forward and copy to CPU). In order to perform + * a soft switch, NEVER use COPY action in the switch driver. + */ + break; + default: + dev_warn_ratelimited(&netdev->dev, + "Unexpected rx cmd 0x%02x\n", cmd); + break; + } + + /* Remove YT921x tag and update checksum */ + skb_pull_rcsum(skb, YT921X_TAG_LEN); + dsa_strip_etype_header(skb, YT921X_TAG_LEN); + + return skb; +} + +static const struct dsa_device_ops yt921x_netdev_ops = { + .name = YT921X_TAG_NAME, + .proto = DSA_TAG_PROTO_YT921X, + .xmit = yt921x_tag_xmit, + .rcv = yt921x_tag_rcv, + .needed_headroom = YT921X_TAG_LEN, +}; + +MODULE_DESCRIPTION("DSA tag driver for Motorcomm YT921x switches"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_YT921X, YT921X_TAG_NAME); + +module_dsa_tag_driver(yt921x_netdev_ops); diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 4e3651101b86..13a63b48b7ee 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -154,9 +154,9 @@ EXPORT_SYMBOL(eth_get_headlen); */ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) { - unsigned short _service_access_point; const unsigned short *sap; const struct ethhdr *eth; + __be16 res; skb->dev = dev; skb_reset_mac_header(skb); @@ -181,15 +181,15 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev) * the protocol design and runs IPX over 802.3 without an 802.2 LLC * layer. We look for FFFF which isn't a used 802.2 SSAP/DSAP. This * won't work for fault tolerant netware but does for the rest. + * We use skb->dev as temporary storage to not hit + * CONFIG_STACKPROTECTOR_STRONG=y costs on some platforms. */ - sap = skb_header_pointer(skb, 0, sizeof(*sap), &_service_access_point); - if (sap && *sap == 0xFFFF) - return htons(ETH_P_802_3); + sap = skb_header_pointer(skb, 0, sizeof(*sap), &skb->dev); + res = (sap && *sap == 0xFFFF) ? htons(ETH_P_802_3) : htons(ETH_P_802_2); - /* - * Real 802.2 LLC - */ - return htons(ETH_P_802_2); + /* restore skb->dev in case it was mangled by skb_header_pointer(). */ + skb->dev = dev; + return res; } EXPORT_SYMBOL(eth_type_trans); @@ -613,7 +613,10 @@ EXPORT_SYMBOL(fwnode_get_mac_address); */ int device_get_mac_address(struct device *dev, char *addr) { - return fwnode_get_mac_address(dev_fwnode(dev), addr); + if (!fwnode_get_mac_address(dev_fwnode(dev), addr)) + return 0; + + return nvmem_get_mac_address(dev, addr); } EXPORT_SYMBOL(device_get_mac_address); diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile index a1490c4afe6b..629c10916670 100644 --- a/net/ethtool/Makefile +++ b/net/ethtool/Makefile @@ -8,5 +8,5 @@ ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o rss.o \ linkstate.o debug.o wol.o features.o privflags.o rings.o \ channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o \ tunnels.o fec.o eeprom.o stats.o phc_vclocks.o mm.o \ - module.o cmis_fw_update.o cmis_cdb.o pse-pd.o plca.o mm.o \ - phy.o tsconfig.o + module.o cmis_fw_update.o cmis_cdb.o pse-pd.o plca.o \ + phy.o tsconfig.o mse.o diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 4f58648a27ad..369c05cf8163 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -233,6 +233,10 @@ const char link_mode_names[][ETH_GSTRING_LEN] = { __DEFINE_LINK_MODE_NAME(800000, DR4_2, Full), __DEFINE_LINK_MODE_NAME(800000, SR4, Full), __DEFINE_LINK_MODE_NAME(800000, VR4, Full), + __DEFINE_LINK_MODE_NAME(1600000, CR8, Full), + __DEFINE_LINK_MODE_NAME(1600000, KR8, Full), + __DEFINE_LINK_MODE_NAME(1600000, DR8, Full), + __DEFINE_LINK_MODE_NAME(1600000, DR8_2, Full), }; static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS); @@ -422,6 +426,10 @@ const struct link_mode_info link_mode_params[] = { __DEFINE_LINK_MODE_PARAMS(800000, DR4_2, Full), __DEFINE_LINK_MODE_PARAMS(800000, SR4, Full), __DEFINE_LINK_MODE_PARAMS(800000, VR4, Full), + __DEFINE_LINK_MODE_PARAMS(1600000, CR8, Full), + __DEFINE_LINK_MODE_PARAMS(1600000, KR8, Full), + __DEFINE_LINK_MODE_PARAMS(1600000, DR8, Full), + __DEFINE_LINK_MODE_PARAMS(1600000, DR8_2, Full), }; static_assert(ARRAY_SIZE(link_mode_params) == __ETHTOOL_LINK_MODE_MASK_NBITS); EXPORT_SYMBOL_GPL(link_mode_params); @@ -577,6 +585,26 @@ int __ethtool_get_link(struct net_device *dev) return netif_running(dev) && dev->ethtool_ops->get_link(dev); } +int ethtool_get_rx_ring_count(struct net_device *dev) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + struct ethtool_rxnfc rx_rings = {}; + int ret; + + if (ops->get_rx_ring_count) + return ops->get_rx_ring_count(dev); + + if (!ops->get_rxnfc) + return -EOPNOTSUPP; + + rx_rings.cmd = ETHTOOL_GRXRINGS; + ret = ops->get_rxnfc(dev, &rx_rings, NULL); + if (ret < 0) + return ret; + + return rx_rings.data; +} + static int ethtool_get_rxnfc_rule_count(struct net_device *dev) { const struct ethtool_ops *ops = dev->ethtool_ops; @@ -905,7 +933,7 @@ int ethtool_net_get_ts_info_by_phc(struct net_device *dev, int err; if (!ops->get_ts_info) - return -ENODEV; + return -EOPNOTSUPP; /* Does ptp comes from netdev */ ethtool_init_tsinfo(info); @@ -973,7 +1001,7 @@ int ethtool_get_ts_info_by_phc(struct net_device *dev, int err; err = ethtool_net_get_ts_info_by_phc(dev, info, hwprov_desc); - if (err == -ENODEV) { + if (err == -ENODEV || err == -EOPNOTSUPP) { struct phy_device *phy; phy = ethtool_phy_get_ts_info_by_phc(dev, info, hwprov_desc); diff --git a/net/ethtool/common.h b/net/ethtool/common.h index c4d084dde5bf..1609cf4e53eb 100644 --- a/net/ethtool/common.h +++ b/net/ethtool/common.h @@ -54,6 +54,8 @@ void ethtool_ringparam_get_cfg(struct net_device *dev, struct kernel_ethtool_ringparam *kparam, struct netlink_ext_ack *extack); +int ethtool_get_rx_ring_count(struct net_device *dev); + int __ethtool_get_ts_info(struct net_device *dev, struct kernel_ethtool_ts_info *info); int ethtool_get_ts_info_by_phc(struct net_device *dev, struct kernel_ethtool_ts_info *info, diff --git a/net/ethtool/fec.c b/net/ethtool/fec.c index e7d3f2c352a3..4669e74cbcaa 100644 --- a/net/ethtool/fec.c +++ b/net/ethtool/fec.c @@ -17,6 +17,7 @@ struct fec_reply_data { u64 stats[1 + ETHTOOL_MAX_LANES]; u8 cnt; } corr, uncorr, corr_bits; + struct ethtool_fec_hist fec_stat_hist; }; #define FEC_REPDATA(__reply_base) \ @@ -113,7 +114,10 @@ static int fec_prepare_data(const struct ethnl_req_info *req_base, struct ethtool_fec_stats stats; ethtool_stats_init((u64 *)&stats, sizeof(stats) / 8); - dev->ethtool_ops->get_fec_stats(dev, &stats); + ethtool_stats_init((u64 *)data->fec_stat_hist.values, + sizeof(data->fec_stat_hist.values) / 8); + dev->ethtool_ops->get_fec_stats(dev, &stats, + &data->fec_stat_hist); fec_stats_recalc(&data->corr, &stats.corrected_blocks); fec_stats_recalc(&data->uncorr, &stats.uncorrectable_blocks); @@ -157,13 +161,77 @@ static int fec_reply_size(const struct ethnl_req_info *req_base, len += nla_total_size(sizeof(u8)) + /* _FEC_AUTO */ nla_total_size(sizeof(u32)); /* _FEC_ACTIVE */ - if (req_base->flags & ETHTOOL_FLAG_STATS) + if (req_base->flags & ETHTOOL_FLAG_STATS) { len += 3 * nla_total_size_64bit(sizeof(u64) * (1 + ETHTOOL_MAX_LANES)); + /* add FEC bins information */ + len += (nla_total_size(0) + /* _A_FEC_HIST */ + nla_total_size(4) + /* _A_FEC_HIST_BIN_LOW */ + nla_total_size(4) + /* _A_FEC_HIST_BIN_HI */ + /* _A_FEC_HIST_BIN_VAL + per-lane values */ + nla_total_size_64bit(sizeof(u64)) + + nla_total_size_64bit(sizeof(u64) * ETHTOOL_MAX_LANES)) * + ETHTOOL_FEC_HIST_MAX; + } return len; } +static int fec_put_hist(struct sk_buff *skb, + const struct ethtool_fec_hist *hist) +{ + const struct ethtool_fec_hist_range *ranges = hist->ranges; + const struct ethtool_fec_hist_value *values = hist->values; + struct nlattr *nest; + int i, j; + u64 sum; + + if (!ranges) + return 0; + + for (i = 0; i < ETHTOOL_FEC_HIST_MAX; i++) { + if (i && !ranges[i].low && !ranges[i].high) + break; + + if (WARN_ON_ONCE(values[i].sum == ETHTOOL_STAT_NOT_SET && + values[i].per_lane[0] == ETHTOOL_STAT_NOT_SET)) + break; + + nest = nla_nest_start(skb, ETHTOOL_A_FEC_STAT_HIST); + if (!nest) + return -EMSGSIZE; + + if (nla_put_u32(skb, ETHTOOL_A_FEC_HIST_BIN_LOW, + ranges[i].low) || + nla_put_u32(skb, ETHTOOL_A_FEC_HIST_BIN_HIGH, + ranges[i].high)) + goto err_cancel_hist; + sum = 0; + for (j = 0; j < ETHTOOL_MAX_LANES; j++) { + if (values[i].per_lane[j] == ETHTOOL_STAT_NOT_SET) + break; + sum += values[i].per_lane[j]; + } + if (nla_put_uint(skb, ETHTOOL_A_FEC_HIST_BIN_VAL, + values[i].sum == ETHTOOL_STAT_NOT_SET ? + sum : values[i].sum)) + goto err_cancel_hist; + if (j && nla_put_64bit(skb, ETHTOOL_A_FEC_HIST_BIN_VAL_PER_LANE, + sizeof(u64) * j, + values[i].per_lane, + ETHTOOL_A_FEC_HIST_PAD)) + goto err_cancel_hist; + + nla_nest_end(skb, nest); + } + + return 0; + +err_cancel_hist: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + static int fec_put_stats(struct sk_buff *skb, const struct fec_reply_data *data) { struct nlattr *nest; @@ -183,6 +251,9 @@ static int fec_put_stats(struct sk_buff *skb, const struct fec_reply_data *data) data->corr_bits.stats, ETHTOOL_A_FEC_STAT_PAD)) goto err_cancel; + if (fec_put_hist(skb, &data->fec_stat_hist)) + goto err_cancel; + nla_nest_end(skb, nest); return 0; diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 43a7854e784e..fa83ddade4f8 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -1014,6 +1014,28 @@ static bool flow_type_hashable(u32 flow_type) return false; } +static bool flow_type_v6(u32 flow_type) +{ + switch (flow_type) { + case TCP_V6_FLOW: + case UDP_V6_FLOW: + case SCTP_V6_FLOW: + case AH_ESP_V6_FLOW: + case AH_V6_FLOW: + case ESP_V6_FLOW: + case IPV6_FLOW: + case GTPU_V6_FLOW: + case GTPC_V6_FLOW: + case GTPC_TEID_V6_FLOW: + case GTPU_EH_V6_FLOW: + case GTPU_UL_V6_FLOW: + case GTPU_DL_V6_FLOW: + return true; + } + + return false; +} + /* When adding a new type, update the assert and, if it's hashable, add it to * the flow_type_hashable switch case. */ @@ -1077,6 +1099,9 @@ ethtool_set_rxfh_fields(struct net_device *dev, u32 cmd, void __user *useraddr) if (rc) return rc; + if (info.data & RXH_IP6_FL && !flow_type_v6(info.flow_type)) + return -EINVAL; + if (info.flow_type & FLOW_RSS && info.rss_context && !ops->rxfh_per_ctx_fields) return -EINVAL; @@ -1183,18 +1208,41 @@ static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, return 0; } +static noinline_for_stack int ethtool_get_rxrings(struct net_device *dev, + u32 cmd, + void __user *useraddr) +{ + struct ethtool_rxnfc info; + size_t info_size; + int ret; + + info_size = sizeof(info); + ret = ethtool_rxnfc_copy_struct(cmd, &info, &info_size, useraddr); + if (ret) + return ret; + + ret = ethtool_get_rx_ring_count(dev); + if (ret < 0) + return ret; + + info.data = ret; + + return ethtool_rxnfc_copy_to_user(useraddr, &info, info_size, NULL); +} + static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev, u32 cmd, void __user *useraddr) { - struct ethtool_rxnfc info; - size_t info_size = sizeof(info); const struct ethtool_ops *ops = dev->ethtool_ops; - int ret; + struct ethtool_rxnfc info; void *rule_buf = NULL; + size_t info_size; + int ret; if (!ops->get_rxnfc) return -EOPNOTSUPP; + info_size = sizeof(info); ret = ethtool_rxnfc_copy_struct(cmd, &info, &info_size, useraddr); if (ret) return ret; @@ -1221,8 +1269,8 @@ err_out: } static int ethtool_copy_validate_indir(u32 *indir, void __user *useraddr, - struct ethtool_rxnfc *rx_rings, - u32 size) + int num_rx_rings, + u32 size) { int i; @@ -1231,7 +1279,7 @@ static int ethtool_copy_validate_indir(u32 *indir, void __user *useraddr, /* Validate ring indices */ for (i = 0; i < size; i++) - if (indir[i] >= rx_rings->data) + if (indir[i] >= num_rx_rings) return -EINVAL; return 0; @@ -1302,13 +1350,12 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev, const struct ethtool_ops *ops = dev->ethtool_ops; struct ethtool_rxfh_param rxfh_dev = {}; struct netlink_ext_ack *extack = NULL; - struct ethtool_rxnfc rx_rings; + int num_rx_rings; u32 user_size, i; int ret; u32 ringidx_offset = offsetof(struct ethtool_rxfh_indir, ring_index[0]); - if (!ops->get_rxfh_indir_size || !ops->set_rxfh || - !ops->get_rxnfc) + if (!ops->get_rxfh_indir_size || !ops->set_rxfh) return -EOPNOTSUPP; rxfh_dev.indir_size = ops->get_rxfh_indir_size(dev); @@ -1328,20 +1375,21 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev, if (!rxfh_dev.indir) return -ENOMEM; - rx_rings.cmd = ETHTOOL_GRXRINGS; - ret = ops->get_rxnfc(dev, &rx_rings, NULL); - if (ret) + num_rx_rings = ethtool_get_rx_ring_count(dev); + if (num_rx_rings < 0) { + ret = num_rx_rings; goto out; + } if (user_size == 0) { u32 *indir = rxfh_dev.indir; for (i = 0; i < rxfh_dev.indir_size; i++) - indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data); + indir[i] = ethtool_rxfh_indir_default(i, num_rx_rings); } else { ret = ethtool_copy_validate_indir(rxfh_dev.indir, useraddr + ringidx_offset, - &rx_rings, + num_rx_rings, rxfh_dev.indir_size); if (ret) goto out; @@ -1483,14 +1531,14 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, struct ethtool_rxfh_param rxfh_dev = {}; struct ethtool_rxfh_context *ctx = NULL; struct netlink_ext_ack *extack = NULL; - struct ethtool_rxnfc rx_rings; struct ethtool_rxfh rxfh; bool create = false; + int num_rx_rings; u8 *rss_config; int ntf = 0; int ret; - if (!ops->get_rxnfc || !ops->set_rxfh) + if (!ops->set_rxfh) return -EOPNOTSUPP; if (ops->get_rxfh_indir_size) @@ -1546,10 +1594,11 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, if (!rss_config) return -ENOMEM; - rx_rings.cmd = ETHTOOL_GRXRINGS; - ret = ops->get_rxnfc(dev, &rx_rings, NULL); - if (ret) + num_rx_rings = ethtool_get_rx_ring_count(dev); + if (num_rx_rings < 0) { + ret = num_rx_rings; goto out_free; + } /* rxfh.indir_size == 0 means reset the indir table to default (master * context) or delete the context (other RSS contexts). @@ -1562,7 +1611,7 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, rxfh_dev.indir_size = dev_indir_size; ret = ethtool_copy_validate_indir(rxfh_dev.indir, useraddr + rss_cfg_offset, - &rx_rings, + num_rx_rings, rxfh.indir_size); if (ret) goto out_free; @@ -1574,7 +1623,8 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, rxfh_dev.indir_size = dev_indir_size; indir = rxfh_dev.indir; for (i = 0; i < dev_indir_size; i++) - indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data); + indir[i] = + ethtool_rxfh_indir_default(i, num_rx_rings); } else { rxfh_dev.rss_delete = true; } @@ -3352,6 +3402,8 @@ __dev_ethtool(struct net *net, struct ifreq *ifr, void __user *useraddr, rc = ethtool_set_rxfh_fields(dev, ethcmd, useraddr); break; case ETHTOOL_GRXRINGS: + rc = ethtool_get_rxrings(dev, ethcmd, useraddr); + break; case ETHTOOL_GRXCLSRLCNT: case ETHTOOL_GRXCLSRULE: case ETHTOOL_GRXCLSRLALL: diff --git a/net/ethtool/mse.c b/net/ethtool/mse.c new file mode 100644 index 000000000000..6aac004c3ffc --- /dev/null +++ b/net/ethtool/mse.c @@ -0,0 +1,329 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/ethtool.h> +#include <linux/phy.h> +#include <linux/slab.h> + +#include "netlink.h" +#include "common.h" + +/* Channels A-D only; WORST and LINK are exclusive alternatives */ +#define PHY_MSE_CHANNEL_COUNT 4 + +struct mse_req_info { + struct ethnl_req_info base; +}; + +struct mse_snapshot_entry { + struct phy_mse_snapshot snapshot; + int channel; +}; + +struct mse_reply_data { + struct ethnl_reply_data base; + struct phy_mse_capability capability; + struct mse_snapshot_entry *snapshots; + unsigned int num_snapshots; +}; + +static struct mse_reply_data * +mse_repdata(const struct ethnl_reply_data *reply_base) +{ + return container_of(reply_base, struct mse_reply_data, base); +} + +const struct nla_policy ethnl_mse_get_policy[] = { + [ETHTOOL_A_MSE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy_phy), +}; + +static int get_snapshot_if_supported(struct phy_device *phydev, + struct mse_reply_data *data, + unsigned int *idx, u32 cap_bit, + enum phy_mse_channel channel) +{ + int ret; + + if (data->capability.supported_caps & cap_bit) { + ret = phydev->drv->get_mse_snapshot(phydev, channel, + &data->snapshots[*idx].snapshot); + if (ret) + return ret; + data->snapshots[*idx].channel = channel; + (*idx)++; + } + + return 0; +} + +static int mse_get_channels(struct phy_device *phydev, + struct mse_reply_data *data) +{ + unsigned int i = 0; + int ret; + + if (!data->capability.supported_caps) + return 0; + + data->snapshots = kcalloc(PHY_MSE_CHANNEL_COUNT, + sizeof(*data->snapshots), GFP_KERNEL); + if (!data->snapshots) + return -ENOMEM; + + /* Priority 1: Individual channels */ + ret = get_snapshot_if_supported(phydev, data, &i, PHY_MSE_CAP_CHANNEL_A, + PHY_MSE_CHANNEL_A); + if (ret) + return ret; + ret = get_snapshot_if_supported(phydev, data, &i, PHY_MSE_CAP_CHANNEL_B, + PHY_MSE_CHANNEL_B); + if (ret) + return ret; + ret = get_snapshot_if_supported(phydev, data, &i, PHY_MSE_CAP_CHANNEL_C, + PHY_MSE_CHANNEL_C); + if (ret) + return ret; + ret = get_snapshot_if_supported(phydev, data, &i, PHY_MSE_CAP_CHANNEL_D, + PHY_MSE_CHANNEL_D); + if (ret) + return ret; + + /* If any individual channels were found, we are done. */ + if (i > 0) { + data->num_snapshots = i; + return 0; + } + + /* Priority 2: Worst channel, if no individual channels supported. */ + ret = get_snapshot_if_supported(phydev, data, &i, + PHY_MSE_CAP_WORST_CHANNEL, + PHY_MSE_CHANNEL_WORST); + if (ret) + return ret; + + /* If worst channel was found, we are done. */ + if (i > 0) { + data->num_snapshots = i; + return 0; + } + + /* Priority 3: Link-wide, if nothing else is supported. */ + ret = get_snapshot_if_supported(phydev, data, &i, PHY_MSE_CAP_LINK, + PHY_MSE_CHANNEL_LINK); + if (ret) + return ret; + + data->num_snapshots = i; + return 0; +} + +static int mse_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + const struct genl_info *info) +{ + struct mse_reply_data *data = mse_repdata(reply_base); + struct net_device *dev = reply_base->dev; + struct phy_device *phydev; + int ret; + + phydev = ethnl_req_get_phydev(req_base, info->attrs, + ETHTOOL_A_MSE_HEADER, info->extack); + if (IS_ERR(phydev)) + return PTR_ERR(phydev); + if (!phydev) + return -EOPNOTSUPP; + + ret = ethnl_ops_begin(dev); + if (ret) + return ret; + + mutex_lock(&phydev->lock); + + if (!phydev->drv || !phydev->drv->get_mse_capability || + !phydev->drv->get_mse_snapshot) { + ret = -EOPNOTSUPP; + goto out_unlock; + } + if (!phydev->link) { + ret = -ENETDOWN; + goto out_unlock; + } + + ret = phydev->drv->get_mse_capability(phydev, &data->capability); + if (ret) + goto out_unlock; + + ret = mse_get_channels(phydev, data); + +out_unlock: + mutex_unlock(&phydev->lock); + ethnl_ops_complete(dev); + if (ret) + kfree(data->snapshots); + return ret; +} + +static void mse_cleanup_data(struct ethnl_reply_data *reply_base) +{ + struct mse_reply_data *data = mse_repdata(reply_base); + + kfree(data->snapshots); +} + +static int mse_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct mse_reply_data *data = mse_repdata(reply_base); + size_t len = 0; + unsigned int i; + + /* ETHTOOL_A_MSE_CAPABILITIES */ + len += nla_total_size(0); + if (data->capability.supported_caps & PHY_MSE_CAP_AVG) + /* ETHTOOL_A_MSE_CAPABILITIES_MAX_AVERAGE_MSE */ + len += nla_total_size(sizeof(u64)); + if (data->capability.supported_caps & (PHY_MSE_CAP_PEAK | + PHY_MSE_CAP_WORST_PEAK)) + /* ETHTOOL_A_MSE_CAPABILITIES_MAX_PEAK_MSE */ + len += nla_total_size(sizeof(u64)); + /* ETHTOOL_A_MSE_CAPABILITIES_REFRESH_RATE_PS */ + len += nla_total_size(sizeof(u64)); + /* ETHTOOL_A_MSE_CAPABILITIES_NUM_SYMBOLS */ + len += nla_total_size(sizeof(u64)); + + for (i = 0; i < data->num_snapshots; i++) { + size_t snapshot_len = 0; + + /* Per-channel nest (e.g., ETHTOOL_A_MSE_CHANNEL_A / _B / _C / + * _D / _WORST_CHANNEL / _LINK) + */ + snapshot_len += nla_total_size(0); + + if (data->capability.supported_caps & PHY_MSE_CAP_AVG) + snapshot_len += nla_total_size(sizeof(u64)); + if (data->capability.supported_caps & PHY_MSE_CAP_PEAK) + snapshot_len += nla_total_size(sizeof(u64)); + if (data->capability.supported_caps & PHY_MSE_CAP_WORST_PEAK) + snapshot_len += nla_total_size(sizeof(u64)); + + len += snapshot_len; + } + + return len; +} + +static int mse_channel_to_attr(int ch) +{ + switch (ch) { + case PHY_MSE_CHANNEL_A: + return ETHTOOL_A_MSE_CHANNEL_A; + case PHY_MSE_CHANNEL_B: + return ETHTOOL_A_MSE_CHANNEL_B; + case PHY_MSE_CHANNEL_C: + return ETHTOOL_A_MSE_CHANNEL_C; + case PHY_MSE_CHANNEL_D: + return ETHTOOL_A_MSE_CHANNEL_D; + case PHY_MSE_CHANNEL_WORST: + return ETHTOOL_A_MSE_WORST_CHANNEL; + case PHY_MSE_CHANNEL_LINK: + return ETHTOOL_A_MSE_LINK; + default: + return -EINVAL; + } +} + +static int mse_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct mse_reply_data *data = mse_repdata(reply_base); + struct nlattr *nest; + unsigned int i; + int ret; + + nest = nla_nest_start(skb, ETHTOOL_A_MSE_CAPABILITIES); + if (!nest) + return -EMSGSIZE; + + if (data->capability.supported_caps & PHY_MSE_CAP_AVG) { + ret = nla_put_uint(skb, + ETHTOOL_A_MSE_CAPABILITIES_MAX_AVERAGE_MSE, + data->capability.max_average_mse); + if (ret < 0) + goto nla_put_nest_failure; + } + + if (data->capability.supported_caps & (PHY_MSE_CAP_PEAK | + PHY_MSE_CAP_WORST_PEAK)) { + ret = nla_put_uint(skb, ETHTOOL_A_MSE_CAPABILITIES_MAX_PEAK_MSE, + data->capability.max_peak_mse); + if (ret < 0) + goto nla_put_nest_failure; + } + + ret = nla_put_uint(skb, ETHTOOL_A_MSE_CAPABILITIES_REFRESH_RATE_PS, + data->capability.refresh_rate_ps); + if (ret < 0) + goto nla_put_nest_failure; + + ret = nla_put_uint(skb, ETHTOOL_A_MSE_CAPABILITIES_NUM_SYMBOLS, + data->capability.num_symbols); + if (ret < 0) + goto nla_put_nest_failure; + + nla_nest_end(skb, nest); + + for (i = 0; i < data->num_snapshots; i++) { + const struct mse_snapshot_entry *s = &data->snapshots[i]; + int chan_attr; + + chan_attr = mse_channel_to_attr(s->channel); + if (chan_attr < 0) + return chan_attr; + + nest = nla_nest_start(skb, chan_attr); + if (!nest) + return -EMSGSIZE; + + if (data->capability.supported_caps & PHY_MSE_CAP_AVG) { + ret = nla_put_uint(skb, + ETHTOOL_A_MSE_SNAPSHOT_AVERAGE_MSE, + s->snapshot.average_mse); + if (ret) + goto nla_put_nest_failure; + } + if (data->capability.supported_caps & PHY_MSE_CAP_PEAK) { + ret = nla_put_uint(skb, ETHTOOL_A_MSE_SNAPSHOT_PEAK_MSE, + s->snapshot.peak_mse); + if (ret) + goto nla_put_nest_failure; + } + if (data->capability.supported_caps & PHY_MSE_CAP_WORST_PEAK) { + ret = nla_put_uint(skb, + ETHTOOL_A_MSE_SNAPSHOT_WORST_PEAK_MSE, + s->snapshot.worst_peak_mse); + if (ret) + goto nla_put_nest_failure; + } + + nla_nest_end(skb, nest); + } + + return 0; + +nla_put_nest_failure: + nla_nest_cancel(skb, nest); + return ret; +} + +const struct ethnl_request_ops ethnl_mse_request_ops = { + .request_cmd = ETHTOOL_MSG_MSE_GET, + .reply_cmd = ETHTOOL_MSG_MSE_GET_REPLY, + .hdr_attr = ETHTOOL_A_MSE_HEADER, + .req_info_size = sizeof(struct mse_req_info), + .reply_data_size = sizeof(struct mse_reply_data), + + .prepare_data = mse_prepare_data, + .cleanup_data = mse_cleanup_data, + .reply_size = mse_reply_size, + .fill_reply = mse_fill_reply, +}; diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 2f813f25f07e..6e5f0f4f815a 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -420,6 +420,7 @@ ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = { [ETHTOOL_MSG_TSCONFIG_GET] = ðnl_tsconfig_request_ops, [ETHTOOL_MSG_TSCONFIG_SET] = ðnl_tsconfig_request_ops, [ETHTOOL_MSG_PHY_GET] = ðnl_phy_request_ops, + [ETHTOOL_MSG_MSE_GET] = ðnl_mse_request_ops, }; static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb) @@ -1534,6 +1535,15 @@ static const struct genl_ops ethtool_genl_ops[] = { .policy = ethnl_rss_delete_policy, .maxattr = ARRAY_SIZE(ethnl_rss_delete_policy) - 1, }, + { + .cmd = ETHTOOL_MSG_MSE_GET, + .doit = ethnl_default_doit, + .start = ethnl_perphy_start, + .dumpit = ethnl_perphy_dumpit, + .done = ethnl_perphy_done, + .policy = ethnl_mse_get_policy, + .maxattr = ARRAY_SIZE(ethnl_mse_get_policy) - 1, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 1d4f9ecb3d26..89010eaa67df 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -442,6 +442,7 @@ extern const struct ethnl_request_ops ethnl_plca_status_request_ops; extern const struct ethnl_request_ops ethnl_mm_request_ops; extern const struct ethnl_request_ops ethnl_phy_request_ops; extern const struct ethnl_request_ops ethnl_tsconfig_request_ops; +extern const struct ethnl_request_ops ethnl_mse_request_ops; extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1]; extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1]; @@ -497,6 +498,7 @@ extern const struct nla_policy ethnl_module_fw_flash_act_policy[ETHTOOL_A_MODULE extern const struct nla_policy ethnl_phy_get_policy[ETHTOOL_A_PHY_HEADER + 1]; extern const struct nla_policy ethnl_tsconfig_get_policy[ETHTOOL_A_TSCONFIG_HEADER + 1]; extern const struct nla_policy ethnl_tsconfig_set_policy[ETHTOOL_A_TSCONFIG_MAX + 1]; +extern const struct nla_policy ethnl_mse_get_policy[ETHTOOL_A_MSE_HEADER + 1]; int ethnl_set_features(struct sk_buff *skb, struct genl_info *info); int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info); diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c index 992e98abe9dd..4dced53be4b3 100644 --- a/net/ethtool/rss.c +++ b/net/ethtool/rss.c @@ -536,35 +536,36 @@ void ethtool_rss_notify(struct net_device *dev, u32 type, u32 rss_context) #define RFH_MASK (RXH_L2DA | RXH_VLAN | RXH_IP_SRC | RXH_IP_DST | \ RXH_L3_PROTO | RXH_L4_B_0_1 | RXH_L4_B_2_3 | \ RXH_GTP_TEID | RXH_DISCARD) +#define RFH_MASKv6 (RFH_MASK | RXH_IP6_FL) static const struct nla_policy ethnl_rss_flows_policy[] = { [ETHTOOL_A_FLOW_ETHER] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), [ETHTOOL_A_FLOW_IP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_IP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_IP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), [ETHTOOL_A_FLOW_TCP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), [ETHTOOL_A_FLOW_UDP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), [ETHTOOL_A_FLOW_SCTP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), [ETHTOOL_A_FLOW_AH_ESP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_TCP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_UDP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_SCTP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_AH_ESP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_TCP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), + [ETHTOOL_A_FLOW_UDP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), + [ETHTOOL_A_FLOW_SCTP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), + [ETHTOOL_A_FLOW_AH_ESP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), [ETHTOOL_A_FLOW_AH4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), [ETHTOOL_A_FLOW_ESP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_AH6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_ESP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_AH6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), + [ETHTOOL_A_FLOW_ESP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), [ETHTOOL_A_FLOW_GTPU4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_GTPU6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_GTPU6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), [ETHTOOL_A_FLOW_GTPC4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_GTPC6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_GTPC6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), [ETHTOOL_A_FLOW_GTPC_TEID4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_GTPC_TEID6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_GTPC_TEID6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), [ETHTOOL_A_FLOW_GTPU_EH4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_GTPU_EH6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_GTPU_EH6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), [ETHTOOL_A_FLOW_GTPU_UL4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_GTPU_UL6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_GTPU_UL6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), [ETHTOOL_A_FLOW_GTPU_DL4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_GTPU_DL6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_GTPU_DL6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), }; const struct nla_policy ethnl_rss_set_policy[ETHTOOL_A_RSS_FLOW_HASH + 1] = { @@ -619,23 +620,22 @@ rss_set_prep_indir(struct net_device *dev, struct genl_info *info, struct rss_reply_data *data, struct ethtool_rxfh_param *rxfh, bool *reset, bool *mod) { - const struct ethtool_ops *ops = dev->ethtool_ops; struct netlink_ext_ack *extack = info->extack; struct nlattr **tb = info->attrs; - struct ethtool_rxnfc rx_rings; size_t alloc_size; + int num_rx_rings; u32 user_size; int i, err; if (!tb[ETHTOOL_A_RSS_INDIR]) return 0; - if (!data->indir_size || !ops->get_rxnfc) + if (!data->indir_size) return -EOPNOTSUPP; - rx_rings.cmd = ETHTOOL_GRXRINGS; - err = ops->get_rxnfc(dev, &rx_rings, NULL); - if (err) + err = ethtool_get_rx_ring_count(dev); + if (err < 0) return err; + num_rx_rings = err; if (nla_len(tb[ETHTOOL_A_RSS_INDIR]) % 4) { NL_SET_BAD_ATTR(info->extack, tb[ETHTOOL_A_RSS_INDIR]); @@ -664,7 +664,7 @@ rss_set_prep_indir(struct net_device *dev, struct genl_info *info, nla_memcpy(rxfh->indir, tb[ETHTOOL_A_RSS_INDIR], alloc_size); for (i = 0; i < user_size; i++) { - if (rxfh->indir[i] < rx_rings.data) + if (rxfh->indir[i] < num_rx_rings) continue; NL_SET_ERR_MSG_ATTR_FMT(extack, tb[ETHTOOL_A_RSS_INDIR], @@ -681,7 +681,7 @@ rss_set_prep_indir(struct net_device *dev, struct genl_info *info, } else { for (i = 0; i < data->indir_size; i++) rxfh->indir[i] = - ethtool_rxfh_indir_default(i, rx_rings.data); + ethtool_rxfh_indir_default(i, num_rx_rings); } *mod |= memcmp(rxfh->indir, data->indir_table, data->indir_size); diff --git a/net/ethtool/tsconfig.c b/net/ethtool/tsconfig.c index 2be356bdfe87..169b413b31fc 100644 --- a/net/ethtool/tsconfig.c +++ b/net/ethtool/tsconfig.c @@ -423,13 +423,11 @@ static int ethnl_set_tsconfig(struct ethnl_req_info *req_base, return ret; } - if (hwprov_mod || config_mod) { - ret = tsconfig_send_reply(dev, info); - if (ret && ret != -EOPNOTSUPP) { - NL_SET_ERR_MSG(info->extack, - "error while reading the new configuration set"); - return ret; - } + ret = tsconfig_send_reply(dev, info); + if (ret && ret != -EOPNOTSUPP) { + NL_SET_ERR_MSG(info->extack, + "error while reading the new configuration set"); + return ret; } /* tsconfig has no notification */ diff --git a/net/handshake/genl.c b/net/handshake/genl.c index f55d14d7b726..870612609491 100644 --- a/net/handshake/genl.c +++ b/net/handshake/genl.c @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/handshake.yaml */ /* YNL-GEN kernel source */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #include <net/netlink.h> #include <net/genetlink.h> diff --git a/net/handshake/genl.h b/net/handshake/genl.h index ae72a596f6cc..8d3e18672daf 100644 --- a/net/handshake/genl.h +++ b/net/handshake/genl.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/handshake.yaml */ /* YNL-GEN kernel header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _LINUX_HANDSHAKE_GEN_H #define _LINUX_HANDSHAKE_GEN_H diff --git a/net/handshake/netlink.c b/net/handshake/netlink.c index 7e46d130dce2..1d33a4675a48 100644 --- a/net/handshake/netlink.c +++ b/net/handshake/netlink.c @@ -93,7 +93,7 @@ int handshake_nl_accept_doit(struct sk_buff *skb, struct genl_info *info) struct handshake_net *hn = handshake_pernet(net); struct handshake_req *req = NULL; struct socket *sock; - int class, fd, err; + int class, err; err = -EOPNOTSUPP; if (!hn) @@ -106,27 +106,25 @@ int handshake_nl_accept_doit(struct sk_buff *skb, struct genl_info *info) err = -EAGAIN; req = handshake_req_next(hn, class); - if (!req) - goto out_status; - - sock = req->hr_sk->sk_socket; - fd = get_unused_fd_flags(O_CLOEXEC); - if (fd < 0) { - err = fd; - goto out_complete; - } - - err = req->hr_proto->hp_accept(req, info, fd); - if (err) { - put_unused_fd(fd); - goto out_complete; + if (req) { + sock = req->hr_sk->sk_socket; + + FD_PREPARE(fdf, O_CLOEXEC, sock->file); + if (fdf.err) { + err = fdf.err; + goto out_complete; + } + + get_file(sock->file); /* FD_PREPARE() consumes a reference. */ + err = req->hr_proto->hp_accept(req, info, fd_prepare_fd(fdf)); + if (err) + goto out_complete; /* Automatic cleanup handles fput */ + + trace_handshake_cmd_accept(net, req, req->hr_sk, fd_prepare_fd(fdf)); + fd_publish(fdf); + return 0; } - fd_install(fd, get_file(sock->file)); - - trace_handshake_cmd_accept(net, req, req->hr_sk, fd); - return 0; - out_complete: handshake_complete(req, -EIO, NULL); out_status: diff --git a/net/handshake/tlshd.c b/net/handshake/tlshd.c index 081093dfd553..8f9532a15f43 100644 --- a/net/handshake/tlshd.c +++ b/net/handshake/tlshd.c @@ -259,6 +259,7 @@ static int tls_handshake_accept(struct handshake_req *req, out_cancel: genlmsg_cancel(msg, hdr); + nlmsg_free(msg); out: return ret; } diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index 88657255fec1..d1bfc49b5f01 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -49,7 +49,7 @@ static bool hsr_check_carrier(struct hsr_port *master) ASSERT_RTNL(); - hsr_for_each_port(master->hsr, port) { + hsr_for_each_port_rtnl(master->hsr, port) { if (port->type != HSR_PT_MASTER && is_slave_up(port->dev)) { netif_carrier_on(master->dev); return true; @@ -105,7 +105,7 @@ int hsr_get_max_mtu(struct hsr_priv *hsr) struct hsr_port *port; mtu_max = ETH_DATA_LEN; - hsr_for_each_port(hsr, port) + hsr_for_each_port_rtnl(hsr, port) if (port->type != HSR_PT_MASTER) mtu_max = min(port->dev->mtu, mtu_max); @@ -139,7 +139,7 @@ static int hsr_dev_open(struct net_device *dev) hsr = netdev_priv(dev); - hsr_for_each_port(hsr, port) { + hsr_for_each_port_rtnl(hsr, port) { if (port->type == HSR_PT_MASTER) continue; switch (port->type) { @@ -172,7 +172,7 @@ static int hsr_dev_close(struct net_device *dev) struct hsr_priv *hsr; hsr = netdev_priv(dev); - hsr_for_each_port(hsr, port) { + hsr_for_each_port_rtnl(hsr, port) { if (port->type == HSR_PT_MASTER) continue; switch (port->type) { @@ -205,7 +205,7 @@ static netdev_features_t hsr_features_recompute(struct hsr_priv *hsr, * may become enabled. */ features &= ~NETIF_F_ONE_FOR_ALL; - hsr_for_each_port(hsr, port) + hsr_for_each_port_rtnl(hsr, port) features = netdev_increment_features(features, port->dev->features, mask); @@ -226,6 +226,7 @@ static netdev_tx_t hsr_dev_xmit(struct sk_buff *skb, struct net_device *dev) struct hsr_priv *hsr = netdev_priv(dev); struct hsr_port *master; + rcu_read_lock(); master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); if (master) { skb->dev = master->dev; @@ -238,6 +239,8 @@ static netdev_tx_t hsr_dev_xmit(struct sk_buff *skb, struct net_device *dev) dev_core_stats_tx_dropped_inc(dev); dev_kfree_skb_any(skb); } + rcu_read_unlock(); + return NETDEV_TX_OK; } @@ -317,6 +320,9 @@ static void send_hsr_supervision_frame(struct hsr_port *port, } hsr_stag = skb_put(skb, sizeof(struct hsr_sup_tag)); + skb_set_network_header(skb, ETH_HLEN + HSR_HLEN); + skb_reset_mac_len(skb); + set_hsr_stag_path(hsr_stag, (hsr->prot_version ? 0x0 : 0xf)); set_hsr_stag_HSR_ver(hsr_stag, hsr->prot_version); @@ -331,7 +337,7 @@ static void send_hsr_supervision_frame(struct hsr_port *port, } hsr_stag->tlv.HSR_TLV_type = type; - /* TODO: Why 12 in HSRv0? */ + /* HSRv0 has 6 unused bytes after the MAC */ hsr_stag->tlv.HSR_TLV_length = hsr->prot_version ? sizeof(struct hsr_sup_payload) : 12; @@ -484,7 +490,7 @@ static void hsr_set_rx_mode(struct net_device *dev) hsr = netdev_priv(dev); - hsr_for_each_port(hsr, port) { + hsr_for_each_port_rtnl(hsr, port) { if (port->type == HSR_PT_MASTER) continue; switch (port->type) { @@ -506,7 +512,7 @@ static void hsr_change_rx_flags(struct net_device *dev, int change) hsr = netdev_priv(dev); - hsr_for_each_port(hsr, port) { + hsr_for_each_port_rtnl(hsr, port) { if (port->type == HSR_PT_MASTER) continue; switch (port->type) { @@ -534,7 +540,7 @@ static int hsr_ndo_vlan_rx_add_vid(struct net_device *dev, hsr = netdev_priv(dev); - hsr_for_each_port(hsr, port) { + hsr_for_each_port_rtnl(hsr, port) { if (port->type == HSR_PT_MASTER || port->type == HSR_PT_INTERLINK) continue; @@ -580,7 +586,7 @@ static int hsr_ndo_vlan_rx_kill_vid(struct net_device *dev, hsr = netdev_priv(dev); - hsr_for_each_port(hsr, port) { + hsr_for_each_port_rtnl(hsr, port) { switch (port->type) { case HSR_PT_SLAVE_A: case HSR_PT_SLAVE_B: @@ -672,13 +678,38 @@ struct net_device *hsr_get_port_ndev(struct net_device *ndev, struct hsr_priv *hsr = netdev_priv(ndev); struct hsr_port *port; + rcu_read_lock(); hsr_for_each_port(hsr, port) - if (port->type == pt) + if (port->type == pt) { + dev_hold(port->dev); + rcu_read_unlock(); return port->dev; + } + rcu_read_unlock(); return NULL; } EXPORT_SYMBOL(hsr_get_port_ndev); +int hsr_get_port_type(struct net_device *hsr_dev, struct net_device *dev, + enum hsr_port_type *type) +{ + struct hsr_priv *hsr = netdev_priv(hsr_dev); + struct hsr_port *port; + + rcu_read_lock(); + hsr_for_each_port(hsr, port) { + if (port->dev == dev) { + *type = port->type; + rcu_read_unlock(); + return 0; + } + } + rcu_read_unlock(); + + return -EINVAL; +} +EXPORT_SYMBOL(hsr_get_port_type); + /* Default multicast address for HSR Supervision frames */ static const unsigned char def_multicast_addr[ETH_ALEN] __aligned(2) = { 0x01, 0x15, 0x4e, 0x00, 0x01, 0x00 diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c index c67c0d35921d..339f0d220212 100644 --- a/net/hsr/hsr_forward.c +++ b/net/hsr/hsr_forward.c @@ -262,15 +262,23 @@ static struct sk_buff *prp_fill_rct(struct sk_buff *skb, return skb; } -static void hsr_set_path_id(struct hsr_ethhdr *hsr_ethhdr, +static void hsr_set_path_id(struct hsr_frame_info *frame, + struct hsr_ethhdr *hsr_ethhdr, struct hsr_port *port) { int path_id; - if (port->type == HSR_PT_SLAVE_A) - path_id = 0; - else - path_id = 1; + if (port->hsr->prot_version) { + if (port->type == HSR_PT_SLAVE_A) + path_id = 0; + else + path_id = 1; + } else { + if (frame->is_supervision) + path_id = 0xf; + else + path_id = 1; + } set_hsr_tag_path(&hsr_ethhdr->hsr_tag, path_id); } @@ -304,7 +312,7 @@ static struct sk_buff *hsr_fill_tag(struct sk_buff *skb, else hsr_ethhdr = (struct hsr_ethhdr *)pc; - hsr_set_path_id(hsr_ethhdr, port); + hsr_set_path_id(frame, hsr_ethhdr, port); set_hsr_tag_LSDU_size(&hsr_ethhdr->hsr_tag, lsdu_size); hsr_ethhdr->hsr_tag.sequence_nr = htons(frame->sequence_nr); hsr_ethhdr->hsr_tag.encap_proto = hsr_ethhdr->ethhdr.h_proto; @@ -330,7 +338,7 @@ struct sk_buff *hsr_create_tagged_frame(struct hsr_frame_info *frame, (struct hsr_ethhdr *)skb_mac_header(frame->skb_hsr); /* set the lane id properly */ - hsr_set_path_id(hsr_ethhdr, port); + hsr_set_path_id(frame, hsr_ethhdr, port); return skb_clone(frame->skb_hsr, GFP_ATOMIC); } else if (port->dev->features & NETIF_F_HW_HSR_TAG_INS) { return skb_clone(frame->skb_std, GFP_ATOMIC); diff --git a/net/hsr/hsr_main.c b/net/hsr/hsr_main.c index 192893c3f2ec..bc94b07101d8 100644 --- a/net/hsr/hsr_main.c +++ b/net/hsr/hsr_main.c @@ -22,7 +22,7 @@ static bool hsr_slave_empty(struct hsr_priv *hsr) { struct hsr_port *port; - hsr_for_each_port(hsr, port) + hsr_for_each_port_rtnl(hsr, port) if (port->type != HSR_PT_MASTER) return false; return true; @@ -134,7 +134,7 @@ struct hsr_port *hsr_port_get_hsr(struct hsr_priv *hsr, enum hsr_port_type pt) { struct hsr_port *port; - hsr_for_each_port(hsr, port) + hsr_for_each_port_rtnl(hsr, port) if (port->type == pt) return port; return NULL; diff --git a/net/hsr/hsr_main.h b/net/hsr/hsr_main.h index 135ec5fce019..33b0d2460c9b 100644 --- a/net/hsr/hsr_main.h +++ b/net/hsr/hsr_main.h @@ -224,6 +224,9 @@ struct hsr_priv { #define hsr_for_each_port(hsr, port) \ list_for_each_entry_rcu((port), &(hsr)->ports, port_list) +#define hsr_for_each_port_rtnl(hsr, port) \ + list_for_each_entry_rcu((port), &(hsr)->ports, port_list, lockdep_rtnl_is_held()) + struct hsr_port *hsr_port_get_hsr(struct hsr_priv *hsr, enum hsr_port_type pt); /* Caller must ensure skb is a valid HSR frame */ diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c index b120470246cc..db0b0af7a692 100644 --- a/net/hsr/hsr_netlink.c +++ b/net/hsr/hsr_netlink.c @@ -34,12 +34,18 @@ static int hsr_newlink(struct net_device *dev, struct netlink_ext_ack *extack) { struct net *link_net = rtnl_newlink_link_net(params); + struct net_device *link[2], *interlink = NULL; struct nlattr **data = params->data; enum hsr_version proto_version; unsigned char multicast_spec; u8 proto = HSR_PROTOCOL_HSR; - struct net_device *link[2], *interlink = NULL; + if (!net_eq(link_net, dev_net(dev))) { + NL_SET_ERR_MSG_MOD(extack, + "HSR slaves/interlink must be on the same net namespace than HSR link"); + return -EINVAL; + } + if (!data) { NL_SET_ERR_MSG_MOD(extack, "No slave devices specified"); return -EINVAL; @@ -160,12 +166,20 @@ static int hsr_fill_info(struct sk_buff *skb, const struct net_device *dev) goto nla_put_failure; } + port = hsr_port_get_hsr(hsr, HSR_PT_INTERLINK); + if (port) { + if (nla_put_u32(skb, IFLA_HSR_INTERLINK, port->dev->ifindex)) + goto nla_put_failure; + } + if (nla_put(skb, IFLA_HSR_SUPERVISION_ADDR, ETH_ALEN, hsr->sup_multicast_addr) || nla_put_u16(skb, IFLA_HSR_SEQ_NR, hsr->sequence_nr)) goto nla_put_failure; if (hsr->prot_version == PRP_V1) proto = HSR_PROTOCOL_PRP; + else if (nla_put_u8(skb, IFLA_HSR_VERSION, hsr->prot_version)) + goto nla_put_failure; if (nla_put_u8(skb, IFLA_HSR_PROTOCOL, proto)) goto nla_put_failure; diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c index b87b6a6fe070..afe06ba00ea4 100644 --- a/net/hsr/hsr_slave.c +++ b/net/hsr/hsr_slave.c @@ -63,8 +63,14 @@ static rx_handler_result_t hsr_handle_frame(struct sk_buff **pskb) skb_push(skb, ETH_HLEN); skb_reset_mac_header(skb); if ((!hsr->prot_version && protocol == htons(ETH_P_PRP)) || - protocol == htons(ETH_P_HSR)) + protocol == htons(ETH_P_HSR)) { + if (!pskb_may_pull(skb, ETH_HLEN + HSR_HLEN)) { + kfree_skb(skb); + goto finish_consume; + } + skb_set_network_header(skb, ETH_HLEN + HSR_HLEN); + } skb_reset_mac_len(skb); /* Only the frames received over the interlink port will assign a @@ -137,6 +143,7 @@ static int hsr_portdev_setup(struct hsr_priv *hsr, struct net_device *dev, struct netlink_ext_ack *extack) { + struct netdev_lag_upper_info lag_upper_info; struct net_device *hsr_dev; struct hsr_port *master; int res; @@ -153,7 +160,9 @@ static int hsr_portdev_setup(struct hsr_priv *hsr, struct net_device *dev, master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); hsr_dev = master->dev; - res = netdev_upper_dev_link(dev, hsr_dev, extack); + lag_upper_info.tx_type = NETDEV_LAG_TX_TYPE_BROADCAST; + lag_upper_info.hash_type = NETDEV_LAG_HASH_UNKNOWN; + res = netdev_master_upper_dev_link(dev, hsr_dev, NULL, &lag_upper_info, extack); if (res) goto fail_upper_dev_link; @@ -198,14 +207,14 @@ int hsr_add_port(struct hsr_priv *hsr, struct net_device *dev, port->type = type; ether_addr_copy(port->original_macaddress, dev->dev_addr); + list_add_tail_rcu(&port->port_list, &hsr->ports); + if (type != HSR_PT_MASTER) { res = hsr_portdev_setup(hsr, dev, port, extack); if (res) goto fail_dev_setup; } - list_add_tail_rcu(&port->port_list, &hsr->ports); - master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); netdev_update_features(master->dev); dev_set_mtu(master->dev, hsr_get_max_mtu(hsr)); @@ -213,7 +222,8 @@ int hsr_add_port(struct hsr_priv *hsr, struct net_device *dev, return 0; fail_dev_setup: - kfree(port); + list_del_rcu(&port->port_list); + kfree_rcu(port, rcu); return res; } diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index 18d267921bb5..e542fbe113e7 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -96,7 +96,7 @@ static int ieee802154_sock_sendmsg(struct socket *sock, struct msghdr *msg, return sk->sk_prot->sendmsg(sk, msg, len); } -static int ieee802154_sock_bind(struct socket *sock, struct sockaddr *uaddr, +static int ieee802154_sock_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { struct sock *sk = sock->sk; @@ -107,7 +107,7 @@ static int ieee802154_sock_bind(struct socket *sock, struct sockaddr *uaddr, return sock_no_bind(sock, uaddr, addr_len); } -static int ieee802154_sock_connect(struct socket *sock, struct sockaddr *uaddr, +static int ieee802154_sock_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; @@ -193,7 +193,7 @@ static void raw_close(struct sock *sk, long timeout) sk_common_release(sk); } -static int raw_bind(struct sock *sk, struct sockaddr *_uaddr, int len) +static int raw_bind(struct sock *sk, struct sockaddr_unsized *_uaddr, int len) { struct ieee802154_addr addr; struct sockaddr_ieee802154 *uaddr = (struct sockaddr_ieee802154 *)_uaddr; @@ -227,7 +227,7 @@ out: return err; } -static int raw_connect(struct sock *sk, struct sockaddr *uaddr, +static int raw_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { return -ENOTSUPP; @@ -485,7 +485,7 @@ static void dgram_close(struct sock *sk, long timeout) sk_common_release(sk); } -static int dgram_bind(struct sock *sk, struct sockaddr *uaddr, int len) +static int dgram_bind(struct sock *sk, struct sockaddr_unsized *uaddr, int len) { struct sockaddr_ieee802154 *addr = (struct sockaddr_ieee802154 *)uaddr; struct ieee802154_addr haddr; @@ -563,7 +563,7 @@ static int dgram_ioctl(struct sock *sk, int cmd, int *karg) } /* FIXME: autobind */ -static int dgram_connect(struct sock *sk, struct sockaddr *uaddr, +static int dgram_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int len) { struct sockaddr_ieee802154 *addr = (struct sockaddr_ieee802154 *)uaddr; diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 12850a277251..b71c22475c51 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -760,9 +760,7 @@ config TCP_AO config TCP_MD5SIG bool "TCP: MD5 Signature Option support (RFC2385)" - select CRYPTO - select CRYPTO_MD5 - select TCP_SIGPOOL + select CRYPTO_LIB_MD5 help RFC2385 specifies a method of giving MD5 protection to TCP sessions. Its main (only?) use is to protect BGP sessions between core routers diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 76e38092cd8a..08d811f11896 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -102,6 +102,7 @@ #include <net/gro.h> #include <net/gso.h> #include <net/tcp.h> +#include <net/psp.h> #include <net/udp.h> #include <net/udplite.h> #include <net/ping.h> @@ -158,6 +159,7 @@ void inet_sock_destruct(struct sock *sk) kfree(rcu_dereference_protected(inet->inet_opt, 1)); dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1)); dst_release(rcu_dereference_protected(sk->sk_rx_dst, 1)); + psp_sk_assoc_free(sk); } EXPORT_SYMBOL(inet_sock_destruct); @@ -439,7 +441,7 @@ int inet_release(struct socket *sock) } EXPORT_SYMBOL(inet_release); -int inet_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len) +int inet_bind_sk(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { u32 flags = BIND_WITH_LOCK; int err; @@ -462,13 +464,13 @@ int inet_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len) return __inet_bind(sk, uaddr, addr_len, flags); } -int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +int inet_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { return inet_bind_sk(sock->sk, uaddr, addr_len); } EXPORT_SYMBOL(inet_bind); -int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, +int __inet_bind(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len, u32 flags) { struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; @@ -565,7 +567,7 @@ out: return err; } -int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, +int inet_dgram_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; @@ -621,7 +623,7 @@ static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias) * Connect to a remote host. There is regrettably still a little * TCP 'magic' in here. */ -int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, +int __inet_stream_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, int flags, int is_sendmsg) { struct sock *sk = sock->sk; @@ -739,7 +741,7 @@ sock_error: } EXPORT_SYMBOL(__inet_stream_connect); -int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, +int inet_stream_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, int flags) { int err; @@ -753,6 +755,11 @@ EXPORT_SYMBOL(inet_stream_connect); void __inet_accept(struct socket *sock, struct socket *newsock, struct sock *newsk) { + if (mem_cgroup_sockets_enabled) { + mem_cgroup_sk_alloc(newsk); + __sk_charge(newsk, GFP_KERNEL); + } + sock_rps_record_flow(newsk); WARN_ON(!((1 << newsk->sk_state) & (TCPF_ESTABLISHED | TCPF_SYN_RECV | @@ -766,6 +773,7 @@ void __inet_accept(struct socket *sock, struct socket *newsock, struct sock *new newsock->state = SS_CONNECTED; } +EXPORT_SYMBOL_GPL(__inet_accept); /* * Accept a pending connection. The TCP layer now gives BSD semantics. @@ -811,7 +819,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, } sin->sin_port = inet->inet_dport; sin->sin_addr.s_addr = inet->inet_daddr; - BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len, + BPF_CGROUP_RUN_SA_PROG(sk, sin, &sin_addr_len, CGROUP_INET4_GETPEERNAME); } else { __be32 addr = inet->inet_rcv_saddr; @@ -819,7 +827,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, addr = inet->inet_saddr; sin->sin_port = inet->inet_sport; sin->sin_addr.s_addr = addr; - BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len, + BPF_CGROUP_RUN_SA_PROG(sk, sin, &sin_addr_len, CGROUP_INET4_GETSOCKNAME); } release_sock(sk); @@ -1393,14 +1401,10 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb, segs = ERR_PTR(-EPROTONOSUPPORT); - if (!skb->encapsulation || encap) { - udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP); - fixedid = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID); + fixedid = !!(skb_shinfo(skb)->gso_type & (SKB_GSO_TCP_FIXEDID << encap)); - /* fixed ID is invalid if DF bit is not set */ - if (fixedid && !(ip_hdr(skb)->frag_off & htons(IP_DF))) - goto out; - } + if (!skb->encapsulation || encap) + udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP); ops = rcu_dereference(inet_offloads[proto]); if (likely(ops && ops->callbacks.gso_segment)) { diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 5cfc1c939673..7f3863daaa40 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -170,7 +170,7 @@ struct neigh_table arp_tbl = { [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ, [NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ, [NEIGH_VAR_GC_STALETIME] = 60 * HZ, - [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX, + [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_DEFAULT, [NEIGH_VAR_PROXY_QLEN] = 64, [NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ, [NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10, @@ -1189,7 +1189,7 @@ static int arp_req_get(struct net *net, struct arpreq *r) read_lock_bh(&neigh->lock); memcpy(r->arp_ha.sa_data, neigh->ha, - min(dev->addr_len, sizeof(r->arp_ha.sa_data_min))); + min(dev->addr_len, sizeof(r->arp_ha.sa_data))); r->arp_flags = arp_state_to_flags(neigh); read_unlock_bh(&neigh->lock); @@ -1217,10 +1217,10 @@ int arp_invalidate(struct net_device *dev, __be32 ip, bool force) err = neigh_update(neigh, NULL, NUD_FAILED, NEIGH_UPDATE_F_OVERRIDE| NEIGH_UPDATE_F_ADMIN, 0); - write_lock_bh(&tbl->lock); + spin_lock_bh(&tbl->lock); neigh_release(neigh); neigh_remove_one(neigh); - write_unlock_bh(&tbl->lock); + spin_unlock_bh(&tbl->lock); } return err; diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 740af8541d2f..709021197e1c 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -1715,8 +1715,7 @@ validate_return: */ void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway) { - unsigned char optbuf[sizeof(struct ip_options) + 40]; - struct ip_options *opt = (struct ip_options *)optbuf; + struct inet_skb_parm parm; int res; if (ip_hdr(skb)->protocol == IPPROTO_ICMP || error != -EACCES) @@ -1727,19 +1726,19 @@ void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway) * so we can not use icmp_send and IPCB here. */ - memset(opt, 0, sizeof(struct ip_options)); - opt->optlen = ip_hdr(skb)->ihl*4 - sizeof(struct iphdr); + memset(&parm, 0, sizeof(parm)); + parm.opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr); rcu_read_lock(); - res = __ip_options_compile(dev_net(skb->dev), opt, skb, NULL); + res = __ip_options_compile(dev_net(skb->dev), &parm.opt, skb, NULL); rcu_read_unlock(); if (res) return; if (gateway) - __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_ANO, 0, opt); + __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_ANO, 0, &parm); else - __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_ANO, 0, opt); + __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_ANO, 0, &parm); } /** diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index c2b2cda1a7e5..1614593b6d72 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -16,7 +16,7 @@ #include <net/tcp_states.h> #include <net/sock_reuseport.h> -int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +int __ip4_datagram_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { struct inet_sock *inet = inet_sk(sk); struct sockaddr_in *usin = (struct sockaddr_in *) uaddr; @@ -84,7 +84,7 @@ out: } EXPORT_SYMBOL(__ip4_datagram_connect); -int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +int ip4_datagram_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { int res; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index c47d3828d4f6..942a887bf089 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -340,14 +340,13 @@ static void inetdev_destroy(struct in_device *in_dev) static int __init inet_blackhole_dev_init(void) { - int err = 0; + struct in_device *in_dev; rtnl_lock(); - if (!inetdev_init(blackhole_netdev)) - err = -ENOMEM; + in_dev = inetdev_init(blackhole_netdev); rtnl_unlock(); - return err; + return PTR_ERR_OR_ZERO(in_dev); } late_initcall(inet_blackhole_dev_init); diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index f14a41ee4aa1..2c922afadb8f 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -132,8 +132,8 @@ static struct sock *esp_find_tcp_sk(struct xfrm_state *x) dport = encap->encap_dport; spin_unlock_bh(&x->lock); - sk = inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, x->id.daddr.a4, - dport, x->props.saddr.a4, sport, 0); + sk = inet_lookup_established(net, x->id.daddr.a4, dport, + x->props.saddr.a4, sport, 0); if (!sk) return ERR_PTR(-ENOENT); diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index e0d94270da28..05828d4cb6cd 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -122,8 +122,10 @@ static struct sk_buff *xfrm4_tunnel_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { - __be16 type = x->inner_mode.family == AF_INET6 ? htons(ETH_P_IPV6) - : htons(ETH_P_IP); + const struct xfrm_mode *inner_mode = xfrm_ip2inner_mode(x, + XFRM_MODE_SKB_CB(skb)->protocol); + __be16 type = inner_mode->family == AF_INET6 ? htons(ETH_P_IPV6) + : htons(ETH_P_IP); return skb_eth_gso_segment(skb, features, type); } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 6e1b94796f67..1dab44e13d3b 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -32,6 +32,7 @@ #include <linux/list.h> #include <linux/slab.h> +#include <net/flow.h> #include <net/inet_dscp.h> #include <net/ip.h> #include <net/protocol.h> @@ -293,7 +294,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb) .flowi4_iif = LOOPBACK_IFINDEX, .flowi4_l3mdev = l3mdev_master_ifindex_rcu(dev), .daddr = ip_hdr(skb)->saddr, - .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip_hdr(skb))), + .flowi4_dscp = ip4h_dscp(ip_hdr(skb)), .flowi4_scope = scope, .flowi4_mark = vmark ? skb->mark : 0, }; @@ -358,7 +359,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX; fl4.daddr = src; fl4.saddr = dst; - fl4.flowi4_tos = inet_dscp_to_dsfield(dscp); + fl4.flowi4_dscp = dscp; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_tun_key.tun_id = 0; fl4.flowi4_flags = 0; @@ -1372,7 +1373,7 @@ static void nl_fib_lookup(struct net *net, struct fib_result_nl *frn) struct flowi4 fl4 = { .flowi4_mark = frn->fl_mark, .daddr = frn->fl_addr, - .flowi4_tos = frn->fl_tos & INET_DSCP_MASK, + .flowi4_dscp = inet_dsfield_to_dscp(frn->fl_tos), .flowi4_scope = frn->fl_scope, }; struct fib_table *tb; diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index fa58d6620ed6..51f0193092f0 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -23,6 +23,7 @@ #include <linux/list.h> #include <linux/rcupdate.h> #include <linux/export.h> +#include <net/flow.h> #include <net/inet_dscp.h> #include <net/ip.h> #include <net/route.h> @@ -193,8 +194,7 @@ INDIRECT_CALLABLE_SCOPE int fib4_rule_match(struct fib_rule *rule, * to mask the upper three DSCP bits prior to matching to maintain * legacy behavior. */ - if (r->dscp_full && - (r->dscp ^ inet_dsfield_to_dscp(fl4->flowi4_tos)) & r->dscp_mask) + if (r->dscp_full && (r->dscp ^ fl4->flowi4_dscp) & r->dscp_mask) return 0; else if (!r->dscp_full && r->dscp && !fib_dscp_masked_match(r->dscp, fl4)) diff --git a/net/ipv4/fou_core.c b/net/ipv4/fou_core.c index 3e30745e2c09..3970b6b7ace5 100644 --- a/net/ipv4/fou_core.c +++ b/net/ipv4/fou_core.c @@ -228,21 +228,27 @@ drop: return 0; } +static const struct net_offload *fou_gro_ops(const struct sock *sk, + int proto) +{ + const struct net_offload __rcu **offloads; + + /* FOU doesn't allow IPv4 on IPv6 sockets. */ + offloads = sk->sk_family == AF_INET6 ? inet6_offloads : inet_offloads; + return rcu_dereference(offloads[proto]); +} + static struct sk_buff *fou_gro_receive(struct sock *sk, struct list_head *head, struct sk_buff *skb) { - const struct net_offload __rcu **offloads; struct fou *fou = fou_from_sock(sk); const struct net_offload *ops; struct sk_buff *pp = NULL; - u8 proto; if (!fou) goto out; - proto = fou->protocol; - /* We can clear the encap_mark for FOU as we are essentially doing * one of two possible things. We are either adding an L4 tunnel * header to the outer L3 tunnel header, or we are simply @@ -254,8 +260,7 @@ static struct sk_buff *fou_gro_receive(struct sock *sk, /* Flag this frame as already having an outer encap header */ NAPI_GRO_CB(skb)->is_fou = 1; - offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; - ops = rcu_dereference(offloads[proto]); + ops = fou_gro_ops(sk, fou->protocol); if (!ops || !ops->callbacks.gro_receive) goto out; @@ -268,10 +273,8 @@ out: static int fou_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff) { - const struct net_offload __rcu **offloads; struct fou *fou = fou_from_sock(sk); const struct net_offload *ops; - u8 proto; int err; if (!fou) { @@ -279,10 +282,7 @@ static int fou_gro_complete(struct sock *sk, struct sk_buff *skb, goto out; } - proto = fou->protocol; - - offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; - ops = rcu_dereference(offloads[proto]); + ops = fou_gro_ops(sk, fou->protocol); if (WARN_ON(!ops || !ops->callbacks.gro_complete)) { err = -ENOSYS; goto out; @@ -323,7 +323,6 @@ static struct sk_buff *gue_gro_receive(struct sock *sk, struct list_head *head, struct sk_buff *skb) { - const struct net_offload __rcu **offloads; const struct net_offload *ops; struct sk_buff *pp = NULL; struct sk_buff *p; @@ -450,8 +449,7 @@ next_proto: /* Flag this frame as already having an outer encap header */ NAPI_GRO_CB(skb)->is_fou = 1; - offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; - ops = rcu_dereference(offloads[proto]); + ops = fou_gro_ops(sk, proto); if (!ops || !ops->callbacks.gro_receive) goto out; @@ -467,7 +465,6 @@ out: static int gue_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff) { struct guehdr *guehdr = (struct guehdr *)(skb->data + nhoff); - const struct net_offload __rcu **offloads; const struct net_offload *ops; unsigned int guehlen = 0; u8 proto; @@ -494,8 +491,7 @@ static int gue_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff) return err; } - offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; - ops = rcu_dereference(offloads[proto]); + ops = fou_gro_ops(sk, proto); if (WARN_ON(!ops || !ops->callbacks.gro_complete)) goto out; diff --git a/net/ipv4/fou_nl.c b/net/ipv4/fou_nl.c index 3d9614609b2d..7a99639204b1 100644 --- a/net/ipv4/fou_nl.c +++ b/net/ipv4/fou_nl.c @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/fou.yaml */ /* YNL-GEN kernel source */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #include <net/netlink.h> #include <net/genetlink.h> @@ -18,9 +19,9 @@ const struct nla_policy fou_nl_policy[FOU_ATTR_IFINDEX + 1] = { [FOU_ATTR_TYPE] = { .type = NLA_U8, }, [FOU_ATTR_REMCSUM_NOPARTIAL] = { .type = NLA_FLAG, }, [FOU_ATTR_LOCAL_V4] = { .type = NLA_U32, }, - [FOU_ATTR_LOCAL_V6] = { .len = 16, }, + [FOU_ATTR_LOCAL_V6] = NLA_POLICY_EXACT_LEN(16), [FOU_ATTR_PEER_V4] = { .type = NLA_U32, }, - [FOU_ATTR_PEER_V6] = { .len = 16, }, + [FOU_ATTR_PEER_V6] = NLA_POLICY_EXACT_LEN(16), [FOU_ATTR_PEER_PORT] = { .type = NLA_BE16, }, [FOU_ATTR_IFINDEX] = { .type = NLA_S32, }, }; diff --git a/net/ipv4/fou_nl.h b/net/ipv4/fou_nl.h index 63a6c4ed803d..438342dc8507 100644 --- a/net/ipv4/fou_nl.h +++ b/net/ipv4/fou_nl.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/fou.yaml */ /* YNL-GEN kernel header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _LINUX_FOU_GEN_H #define _LINUX_FOU_GEN_H diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 2ffe73ea644f..4abbec2f47ef 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -72,6 +72,7 @@ #include <linux/string.h> #include <linux/netfilter_ipv4.h> #include <linux/slab.h> +#include <net/flow.h> #include <net/snmp.h> #include <net/ip.h> #include <net/route.h> @@ -318,17 +319,17 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, return true; /* No rate limit on loopback */ - dev = dst_dev(dst); + rcu_read_lock(); + dev = dst_dev_rcu(dst); if (dev && (dev->flags & IFF_LOOPBACK)) goto out; - rcu_read_lock(); peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, l3mdev_master_ifindex_rcu(dev)); rc = inet_peer_xrlim_allow(peer, READ_ONCE(net->ipv4.sysctl_icmp_ratelimit)); - rcu_read_unlock(); out: + rcu_read_unlock(); if (!rc) __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITHOST); else @@ -444,7 +445,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) fl4.saddr = saddr; fl4.flowi4_mark = mark; fl4.flowi4_uid = sock_net_uid(net, NULL); - fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip_hdr(skb))); + fl4.flowi4_dscp = ip4h_dscp(ip_hdr(skb)); fl4.flowi4_proto = IPPROTO_ICMP; fl4.flowi4_oif = l3mdev_master_ifindex(skb->dev); security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4)); @@ -495,7 +496,7 @@ static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4, fl4->saddr = saddr; fl4->flowi4_mark = mark; fl4->flowi4_uid = sock_net_uid(net, NULL); - fl4->flowi4_tos = inet_dscp_to_dsfield(dscp); + fl4->flowi4_dscp = dscp; fl4->flowi4_proto = IPPROTO_ICMP; fl4->fl4_icmp_type = type; fl4->fl4_icmp_code = code; @@ -544,14 +545,15 @@ static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4, goto relookup_failed; } /* Ugh! */ - orefdst = skb_in->_skb_refdst; /* save old refdst */ - skb_dst_set(skb_in, NULL); + orefdst = skb_dstref_steal(skb_in); err = ip_route_input(skb_in, fl4_dec.daddr, fl4_dec.saddr, dscp, rt2->dst.dev) ? -EINVAL : 0; dst_release(&rt2->dst); rt2 = skb_rtable(skb_in); - skb_in->_skb_refdst = orefdst; /* restore old refdst */ + /* steal dst entry from skb_in, don't drop refcnt */ + skb_dstref_steal(skb_in); + skb_dstref_restore(skb_in, orefdst); } if (err) @@ -580,6 +582,185 @@ relookup_failed: return ERR_PTR(err); } +struct icmp_ext_iio_addr4_subobj { + __be16 afi; + __be16 reserved; + __be32 addr4; +}; + +static unsigned int icmp_ext_iio_len(void) +{ + return sizeof(struct icmp_extobj_hdr) + + /* ifIndex */ + sizeof(__be32) + + /* Interface Address Sub-Object */ + sizeof(struct icmp_ext_iio_addr4_subobj) + + /* Interface Name Sub-Object. Length must be a multiple of 4 + * bytes. + */ + ALIGN(sizeof(struct icmp_ext_iio_name_subobj), 4) + + /* MTU */ + sizeof(__be32); +} + +static unsigned int icmp_ext_max_len(u8 ext_objs) +{ + unsigned int ext_max_len; + + ext_max_len = sizeof(struct icmp_ext_hdr); + + if (ext_objs & BIT(ICMP_ERR_EXT_IIO_IIF)) + ext_max_len += icmp_ext_iio_len(); + + return ext_max_len; +} + +static __be32 icmp_ext_iio_addr4_find(const struct net_device *dev) +{ + struct in_device *in_dev; + struct in_ifaddr *ifa; + + in_dev = __in_dev_get_rcu(dev); + if (!in_dev) + return 0; + + /* It is unclear from RFC 5837 which IP address should be chosen, but + * it makes sense to choose a global unicast address. + */ + in_dev_for_each_ifa_rcu(ifa, in_dev) { + if (READ_ONCE(ifa->ifa_flags) & IFA_F_SECONDARY) + continue; + if (ifa->ifa_scope != RT_SCOPE_UNIVERSE || + ipv4_is_multicast(ifa->ifa_address)) + continue; + return ifa->ifa_address; + } + + return 0; +} + +static void icmp_ext_iio_iif_append(struct net *net, struct sk_buff *skb, + int iif) +{ + struct icmp_ext_iio_name_subobj *name_subobj; + struct icmp_extobj_hdr *objh; + struct net_device *dev; + __be32 data; + + if (!iif) + return; + + /* Add the fields in the order specified by RFC 5837. */ + objh = skb_put(skb, sizeof(*objh)); + objh->class_num = ICMP_EXT_OBJ_CLASS_IIO; + objh->class_type = ICMP_EXT_CTYPE_IIO_ROLE(ICMP_EXT_CTYPE_IIO_ROLE_IIF); + + data = htonl(iif); + skb_put_data(skb, &data, sizeof(__be32)); + objh->class_type |= ICMP_EXT_CTYPE_IIO_IFINDEX; + + rcu_read_lock(); + + dev = dev_get_by_index_rcu(net, iif); + if (!dev) + goto out; + + data = icmp_ext_iio_addr4_find(dev); + if (data) { + struct icmp_ext_iio_addr4_subobj *addr4_subobj; + + addr4_subobj = skb_put_zero(skb, sizeof(*addr4_subobj)); + addr4_subobj->afi = htons(ICMP_AFI_IP); + addr4_subobj->addr4 = data; + objh->class_type |= ICMP_EXT_CTYPE_IIO_IPADDR; + } + + name_subobj = skb_put_zero(skb, ALIGN(sizeof(*name_subobj), 4)); + name_subobj->len = ALIGN(sizeof(*name_subobj), 4); + netdev_copy_name(dev, name_subobj->name); + objh->class_type |= ICMP_EXT_CTYPE_IIO_NAME; + + data = htonl(READ_ONCE(dev->mtu)); + skb_put_data(skb, &data, sizeof(__be32)); + objh->class_type |= ICMP_EXT_CTYPE_IIO_MTU; + +out: + rcu_read_unlock(); + objh->length = htons(skb_tail_pointer(skb) - (unsigned char *)objh); +} + +static void icmp_ext_objs_append(struct net *net, struct sk_buff *skb, + u8 ext_objs, int iif) +{ + if (ext_objs & BIT(ICMP_ERR_EXT_IIO_IIF)) + icmp_ext_iio_iif_append(net, skb, iif); +} + +static struct sk_buff * +icmp_ext_append(struct net *net, struct sk_buff *skb_in, struct icmphdr *icmph, + unsigned int room, int iif) +{ + unsigned int payload_len, ext_max_len, ext_len; + struct icmp_ext_hdr *ext_hdr; + struct sk_buff *skb; + u8 ext_objs; + int nhoff; + + switch (icmph->type) { + case ICMP_DEST_UNREACH: + case ICMP_TIME_EXCEEDED: + case ICMP_PARAMETERPROB: + break; + default: + return NULL; + } + + ext_objs = READ_ONCE(net->ipv4.sysctl_icmp_errors_extension_mask); + if (!ext_objs) + return NULL; + + ext_max_len = icmp_ext_max_len(ext_objs); + if (ICMP_EXT_ORIG_DGRAM_MIN_LEN + ext_max_len > room) + return NULL; + + skb = skb_clone(skb_in, GFP_ATOMIC); + if (!skb) + return NULL; + + nhoff = skb_network_offset(skb); + payload_len = min(skb->len - nhoff, ICMP_EXT_ORIG_DGRAM_MIN_LEN); + + if (!pskb_network_may_pull(skb, payload_len)) + goto free_skb; + + if (pskb_trim(skb, nhoff + ICMP_EXT_ORIG_DGRAM_MIN_LEN) || + __skb_put_padto(skb, nhoff + ICMP_EXT_ORIG_DGRAM_MIN_LEN, false)) + goto free_skb; + + if (pskb_expand_head(skb, 0, ext_max_len, GFP_ATOMIC)) + goto free_skb; + + ext_hdr = skb_put_zero(skb, sizeof(*ext_hdr)); + ext_hdr->version = ICMP_EXT_VERSION_2; + + icmp_ext_objs_append(net, skb, ext_objs, iif); + + /* Do not send an empty extension structure. */ + ext_len = skb_tail_pointer(skb) - (unsigned char *)ext_hdr; + if (ext_len == sizeof(*ext_hdr)) + goto free_skb; + + ext_hdr->checksum = ip_compute_csum(ext_hdr, ext_len); + /* The length of the original datagram in 32-bit words (RFC 4884). */ + icmph->un.reserved[1] = ICMP_EXT_ORIG_DGRAM_MIN_LEN / sizeof(u32); + + return skb; + +free_skb: + consume_skb(skb); + return NULL; +} + /* * Send an ICMP message in response to a situation * @@ -592,13 +773,14 @@ relookup_failed: */ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, - const struct ip_options *opt) + const struct inet_skb_parm *parm) { struct iphdr *iph; int room; struct icmp_bxm icmp_param; struct rtable *rt = skb_rtable(skb_in); bool apply_ratelimit = false; + struct sk_buff *ext_skb; struct ipcm_cookie ipc; struct flowi4 fl4; __be32 saddr; @@ -708,7 +890,8 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, rcu_read_lock(); if (rt_is_input_route(rt) && READ_ONCE(net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr)) - dev = dev_get_by_index_rcu(net, inet_iif(skb_in)); + dev = dev_get_by_index_rcu(net, parm->iif ? parm->iif : + inet_iif(skb_in)); if (dev) saddr = inet_select_addr(dev, iph->saddr, @@ -723,7 +906,8 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, iph->tos; mark = IP4_REPLY_MARK(net, skb_in->mark); - if (__ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in, opt)) + if (__ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in, + &parm->opt)) goto out_unlock; @@ -766,7 +950,12 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, if (room <= (int)sizeof(struct iphdr)) goto ende; - icmp_param.data_len = skb_in->len - icmp_param.offset; + ext_skb = icmp_ext_append(net, skb_in, &icmp_param.data.icmph, room, + parm->iif); + if (ext_skb) + icmp_param.skb = ext_skb; + + icmp_param.data_len = icmp_param.skb->len - icmp_param.offset; if (icmp_param.data_len > room) icmp_param.data_len = room; icmp_param.head_len = sizeof(struct icmphdr); @@ -781,6 +970,9 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, trace_icmp_send(skb_in, type, code); icmp_push_reply(sk, &icmp_param, &fl4, &ipc, &rt); + + if (ext_skb) + consume_skb(ext_skb); ende: ip_rt_put(rt); out_unlock: @@ -797,14 +989,16 @@ EXPORT_SYMBOL(__icmp_send); void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info) { struct sk_buff *cloned_skb = NULL; - struct ip_options opts = { 0 }; enum ip_conntrack_info ctinfo; + enum ip_conntrack_dir dir; + struct inet_skb_parm parm; struct nf_conn *ct; __be32 orig_ip; + memset(&parm, 0, sizeof(parm)); ct = nf_ct_get(skb_in, &ctinfo); - if (!ct || !(ct->status & IPS_SRC_NAT)) { - __icmp_send(skb_in, type, code, info, &opts); + if (!ct || !(READ_ONCE(ct->status) & IPS_NAT_MASK)) { + __icmp_send(skb_in, type, code, info, &parm); return; } @@ -818,8 +1012,9 @@ void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info) goto out; orig_ip = ip_hdr(skb_in)->saddr; - ip_hdr(skb_in)->saddr = ct->tuplehash[0].tuple.src.u3.ip; - __icmp_send(skb_in, type, code, info, &opts); + dir = CTINFO2DIR(ctinfo); + ip_hdr(skb_in)->saddr = ct->tuplehash[dir].tuple.src.u3.ip; + __icmp_send(skb_in, type, code, info, &parm); ip_hdr(skb_in)->saddr = orig_ip; out: consume_skb(cloned_skb); @@ -1495,6 +1690,7 @@ static int __net_init icmp_sk_init(struct net *net) net->ipv4.sysctl_icmp_ratelimit = 1 * HZ; net->ipv4.sysctl_icmp_ratemask = 0x1818; net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr = 0; + net->ipv4.sysctl_icmp_errors_extension_mask = 0; net->ipv4.sysctl_icmp_msgs_per_sec = 1000; net->ipv4.sysctl_icmp_msgs_burst = 50; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 1e2df51427fe..97d57c52b9ad 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -423,7 +423,7 @@ success: } static inline int sk_reuseport_match(struct inet_bind_bucket *tb, - struct sock *sk) + const struct sock *sk) { if (tb->fastreuseport <= 0) return 0; @@ -453,8 +453,9 @@ static inline int sk_reuseport_match(struct inet_bind_bucket *tb, ipv6_only_sock(sk), true, false); } -void inet_csk_update_fastreuse(struct inet_bind_bucket *tb, - struct sock *sk) +void inet_csk_update_fastreuse(const struct sock *sk, + struct inet_bind_bucket *tb, + struct inet_bind2_bucket *tb2) { bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; @@ -501,6 +502,9 @@ void inet_csk_update_fastreuse(struct inet_bind_bucket *tb, tb->fastreuseport = 0; } } + + tb2->fastreuse = tb->fastreuse; + tb2->fastreuseport = tb->fastreuseport; } /* Obtain a reference to a local port for the given sock, @@ -582,7 +586,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) } success: - inet_csk_update_fastreuse(tb, sk); + inet_csk_update_fastreuse(sk, tb, tb2); if (!inet_csk(sk)->icsk_bind_hash) inet_bind_hash(sk, tb, tb2, port); @@ -706,44 +710,18 @@ struct sock *inet_csk_accept(struct sock *sk, struct proto_accept_arg *arg) spin_unlock_bh(&queue->fastopenq.lock); } -out: release_sock(sk); - if (newsk && mem_cgroup_sockets_enabled) { - gfp_t gfp = GFP_KERNEL | __GFP_NOFAIL; - int amt = 0; - /* atomically get the memory usage, set and charge the - * newsk->sk_memcg. - */ - lock_sock(newsk); - - mem_cgroup_sk_alloc(newsk); - if (newsk->sk_memcg) { - /* The socket has not been accepted yet, no need - * to look at newsk->sk_wmem_queued. - */ - amt = sk_mem_pages(newsk->sk_forward_alloc + - atomic_read(&newsk->sk_rmem_alloc)); - } - - if (amt) - mem_cgroup_charge_skmem(newsk->sk_memcg, amt, gfp); - kmem_cache_charge(newsk, gfp); - - release_sock(newsk); - } if (req) reqsk_put(req); - if (newsk) - inet_init_csk_locks(newsk); - + inet_init_csk_locks(newsk); return newsk; + out_err: - newsk = NULL; - req = NULL; + release_sock(sk); arg->err = error; - goto out; + return NULL; } EXPORT_SYMBOL(inet_csk_accept); @@ -759,9 +737,9 @@ void inet_csk_init_xmit_timers(struct sock *sk, { struct inet_connection_sock *icsk = inet_csk(sk); - timer_setup(&icsk->icsk_retransmit_timer, retransmit_handler, 0); + timer_setup(&sk->tcp_retransmit_timer, retransmit_handler, 0); timer_setup(&icsk->icsk_delack_timer, delack_handler, 0); - timer_setup(&sk->sk_timer, keepalive_handler, 0); + timer_setup(&icsk->icsk_keepalive_timer, keepalive_handler, 0); icsk->icsk_pending = icsk->icsk_ack.pending = 0; } @@ -772,9 +750,9 @@ void inet_csk_clear_xmit_timers(struct sock *sk) smp_store_release(&icsk->icsk_pending, 0); smp_store_release(&icsk->icsk_ack.pending, 0); - sk_stop_timer(sk, &icsk->icsk_retransmit_timer); + sk_stop_timer(sk, &sk->tcp_retransmit_timer); sk_stop_timer(sk, &icsk->icsk_delack_timer); - sk_stop_timer(sk, &sk->sk_timer); + sk_stop_timer(sk, &icsk->icsk_keepalive_timer); } void inet_csk_clear_xmit_timers_sync(struct sock *sk) @@ -787,9 +765,9 @@ void inet_csk_clear_xmit_timers_sync(struct sock *sk) smp_store_release(&icsk->icsk_pending, 0); smp_store_release(&icsk->icsk_ack.pending, 0); - sk_stop_timer_sync(sk, &icsk->icsk_retransmit_timer); + sk_stop_timer_sync(sk, &sk->tcp_retransmit_timer); sk_stop_timer_sync(sk, &icsk->icsk_delack_timer); - sk_stop_timer_sync(sk, &sk->sk_timer); + sk_stop_timer_sync(sk, &icsk->icsk_keepalive_timer); } struct dst_entry *inet_csk_route_req(const struct sock *sk, @@ -907,7 +885,6 @@ reqsk_alloc_noprof(const struct request_sock_ops *ops, struct sock *sk_listener, sk_tx_queue_clear(req_to_sk(req)); req->saved_syn = NULL; req->syncookie = 0; - req->timeout = 0; req->num_timeout = 0; req->num_retrans = 0; req->sk = NULL; @@ -935,7 +912,6 @@ struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, ireq->ireq_state = TCP_NEW_SYN_RECV; write_pnet(&ireq->ireq_net, sock_net(sk_listener)); ireq->ireq_family = sk_listener->sk_family; - req->timeout = TCP_TIMEOUT_INIT; } return req; @@ -1118,16 +1094,18 @@ static void reqsk_timer_handler(struct timer_list *t) young <<= 1; } } + syn_ack_recalc(req, max_syn_ack_retries, READ_ONCE(queue->rskq_defer_accept), &expire, &resend); - req->rsk_ops->syn_ack_timeout(req); + tcp_syn_ack_timeout(req); + if (!expire && (!resend || !tcp_rtx_synack(sk_listener, req) || inet_rsk(req)->acked)) { if (req->num_timeout++ == 0) atomic_dec(&queue->young); - mod_timer(&req->rsk_timer, jiffies + reqsk_timeout(req, TCP_RTO_MAX)); + mod_timer(&req->rsk_timer, jiffies + tcp_reqsk_timeout(req)); if (!nreq) return; @@ -1164,8 +1142,7 @@ drop: reqsk_put(oreq); } -static bool reqsk_queue_hash_req(struct request_sock *req, - unsigned long timeout) +static bool reqsk_queue_hash_req(struct request_sock *req) { bool found_dup_sk = false; @@ -1173,8 +1150,9 @@ static bool reqsk_queue_hash_req(struct request_sock *req, return false; /* The timer needs to be setup after a successful insertion. */ + req->timeout = tcp_timeout_init((struct sock *)req); timer_setup(&req->rsk_timer, reqsk_timer_handler, TIMER_PINNED); - mod_timer(&req->rsk_timer, jiffies + timeout); + mod_timer(&req->rsk_timer, jiffies + req->timeout); /* before letting lookups find us, make sure all req fields * are committed to memory and refcnt initialized. @@ -1184,10 +1162,9 @@ static bool reqsk_queue_hash_req(struct request_sock *req, return true; } -bool inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, - unsigned long timeout) +bool inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req) { - if (!reqsk_queue_hash_req(req, timeout)) + if (!reqsk_queue_hash_req(req)) return false; inet_csk_reqsk_queue_added(sk); @@ -1297,12 +1274,19 @@ void inet_csk_destroy_sock(struct sock *sk) xfrm_sk_free_policy(sk); - this_cpu_dec(*sk->sk_prot->orphan_count); + tcp_orphan_count_dec(); sock_put(sk); } EXPORT_SYMBOL(inet_csk_destroy_sock); +void inet_csk_prepare_for_destroy_sock(struct sock *sk) +{ + /* The below has to be done to allow calling inet_csk_destroy_sock */ + sock_set_flag(sk, SOCK_DEAD); + tcp_orphan_count_inc(); +} + /* This function allows to force a closure of a socket after the call to * tcp_create_openreq_child(). */ @@ -1370,7 +1354,7 @@ static void inet_child_forget(struct sock *sk, struct request_sock *req, sock_orphan(child); - this_cpu_inc(*sk->sk_prot->orphan_count); + tcp_orphan_count_inc(); if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) { BUG_ON(rcu_access_pointer(tcp_sk(child)->fastopen_rsk) != req); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 2fa53b16fe77..3f5b1418a610 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -20,9 +20,6 @@ #include <net/ipv6.h> #include <net/inet_common.h> #include <net/inet_connection_sock.h> -#include <net/inet_hashtables.h> -#include <net/inet_timewait_sock.h> -#include <net/inet6_hashtables.h> #include <net/bpf_sk_storage.h> #include <net/netlink.h> @@ -74,54 +71,29 @@ static void inet_diag_unlock_handler(const struct inet_diag_handler *handler) void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk) { - r->idiag_family = sk->sk_family; + r->idiag_family = READ_ONCE(sk->sk_family); - r->id.idiag_sport = htons(sk->sk_num); - r->id.idiag_dport = sk->sk_dport; - r->id.idiag_if = sk->sk_bound_dev_if; + r->id.idiag_sport = htons(READ_ONCE(sk->sk_num)); + r->id.idiag_dport = READ_ONCE(sk->sk_dport); + r->id.idiag_if = READ_ONCE(sk->sk_bound_dev_if); sock_diag_save_cookie(sk, r->id.idiag_cookie); #if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family == AF_INET6) { - *(struct in6_addr *)r->id.idiag_src = sk->sk_v6_rcv_saddr; - *(struct in6_addr *)r->id.idiag_dst = sk->sk_v6_daddr; + if (r->idiag_family == AF_INET6) { + data_race(*(struct in6_addr *)r->id.idiag_src = sk->sk_v6_rcv_saddr); + data_race(*(struct in6_addr *)r->id.idiag_dst = sk->sk_v6_daddr); } else #endif { memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src)); memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst)); - r->id.idiag_src[0] = sk->sk_rcv_saddr; - r->id.idiag_dst[0] = sk->sk_daddr; + r->id.idiag_src[0] = READ_ONCE(sk->sk_rcv_saddr); + r->id.idiag_dst[0] = READ_ONCE(sk->sk_daddr); } } EXPORT_SYMBOL_GPL(inet_diag_msg_common_fill); -static size_t inet_sk_attr_size(struct sock *sk, - const struct inet_diag_req_v2 *req, - bool net_admin) -{ - const struct inet_diag_handler *handler; - size_t aux = 0; - - rcu_read_lock(); - handler = rcu_dereference(inet_diag_table[req->sdiag_protocol]); - DEBUG_NET_WARN_ON_ONCE(!handler); - if (handler && handler->idiag_get_aux_size) - aux = handler->idiag_get_aux_size(sk, net_admin); - rcu_read_unlock(); - - return nla_total_size(sizeof(struct tcp_info)) - + nla_total_size(sizeof(struct inet_diag_msg)) - + inet_diag_msg_attrs_size() - + nla_total_size(sizeof(struct inet_diag_meminfo)) - + nla_total_size(SK_MEMINFO_VARS * sizeof(u32)) - + nla_total_size(TCP_CA_NAME_MAX) - + nla_total_size(sizeof(struct tcpvegas_info)) - + aux - + 64; -} - int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, struct inet_diag_msg *r, int ext, struct user_namespace *user_ns, @@ -313,19 +285,19 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk_pending == ICSK_TIME_LOSS_PROBE) { r->idiag_timer = 1; - r->idiag_retrans = icsk->icsk_retransmits; + r->idiag_retrans = READ_ONCE(icsk->icsk_retransmits); r->idiag_expires = - jiffies_delta_to_msecs(icsk_timeout(icsk) - jiffies); + jiffies_delta_to_msecs(tcp_timeout_expires(sk) - jiffies); } else if (icsk_pending == ICSK_TIME_PROBE0) { r->idiag_timer = 4; - r->idiag_retrans = icsk->icsk_probes_out; + r->idiag_retrans = READ_ONCE(icsk->icsk_probes_out); r->idiag_expires = - jiffies_delta_to_msecs(icsk_timeout(icsk) - jiffies); - } else if (timer_pending(&sk->sk_timer)) { + jiffies_delta_to_msecs(tcp_timeout_expires(sk) - jiffies); + } else if (timer_pending(&icsk->icsk_keepalive_timer)) { r->idiag_timer = 2; - r->idiag_retrans = icsk->icsk_probes_out; + r->idiag_retrans = READ_ONCE(icsk->icsk_probes_out); r->idiag_expires = - jiffies_delta_to_msecs(sk->sk_timer.expires - jiffies); + jiffies_delta_to_msecs(icsk->icsk_keepalive_timer.expires - jiffies); } if ((ext & (1 << (INET_DIAG_INFO - 1))) && handler->idiag_info_size) { @@ -422,183 +394,6 @@ errout: } EXPORT_SYMBOL_GPL(inet_sk_diag_fill); -static int inet_twsk_diag_fill(struct sock *sk, - struct sk_buff *skb, - struct netlink_callback *cb, - u16 nlmsg_flags, bool net_admin) -{ - struct inet_timewait_sock *tw = inet_twsk(sk); - struct inet_diag_msg *r; - struct nlmsghdr *nlh; - long tmo; - - nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, cb->nlh->nlmsg_type, - sizeof(*r), nlmsg_flags); - if (!nlh) - return -EMSGSIZE; - - r = nlmsg_data(nlh); - BUG_ON(tw->tw_state != TCP_TIME_WAIT); - - inet_diag_msg_common_fill(r, sk); - r->idiag_retrans = 0; - - r->idiag_state = READ_ONCE(tw->tw_substate); - r->idiag_timer = 3; - tmo = tw->tw_timer.expires - jiffies; - r->idiag_expires = jiffies_delta_to_msecs(tmo); - r->idiag_rqueue = 0; - r->idiag_wqueue = 0; - r->idiag_uid = 0; - r->idiag_inode = 0; - - if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, - tw->tw_mark)) { - nlmsg_cancel(skb, nlh); - return -EMSGSIZE; - } - - nlmsg_end(skb, nlh); - return 0; -} - -static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb, - struct netlink_callback *cb, - u16 nlmsg_flags, bool net_admin) -{ - struct request_sock *reqsk = inet_reqsk(sk); - struct inet_diag_msg *r; - struct nlmsghdr *nlh; - long tmo; - - nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - cb->nlh->nlmsg_type, sizeof(*r), nlmsg_flags); - if (!nlh) - return -EMSGSIZE; - - r = nlmsg_data(nlh); - inet_diag_msg_common_fill(r, sk); - r->idiag_state = TCP_SYN_RECV; - r->idiag_timer = 1; - r->idiag_retrans = reqsk->num_retrans; - - BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) != - offsetof(struct sock, sk_cookie)); - - tmo = inet_reqsk(sk)->rsk_timer.expires - jiffies; - r->idiag_expires = jiffies_delta_to_msecs(tmo); - r->idiag_rqueue = 0; - r->idiag_wqueue = 0; - r->idiag_uid = 0; - r->idiag_inode = 0; - - if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, - inet_rsk(reqsk)->ir_mark)) { - nlmsg_cancel(skb, nlh); - return -EMSGSIZE; - } - - nlmsg_end(skb, nlh); - return 0; -} - -static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, - struct netlink_callback *cb, - const struct inet_diag_req_v2 *r, - u16 nlmsg_flags, bool net_admin) -{ - if (sk->sk_state == TCP_TIME_WAIT) - return inet_twsk_diag_fill(sk, skb, cb, nlmsg_flags, net_admin); - - if (sk->sk_state == TCP_NEW_SYN_RECV) - return inet_req_diag_fill(sk, skb, cb, nlmsg_flags, net_admin); - - return inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, r, nlmsg_flags, - net_admin); -} - -struct sock *inet_diag_find_one_icsk(struct net *net, - struct inet_hashinfo *hashinfo, - const struct inet_diag_req_v2 *req) -{ - struct sock *sk; - - rcu_read_lock(); - if (req->sdiag_family == AF_INET) - sk = inet_lookup(net, hashinfo, NULL, 0, req->id.idiag_dst[0], - req->id.idiag_dport, req->id.idiag_src[0], - req->id.idiag_sport, req->id.idiag_if); -#if IS_ENABLED(CONFIG_IPV6) - else if (req->sdiag_family == AF_INET6) { - if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) && - ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src)) - sk = inet_lookup(net, hashinfo, NULL, 0, req->id.idiag_dst[3], - req->id.idiag_dport, req->id.idiag_src[3], - req->id.idiag_sport, req->id.idiag_if); - else - sk = inet6_lookup(net, hashinfo, NULL, 0, - (struct in6_addr *)req->id.idiag_dst, - req->id.idiag_dport, - (struct in6_addr *)req->id.idiag_src, - req->id.idiag_sport, - req->id.idiag_if); - } -#endif - else { - rcu_read_unlock(); - return ERR_PTR(-EINVAL); - } - rcu_read_unlock(); - if (!sk) - return ERR_PTR(-ENOENT); - - if (sock_diag_check_cookie(sk, req->id.idiag_cookie)) { - sock_gen_put(sk); - return ERR_PTR(-ENOENT); - } - - return sk; -} -EXPORT_SYMBOL_GPL(inet_diag_find_one_icsk); - -int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, - struct netlink_callback *cb, - const struct inet_diag_req_v2 *req) -{ - struct sk_buff *in_skb = cb->skb; - bool net_admin = netlink_net_capable(in_skb, CAP_NET_ADMIN); - struct net *net = sock_net(in_skb->sk); - struct sk_buff *rep; - struct sock *sk; - int err; - - sk = inet_diag_find_one_icsk(net, hashinfo, req); - if (IS_ERR(sk)) - return PTR_ERR(sk); - - rep = nlmsg_new(inet_sk_attr_size(sk, req, net_admin), GFP_KERNEL); - if (!rep) { - err = -ENOMEM; - goto out; - } - - err = sk_diag_fill(sk, rep, cb, req, 0, net_admin); - if (err < 0) { - WARN_ON(err == -EMSGSIZE); - nlmsg_free(rep); - goto out; - } - err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid); - -out: - if (sk) - sock_gen_put(sk); - - return err; -} -EXPORT_SYMBOL_GPL(inet_diag_dump_one_icsk); - static int inet_diag_cmd_exact(int cmd, struct sk_buff *in_skb, const struct nlmsghdr *nlh, int hdrlen, @@ -785,7 +580,7 @@ static void entry_fill_addrs(struct inet_diag_entry *entry, const struct sock *sk) { #if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family == AF_INET6) { + if (entry->family == AF_INET6) { entry->saddr = sk->sk_v6_rcv_saddr.s6_addr32; entry->daddr = sk->sk_v6_daddr.s6_addr32; } else @@ -796,31 +591,36 @@ static void entry_fill_addrs(struct inet_diag_entry *entry, } } -int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk) +int inet_diag_bc_sk(const struct inet_diag_dump_data *cb_data, struct sock *sk) { - struct inet_sock *inet = inet_sk(sk); + const struct nlattr *bc = cb_data->inet_diag_nla_bc; + const struct inet_sock *inet = inet_sk(sk); struct inet_diag_entry entry; if (!bc) return 1; - entry.family = sk->sk_family; + entry.family = READ_ONCE(sk->sk_family); entry_fill_addrs(&entry, sk); - entry.sport = inet->inet_num; - entry.dport = ntohs(inet->inet_dport); - entry.ifindex = sk->sk_bound_dev_if; - entry.userlocks = sk_fullsock(sk) ? sk->sk_userlocks : 0; - if (sk_fullsock(sk)) - entry.mark = READ_ONCE(sk->sk_mark); - else if (sk->sk_state == TCP_NEW_SYN_RECV) - entry.mark = inet_rsk(inet_reqsk(sk))->ir_mark; - else if (sk->sk_state == TCP_TIME_WAIT) - entry.mark = inet_twsk(sk)->tw_mark; - else - entry.mark = 0; + entry.sport = READ_ONCE(inet->inet_num); + entry.dport = ntohs(READ_ONCE(inet->inet_dport)); + entry.ifindex = READ_ONCE(sk->sk_bound_dev_if); + if (cb_data->userlocks_needed) + entry.userlocks = sk_fullsock(sk) ? READ_ONCE(sk->sk_userlocks) : 0; + if (cb_data->mark_needed) { + if (sk_fullsock(sk)) + entry.mark = READ_ONCE(sk->sk_mark); + else if (sk->sk_state == TCP_NEW_SYN_RECV) + entry.mark = inet_rsk(inet_reqsk(sk))->ir_mark; + else if (sk->sk_state == TCP_TIME_WAIT) + entry.mark = inet_twsk(sk)->tw_mark; + else + entry.mark = 0; + } #ifdef CONFIG_SOCK_CGROUP_DATA - entry.cgroup_id = sk_fullsock(sk) ? - cgroup_id(sock_cgroup_ptr(&sk->sk_cgrp_data)) : 0; + if (cb_data->cgroup_needed) + entry.cgroup_id = sk_fullsock(sk) ? + cgroup_id(sock_cgroup_ptr(&sk->sk_cgrp_data)) : 0; #endif return inet_diag_bc_run(bc, &entry); @@ -920,16 +720,21 @@ static bool valid_cgroupcond(const struct inet_diag_bc_op *op, int len, } #endif -static int inet_diag_bc_audit(const struct nlattr *attr, +static int inet_diag_bc_audit(struct inet_diag_dump_data *cb_data, const struct sk_buff *skb) { - bool net_admin = netlink_net_capable(skb, CAP_NET_ADMIN); + const struct nlattr *attr = cb_data->inet_diag_nla_bc; const void *bytecode, *bc; int bytecode_len, len; + bool net_admin; + + if (!attr) + return 0; - if (!attr || nla_len(attr) < sizeof(struct inet_diag_bc_op)) + if (nla_len(attr) < sizeof(struct inet_diag_bc_op)) return -EINVAL; + net_admin = netlink_net_capable(skb, CAP_NET_ADMIN); bytecode = bc = nla_data(attr); len = bytecode_len = nla_len(attr); @@ -961,14 +766,18 @@ static int inet_diag_bc_audit(const struct nlattr *attr, return -EPERM; if (!valid_markcond(bc, len, &min_len)) return -EINVAL; + cb_data->mark_needed = true; break; #ifdef CONFIG_SOCK_CGROUP_DATA case INET_DIAG_BC_CGROUP_COND: if (!valid_cgroupcond(bc, len, &min_len)) return -EINVAL; + cb_data->cgroup_needed = true; break; #endif case INET_DIAG_BC_AUTO: + cb_data->userlocks_needed = true; + fallthrough; case INET_DIAG_BC_JMP: case INET_DIAG_BC_NOP: break; @@ -992,280 +801,6 @@ static int inet_diag_bc_audit(const struct nlattr *attr, return len == 0 ? 0 : -EINVAL; } -static void twsk_build_assert(void) -{ - BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_family) != - offsetof(struct sock, sk_family)); - - BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_num) != - offsetof(struct inet_sock, inet_num)); - - BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_dport) != - offsetof(struct inet_sock, inet_dport)); - - BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_rcv_saddr) != - offsetof(struct inet_sock, inet_rcv_saddr)); - - BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_daddr) != - offsetof(struct inet_sock, inet_daddr)); - -#if IS_ENABLED(CONFIG_IPV6) - BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_rcv_saddr) != - offsetof(struct sock, sk_v6_rcv_saddr)); - - BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_daddr) != - offsetof(struct sock, sk_v6_daddr)); -#endif -} - -void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, - struct netlink_callback *cb, - const struct inet_diag_req_v2 *r) -{ - bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); - struct inet_diag_dump_data *cb_data = cb->data; - struct net *net = sock_net(skb->sk); - u32 idiag_states = r->idiag_states; - int i, num, s_i, s_num; - struct nlattr *bc; - struct sock *sk; - - bc = cb_data->inet_diag_nla_bc; - if (idiag_states & TCPF_SYN_RECV) - idiag_states |= TCPF_NEW_SYN_RECV; - s_i = cb->args[1]; - s_num = num = cb->args[2]; - - if (cb->args[0] == 0) { - if (!(idiag_states & TCPF_LISTEN) || r->id.idiag_dport) - goto skip_listen_ht; - - for (i = s_i; i <= hashinfo->lhash2_mask; i++) { - struct inet_listen_hashbucket *ilb; - struct hlist_nulls_node *node; - - num = 0; - ilb = &hashinfo->lhash2[i]; - - if (hlist_nulls_empty(&ilb->nulls_head)) { - s_num = 0; - continue; - } - spin_lock(&ilb->lock); - sk_nulls_for_each(sk, node, &ilb->nulls_head) { - struct inet_sock *inet = inet_sk(sk); - - if (!net_eq(sock_net(sk), net)) - continue; - - if (num < s_num) { - num++; - continue; - } - - if (r->sdiag_family != AF_UNSPEC && - sk->sk_family != r->sdiag_family) - goto next_listen; - - if (r->id.idiag_sport != inet->inet_sport && - r->id.idiag_sport) - goto next_listen; - - if (!inet_diag_bc_sk(bc, sk)) - goto next_listen; - - if (inet_sk_diag_fill(sk, inet_csk(sk), skb, - cb, r, NLM_F_MULTI, - net_admin) < 0) { - spin_unlock(&ilb->lock); - goto done; - } - -next_listen: - ++num; - } - spin_unlock(&ilb->lock); - - s_num = 0; - } -skip_listen_ht: - cb->args[0] = 1; - s_i = num = s_num = 0; - } - -/* Process a maximum of SKARR_SZ sockets at a time when walking hash buckets - * with bh disabled. - */ -#define SKARR_SZ 16 - - /* Dump bound but inactive (not listening, connecting, etc.) sockets */ - if (cb->args[0] == 1) { - if (!(idiag_states & TCPF_BOUND_INACTIVE)) - goto skip_bind_ht; - - for (i = s_i; i < hashinfo->bhash_size; i++) { - struct inet_bind_hashbucket *ibb; - struct inet_bind2_bucket *tb2; - struct sock *sk_arr[SKARR_SZ]; - int num_arr[SKARR_SZ]; - int idx, accum, res; - -resume_bind_walk: - num = 0; - accum = 0; - ibb = &hashinfo->bhash2[i]; - - if (hlist_empty(&ibb->chain)) { - s_num = 0; - continue; - } - spin_lock_bh(&ibb->lock); - inet_bind_bucket_for_each(tb2, &ibb->chain) { - if (!net_eq(ib2_net(tb2), net)) - continue; - - sk_for_each_bound(sk, &tb2->owners) { - struct inet_sock *inet = inet_sk(sk); - - if (num < s_num) - goto next_bind; - - if (sk->sk_state != TCP_CLOSE || - !inet->inet_num) - goto next_bind; - - if (r->sdiag_family != AF_UNSPEC && - r->sdiag_family != sk->sk_family) - goto next_bind; - - if (!inet_diag_bc_sk(bc, sk)) - goto next_bind; - - sock_hold(sk); - num_arr[accum] = num; - sk_arr[accum] = sk; - if (++accum == SKARR_SZ) - goto pause_bind_walk; -next_bind: - num++; - } - } -pause_bind_walk: - spin_unlock_bh(&ibb->lock); - - res = 0; - for (idx = 0; idx < accum; idx++) { - if (res >= 0) { - res = inet_sk_diag_fill(sk_arr[idx], - NULL, skb, cb, - r, NLM_F_MULTI, - net_admin); - if (res < 0) - num = num_arr[idx]; - } - sock_put(sk_arr[idx]); - } - if (res < 0) - goto done; - - cond_resched(); - - if (accum == SKARR_SZ) { - s_num = num + 1; - goto resume_bind_walk; - } - - s_num = 0; - } -skip_bind_ht: - cb->args[0] = 2; - s_i = num = s_num = 0; - } - - if (!(idiag_states & ~TCPF_LISTEN)) - goto out; - - for (i = s_i; i <= hashinfo->ehash_mask; i++) { - struct inet_ehash_bucket *head = &hashinfo->ehash[i]; - spinlock_t *lock = inet_ehash_lockp(hashinfo, i); - struct hlist_nulls_node *node; - struct sock *sk_arr[SKARR_SZ]; - int num_arr[SKARR_SZ]; - int idx, accum, res; - - if (hlist_nulls_empty(&head->chain)) - continue; - - if (i > s_i) - s_num = 0; - -next_chunk: - num = 0; - accum = 0; - spin_lock_bh(lock); - sk_nulls_for_each(sk, node, &head->chain) { - int state; - - if (!net_eq(sock_net(sk), net)) - continue; - if (num < s_num) - goto next_normal; - state = (sk->sk_state == TCP_TIME_WAIT) ? - READ_ONCE(inet_twsk(sk)->tw_substate) : sk->sk_state; - if (!(idiag_states & (1 << state))) - goto next_normal; - if (r->sdiag_family != AF_UNSPEC && - sk->sk_family != r->sdiag_family) - goto next_normal; - if (r->id.idiag_sport != htons(sk->sk_num) && - r->id.idiag_sport) - goto next_normal; - if (r->id.idiag_dport != sk->sk_dport && - r->id.idiag_dport) - goto next_normal; - twsk_build_assert(); - - if (!inet_diag_bc_sk(bc, sk)) - goto next_normal; - - if (!refcount_inc_not_zero(&sk->sk_refcnt)) - goto next_normal; - - num_arr[accum] = num; - sk_arr[accum] = sk; - if (++accum == SKARR_SZ) - break; -next_normal: - ++num; - } - spin_unlock_bh(lock); - res = 0; - for (idx = 0; idx < accum; idx++) { - if (res >= 0) { - res = sk_diag_fill(sk_arr[idx], skb, cb, r, - NLM_F_MULTI, net_admin); - if (res < 0) - num = num_arr[idx]; - } - sock_gen_put(sk_arr[idx]); - } - if (res < 0) - break; - cond_resched(); - if (accum == SKARR_SZ) { - s_num = num + 1; - goto next_chunk; - } - } - -done: - cb->args[1] = i; - cb->args[2] = num; -out: - ; -} -EXPORT_SYMBOL_GPL(inet_diag_dump_icsk); - static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { @@ -1319,13 +854,10 @@ static int __inet_diag_dump_start(struct netlink_callback *cb, int hdrlen) kfree(cb_data); return err; } - nla = cb_data->inet_diag_nla_bc; - if (nla) { - err = inet_diag_bc_audit(nla, skb); - if (err) { - kfree(cb_data); - return err; - } + err = inet_diag_bc_audit(cb_data, skb); + if (err) { + kfree(cb_data); + return err; } nla = cb_data->inet_diag_nla_bpf_stgs; diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 470ab17ceb51..025895eb6ec5 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -183,7 +183,7 @@ static void fqdir_work_fn(struct work_struct *work) rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL); if (llist_add(&fqdir->free_list, &fqdir_free_list)) - queue_delayed_work(system_wq, &fqdir_free_work, HZ); + queue_delayed_work(system_percpu_wq, &fqdir_free_work, HZ); } int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index ceeeec9b7290..f5826ec4bcaa 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -58,6 +58,14 @@ static u32 sk_ehashfn(const struct sock *sk) sk->sk_daddr, sk->sk_dport); } +static bool sk_is_connect_bind(const struct sock *sk) +{ + if (sk->sk_state == TCP_TIME_WAIT) + return inet_twsk(sk)->tw_connect_bind; + else + return sk->sk_userlocks & SOCK_CONNECT_BIND; +} + /* * Allocate and initialize a new local port bind bucket. * The bindhash mutex for snum's hash chain must be held here. @@ -87,10 +95,22 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, */ void inet_bind_bucket_destroy(struct inet_bind_bucket *tb) { + const struct inet_bind2_bucket *tb2; + if (hlist_empty(&tb->bhash2)) { hlist_del_rcu(&tb->node); kfree_rcu(tb, rcu); + return; + } + + if (tb->fastreuse == -1 && tb->fastreuseport == -1) + return; + hlist_for_each_entry(tb2, &tb->bhash2, bhash_node) { + if (tb2->fastreuse != -1 || tb2->fastreuseport != -1) + return; } + tb->fastreuse = -1; + tb->fastreuseport = -1; } bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, const struct net *net, @@ -121,6 +141,8 @@ static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb2, #else tb2->rcv_saddr = sk->sk_rcv_saddr; #endif + tb2->fastreuse = 0; + tb2->fastreuseport = 0; INIT_HLIST_HEAD(&tb2->owners); hlist_add_head(&tb2->node, &head->chain); hlist_add_head(&tb2->bhash_node, &tb->bhash2); @@ -143,11 +165,23 @@ struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep, /* Caller must hold hashbucket lock for this tb with local BH disabled */ void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb) { + const struct sock *sk; + if (hlist_empty(&tb->owners)) { __hlist_del(&tb->node); __hlist_del(&tb->bhash_node); kmem_cache_free(cachep, tb); + return; + } + + if (tb->fastreuse == -1 && tb->fastreuseport == -1) + return; + sk_for_each_bound(sk, &tb->owners) { + if (!sk_is_connect_bind(sk)) + return; } + tb->fastreuse = -1; + tb->fastreuseport = -1; } static bool inet_bind2_bucket_addr_match(const struct inet_bind2_bucket *tb2, @@ -191,6 +225,7 @@ static void __inet_put_port(struct sock *sk) tb = inet_csk(sk)->icsk_bind_hash; inet_csk(sk)->icsk_bind_hash = NULL; inet_sk(sk)->inet_num = 0; + sk->sk_userlocks &= ~SOCK_CONNECT_BIND; spin_lock(&head2->lock); if (inet_csk(sk)->icsk_bind2_hash) { @@ -277,7 +312,7 @@ bhash2_find: } } if (update_fastreuse) - inet_csk_update_fastreuse(tb, child); + inet_csk_update_fastreuse(child, tb, tb2); inet_bind_hash(child, tb, tb2, port); spin_unlock(&head2->lock); spin_unlock(&head->lock); @@ -425,19 +460,18 @@ struct sock *inet_lookup_run_sk_lookup(const struct net *net, } struct sock *__inet_lookup_listener(const struct net *net, - struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be32 saddr, __be16 sport, const __be32 daddr, const unsigned short hnum, const int dif, const int sdif) { struct inet_listen_hashbucket *ilb2; + struct inet_hashinfo *hashinfo; struct sock *result = NULL; unsigned int hash2; /* Lookup redirect from BPF */ - if (static_branch_unlikely(&bpf_sk_lookup_enabled) && - hashinfo == net->ipv4.tcp_death_row.hashinfo) { + if (static_branch_unlikely(&bpf_sk_lookup_enabled)) { result = inet_lookup_run_sk_lookup(net, IPPROTO_TCP, skb, doff, saddr, sport, daddr, hnum, dif, inet_ehashfn); @@ -445,6 +479,7 @@ struct sock *__inet_lookup_listener(const struct net *net, goto done; } + hashinfo = net->ipv4.tcp_death_row.hashinfo; hash2 = ipv4_portaddr_hash(net, daddr, hnum); ilb2 = inet_lhash2_bucket(hashinfo, hash2); @@ -490,21 +525,22 @@ void sock_edemux(struct sk_buff *skb) EXPORT_SYMBOL(sock_edemux); struct sock *__inet_lookup_established(const struct net *net, - struct inet_hashinfo *hashinfo, - const __be32 saddr, const __be16 sport, - const __be32 daddr, const u16 hnum, - const int dif, const int sdif) + const __be32 saddr, const __be16 sport, + const __be32 daddr, const u16 hnum, + const int dif, const int sdif) { - INET_ADDR_COOKIE(acookie, saddr, daddr); const __portpair ports = INET_COMBINED_PORTS(sport, hnum); - struct sock *sk; + INET_ADDR_COOKIE(acookie, saddr, daddr); const struct hlist_nulls_node *node; - /* Optimize here for direct hit, only listening connections can - * have wildcards anyways. - */ - unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport); - unsigned int slot = hash & hashinfo->ehash_mask; - struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; + struct inet_ehash_bucket *head; + struct inet_hashinfo *hashinfo; + unsigned int hash, slot; + struct sock *sk; + + hashinfo = net->ipv4.tcp_death_row.hashinfo; + hash = inet_ehashfn(net, daddr, hnum, saddr, sport); + slot = hash & hashinfo->ehash_mask; + head = &hashinfo->ehash[slot]; begin: sk_nulls_for_each_rcu(sk, node, &head->chain) { @@ -579,8 +615,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, if (likely(inet_match(net, sk2, acookie, ports, dif, sdif))) { if (sk2->sk_state == TCP_TIME_WAIT) { tw = inet_twsk(sk2); - if (sk->sk_protocol == IPPROTO_TCP && - tcp_twsk_unique(sk, sk2, twp)) + if (tcp_twsk_unique(sk, sk2, twp)) break; } goto not_unique; @@ -685,8 +720,11 @@ bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk) spin_lock(lock); if (osk) { WARN_ON_ONCE(sk->sk_hash != osk->sk_hash); - ret = sk_nulls_del_node_init_rcu(osk); - } else if (found_dup_sk) { + ret = sk_nulls_replace_node_init_rcu(osk, sk); + goto unlock; + } + + if (found_dup_sk) { *found_dup_sk = inet_ehash_lookup_by_sk(sk, list); if (*found_dup_sk) ret = false; @@ -695,6 +733,7 @@ bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk) if (ret) __sk_nulls_add_node_rcu(sk, list); +unlock: spin_unlock(lock); return ret; @@ -707,7 +746,7 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk) if (ok) { sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); } else { - this_cpu_inc(*sk->sk_prot->orphan_count); + tcp_orphan_count_inc(); inet_sk_set_state(sk, TCP_CLOSE); sock_set_flag(sk, SOCK_DEAD); inet_csk_destroy_sock(sk); @@ -739,15 +778,18 @@ static int inet_reuseport_add_sock(struct sock *sk, return reuseport_alloc(sk, inet_rcv_saddr_any(sk)); } -int __inet_hash(struct sock *sk, struct sock *osk) +int inet_hash(struct sock *sk) { struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); struct inet_listen_hashbucket *ilb2; int err = 0; + if (sk->sk_state == TCP_CLOSE) + return 0; + if (sk->sk_state != TCP_LISTEN) { local_bh_disable(); - inet_ehash_nolisten(sk, osk, NULL); + inet_ehash_nolisten(sk, NULL, NULL); local_bh_enable(); return 0; } @@ -772,17 +814,7 @@ unlock: return err; } -EXPORT_IPV6_MOD(__inet_hash); - -int inet_hash(struct sock *sk) -{ - int err = 0; - - if (sk->sk_state != TCP_CLOSE) - err = __inet_hash(sk, NULL); - - return err; -} +EXPORT_IPV6_MOD(inet_hash); void inet_unhash(struct sock *sk) { @@ -800,11 +832,6 @@ void inet_unhash(struct sock *sk) * avoid circular locking dependency on PREEMPT_RT. */ spin_lock(&ilb2->lock); - if (sk_unhashed(sk)) { - spin_unlock(&ilb2->lock); - return; - } - if (rcu_access_pointer(sk->sk_reuseport_cb)) reuseport_stop_listen_sock(sk); @@ -815,10 +842,6 @@ void inet_unhash(struct sock *sk) spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); spin_lock_bh(lock); - if (sk_unhashed(sk)) { - spin_unlock_bh(lock); - return; - } __sk_nulls_del_node_init_rcu(sk); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); spin_unlock_bh(lock); @@ -966,6 +989,10 @@ static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family, if (!tb2) { tb2 = new_tb2; inet_bind2_bucket_init(tb2, net, head2, inet_csk(sk)->icsk_bind_hash, sk); + if (sk_is_connect_bind(sk)) { + tb2->fastreuse = -1; + tb2->fastreuseport = -1; + } } inet_csk(sk)->icsk_bind2_hash = tb2; sk_add_bind_node(sk, &tb2->owners); @@ -1136,6 +1163,8 @@ ok: head2, tb, sk); if (!tb2) goto error; + tb2->fastreuse = -1; + tb2->fastreuseport = -1; } /* Here we want to add a little bit of randomness to the next source @@ -1148,6 +1177,7 @@ ok: /* Head lock still held and bh's disabled */ inet_bind_hash(sk, tb, tb2, port); + sk->sk_userlocks |= SOCK_CONNECT_BIND; if (sk_unhashed(sk)) { inet_sk(sk)->inet_sport = htons(port); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 875ff923a8ed..d4c781a0667f 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -15,7 +15,8 @@ #include <net/inet_hashtables.h> #include <net/inet_timewait_sock.h> #include <net/ip.h> - +#include <net/tcp.h> +#include <net/psp.h> /** * inet_twsk_bind_unhash - unhash a timewait socket from bind hash @@ -74,7 +75,8 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw) void inet_twsk_free(struct inet_timewait_sock *tw) { struct module *owner = tw->tw_prot->owner; - twsk_destructor((struct sock *)tw); + + tcp_twsk_destructor((struct sock *)tw); kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw); module_put(owner); } @@ -86,12 +88,6 @@ void inet_twsk_put(struct inet_timewait_sock *tw) } EXPORT_SYMBOL_GPL(inet_twsk_put); -static void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw, - struct hlist_nulls_head *list) -{ - hlist_nulls_add_head_rcu(&tw->tw_node, list); -} - static void inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo) { __inet_twsk_schedule(tw, timeo, false); @@ -111,13 +107,12 @@ void inet_twsk_hashdance_schedule(struct inet_timewait_sock *tw, { const struct inet_sock *inet = inet_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); - struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash); spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); struct inet_bind_hashbucket *bhead, *bhead2; - /* Step 1: Put TW into bind hash. Original socket stays there too. - Note, that any socket with inet->num != 0 MUST be bound in - binding cache, even if it is closed. + /* Put TW into bind hash. Original socket stays there too. + * Note, that any socket with inet->num != 0 MUST be bound in + * binding cache, even if it is closed. */ bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num, hashinfo->bhash_size)]; @@ -139,19 +134,6 @@ void inet_twsk_hashdance_schedule(struct inet_timewait_sock *tw, spin_lock(lock); - /* Step 2: Hash TW into tcp ehash chain */ - inet_twsk_add_node_rcu(tw, &ehead->chain); - - /* Step 3: Remove SK from hash chain */ - if (__sk_nulls_del_node_init_rcu(sk)) - sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); - - - /* Ensure above writes are committed into memory before updating the - * refcount. - * Provides ordering vs later refcount_inc(). - */ - smp_wmb(); /* tw_refcnt is set to 3 because we have : * - one reference for bhash chain. * - one reference for ehash chain. @@ -161,6 +143,15 @@ void inet_twsk_hashdance_schedule(struct inet_timewait_sock *tw, */ refcount_set(&tw->tw_refcnt, 3); + /* Ensure tw_refcnt has been set before tw is published. + * smp_wmb() provides the necessary memory barrier to enforce this + * ordering. + */ + smp_wmb(); + + hlist_nulls_replace_init_rcu(&sk->sk_nulls_node, &tw->tw_node); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); + inet_twsk_schedule(tw, timeo); spin_unlock(lock); @@ -206,10 +197,14 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, tw->tw_hash = sk->sk_hash; tw->tw_ipv6only = 0; tw->tw_transparent = inet_test_bit(TRANSPARENT, sk); + tw->tw_connect_bind = !!(sk->sk_userlocks & SOCK_CONNECT_BIND); tw->tw_prot = sk->sk_prot_creator; atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie)); twsk_net_set(tw, sock_net(sk)); timer_setup(&tw->tw_timer, tw_timer_handler, 0); +#ifdef CONFIG_SOCK_VALIDATE_XMIT + tw->tw_validate_xmit_skb = NULL; +#endif /* * Because we use RCU lookups, we should not set tw_refcnt * to a non null value before everything is setup for this @@ -218,6 +213,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, refcount_set(&tw->tw_refcnt, 0); __module_get(tw->tw_prot->owner); + psp_twsk_init(tw, sk); } return tw; @@ -329,13 +325,13 @@ restart: TCPF_NEW_SYN_RECV)) continue; - if (refcount_read(&sock_net(sk)->ns.count)) + if (check_net(sock_net(sk))) continue; if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt))) continue; - if (refcount_read(&sock_net(sk)->ns.count)) { + if (check_net(sock_net(sk))) { sock_gen_put(sk); goto restart; } diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index b2584cce90ae..f7012479713b 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -476,14 +476,16 @@ out_fail: /* Process an incoming IP datagram fragment. */ int ip_defrag(struct net *net, struct sk_buff *skb, u32 user) { - struct net_device *dev = skb->dev ? : skb_dst_dev(skb); - int vif = l3mdev_master_ifindex_rcu(dev); + struct net_device *dev; struct ipq *qp; + int vif; __IP_INC_STATS(net, IPSTATS_MIB_REASMREQDS); /* Lookup (or create) queue header */ rcu_read_lock(); + dev = skb->dev ? : skb_dst_dev_rcu(skb); + vif = l3mdev_master_ifindex_rcu(dev); qp = ip_find(net, ip_hdr(skb), user, vif); if (qp) { int ret, refs = 0; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index f5b9004d6938..761a53c6a89a 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -28,6 +28,7 @@ #include <linux/etherdevice.h> #include <linux/if_ether.h> +#include <net/flow.h> #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> @@ -44,7 +45,6 @@ #include <net/gre.h> #include <net/dst_metadata.h> #include <net/erspan.h> -#include <net/inet_dscp.h> /* Problems & solutions @@ -930,7 +930,7 @@ static int ipgre_open(struct net_device *dev) if (ipv4_is_multicast(t->parms.iph.daddr)) { struct flowi4 fl4 = { .flowi4_oif = t->parms.link, - .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(&t->parms.iph)), + .flowi4_dscp = ip4h_dscp(&t->parms.iph), .flowi4_scope = RT_SCOPE_UNIVERSE, .flowi4_proto = IPPROTO_GRE, .saddr = t->parms.iph.saddr, diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index fc323994b1fa..19d3141dad1f 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -141,6 +141,8 @@ #include <linux/mroute.h> #include <linux/netlink.h> #include <net/dst_metadata.h> +#include <net/udp.h> +#include <net/tcp.h> /* * Process Router Attention IP option (RFC 2113) @@ -263,10 +265,11 @@ int ip_local_deliver(struct sk_buff *skb) } EXPORT_SYMBOL(ip_local_deliver); -static inline bool ip_rcv_options(struct sk_buff *skb, struct net_device *dev) +static inline enum skb_drop_reason +ip_rcv_options(struct sk_buff *skb, struct net_device *dev) { - struct ip_options *opt; const struct iphdr *iph; + struct ip_options *opt; /* It looks as overkill, because not all IP options require packet mangling. @@ -277,7 +280,7 @@ static inline bool ip_rcv_options(struct sk_buff *skb, struct net_device *dev) */ if (skb_cow(skb, skb_headroom(skb))) { __IP_INC_STATS(dev_net(dev), IPSTATS_MIB_INDISCARDS); - goto drop; + return SKB_DROP_REASON_NOMEM; } iph = ip_hdr(skb); @@ -286,7 +289,7 @@ static inline bool ip_rcv_options(struct sk_buff *skb, struct net_device *dev) if (ip_options_compile(dev_net(dev), opt, skb)) { __IP_INC_STATS(dev_net(dev), IPSTATS_MIB_INHDRERRORS); - goto drop; + return SKB_DROP_REASON_IP_INHDR; } if (unlikely(opt->srr)) { @@ -298,17 +301,15 @@ static inline bool ip_rcv_options(struct sk_buff *skb, struct net_device *dev) net_info_ratelimited("source route option %pI4 -> %pI4\n", &iph->saddr, &iph->daddr); - goto drop; + return SKB_DROP_REASON_NOT_SPECIFIED; } } if (ip_options_rcv_srr(skb, dev)) - goto drop; + return SKB_DROP_REASON_NOT_SPECIFIED; } - return false; -drop: - return true; + return SKB_NOT_DROPPED_YET; } static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph, @@ -318,8 +319,6 @@ static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph, ip_hdr(hint)->tos == iph->tos; } -int tcp_v4_early_demux(struct sk_buff *skb); -int udp_v4_early_demux(struct sk_buff *skb); static int ip_rcv_finish_core(struct net *net, struct sk_buff *skb, struct net_device *dev, const struct sk_buff *hint) @@ -335,7 +334,6 @@ static int ip_rcv_finish_core(struct net *net, goto drop_error; } - drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; if (READ_ONCE(net->ipv4.sysctl_ip_early_demux) && !skb_dst(skb) && !skb->sk && @@ -354,7 +352,6 @@ static int ip_rcv_finish_core(struct net *net, drop_reason = udp_v4_early_demux(skb); if (unlikely(drop_reason)) goto drop_error; - drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; /* must reload iph, skb->head might have changed */ iph = ip_hdr(skb); @@ -372,7 +369,6 @@ static int ip_rcv_finish_core(struct net *net, ip4h_dscp(iph), dev); if (unlikely(drop_reason)) goto drop_error; - drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; } else { struct in_device *in_dev = __in_dev_get_rcu(dev); @@ -391,8 +387,11 @@ static int ip_rcv_finish_core(struct net *net, } #endif - if (iph->ihl > 5 && ip_rcv_options(skb, dev)) - goto drop; + if (iph->ihl > 5) { + drop_reason = ip_rcv_options(skb, dev); + if (drop_reason) + goto drop; + } rt = skb_rtable(skb); if (rt->rt_type == RTN_MULTICAST) { @@ -587,9 +586,13 @@ static void ip_sublist_rcv_finish(struct list_head *head) } static struct sk_buff *ip_extract_route_hint(const struct net *net, - struct sk_buff *skb, int rt_type) + struct sk_buff *skb) { - if (fib4_has_custom_rules(net) || rt_type == RTN_BROADCAST || + const struct iphdr *iph = ip_hdr(skb); + + if (fib4_has_custom_rules(net) || + ipv4_is_lbcast(iph->daddr) || + ipv4_is_zeronet(iph->daddr) || IPCB(skb)->flags & IPSKB_MULTIPATH) return NULL; @@ -618,8 +621,7 @@ static void ip_list_rcv_finish(struct net *net, struct list_head *head) dst = skb_dst(skb); if (curr_dst != dst) { - hint = ip_extract_route_hint(net, skb, - dst_rtable(dst)->rt_type); + hint = ip_extract_route_hint(net, skb); /* dispatch old sublist */ if (!list_empty(&sublist)) diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index e3321932bec0..be8815ce3ac2 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -615,14 +615,13 @@ int ip_options_rcv_srr(struct sk_buff *skb, struct net_device *dev) } memcpy(&nexthop, &optptr[srrptr-1], 4); - orefdst = skb->_skb_refdst; - skb_dst_set(skb, NULL); + orefdst = skb_dstref_steal(skb); err = ip_route_input(skb, nexthop, iph->saddr, ip4h_dscp(iph), dev) ? -EINVAL : 0; rt2 = skb_rtable(skb); if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) { skb_dst_drop(skb); - skb->_skb_refdst = orefdst; + skb_dstref_restore(skb, orefdst); return -EINVAL; } refdst_drop(orefdst); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 84e7f8a2f50f..ff11d3a85a36 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -63,6 +63,7 @@ #include <linux/stat.h> #include <linux/init.h> +#include <net/flow.h> #include <net/snmp.h> #include <net/ip.h> #include <net/protocol.h> @@ -83,6 +84,7 @@ #include <linux/netfilter_bridge.h> #include <linux/netlink.h> #include <linux/tcp.h> +#include <net/psp.h> static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, @@ -485,7 +487,7 @@ int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, inet_sk_init_flowi4(inet, fl4); /* sctp_v4_xmit() uses its own DSCP value */ - fl4->flowi4_tos = tos & INET_DSCP_MASK; + fl4->flowi4_dscp = inet_dsfield_to_dscp(tos); /* If this fails, retransmit mechanism of transport layer will * keep trying until route appears or the connection times @@ -1664,8 +1666,10 @@ void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk, arg->csumoffset) = csum_fold(csum_add(nskb->csum, arg->csum)); nskb->ip_summed = CHECKSUM_NONE; - if (orig_sk) + if (orig_sk) { skb_set_owner_edemux(nskb, (struct sock *)orig_sk); + psp_reply_set_decrypted(orig_sk, nskb); + } if (transmit_time) nskb->tstamp_type = SKB_CLOCK_MONOTONIC; if (txhash) diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index aaeb5d16f0c9..158a30ae7c5f 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -568,20 +568,6 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb, return 0; } -static void ip_tunnel_adj_headroom(struct net_device *dev, unsigned int headroom) -{ - /* we must cap headroom to some upperlimit, else pskb_expand_head - * will overflow header offsets in skb_headers_offset_update(). - */ - static const unsigned int max_allowed = 512; - - if (headroom > max_allowed) - headroom = max_allowed; - - if (headroom > READ_ONCE(dev->needed_headroom)) - WRITE_ONCE(dev->needed_headroom, headroom); -} - void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto, int tunnel_hlen) { diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index cc9915543637..2e61ac137128 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -206,6 +206,9 @@ static int iptunnel_pmtud_build_icmp(struct sk_buff *skb, int mtu) if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct iphdr))) return -EINVAL; + if (skb_is_gso(skb)) + skb_gso_reset(skb); + skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN); pskb_pull(skb, ETH_HLEN); skb_reset_network_header(skb); @@ -300,6 +303,9 @@ static int iptunnel_pmtud_build_icmpv6(struct sk_buff *skb, int mtu) if (!pskb_may_pull(skb, ETH_HLEN + sizeof(struct ipv6hdr))) return -EINVAL; + if (skb_is_gso(skb)) + skb_gso_reset(skb); + skb_copy_bits(skb, skb_mac_offset(skb), &eh, ETH_HLEN); pskb_pull(skb, ETH_HLEN); skb_reset_network_header(skb); diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 22a7889876c1..019408d3ca2c 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -1690,7 +1690,8 @@ static int __init ic_proto_name(char *name) *v = 0; if (kstrtou8(client_id, 0, dhcp_client_identifier)) pr_debug("DHCP: Invalid client identifier type\n"); - strncpy(dhcp_client_identifier + 1, v + 1, 251); + strscpy(dhcp_client_identifier + 1, v + 1, + sizeof(dhcp_client_identifier) - 1); *v = ','; } return 1; diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 3e03af073a1c..ff95b1b9908e 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -353,6 +353,30 @@ ipip_tunnel_ctl(struct net_device *dev, struct ip_tunnel_parm_kern *p, int cmd) return ip_tunnel_ctl(dev, p, cmd); } +static int ipip_fill_forward_path(struct net_device_path_ctx *ctx, + struct net_device_path *path) +{ + struct ip_tunnel *tunnel = netdev_priv(ctx->dev); + const struct iphdr *tiph = &tunnel->parms.iph; + struct rtable *rt; + + rt = ip_route_output(dev_net(ctx->dev), tiph->daddr, 0, 0, 0, + RT_SCOPE_UNIVERSE); + if (IS_ERR(rt)) + return PTR_ERR(rt); + + path->type = DEV_PATH_TUN; + path->tun.src_v4.s_addr = tiph->saddr; + path->tun.dst_v4.s_addr = tiph->daddr; + path->tun.l3_proto = IPPROTO_IPIP; + path->dev = ctx->dev; + + ctx->dev = rt->dst.dev; + ip_rt_put(rt); + + return 0; +} + static const struct net_device_ops ipip_netdev_ops = { .ndo_init = ipip_tunnel_init, .ndo_uninit = ip_tunnel_uninit, @@ -362,6 +386,7 @@ static const struct net_device_ops ipip_netdev_ops = { .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip_tunnel_get_iflink, .ndo_tunnel_ctl = ipip_tunnel_ctl, + .ndo_fill_forward_path = ipip_fill_forward_path, }; #define IPIP_FEATURES (NETIF_F_SG | \ diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index e86a8a862c41..ca9eaee4c2ef 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -42,6 +42,7 @@ #include <linux/init.h> #include <linux/if_ether.h> #include <linux/slab.h> +#include <net/flow.h> #include <net/net_namespace.h> #include <net/ip.h> #include <net/protocol.h> @@ -1904,7 +1905,7 @@ static int ipmr_prepare_xmit(struct net *net, struct mr_table *mrt, return -1; } - encap += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len; + encap += LL_RESERVED_SPACE(dst_dev_rcu(&rt->dst)) + rt->dst.header_len; if (skb_cow(skb, encap)) { ip_rt_put(rt); @@ -1957,7 +1958,7 @@ static void ipmr_queue_fwd_xmit(struct net *net, struct mr_table *mrt, * result in receiving multiple packets. */ NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, - net, NULL, skb, skb->dev, rt->dst.dev, + net, NULL, skb, skb->dev, dst_dev_rcu(&rt->dst), ipmr_forward_finish); return; @@ -2120,7 +2121,7 @@ static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb) struct flowi4 fl4 = { .daddr = iph->daddr, .saddr = iph->saddr, - .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)), + .flowi4_dscp = ip4h_dscp(iph), .flowi4_oif = (rt_is_output_route(rt) ? skb->dev->ifindex : 0), .flowi4_iif = (rt_is_output_route(rt) ? @@ -2301,7 +2302,7 @@ int ip_mr_output(struct net *net, struct sock *sk, struct sk_buff *skb) guard(rcu)(); - dev = rt->dst.dev; + dev = dst_dev_rcu(&rt->dst); if (IPCB(skb)->flags & IPSKB_FORWARDED) goto mc_output; diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index 0565f001120d..ce310eb779e0 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -11,10 +11,10 @@ #include <linux/skbuff.h> #include <linux/gfp.h> #include <linux/export.h> +#include <net/flow.h> #include <net/route.h> #include <net/xfrm.h> #include <net/ip.h> -#include <net/inet_dscp.h> #include <net/netfilter/nf_queue.h> /* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */ @@ -44,7 +44,7 @@ int ip_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb, un */ fl4.daddr = iph->daddr; fl4.saddr = saddr; - fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)); + fl4.flowi4_dscp = ip4h_dscp(iph); fl4.flowi4_oif = sk ? sk->sk_bound_dev_if : 0; fl4.flowi4_l3mdev = l3mdev_master_ifindex(dev); fl4.flowi4_mark = skb->mark; @@ -65,7 +65,10 @@ int ip_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb, un if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && xfrm_decode_session(net, skb, flowi4_to_flowi(&fl4), AF_INET) == 0) { struct dst_entry *dst = skb_dst(skb); - skb_dst_set(skb, NULL); + /* ignore return value from skb_dstref_steal, xfrm_lookup takes + * care of dropping the refcnt if needed. + */ + skb_dstref_steal(skb); dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), sk, 0); if (IS_ERR(dst)) return PTR_ERR(dst); diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 2c438b140e88..7dc9772fe2d8 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -14,6 +14,7 @@ config NF_DEFRAG_IPV4 config IP_NF_IPTABLES_LEGACY tristate "Legacy IP tables support" depends on NETFILTER_XTABLES_LEGACY + depends on NETFILTER_XTABLES default m if NETFILTER_XTABLES_LEGACY help iptables is a legacy packet classifier. @@ -326,6 +327,7 @@ endif # IP_NF_IPTABLES config IP_NF_ARPTABLES tristate "Legacy ARPTABLES support" depends on NETFILTER_XTABLES_LEGACY + depends on NETFILTER_XTABLES default n help arptables is a legacy packet classifier. @@ -343,6 +345,7 @@ config IP_NF_ARPFILTER select IP_NF_ARPTABLES select NETFILTER_FAMILY_ARP depends on NETFILTER_XTABLES_LEGACY + depends on NETFILTER_XTABLES help ARP packet filtering defines a table `filter', which has a series of rules for simple ARP packet filtering at local input and diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c index a27782d7653e..6d9bf5106868 100644 --- a/net/ipv4/netfilter/ipt_rpfilter.c +++ b/net/ipv4/netfilter/ipt_rpfilter.c @@ -8,8 +8,8 @@ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/netdevice.h> -#include <net/inet_dscp.h> #include <linux/ip.h> +#include <net/flow.h> #include <net/ip.h> #include <net/ip_fib.h> #include <net/route.h> @@ -76,7 +76,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) flow.daddr = iph->saddr; flow.saddr = rpfilter_get_saddr(iph->daddr); flow.flowi4_mark = info->flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0; - flow.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)); + flow.flowi4_dscp = ip4h_dscp(iph); flow.flowi4_scope = RT_SCOPE_UNIVERSE; flow.flowi4_l3mdev = l3mdev_master_ifindex_rcu(xt_in(par)); flow.flowi4_uid = sock_net_uid(xt_net(par), NULL); diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c index ed08fb78cfa8..9a773502f10a 100644 --- a/net/ipv4/netfilter/nf_dup_ipv4.c +++ b/net/ipv4/netfilter/nf_dup_ipv4.c @@ -12,10 +12,10 @@ #include <linux/skbuff.h> #include <linux/netfilter.h> #include <net/checksum.h> +#include <net/flow.h> #include <net/icmp.h> #include <net/ip.h> #include <net/route.h> -#include <net/inet_dscp.h> #include <net/netfilter/ipv4/nf_dup_ipv4.h> #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <net/netfilter/nf_conntrack.h> @@ -33,7 +33,7 @@ static bool nf_dup_ipv4_route(struct net *net, struct sk_buff *skb, fl4.flowi4_oif = oif; fl4.daddr = gw->s_addr; - fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)); + fl4.flowi4_dscp = ip4h_dscp(iph); fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_flags = FLOWI_FLAG_KNOWN_NH; rt = ip_route_output_key(net, &fl4); diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index 87fd945a0d27..fae4aa4a5f09 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -12,6 +12,15 @@ #include <linux/netfilter_ipv4.h> #include <linux/netfilter_bridge.h> +static struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb, + const struct sk_buff *oldskb, + __u8 protocol, int ttl); +static void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb, + const struct tcphdr *oth); +static const struct tcphdr * +nf_reject_ip_tcphdr_get(struct sk_buff *oldskb, + struct tcphdr *_oth, int hook); + static int nf_reject_iphdr_validate(struct sk_buff *skb) { struct iphdr *iph; @@ -71,6 +80,27 @@ struct sk_buff *nf_reject_skb_v4_tcp_reset(struct net *net, } EXPORT_SYMBOL_GPL(nf_reject_skb_v4_tcp_reset); +static bool nf_skb_is_icmp_unreach(const struct sk_buff *skb) +{ + const struct iphdr *iph = ip_hdr(skb); + u8 *tp, _type; + int thoff; + + if (iph->protocol != IPPROTO_ICMP) + return false; + + thoff = skb_network_offset(skb) + sizeof(*iph); + + tp = skb_header_pointer(skb, + thoff + offsetof(struct icmphdr, type), + sizeof(_type), &_type); + + if (!tp) + return false; + + return *tp == ICMP_DEST_UNREACH; +} + struct sk_buff *nf_reject_skb_v4_unreach(struct net *net, struct sk_buff *oldskb, const struct net_device *dev, @@ -91,6 +121,10 @@ struct sk_buff *nf_reject_skb_v4_unreach(struct net *net, if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET)) return NULL; + /* don't reply to ICMP_DEST_UNREACH with ICMP_DEST_UNREACH. */ + if (nf_skb_is_icmp_unreach(oldskb)) + return NULL; + /* RFC says return as much as we can without exceeding 576 bytes. */ len = min_t(unsigned int, 536, oldskb->len); @@ -136,8 +170,9 @@ struct sk_buff *nf_reject_skb_v4_unreach(struct net *net, } EXPORT_SYMBOL_GPL(nf_reject_skb_v4_unreach); -const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb, - struct tcphdr *_oth, int hook) +static const struct tcphdr * +nf_reject_ip_tcphdr_get(struct sk_buff *oldskb, + struct tcphdr *_oth, int hook) { const struct tcphdr *oth; @@ -163,11 +198,10 @@ const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb, return oth; } -EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_get); -struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb, - const struct sk_buff *oldskb, - __u8 protocol, int ttl) +static struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb, + const struct sk_buff *oldskb, + __u8 protocol, int ttl) { struct iphdr *niph, *oiph = ip_hdr(oldskb); @@ -188,10 +222,9 @@ struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb, return niph; } -EXPORT_SYMBOL_GPL(nf_reject_iphdr_put); -void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb, - const struct tcphdr *oth) +static void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb, + const struct tcphdr *oth) { struct iphdr *niph = ip_hdr(nskb); struct tcphdr *tcph; @@ -218,7 +251,6 @@ void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb, nskb->csum_start = (unsigned char *)tcph - nskb->head; nskb->csum_offset = offsetof(struct tcphdr, check); } -EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_put); static int nf_reject_fill_skb_dst(struct sk_buff *skb_in) { @@ -247,8 +279,7 @@ void nf_send_reset(struct net *net, struct sock *sk, struct sk_buff *oldskb, if (!oth) return; - if ((hook == NF_INET_PRE_ROUTING || hook == NF_INET_INGRESS) && - nf_reject_fill_skb_dst(oldskb) < 0) + if (!skb_dst(oldskb) && nf_reject_fill_skb_dst(oldskb) < 0) return; if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) @@ -321,8 +352,7 @@ void nf_send_unreach(struct sk_buff *skb_in, int code, int hook) if (iph->frag_off & htons(IP_OFFSET)) return; - if ((hook == NF_INET_PRE_ROUTING || hook == NF_INET_INGRESS) && - nf_reject_fill_skb_dst(skb_in) < 0) + if (!skb_dst(skb_in) && nf_reject_fill_skb_dst(skb_in) < 0) return; if (skb_csum_unnecessary(skb_in) || diff --git a/net/ipv4/netfilter/nf_socket_ipv4.c b/net/ipv4/netfilter/nf_socket_ipv4.c index a1350fc25838..5080fa5fbf6a 100644 --- a/net/ipv4/netfilter/nf_socket_ipv4.c +++ b/net/ipv4/netfilter/nf_socket_ipv4.c @@ -71,8 +71,7 @@ nf_socket_get_sock_v4(struct net *net, struct sk_buff *skb, const int doff, { switch (protocol) { case IPPROTO_TCP: - return inet_lookup(net, net->ipv4.tcp_death_row.hashinfo, - skb, doff, saddr, sport, daddr, dport, + return inet_lookup(net, skb, doff, saddr, sport, daddr, dport, in->ifindex); case IPPROTO_UDP: return udp4_lib_lookup(net, saddr, sport, daddr, dport, diff --git a/net/ipv4/netfilter/nf_tproxy_ipv4.c b/net/ipv4/netfilter/nf_tproxy_ipv4.c index 73e66a088e25..041c3f37f237 100644 --- a/net/ipv4/netfilter/nf_tproxy_ipv4.c +++ b/net/ipv4/netfilter/nf_tproxy_ipv4.c @@ -81,7 +81,6 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, const struct net_device *in, const enum nf_tproxy_lookup_t lookup_type) { - struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo; struct sock *sk; switch (protocol) { @@ -95,7 +94,7 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, switch (lookup_type) { case NF_TPROXY_LOOKUP_LISTENER: - sk = inet_lookup_listener(net, hinfo, skb, + sk = inet_lookup_listener(net, skb, ip_hdrlen(skb) + __tcp_hdrlen(hp), saddr, sport, daddr, dport, in->ifindex, 0); @@ -109,7 +108,7 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, */ break; case NF_TPROXY_LOOKUP_ESTABLISHED: - sk = inet_lookup_established(net, hinfo, saddr, sport, + sk = inet_lookup_established(net, saddr, sport, daddr, dport, in->ifindex); break; default: diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c index 7e7c49535e3f..82af6cd76d13 100644 --- a/net/ipv4/netfilter/nft_fib_ipv4.c +++ b/net/ipv4/netfilter/nft_fib_ipv4.c @@ -10,7 +10,7 @@ #include <net/netfilter/nf_tables.h> #include <net/netfilter/nft_fib.h> -#include <net/inet_dscp.h> +#include <net/flow.h> #include <net/ip.h> #include <net/ip_fib.h> #include <net/route.h> @@ -114,7 +114,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, if (priv->flags & NFTA_FIB_F_MARK) fl4.flowi4_mark = pkt->skb->mark; - fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)); + fl4.flowi4_dscp = ip4h_dscp(iph); if (priv->flags & NFTA_FIB_F_DADDR) { fl4.daddr = iph->daddr; diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 29118c43ebf5..7b9d70f9b31c 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -2087,6 +2087,12 @@ static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh, { struct nh_grp_entry *nhge, *tmp; + /* If there is nothing to do, let's avoid the costly call to + * synchronize_net() + */ + if (list_empty(&nh->grp_list)) + return; + list_for_each_entry_safe(nhge, tmp, &nh->grp_list, nh_list) remove_nh_grp_entry(net, nhge, nlinfo); @@ -2399,6 +2405,13 @@ static int replace_nexthop_single(struct net *net, struct nexthop *old, return -EINVAL; } + if (!list_empty(&old->grp_list) && + rtnl_dereference(new->nh_info)->fdb_nh != + rtnl_dereference(old->nh_info)->fdb_nh) { + NL_SET_ERR_MSG(extack, "Cannot change nexthop FDB status while in a group"); + return -EINVAL; + } + err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new, extack); if (err) return err; @@ -3511,12 +3524,42 @@ static int rtm_dump_walk_nexthops(struct sk_buff *skb, int err; s_idx = ctx->idx; - for (node = rb_first(root); node; node = rb_next(node)) { + + /* If this is not the first invocation, ctx->idx will contain the id of + * the last nexthop we processed. Instead of starting from the very + * first element of the red/black tree again and linearly skipping the + * (potentially large) set of nodes with an id smaller than s_idx, walk + * the tree and find the left-most node whose id is >= s_idx. This + * provides an efficient O(log n) starting point for the dump + * continuation. + */ + if (s_idx != 0) { + struct rb_node *tmp = root->rb_node; + + node = NULL; + while (tmp) { + struct nexthop *nh; + + nh = rb_entry(tmp, struct nexthop, rb_node); + if (nh->id < s_idx) { + tmp = tmp->rb_right; + } else { + /* Track current candidate and keep looking on + * the left side to find the left-most + * (smallest id) that is still >= s_idx. + */ + node = tmp; + tmp = tmp->rb_left; + } + } + } else { + node = rb_first(root); + } + + for (; node; node = rb_next(node)) { struct nexthop *nh; nh = rb_entry(node, struct nexthop, rb_node); - if (nh->id < s_idx) - continue; ctx->idx = nh->id; err = nh_cb(skb, cb, nh, data); diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 031df4c19fcc..ad56588107cc 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -56,9 +56,7 @@ struct ping_table { static struct ping_table ping_table; struct pingv6_ops pingv6_ops; -EXPORT_SYMBOL_GPL(pingv6_ops); - -static u16 ping_port_rover; +EXPORT_IPV6_MOD_GPL(pingv6_ops); static inline u32 ping_hashfn(const struct net *net, u32 num, u32 mask) { @@ -67,7 +65,6 @@ static inline u32 ping_hashfn(const struct net *net, u32 num, u32 mask) pr_debug("hash(%u) = %u\n", num, res); return res; } -EXPORT_SYMBOL_GPL(ping_hash); static inline struct hlist_head *ping_hashslot(struct ping_table *table, struct net *net, unsigned int num) @@ -77,6 +74,7 @@ static inline struct hlist_head *ping_hashslot(struct ping_table *table, int ping_get_port(struct sock *sk, unsigned short ident) { + struct net *net = sock_net(sk); struct inet_sock *isk, *isk2; struct hlist_head *hlist; struct sock *sk2 = NULL; @@ -84,15 +82,16 @@ int ping_get_port(struct sock *sk, unsigned short ident) isk = inet_sk(sk); spin_lock(&ping_table.lock); if (ident == 0) { + u16 result = net->ipv4.ping_port_rover + 1; u32 i; - u16 result = ping_port_rover + 1; for (i = 0; i < (1L << 16); i++, result++) { if (!result) - result++; /* avoid zero */ - hlist = ping_hashslot(&ping_table, sock_net(sk), - result); + continue; /* avoid zero */ + hlist = ping_hashslot(&ping_table, net, result); sk_for_each(sk2, hlist) { + if (!net_eq(sock_net(sk2), net)) + continue; isk2 = inet_sk(sk2); if (isk2->inet_num == result) @@ -100,7 +99,7 @@ int ping_get_port(struct sock *sk, unsigned short ident) } /* found */ - ping_port_rover = ident = result; + net->ipv4.ping_port_rover = ident = result; break; next_port: ; @@ -108,8 +107,10 @@ next_port: if (i >= (1L << 16)) goto fail; } else { - hlist = ping_hashslot(&ping_table, sock_net(sk), ident); + hlist = ping_hashslot(&ping_table, net, ident); sk_for_each(sk2, hlist) { + if (!net_eq(sock_net(sk2), net)) + continue; isk2 = inet_sk(sk2); /* BUG? Why is this reuse and not reuseaddr? ping.c @@ -129,7 +130,7 @@ next_port: pr_debug("was not hashed\n"); sk_add_node_rcu(sk, hlist); sock_set_flag(sk, SOCK_RCU_FREE); - sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + sock_prot_inuse_add(net, sk->sk_prot, 1); } spin_unlock(&ping_table.lock); return 0; @@ -138,15 +139,7 @@ fail: spin_unlock(&ping_table.lock); return -EADDRINUSE; } -EXPORT_SYMBOL_GPL(ping_get_port); - -int ping_hash(struct sock *sk) -{ - pr_debug("ping_hash(sk->port=%u)\n", inet_sk(sk)->inet_num); - BUG(); /* "Please do not press this button again." */ - - return 0; -} +EXPORT_IPV6_MOD_GPL(ping_get_port); void ping_unhash(struct sock *sk) { @@ -161,7 +154,7 @@ void ping_unhash(struct sock *sk) } spin_unlock(&ping_table.lock); } -EXPORT_SYMBOL_GPL(ping_unhash); +EXPORT_IPV6_MOD_GPL(ping_unhash); /* Called under rcu_read_lock() */ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident) @@ -188,6 +181,8 @@ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident) } sk_for_each_rcu(sk, hslot) { + if (!net_eq(sock_net(sk), net)) + continue; isk = inet_sk(sk); pr_debug("iterate\n"); @@ -279,7 +274,7 @@ out_release_group: put_group_info(group_info); return ret; } -EXPORT_SYMBOL_GPL(ping_init_sock); +EXPORT_IPV6_MOD_GPL(ping_init_sock); void ping_close(struct sock *sk, long timeout) { @@ -289,9 +284,9 @@ void ping_close(struct sock *sk, long timeout) sk_common_release(sk); } -EXPORT_SYMBOL_GPL(ping_close); +EXPORT_IPV6_MOD_GPL(ping_close); -static int ping_pre_connect(struct sock *sk, struct sockaddr *uaddr, +static int ping_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { /* This check is replicated from __ip4_datagram_connect() and @@ -306,7 +301,7 @@ static int ping_pre_connect(struct sock *sk, struct sockaddr *uaddr, /* Checks the bind address and possibly modifies sk->sk_bound_dev_if. */ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, - struct sockaddr *uaddr, int addr_len) + struct sockaddr_unsized *uaddr, int addr_len) { struct net *net = sock_net(sk); if (sk->sk_family == AF_INET) { @@ -392,7 +387,7 @@ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, return 0; } -static void ping_set_saddr(struct sock *sk, struct sockaddr *saddr) +static void ping_set_saddr(struct sock *sk, struct sockaddr_unsized *saddr) { if (saddr->sa_family == AF_INET) { struct inet_sock *isk = inet_sk(sk); @@ -412,7 +407,7 @@ static void ping_set_saddr(struct sock *sk, struct sockaddr *saddr) * Moreover, we don't allow binding to multi- and broadcast addresses. */ -int ping_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) +int ping_bind(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { struct inet_sock *isk = inet_sk(sk); unsigned short snum; @@ -467,7 +462,7 @@ out: pr_debug("ping_v4_bind -> %d\n", err); return err; } -EXPORT_SYMBOL_GPL(ping_bind); +EXPORT_IPV6_MOD_GPL(ping_bind); /* * Is this a supported type of ICMP message? @@ -600,7 +595,7 @@ void ping_err(struct sk_buff *skb, int offset, u32 info) out: return; } -EXPORT_SYMBOL_GPL(ping_err); +EXPORT_IPV6_MOD_GPL(ping_err); /* * Copy and checksum an ICMP Echo packet from user space into a buffer @@ -630,7 +625,7 @@ int ping_getfrag(void *from, char *to, return 0; } -EXPORT_SYMBOL_GPL(ping_getfrag); +EXPORT_IPV6_MOD_GPL(ping_getfrag); static int ping_v4_push_pending_frames(struct sock *sk, struct pingfakehdr *pfh, struct flowi4 *fl4) @@ -691,7 +686,7 @@ int ping_common_sendmsg(int family, struct msghdr *msg, size_t len, return 0; } -EXPORT_SYMBOL_GPL(ping_common_sendmsg); +EXPORT_IPV6_MOD_GPL(ping_common_sendmsg); static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { @@ -936,7 +931,7 @@ out: pr_debug("ping_recvmsg -> %d\n", err); return err; } -EXPORT_SYMBOL_GPL(ping_recvmsg); +EXPORT_IPV6_MOD_GPL(ping_recvmsg); static enum skb_drop_reason __ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) @@ -957,7 +952,7 @@ int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { return __ping_queue_rcv_skb(sk, skb) ? -1 : 0; } -EXPORT_SYMBOL_GPL(ping_queue_rcv_skb); +EXPORT_IPV6_MOD_GPL(ping_queue_rcv_skb); /* @@ -985,7 +980,7 @@ enum skb_drop_reason ping_rcv(struct sk_buff *skb) kfree_skb_reason(skb, SKB_DROP_REASON_NO_SOCKET); return SKB_DROP_REASON_NO_SOCKET; } -EXPORT_SYMBOL_GPL(ping_rcv); +EXPORT_IPV6_MOD_GPL(ping_rcv); struct proto ping_prot = { .name = "PING", @@ -1002,13 +997,12 @@ struct proto ping_prot = { .bind = ping_bind, .backlog_rcv = ping_queue_rcv_skb, .release_cb = ip4_datagram_release_cb, - .hash = ping_hash, .unhash = ping_unhash, .get_port = ping_get_port, .put_port = ping_unhash, .obj_size = sizeof(struct inet_sock), }; -EXPORT_SYMBOL(ping_prot); +EXPORT_IPV6_MOD(ping_prot); #ifdef CONFIG_PROC_FS @@ -1073,7 +1067,7 @@ void *ping_seq_start(struct seq_file *seq, loff_t *pos, sa_family_t family) return *pos ? ping_get_idx(seq, *pos-1) : SEQ_START_TOKEN; } -EXPORT_SYMBOL_GPL(ping_seq_start); +EXPORT_IPV6_MOD_GPL(ping_seq_start); static void *ping_v4_seq_start(struct seq_file *seq, loff_t *pos) { @@ -1092,14 +1086,14 @@ void *ping_seq_next(struct seq_file *seq, void *v, loff_t *pos) ++*pos; return sk; } -EXPORT_SYMBOL_GPL(ping_seq_next); +EXPORT_IPV6_MOD_GPL(ping_seq_next); void ping_seq_stop(struct seq_file *seq, void *v) __releases(ping_table.lock) { spin_unlock(&ping_table.lock); } -EXPORT_SYMBOL_GPL(ping_seq_stop); +EXPORT_IPV6_MOD_GPL(ping_seq_stop); static void ping_v4_format_sock(struct sock *sp, struct seq_file *f, int bucket) @@ -1119,7 +1113,7 @@ static void ping_v4_format_sock(struct sock *sp, struct seq_file *f, from_kuid_munged(seq_user_ns(f), sk_uid(sp)), 0, sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, - atomic_read(&sp->sk_drops)); + sk_drops_read(sp)); } static int ping_v4_seq_show(struct seq_file *seq, void *v) @@ -1150,6 +1144,8 @@ static int __net_init ping_v4_proc_init_net(struct net *net) if (!proc_create_net("icmp", 0444, net->proc_net, &ping_v4_seq_ops, sizeof(struct ping_iter_state))) return -ENOMEM; + + net->ipv4.ping_port_rover = get_random_u16(); return 0; } diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 65b0d0ab0084..974afc4ecbe2 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -95,7 +95,6 @@ static const struct snmp_mib snmp4_ipstats_list[] = { SNMP_MIB_ITEM("FragFails", IPSTATS_MIB_FRAGFAILS), SNMP_MIB_ITEM("FragCreates", IPSTATS_MIB_FRAGCREATES), SNMP_MIB_ITEM("OutTransmits", IPSTATS_MIB_OUTPKTS), - SNMP_MIB_SENTINEL }; /* Following items are displayed in /proc/net/netstat */ @@ -119,7 +118,6 @@ static const struct snmp_mib snmp4_ipextstats_list[] = { SNMP_MIB_ITEM("InECT0Pkts", IPSTATS_MIB_ECT0PKTS), SNMP_MIB_ITEM("InCEPkts", IPSTATS_MIB_CEPKTS), SNMP_MIB_ITEM("ReasmOverlaps", IPSTATS_MIB_REASM_OVERLAPS), - SNMP_MIB_SENTINEL }; static const struct { @@ -157,7 +155,6 @@ static const struct snmp_mib snmp4_tcp_list[] = { SNMP_MIB_ITEM("InErrs", TCP_MIB_INERRS), SNMP_MIB_ITEM("OutRsts", TCP_MIB_OUTRSTS), SNMP_MIB_ITEM("InCsumErrors", TCP_MIB_CSUMERRORS), - SNMP_MIB_SENTINEL }; static const struct snmp_mib snmp4_udp_list[] = { @@ -170,7 +167,6 @@ static const struct snmp_mib snmp4_udp_list[] = { SNMP_MIB_ITEM("InCsumErrors", UDP_MIB_CSUMERRORS), SNMP_MIB_ITEM("IgnoredMulti", UDP_MIB_IGNOREDMULTI), SNMP_MIB_ITEM("MemErrors", UDP_MIB_MEMERRORS), - SNMP_MIB_SENTINEL }; static const struct snmp_mib snmp4_net_list[] = { @@ -309,7 +305,6 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPAOKeyNotFound", LINUX_MIB_TCPAOKEYNOTFOUND), SNMP_MIB_ITEM("TCPAOGood", LINUX_MIB_TCPAOGOOD), SNMP_MIB_ITEM("TCPAODroppedIcmps", LINUX_MIB_TCPAODROPPEDICMPS), - SNMP_MIB_SENTINEL }; static void icmpmsg_put_line(struct seq_file *seq, unsigned long *vals, @@ -389,14 +384,15 @@ static void icmp_put(struct seq_file *seq) */ static int snmp_seq_show_ipstats(struct seq_file *seq, void *v) { + const int cnt = ARRAY_SIZE(snmp4_ipstats_list); + u64 buff64[ARRAY_SIZE(snmp4_ipstats_list)]; struct net *net = seq->private; - u64 buff64[IPSTATS_MIB_MAX]; int i; - memset(buff64, 0, IPSTATS_MIB_MAX * sizeof(u64)); + memset(buff64, 0, sizeof(buff64)); seq_puts(seq, "Ip: Forwarding DefaultTTL"); - for (i = 0; snmp4_ipstats_list[i].name; i++) + for (i = 0; i < cnt; i++) seq_printf(seq, " %s", snmp4_ipstats_list[i].name); seq_printf(seq, "\nIp: %d %d", @@ -404,10 +400,10 @@ static int snmp_seq_show_ipstats(struct seq_file *seq, void *v) READ_ONCE(net->ipv4.sysctl_ip_default_ttl)); BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0); - snmp_get_cpu_field64_batch(buff64, snmp4_ipstats_list, - net->mib.ip_statistics, - offsetof(struct ipstats_mib, syncp)); - for (i = 0; snmp4_ipstats_list[i].name; i++) + snmp_get_cpu_field64_batch_cnt(buff64, snmp4_ipstats_list, cnt, + net->mib.ip_statistics, + offsetof(struct ipstats_mib, syncp)); + for (i = 0; i < cnt; i++) seq_printf(seq, " %llu", buff64[i]); return 0; @@ -415,20 +411,23 @@ static int snmp_seq_show_ipstats(struct seq_file *seq, void *v) static int snmp_seq_show_tcp_udp(struct seq_file *seq, void *v) { + const int udp_cnt = ARRAY_SIZE(snmp4_udp_list); + const int tcp_cnt = ARRAY_SIZE(snmp4_tcp_list); unsigned long buff[TCPUDP_MIB_MAX]; struct net *net = seq->private; int i; - memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long)); + memset(buff, 0, tcp_cnt * sizeof(unsigned long)); seq_puts(seq, "\nTcp:"); - for (i = 0; snmp4_tcp_list[i].name; i++) + for (i = 0; i < tcp_cnt; i++) seq_printf(seq, " %s", snmp4_tcp_list[i].name); seq_puts(seq, "\nTcp:"); - snmp_get_cpu_field_batch(buff, snmp4_tcp_list, - net->mib.tcp_statistics); - for (i = 0; snmp4_tcp_list[i].name; i++) { + snmp_get_cpu_field_batch_cnt(buff, snmp4_tcp_list, + tcp_cnt, + net->mib.tcp_statistics); + for (i = 0; i < tcp_cnt; i++) { /* MaxConn field is signed, RFC 2012 */ if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN) seq_printf(seq, " %ld", buff[i]); @@ -436,27 +435,29 @@ static int snmp_seq_show_tcp_udp(struct seq_file *seq, void *v) seq_printf(seq, " %lu", buff[i]); } - memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long)); + memset(buff, 0, udp_cnt * sizeof(unsigned long)); - snmp_get_cpu_field_batch(buff, snmp4_udp_list, - net->mib.udp_statistics); + snmp_get_cpu_field_batch_cnt(buff, snmp4_udp_list, + udp_cnt, + net->mib.udp_statistics); seq_puts(seq, "\nUdp:"); - for (i = 0; snmp4_udp_list[i].name; i++) + for (i = 0; i < udp_cnt; i++) seq_printf(seq, " %s", snmp4_udp_list[i].name); seq_puts(seq, "\nUdp:"); - for (i = 0; snmp4_udp_list[i].name; i++) + for (i = 0; i < udp_cnt; i++) seq_printf(seq, " %lu", buff[i]); - memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long)); + memset(buff, 0, udp_cnt * sizeof(unsigned long)); /* the UDP and UDP-Lite MIBs are the same */ seq_puts(seq, "\nUdpLite:"); - snmp_get_cpu_field_batch(buff, snmp4_udp_list, - net->mib.udplite_statistics); - for (i = 0; snmp4_udp_list[i].name; i++) + snmp_get_cpu_field_batch_cnt(buff, snmp4_udp_list, + udp_cnt, + net->mib.udplite_statistics); + for (i = 0; i < udp_cnt; i++) seq_printf(seq, " %s", snmp4_udp_list[i].name); seq_puts(seq, "\nUdpLite:"); - for (i = 0; snmp4_udp_list[i].name; i++) + for (i = 0; i < udp_cnt; i++) seq_printf(seq, " %lu", buff[i]); seq_putc(seq, '\n'); @@ -480,8 +481,8 @@ static int snmp_seq_show(struct seq_file *seq, void *v) */ static int netstat_seq_show(struct seq_file *seq, void *v) { - const int ip_cnt = ARRAY_SIZE(snmp4_ipextstats_list) - 1; - const int tcp_cnt = ARRAY_SIZE(snmp4_net_list) - 1; + const int ip_cnt = ARRAY_SIZE(snmp4_ipextstats_list); + const int tcp_cnt = ARRAY_SIZE(snmp4_net_list); struct net *net = seq->private; unsigned long *buff; int i; @@ -494,8 +495,8 @@ static int netstat_seq_show(struct seq_file *seq, void *v) buff = kzalloc(max(tcp_cnt * sizeof(long), ip_cnt * sizeof(u64)), GFP_KERNEL); if (buff) { - snmp_get_cpu_field_batch(buff, snmp4_net_list, - net->mib.net_statistics); + snmp_get_cpu_field_batch_cnt(buff, snmp4_net_list, tcp_cnt, + net->mib.net_statistics); for (i = 0; i < tcp_cnt; i++) seq_printf(seq, " %lu", buff[i]); } else { @@ -513,7 +514,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v) u64 *buff64 = (u64 *)buff; memset(buff64, 0, ip_cnt * sizeof(u64)); - snmp_get_cpu_field64_batch(buff64, snmp4_ipextstats_list, + snmp_get_cpu_field64_batch_cnt(buff64, snmp4_ipextstats_list, ip_cnt, net->mib.ip_statistics, offsetof(struct ipstats_mib, syncp)); for (i = 0; i < ip_cnt; i++) diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 1d2c89d63cc7..5998c4cc6f47 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -178,7 +178,7 @@ static int raw_v4_input(struct net *net, struct sk_buff *skb, if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); continue; } @@ -311,7 +311,7 @@ static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb) int raw_rcv(struct sock *sk, struct sk_buff *skb) { if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_XFRM_POLICY); return NET_RX_DROP; } @@ -697,7 +697,8 @@ static void raw_destroy(struct sock *sk) } /* This gets rid of all the nasties in af_inet. -DaveM */ -static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) +static int raw_bind(struct sock *sk, struct sockaddr_unsized *uaddr, + int addr_len) { struct inet_sock *inet = inet_sk(sk); struct sockaddr_in *addr = (struct sockaddr_in *) uaddr; @@ -793,6 +794,7 @@ static int raw_sk_init(struct sock *sk) { struct raw_sock *rp = raw_sk(sk); + sk->sk_drop_counters = &rp->drop_counters; if (inet_sk(sk)->inet_num == IPPROTO_ICMP) memset(&rp->filter, 0, sizeof(rp->filter)); return 0; @@ -1045,7 +1047,7 @@ static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i) 0, 0L, 0, from_kuid_munged(seq_user_ns(seq), sk_uid(sp)), 0, sock_i_ino(sp), - refcount_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops)); + refcount_read(&sp->sk_refcnt), sp, sk_drops_read(sp)); } static int raw_seq_show(struct seq_file *seq, void *v) diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c index cc793bd8de25..943e5998e0ad 100644 --- a/net/ipv4/raw_diag.c +++ b/net/ipv4/raw_diag.c @@ -126,9 +126,9 @@ static int raw_diag_dump_one(struct netlink_callback *cb, static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r, - struct nlattr *bc, bool net_admin) + bool net_admin) { - if (!inet_diag_bc_sk(bc, sk)) + if (!inet_diag_bc_sk(cb->data, sk)) return 0; return inet_sk_diag_fill(sk, NULL, skb, cb, r, NLM_F_MULTI, net_admin); @@ -140,17 +140,13 @@ static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); struct raw_hashinfo *hashinfo = raw_get_hashinfo(r); struct net *net = sock_net(skb->sk); - struct inet_diag_dump_data *cb_data; int num, s_num, slot, s_slot; struct hlist_head *hlist; struct sock *sk = NULL; - struct nlattr *bc; if (IS_ERR(hashinfo)) return; - cb_data = cb->data; - bc = cb_data->inet_diag_nla_bc; s_slot = cb->args[0]; num = s_num = cb->args[1]; @@ -174,7 +170,7 @@ static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, if (r->id.idiag_dport != inet->inet_dport && r->id.idiag_dport) goto next; - if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0) + if (sk_diag_dump(sk, skb, cb, r, net_admin) < 0) goto out_unlock; next: num++; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index f639a2ae881a..b549d6a57307 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -84,6 +84,7 @@ #include <linux/jhash.h> #include <net/dst.h> #include <net/dst_metadata.h> +#include <net/flow.h> #include <net/inet_dscp.h> #include <net/net_namespace.h> #include <net/ip.h> @@ -413,11 +414,11 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr) { const struct rtable *rt = container_of(dst, struct rtable, dst); - struct net_device *dev = dst_dev(dst); + struct net_device *dev; struct neighbour *n; rcu_read_lock(); - + dev = dst_dev_rcu(dst); if (likely(rt->rt_gw_family == AF_INET)) { n = ip_neigh_gw4(dev, rt->rt_gw4); } else if (rt->rt_gw_family == AF_INET6) { @@ -606,6 +607,11 @@ static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash) oldest_p = fnhe_p; } } + + /* Clear oldest->fnhe_daddr to prevent this fnhe from being + * rebound with new dsts in rt_bind_exception(). + */ + oldest->fnhe_daddr = 0; fnhe_flush_routes(oldest); *oldest_p = oldest->fnhe_next; kfree_rcu(oldest, rcu); @@ -1026,7 +1032,7 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) return; rcu_read_lock(); - net = dev_net_rcu(dst_dev(dst)); + net = dst_dev_net_rcu(dst); if (mtu < net->ipv4.ip_rt_min_pmtu) { lock = true; mtu = min(old_mtu, net->ipv4.ip_rt_min_pmtu); @@ -1221,8 +1227,8 @@ EXPORT_INDIRECT_CALLABLE(ipv4_dst_check); static void ipv4_send_dest_unreach(struct sk_buff *skb) { + struct inet_skb_parm parm; struct net_device *dev; - struct ip_options opt; int res; /* Recompile ip options since IPCB may not be valid anymore. @@ -1232,21 +1238,21 @@ static void ipv4_send_dest_unreach(struct sk_buff *skb) ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5) return; - memset(&opt, 0, sizeof(opt)); + memset(&parm, 0, sizeof(parm)); if (ip_hdr(skb)->ihl > 5) { if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4)) return; - opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr); + parm.opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr); rcu_read_lock(); dev = skb->dev ? skb->dev : skb_rtable(skb)->dst.dev; - res = __ip_options_compile(dev_net(dev), &opt, skb, NULL); + res = __ip_options_compile(dev_net(dev), &parm.opt, skb, NULL); rcu_read_unlock(); if (res) return; } - __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt); + __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &parm); } static void ipv4_link_failure(struct sk_buff *skb) @@ -1291,7 +1297,7 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) struct flowi4 fl4 = { .daddr = iph->daddr, .saddr = iph->saddr, - .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)), + .flowi4_dscp = ip4h_dscp(iph), .flowi4_oif = rt->dst.dev->ifindex, .flowi4_iif = skb->dev->ifindex, .flowi4_mark = skb->mark, @@ -1326,7 +1332,7 @@ static unsigned int ipv4_default_advmss(const struct dst_entry *dst) struct net *net; rcu_read_lock(); - net = dev_net_rcu(dst_dev(dst)); + net = dst_dev_net_rcu(dst); advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size, net->ipv4.ip_rt_min_advmss); rcu_read_unlock(); @@ -2210,7 +2216,7 @@ ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, goto martian_source; } - if (rt->rt_type != RTN_LOCAL) + if (!(rt->rt_flags & RTCF_LOCAL)) goto skip_validate_source; reason = fib_validate_source_reason(skb, saddr, daddr, dscp, 0, dev, @@ -2331,7 +2337,7 @@ ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, fl4.flowi4_oif = 0; fl4.flowi4_iif = dev->ifindex; fl4.flowi4_mark = skb->mark; - fl4.flowi4_tos = inet_dscp_to_dsfield(dscp); + fl4.flowi4_dscp = dscp; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_flags = 0; fl4.daddr = daddr; @@ -2575,12 +2581,16 @@ static struct rtable *__mkroute_output(const struct fib_result *res, !netif_is_l3_master(dev_out)) return ERR_PTR(-EINVAL); - if (ipv4_is_lbcast(fl4->daddr)) + if (ipv4_is_lbcast(fl4->daddr)) { type = RTN_BROADCAST; - else if (ipv4_is_multicast(fl4->daddr)) + + /* reset fi to prevent gateway resolution */ + fi = NULL; + } else if (ipv4_is_multicast(fl4->daddr)) { type = RTN_MULTICAST; - else if (ipv4_is_zeronet(fl4->daddr)) + } else if (ipv4_is_zeronet(fl4->daddr)) { return ERR_PTR(-EINVAL); + } if (dev_out->flags & IFF_LOOPBACK) flags |= RTCF_LOCAL; @@ -2690,7 +2700,6 @@ struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, struct rtable *rth; fl4->flowi4_iif = LOOPBACK_IFINDEX; - fl4->flowi4_tos &= INET_DSCP_MASK; rcu_read_lock(); rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb); @@ -3333,7 +3342,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, fl4.daddr = dst; fl4.saddr = src; - fl4.flowi4_tos = inet_dscp_to_dsfield(dscp); + fl4.flowi4_dscp = dscp; fl4.flowi4_oif = nla_get_u32_default(tb[RTA_OIF], 0); fl4.flowi4_mark = mark; fl4.flowi4_uid = uid; diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index eb0819463fae..569befcf021b 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -12,6 +12,7 @@ #include <linux/export.h> #include <net/secure_seq.h> #include <net/tcp.h> +#include <net/tcp_ecn.h> #include <net/route.h> static siphash_aligned_key_t syncookie_secret[2]; @@ -403,6 +404,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) struct tcp_sock *tp = tcp_sk(sk); struct inet_request_sock *ireq; struct net *net = sock_net(sk); + struct tcp_request_sock *treq; struct request_sock *req; struct sock *ret = sk; struct flowi4 fl4; @@ -428,6 +430,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) } ireq = inet_rsk(req); + treq = tcp_rsk(req); sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); @@ -483,6 +486,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) if (!req->syncookie) ireq->rcv_wscale = rcv_wscale; ireq->ecn_ok &= cookie_ecn_ok(net, &rt->dst); + treq->accecn_ok = ireq->ecn_ok && cookie_accecn_ok(th); ret = tcp_get_cookie_sock(sk, skb, req, &rt->dst); /* ip_queue_xmit() depends on our flow being setup diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 3a43010d726f..a1a50a5c80dc 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -47,6 +47,9 @@ static unsigned int udp_child_hash_entries_max = UDP_HTABLE_SIZE_MAX; static int tcp_plb_max_rounds = 31; static int tcp_plb_max_cong_thresh = 256; static unsigned int tcp_tw_reuse_delay_max = TCP_PAWS_MSL * MSEC_PER_SEC; +static int tcp_ecn_mode_max = 2; +static u32 icmp_errors_extension_mask_all = + GENMASK_U8(ICMP_ERR_EXT_COUNT - 1, 0); /* obsolete */ static int sysctl_tcp_low_latency __read_mostly; @@ -674,6 +677,15 @@ static struct ctl_table ipv4_net_table[] = { .extra2 = SYSCTL_ONE }, { + .procname = "icmp_errors_extension_mask", + .data = &init_net.ipv4.sysctl_icmp_errors_extension_mask, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &icmp_errors_extension_mask_all, + }, + { .procname = "icmp_ratelimit", .data = &init_net.ipv4.sysctl_icmp_ratelimit, .maxlen = sizeof(int), @@ -728,9 +740,27 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, + .extra2 = &tcp_ecn_mode_max, + }, + { + .procname = "tcp_ecn_option", + .data = &init_net.ipv4.sysctl_tcp_ecn_option, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, { + .procname = "tcp_ecn_option_beacon", + .data = &init_net.ipv4.sysctl_tcp_ecn_option_beacon, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_THREE, + }, + { .procname = "tcp_ecn_fallback", .data = &init_net.ipv4.sysctl_tcp_ecn_fallback, .maxlen = sizeof(u8), @@ -1313,6 +1343,15 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_dou8vec_minmax, }, { + .procname = "tcp_rcvbuf_low_rtt", + .data = &init_net.ipv4.sysctl_tcp_rcvbuf_low_rtt, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, + }, + { .procname = "tcp_tso_win_divisor", .data = &init_net.ipv4.sysctl_tcp_tso_win_divisor, .maxlen = sizeof(u8), @@ -1422,6 +1461,15 @@ static struct ctl_table ipv4_net_table[] = { .proc_handler = proc_doulongvec_minmax, }, { + .procname = "tcp_comp_sack_rtt_percent", + .data = &init_net.ipv4.sysctl_tcp_comp_sack_rtt_percent, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_ONE_THOUSAND, + }, + { .procname = "tcp_comp_sack_slack_ns", .data = &init_net.ipv4.sysctl_tcp_comp_sack_slack_ns, .maxlen = sizeof(unsigned long), diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 71a956fbfc55..f035440c475a 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -243,7 +243,7 @@ #define pr_fmt(fmt) "TCP: " fmt -#include <crypto/hash.h> +#include <crypto/md5.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/types.h> @@ -253,7 +253,6 @@ #include <linux/init.h> #include <linux/fs.h> #include <linux/skbuff.h> -#include <linux/scatterlist.h> #include <linux/splice.h> #include <linux/net.h> #include <linux/socket.h> @@ -270,11 +269,14 @@ #include <net/icmp.h> #include <net/inet_common.h> +#include <net/inet_ecn.h> #include <net/tcp.h> +#include <net/tcp_ecn.h> #include <net/mptcp.h> #include <net/proto_memory.h> #include <net/xfrm.h> #include <net/ip.h> +#include <net/psp.h> #include <net/sock.h> #include <net/rstreason.h> @@ -412,6 +414,21 @@ static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp) return rate64; } +#ifdef CONFIG_TCP_MD5SIG +void tcp_md5_destruct_sock(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (tp->md5sig_info) { + + tcp_clear_md5_list(sk); + kfree(rcu_replace_pointer(tp->md5sig_info, NULL, 1)); + static_branch_slow_dec_deferred(&tcp_md5_needed); + } +} +EXPORT_IPV6_MOD_GPL(tcp_md5_destruct_sock); +#endif + /* Address-family independent initialization for a tcp_sock. * * NOTE: A lot of things set to zero explicitly by call to @@ -687,6 +704,7 @@ void tcp_skb_entail(struct sock *sk, struct sk_buff *skb) tcb->seq = tcb->end_seq = tp->write_seq; tcb->tcp_flags = TCPHDR_ACK; __skb_header_release(skb); + psp_enqueue_set_decrypted(sk, skb); tcp_add_write_queue_tail(sk, skb); sk_wmem_queued_add(sk, skb->truesize); sk_mem_charge(sk, skb->truesize); @@ -908,7 +926,8 @@ struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp, } __kfree_skb(skb); } else { - sk->sk_prot->enter_memory_pressure(sk); + if (!sk->sk_bypass_prot_mem) + tcp_enter_memory_pressure(sk); sk_stream_moderate_sndbuf(sk); } return NULL; @@ -1042,7 +1061,7 @@ int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied, } } flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0; - err = __inet_stream_connect(sk->sk_socket, uaddr, + err = __inet_stream_connect(sk->sk_socket, (struct sockaddr_unsized *)uaddr, msg->msg_namelen, flags, 1); /* fastopen_req could already be freed in __inet_stream_connect * if the connection times out or gets rst @@ -1537,8 +1556,10 @@ void __tcp_cleanup_rbuf(struct sock *sk, int copied) time_to_ack = true; } } - if (time_to_ack) + if (time_to_ack) { + tcp_mstamp_refresh(tp); tcp_send_ack(sk); + } } void tcp_cleanup_rbuf(struct sock *sk, int copied) @@ -1771,6 +1792,7 @@ EXPORT_IPV6_MOD(tcp_peek_len); /* Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint */ int tcp_set_rcvlowat(struct sock *sk, int val) { + struct tcp_sock *tp = tcp_sk(sk); int space, cap; if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) @@ -1789,7 +1811,9 @@ int tcp_set_rcvlowat(struct sock *sk, int val) space = tcp_space_from_win(sk, val); if (space > sk->sk_rcvbuf) { WRITE_ONCE(sk->sk_rcvbuf, space); - WRITE_ONCE(tcp_sk(sk)->window_clamp, val); + + if (tp->window_clamp && tp->window_clamp < val) + WRITE_ONCE(tp->window_clamp, val); } return 0; } @@ -2563,7 +2587,7 @@ static int tcp_recvmsg_dmabuf(struct sock *sk, const struct sk_buff *skb, if (err) goto out; - atomic_long_inc(&niov->pp_ref_count); + atomic_long_inc(&niov->desc.pp_ref_count); tcp_xa_pool.netmems[tcp_xa_pool.idx++] = skb_frag_netmem(frag); sent += copy; @@ -2818,9 +2842,9 @@ found_ok_skb: err = tcp_recvmsg_dmabuf(sk, skb, offset, msg, used); - if (err <= 0) { + if (err < 0) { if (!copied) - copied = -EFAULT; + copied = err; break; } @@ -3099,8 +3123,8 @@ bool tcp_check_oom(const struct sock *sk, int shift) void __tcp_close(struct sock *sk, long timeout) { + bool data_was_unread = false; struct sk_buff *skb; - int data_was_unread = 0; int state; WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); @@ -3118,13 +3142,14 @@ void __tcp_close(struct sock *sk, long timeout) * descriptor close, not protocol-sourced closes, because the * reader process may not have drained the data yet! */ - while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { - u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq; + while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { + u32 end_seq = TCP_SKB_CB(skb)->end_seq; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) - len--; - data_was_unread += len; - __kfree_skb(skb); + end_seq--; + if (after(end_seq, tcp_sk(sk)->copied_seq)) + data_was_unread = true; + tcp_eat_recv_skb(sk, skb); } /* If socket has been already reset (e.g. in tcp_reset()) - kill it. */ @@ -3195,7 +3220,7 @@ adjudge_to_death: /* remove backlog if any, without releasing ownership. */ __release_sock(sk); - this_cpu_inc(tcp_orphan_count); + tcp_orphan_count_inc(); /* Have we already been destroyed by a softirq or backlog? */ if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE) @@ -3327,6 +3352,7 @@ int tcp_disconnect(struct sock *sk, int flags) struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int old_state = sk->sk_state; + struct request_sock *req; u32 seq; if (old_state != TCP_CLOSE) @@ -3376,7 +3402,7 @@ int tcp_disconnect(struct sock *sk, int flags) WRITE_ONCE(tp->write_seq, seq); icsk->icsk_backoff = 0; - icsk->icsk_probes_out = 0; + WRITE_ONCE(icsk->icsk_probes_out, 0); icsk->icsk_probes_tstamp = 0; icsk->icsk_rto = TCP_TIMEOUT_INIT; WRITE_ONCE(icsk->icsk_rto_min, TCP_RTO_MIN); @@ -3389,6 +3415,11 @@ int tcp_disconnect(struct sock *sk, int flags) tp->window_clamp = 0; tp->delivered = 0; tp->delivered_ce = 0; + tp->accecn_fail_mode = 0; + tp->saw_accecn_opt = TCP_ACCECN_OPT_NOT_SEEN; + tcp_accecn_init_counters(tp); + tp->prev_ecnfield = 0; + tp->accecn_opt_tstamp = 0; if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk); memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); @@ -3442,6 +3473,10 @@ int tcp_disconnect(struct sock *sk, int flags) /* Clean up fastopen related fields */ + req = rcu_dereference_protected(tp->fastopen_rsk, + lockdep_sock_is_held(sk)); + if (req) + reqsk_fastopen_remove(sk, req, false); tcp_free_fastopen_req(tp); inet_clear_bit(DEFER_CONNECT, sk); tp->fastopen_client_fail = 0; @@ -3549,9 +3584,12 @@ static int tcp_repair_options_est(struct sock *sk, sockptr_t optbuf, DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled); EXPORT_IPV6_MOD(tcp_tx_delay_enabled); -static void tcp_enable_tx_delay(void) +static void tcp_enable_tx_delay(struct sock *sk, int val) { - if (!static_branch_unlikely(&tcp_tx_delay_enabled)) { + struct tcp_sock *tp = tcp_sk(sk); + s32 delta = (val - tp->tcp_tx_delay) << 3; + + if (val && !static_branch_unlikely(&tcp_tx_delay_enabled)) { static int __tcp_tx_delay_enabled = 0; if (cmpxchg(&__tcp_tx_delay_enabled, 0, 1) == 0) { @@ -3559,6 +3597,22 @@ static void tcp_enable_tx_delay(void) pr_info("TCP_TX_DELAY enabled\n"); } } + /* If we change tcp_tx_delay on a live flow, adjust tp->srtt_us, + * tp->rtt_min, icsk_rto and sk->sk_pacing_rate. + * This is best effort. + */ + if (delta && sk->sk_state == TCP_ESTABLISHED) { + s64 srtt = (s64)tp->srtt_us + delta; + + tp->srtt_us = clamp_t(s64, srtt, 1, ~0U); + + /* Note: does not deal with non zero icsk_backoff */ + tcp_set_rto(sk); + + minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U); + + tcp_update_pacing_rate(sk); + } } /* When set indicates to always queue non-full frames. Later the user clears @@ -3760,7 +3814,7 @@ int tcp_sock_set_maxseg(struct sock *sk, int val) if (val && (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW)) return -EINVAL; - tcp_sk(sk)->rx_opt.user_mss = val; + WRITE_ONCE(tcp_sk(sk)->rx_opt.user_mss, val); return 0; } @@ -3890,15 +3944,13 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, WRITE_ONCE(inet_csk(sk)->icsk_delack_max, delack_max); return 0; } + case TCP_MAXSEG: + return tcp_sock_set_maxseg(sk, val); } sockopt_lock_sock(sk); switch (optname) { - case TCP_MAXSEG: - err = tcp_sock_set_maxseg(sk, val); - break; - case TCP_NODELAY: __tcp_sock_set_nodelay(sk, val); break; @@ -4087,8 +4139,12 @@ ao_parse: tp->recvmsg_inq = val; break; case TCP_TX_DELAY: - if (val) - tcp_enable_tx_delay(); + /* tp->srtt_us is u32, and is shifted by 3 */ + if (val < 0 || val >= (1U << (31 - 3))) { + err = -EINVAL; + break; + } + tcp_enable_tx_delay(sk, val); WRITE_ONCE(tp->tcp_tx_delay, val); break; default: @@ -4137,6 +4193,9 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) { const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */ const struct inet_connection_sock *icsk = inet_csk(sk); + const u8 ect1_idx = INET_ECN_ECT_1 - 1; + const u8 ect0_idx = INET_ECN_ECT_0 - 1; + const u8 ce_idx = INET_ECN_CE - 1; unsigned long rate; u32 now; u64 rate64; @@ -4263,6 +4322,16 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) if (tp->rto_stamp) info->tcpi_total_rto_time += tcp_clock_ms() - tp->rto_stamp; + info->tcpi_accecn_fail_mode = tp->accecn_fail_mode; + info->tcpi_accecn_opt_seen = tp->saw_accecn_opt; + info->tcpi_received_ce = tp->received_ce; + info->tcpi_delivered_e1_bytes = tp->delivered_ecn_bytes[ect1_idx]; + info->tcpi_delivered_e0_bytes = tp->delivered_ecn_bytes[ect0_idx]; + info->tcpi_delivered_ce_bytes = tp->delivered_ecn_bytes[ce_idx]; + info->tcpi_received_e1_bytes = tp->received_ecn_bytes[ect1_idx]; + info->tcpi_received_e0_bytes = tp->received_ecn_bytes[ect0_idx]; + info->tcpi_received_ce_bytes = tp->received_ecn_bytes[ce_idx]; + unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(tcp_get_info); @@ -4348,7 +4417,8 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering); nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp)); - nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits); + nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, + READ_ONCE(inet_csk(sk)->icsk_retransmits)); nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited); nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh); nla_put_u32(stats, TCP_NLA_DELIVERED, tp->delivered); @@ -4383,6 +4453,7 @@ int do_tcp_getsockopt(struct sock *sk, int level, struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); + int user_mss; int val, len; if (copy_from_sockptr(&len, optlen, sizeof(int))) @@ -4396,9 +4467,10 @@ int do_tcp_getsockopt(struct sock *sk, int level, switch (optname) { case TCP_MAXSEG: val = tp->mss_cache; - if (tp->rx_opt.user_mss && + user_mss = READ_ONCE(tp->rx_opt.user_mss); + if (user_mss && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) - val = tp->rx_opt.user_mss; + val = user_mss; if (tp->repair) val = tp->rx_opt.mss_clamp; break; @@ -4767,52 +4839,45 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, EXPORT_IPV6_MOD(tcp_getsockopt); #ifdef CONFIG_TCP_MD5SIG -int tcp_md5_sigpool_id = -1; -EXPORT_IPV6_MOD_GPL(tcp_md5_sigpool_id); - -int tcp_md5_alloc_sigpool(void) +void tcp_md5_hash_skb_data(struct md5_ctx *ctx, const struct sk_buff *skb, + unsigned int header_len) { - size_t scratch_size; - int ret; + const unsigned int head_data_len = skb_headlen(skb) > header_len ? + skb_headlen(skb) - header_len : 0; + const struct skb_shared_info *shi = skb_shinfo(skb); + struct sk_buff *frag_iter; + unsigned int i; - scratch_size = sizeof(union tcp_md5sum_block) + sizeof(struct tcphdr); - ret = tcp_sigpool_alloc_ahash("md5", scratch_size); - if (ret >= 0) { - /* As long as any md5 sigpool was allocated, the return - * id would stay the same. Re-write the id only for the case - * when previously all MD5 keys were deleted and this call - * allocates the first MD5 key, which may return a different - * sigpool id than was used previously. - */ - WRITE_ONCE(tcp_md5_sigpool_id, ret); /* Avoids the compiler potentially being smart here */ - return 0; - } - return ret; -} + md5_update(ctx, (const u8 *)tcp_hdr(skb) + header_len, head_data_len); -void tcp_md5_release_sigpool(void) -{ - tcp_sigpool_release(READ_ONCE(tcp_md5_sigpool_id)); -} + for (i = 0; i < shi->nr_frags; ++i) { + const skb_frag_t *f = &shi->frags[i]; + u32 p_off, p_len, copied; + const void *vaddr; + struct page *p; -void tcp_md5_add_sigpool(void) -{ - tcp_sigpool_get(READ_ONCE(tcp_md5_sigpool_id)); + skb_frag_foreach_page(f, skb_frag_off(f), skb_frag_size(f), + p, p_off, p_len, copied) { + vaddr = kmap_local_page(p); + md5_update(ctx, vaddr + p_off, p_len); + kunmap_local(vaddr); + } + } + + skb_walk_frags(skb, frag_iter) + tcp_md5_hash_skb_data(ctx, frag_iter, 0); } +EXPORT_IPV6_MOD(tcp_md5_hash_skb_data); -int tcp_md5_hash_key(struct tcp_sigpool *hp, - const struct tcp_md5sig_key *key) +void tcp_md5_hash_key(struct md5_ctx *ctx, + const struct tcp_md5sig_key *key) { u8 keylen = READ_ONCE(key->keylen); /* paired with WRITE_ONCE() in tcp_md5_do_add */ - struct scatterlist sg; - - sg_init_one(&sg, key->key, keylen); - ahash_request_set_crypt(hp->req, &sg, NULL, keylen); /* We use data_race() because tcp_md5_do_add() might change * key->key under us */ - return data_race(crypto_ahash_update(hp->req)); + data_race(({ md5_update(ctx, key->key, keylen), 0; })); } EXPORT_IPV6_MOD(tcp_md5_hash_key); @@ -4823,19 +4888,16 @@ tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, int family, int l3index, const __u8 *hash_location) { /* This gets called for each TCP segment that has TCP-MD5 option. - * We have 3 drop cases: - * o No MD5 hash and one expected. - * o MD5 hash and we're not expecting one. - * o MD5 hash and its wrong. + * We have 2 drop cases: + * o An MD5 signature is present, but we're not expecting one. + * o The MD5 signature is wrong. */ const struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_key *key; u8 newhash[16]; - int genhash; key = tcp_md5_do_lookup(sk, l3index, saddr, family); - - if (!key && hash_location) { + if (!key) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); trace_tcp_hash_md5_unexpected(sk, skb); return SKB_DROP_REASON_TCP_MD5UNEXPECTED; @@ -4846,11 +4908,10 @@ tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, * IPv4-mapped case. */ if (family == AF_INET) - genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); + tcp_v4_md5_hash_skb(newhash, key, NULL, skb); else - genhash = tp->af_specific->calc_md5_hash(newhash, key, - NULL, skb); - if (genhash || memcmp(hash_location, newhash, 16) != 0) { + tp->af_specific->calc_md5_hash(newhash, key, NULL, skb); + if (memcmp(hash_location, newhash, 16) != 0) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); trace_tcp_hash_md5_mismatch(sk, skb); return SKB_DROP_REASON_TCP_MD5FAILURE; @@ -5056,7 +5117,9 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, notsent_lowat); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, gso_segs); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, retransmit_skb_hint); - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_tx, 32); +#if IS_ENABLED(CONFIG_TLS_DEVICE) + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, tcp_clean_acked); +#endif /* TXRX read-mostly hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, tsoffset); @@ -5067,11 +5130,9 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, lost_out); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, sacked_out); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, scaling_ratio); - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_txrx, 32); /* RX read-mostly hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, copied_seq); - CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rcv_tstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, snd_wl1); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, tlp_high_seq); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rttvar_us); @@ -5082,12 +5143,6 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rtt_min); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, out_of_order_queue); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, snd_ssthresh); -#if IS_ENABLED(CONFIG_TLS_DEVICE) - CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, tcp_clean_acked); - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_rx, 77); -#else - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_rx, 69); -#endif /* TX read-write hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, segs_out); @@ -5101,11 +5156,11 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, lsndtime); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, mdev_us); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_wstamp_ns); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, accecn_opt_tstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, rtt_seq); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tsorted_sent_queue); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, highest_sack); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, ecn_flags); - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_tx, 89); /* TXRX read-write hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, pred_flags); @@ -5120,15 +5175,13 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_up); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered_ce); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ce); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ecn_bytes); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, app_limited); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_tstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rx_opt); - /* 32bit arches with 8byte alignment on u64 fields might need padding - * before tcp_clock_cache. - */ - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 92 + 4); - /* RX read-write hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_received); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, segs_in); @@ -5139,12 +5192,12 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_delivered); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_interval_us); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_last_tsecr); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_ecn_bytes); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, first_tx_mstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_mstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_acked); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_est); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcvq_space); - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_rx, 99); } void __init tcp_init(void) diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c index bbb8d5f0eae7..34b8450829d0 100644 --- a/net/ipv4/tcp_ao.c +++ b/net/ipv4/tcp_ao.c @@ -268,9 +268,8 @@ static void tcp_ao_key_free_rcu(struct rcu_head *head) kfree_sensitive(key); } -static void tcp_ao_info_free_rcu(struct rcu_head *head) +static void tcp_ao_info_free(struct tcp_ao_info *ao) { - struct tcp_ao_info *ao = container_of(head, struct tcp_ao_info, rcu); struct tcp_ao_key *key; struct hlist_node *n; @@ -310,7 +309,7 @@ void tcp_ao_destroy_sock(struct sock *sk, bool twsk) if (!twsk) tcp_ao_sk_omem_free(sk, ao); - call_rcu(&ao->rcu, tcp_ao_info_free_rcu); + tcp_ao_info_free(ao); } void tcp_ao_time_wait(struct tcp_timewait_sock *tcptw, struct tcp_sock *tp) @@ -1178,7 +1177,9 @@ void tcp_ao_finish_connect(struct sock *sk, struct sk_buff *skb) if (!ao) return; - WRITE_ONCE(ao->risn, tcp_hdr(skb)->seq); + /* sk with TCP_REPAIR_ON does not have skb in tcp_finish_connect */ + if (skb) + WRITE_ONCE(ao->risn, tcp_hdr(skb)->seq); ao->rcv_sne = 0; hlist_for_each_entry_rcu(key, &ao->head, node, lockdep_sock_is_held(sk)) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index ba581785adb4..a268e1595b22 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -408,8 +408,11 @@ more_data: if (!psock->cork) { psock->cork = kzalloc(sizeof(*psock->cork), GFP_ATOMIC | __GFP_NOWARN); - if (!psock->cork) + if (!psock->cork) { + sk_msg_free(sk, msg); + *copied = 0; return -ENOMEM; + } } memcpy(psock->cork, msg, sizeof(*msg)); return 0; diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c index ba4d98e510e0..fbad6c35dee9 100644 --- a/net/ipv4/tcp_cdg.c +++ b/net/ipv4/tcp_cdg.c @@ -379,7 +379,7 @@ static void tcp_cdg_init(struct sock *sk) /* We silently fall back to window = 1 if allocation fails. */ if (window > 1) ca->gradients = kcalloc(window, sizeof(ca->gradients[0]), - GFP_NOWAIT | __GFP_NOWARN); + GFP_NOWAIT); ca->rtt_seq = tp->snd_nxt; ca->shadow_wnd = tcp_snd_cwnd(tp); } diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 45e174b8cd22..d83efd91f461 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -12,6 +12,9 @@ #include <linux/tcp.h> +#include <net/inet_hashtables.h> +#include <net/inet6_hashtables.h> +#include <net/inet_timewait_sock.h> #include <net/netlink.h> #include <net/tcp.h> @@ -174,27 +177,465 @@ static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin) size += ulp_ops->get_info_size(sk, net_admin); } } - return size; + + return size + + nla_total_size(sizeof(struct tcp_info)) + + nla_total_size(sizeof(struct inet_diag_msg)) + + inet_diag_msg_attrs_size() + + nla_total_size(sizeof(struct inet_diag_meminfo)) + + nla_total_size(SK_MEMINFO_VARS * sizeof(u32)) + + nla_total_size(TCP_CA_NAME_MAX) + + nla_total_size(sizeof(struct tcpvegas_info)) + + 64; +} + +static int tcp_twsk_diag_fill(struct sock *sk, + struct sk_buff *skb, + struct netlink_callback *cb, + u16 nlmsg_flags, bool net_admin) +{ + struct inet_timewait_sock *tw = inet_twsk(sk); + struct inet_diag_msg *r; + struct nlmsghdr *nlh; + long tmo; + + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, cb->nlh->nlmsg_type, + sizeof(*r), nlmsg_flags); + if (!nlh) + return -EMSGSIZE; + + r = nlmsg_data(nlh); + DEBUG_NET_WARN_ON_ONCE(tw->tw_state != TCP_TIME_WAIT); + + inet_diag_msg_common_fill(r, sk); + r->idiag_retrans = 0; + + r->idiag_state = READ_ONCE(tw->tw_substate); + r->idiag_timer = 3; + tmo = tw->tw_timer.expires - jiffies; + r->idiag_expires = jiffies_delta_to_msecs(tmo); + r->idiag_rqueue = 0; + r->idiag_wqueue = 0; + r->idiag_uid = 0; + r->idiag_inode = 0; + + if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, + tw->tw_mark)) { + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; + } + + nlmsg_end(skb, nlh); + return 0; +} + +static int tcp_req_diag_fill(struct sock *sk, struct sk_buff *skb, + struct netlink_callback *cb, + u16 nlmsg_flags, bool net_admin) +{ + struct request_sock *reqsk = inet_reqsk(sk); + struct inet_diag_msg *r; + struct nlmsghdr *nlh; + long tmo; + + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + cb->nlh->nlmsg_type, sizeof(*r), nlmsg_flags); + if (!nlh) + return -EMSGSIZE; + + r = nlmsg_data(nlh); + inet_diag_msg_common_fill(r, sk); + r->idiag_state = TCP_SYN_RECV; + r->idiag_timer = 1; + r->idiag_retrans = READ_ONCE(reqsk->num_retrans); + + BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) != + offsetof(struct sock, sk_cookie)); + + tmo = READ_ONCE(inet_reqsk(sk)->rsk_timer.expires) - jiffies; + r->idiag_expires = jiffies_delta_to_msecs(tmo); + r->idiag_rqueue = 0; + r->idiag_wqueue = 0; + r->idiag_uid = 0; + r->idiag_inode = 0; + + if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, + inet_rsk(reqsk)->ir_mark)) { + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; + } + + nlmsg_end(skb, nlh); + return 0; +} + +static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, + struct netlink_callback *cb, + const struct inet_diag_req_v2 *r, + u16 nlmsg_flags, bool net_admin) +{ + if (sk->sk_state == TCP_TIME_WAIT) + return tcp_twsk_diag_fill(sk, skb, cb, nlmsg_flags, net_admin); + + if (sk->sk_state == TCP_NEW_SYN_RECV) + return tcp_req_diag_fill(sk, skb, cb, nlmsg_flags, net_admin); + + return inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, r, nlmsg_flags, + net_admin); +} + +static void twsk_build_assert(void) +{ + BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_family) != + offsetof(struct sock, sk_family)); + + BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_num) != + offsetof(struct inet_sock, inet_num)); + + BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_dport) != + offsetof(struct inet_sock, inet_dport)); + + BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_rcv_saddr) != + offsetof(struct inet_sock, inet_rcv_saddr)); + + BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_daddr) != + offsetof(struct inet_sock, inet_daddr)); + +#if IS_ENABLED(CONFIG_IPV6) + BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_rcv_saddr) != + offsetof(struct sock, sk_v6_rcv_saddr)); + + BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_daddr) != + offsetof(struct sock, sk_v6_daddr)); +#endif } static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { - struct inet_hashinfo *hinfo; + bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); + struct inet_diag_dump_data *cb_data = cb->data; + struct net *net = sock_net(skb->sk); + u32 idiag_states = r->idiag_states; + struct inet_hashinfo *hashinfo; + int i, num, s_i, s_num; + struct sock *sk; - hinfo = sock_net(cb->skb->sk)->ipv4.tcp_death_row.hashinfo; + hashinfo = net->ipv4.tcp_death_row.hashinfo; + if (idiag_states & TCPF_SYN_RECV) + idiag_states |= TCPF_NEW_SYN_RECV; + s_i = cb->args[1]; + s_num = num = cb->args[2]; + + if (cb->args[0] == 0) { + if (!(idiag_states & TCPF_LISTEN) || r->id.idiag_dport) + goto skip_listen_ht; + + for (i = s_i; i <= hashinfo->lhash2_mask; i++) { + struct inet_listen_hashbucket *ilb; + struct hlist_nulls_node *node; + + num = 0; + ilb = &hashinfo->lhash2[i]; + + if (hlist_nulls_empty(&ilb->nulls_head)) { + s_num = 0; + continue; + } + spin_lock(&ilb->lock); + sk_nulls_for_each(sk, node, &ilb->nulls_head) { + struct inet_sock *inet = inet_sk(sk); + + if (!net_eq(sock_net(sk), net)) + continue; + + if (num < s_num) { + num++; + continue; + } + + if (r->sdiag_family != AF_UNSPEC && + sk->sk_family != r->sdiag_family) + goto next_listen; + + if (r->id.idiag_sport != inet->inet_sport && + r->id.idiag_sport) + goto next_listen; + + if (!inet_diag_bc_sk(cb_data, sk)) + goto next_listen; + + if (inet_sk_diag_fill(sk, inet_csk(sk), skb, + cb, r, NLM_F_MULTI, + net_admin) < 0) { + spin_unlock(&ilb->lock); + goto done; + } + +next_listen: + ++num; + } + spin_unlock(&ilb->lock); + + s_num = 0; + } +skip_listen_ht: + cb->args[0] = 1; + s_i = num = s_num = 0; + } + +/* Process a maximum of SKARR_SZ sockets at a time when walking hash buckets + * with bh disabled. + */ +#define SKARR_SZ 16 + + /* Dump bound but inactive (not listening, connecting, etc.) sockets */ + if (cb->args[0] == 1) { + if (!(idiag_states & TCPF_BOUND_INACTIVE)) + goto skip_bind_ht; + + for (i = s_i; i < hashinfo->bhash_size; i++) { + struct inet_bind_hashbucket *ibb; + struct inet_bind2_bucket *tb2; + struct sock *sk_arr[SKARR_SZ]; + int num_arr[SKARR_SZ]; + int idx, accum, res; + +resume_bind_walk: + num = 0; + accum = 0; + ibb = &hashinfo->bhash2[i]; + + if (hlist_empty(&ibb->chain)) { + s_num = 0; + continue; + } + spin_lock_bh(&ibb->lock); + inet_bind_bucket_for_each(tb2, &ibb->chain) { + if (!net_eq(ib2_net(tb2), net)) + continue; + + sk_for_each_bound(sk, &tb2->owners) { + struct inet_sock *inet = inet_sk(sk); + + if (num < s_num) + goto next_bind; + + if (sk->sk_state != TCP_CLOSE || + !inet->inet_num) + goto next_bind; + + if (r->sdiag_family != AF_UNSPEC && + r->sdiag_family != sk->sk_family) + goto next_bind; + + if (!inet_diag_bc_sk(cb_data, sk)) + goto next_bind; + + sock_hold(sk); + num_arr[accum] = num; + sk_arr[accum] = sk; + if (++accum == SKARR_SZ) + goto pause_bind_walk; +next_bind: + num++; + } + } +pause_bind_walk: + spin_unlock_bh(&ibb->lock); + + res = 0; + for (idx = 0; idx < accum; idx++) { + if (res >= 0) { + res = inet_sk_diag_fill(sk_arr[idx], + NULL, skb, cb, + r, NLM_F_MULTI, + net_admin); + if (res < 0) + num = num_arr[idx]; + } + sock_put(sk_arr[idx]); + } + if (res < 0) + goto done; + + cond_resched(); + + if (accum == SKARR_SZ) { + s_num = num + 1; + goto resume_bind_walk; + } + + s_num = 0; + } +skip_bind_ht: + cb->args[0] = 2; + s_i = num = s_num = 0; + } - inet_diag_dump_icsk(hinfo, skb, cb, r); + if (!(idiag_states & ~TCPF_LISTEN)) + goto out; + + for (i = s_i; i <= hashinfo->ehash_mask; i++) { + struct inet_ehash_bucket *head = &hashinfo->ehash[i]; + spinlock_t *lock = inet_ehash_lockp(hashinfo, i); + struct hlist_nulls_node *node; + struct sock *sk_arr[SKARR_SZ]; + int num_arr[SKARR_SZ]; + int idx, accum, res; + + if (hlist_nulls_empty(&head->chain)) + continue; + + if (i > s_i) + s_num = 0; + +next_chunk: + num = 0; + accum = 0; + spin_lock_bh(lock); + sk_nulls_for_each(sk, node, &head->chain) { + int state; + + if (!net_eq(sock_net(sk), net)) + continue; + if (num < s_num) + goto next_normal; + state = (sk->sk_state == TCP_TIME_WAIT) ? + READ_ONCE(inet_twsk(sk)->tw_substate) : sk->sk_state; + if (!(idiag_states & (1 << state))) + goto next_normal; + if (r->sdiag_family != AF_UNSPEC && + sk->sk_family != r->sdiag_family) + goto next_normal; + if (r->id.idiag_sport != htons(sk->sk_num) && + r->id.idiag_sport) + goto next_normal; + if (r->id.idiag_dport != sk->sk_dport && + r->id.idiag_dport) + goto next_normal; + twsk_build_assert(); + + if (!inet_diag_bc_sk(cb_data, sk)) + goto next_normal; + + if (!refcount_inc_not_zero(&sk->sk_refcnt)) + goto next_normal; + + num_arr[accum] = num; + sk_arr[accum] = sk; + if (++accum == SKARR_SZ) + break; +next_normal: + ++num; + } + spin_unlock_bh(lock); + + res = 0; + for (idx = 0; idx < accum; idx++) { + if (res >= 0) { + res = sk_diag_fill(sk_arr[idx], skb, cb, r, + NLM_F_MULTI, net_admin); + if (res < 0) + num = num_arr[idx]; + } + sock_gen_put(sk_arr[idx]); + } + if (res < 0) + break; + + cond_resched(); + + if (accum == SKARR_SZ) { + s_num = num + 1; + goto next_chunk; + } + } + +done: + cb->args[1] = i; + cb->args[2] = num; +out: + ; +} + +static struct sock *tcp_diag_find_one_icsk(struct net *net, + const struct inet_diag_req_v2 *req) +{ + struct sock *sk; + + rcu_read_lock(); + if (req->sdiag_family == AF_INET) { + sk = inet_lookup(net, NULL, 0, req->id.idiag_dst[0], + req->id.idiag_dport, req->id.idiag_src[0], + req->id.idiag_sport, req->id.idiag_if); +#if IS_ENABLED(CONFIG_IPV6) + } else if (req->sdiag_family == AF_INET6) { + if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) && + ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src)) + sk = inet_lookup(net, NULL, 0, req->id.idiag_dst[3], + req->id.idiag_dport, req->id.idiag_src[3], + req->id.idiag_sport, req->id.idiag_if); + else + sk = inet6_lookup(net, NULL, 0, + (struct in6_addr *)req->id.idiag_dst, + req->id.idiag_dport, + (struct in6_addr *)req->id.idiag_src, + req->id.idiag_sport, + req->id.idiag_if); +#endif + } else { + rcu_read_unlock(); + return ERR_PTR(-EINVAL); + } + rcu_read_unlock(); + if (!sk) + return ERR_PTR(-ENOENT); + + if (sock_diag_check_cookie(sk, req->id.idiag_cookie)) { + sock_gen_put(sk); + return ERR_PTR(-ENOENT); + } + + return sk; } static int tcp_diag_dump_one(struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { - struct inet_hashinfo *hinfo; + struct sk_buff *in_skb = cb->skb; + struct sk_buff *rep; + struct sock *sk; + struct net *net; + bool net_admin; + int err; - hinfo = sock_net(cb->skb->sk)->ipv4.tcp_death_row.hashinfo; + net = sock_net(in_skb->sk); + sk = tcp_diag_find_one_icsk(net, req); + if (IS_ERR(sk)) + return PTR_ERR(sk); - return inet_diag_dump_one_icsk(hinfo, cb, req); + net_admin = netlink_net_capable(in_skb, CAP_NET_ADMIN); + rep = nlmsg_new(tcp_diag_get_aux_size(sk, net_admin), GFP_KERNEL); + if (!rep) { + err = -ENOMEM; + goto out; + } + + err = sk_diag_fill(sk, rep, cb, req, 0, net_admin); + if (err < 0) { + WARN_ON(err == -EMSGSIZE); + nlmsg_free(rep); + goto out; + } + err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid); + +out: + if (sk) + sock_gen_put(sk); + + return err; } #ifdef CONFIG_INET_DIAG_DESTROY @@ -202,13 +643,10 @@ static int tcp_diag_destroy(struct sk_buff *in_skb, const struct inet_diag_req_v2 *req) { struct net *net = sock_net(in_skb->sk); - struct inet_hashinfo *hinfo; struct sock *sk; int err; - hinfo = net->ipv4.tcp_death_row.hashinfo; - sk = inet_diag_find_one_icsk(net, hinfo, req); - + sk = tcp_diag_find_one_icsk(net, req); if (IS_ERR(sk)) return PTR_ERR(sk); @@ -226,7 +664,6 @@ static const struct inet_diag_handler tcp_diag_handler = { .dump_one = tcp_diag_dump_one, .idiag_get_info = tcp_diag_get_info, .idiag_get_aux = tcp_diag_get_aux, - .idiag_get_aux_size = tcp_diag_get_aux_size, .idiag_type = IPPROTO_TCP, .idiag_info_size = sizeof(struct tcp_info), #ifdef CONFIG_INET_DIAG_DESTROY diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index f1884f0c9e52..7d945a527daf 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -576,11 +576,12 @@ void tcp_fastopen_active_disable_ofo_check(struct sock *sk) } } else if (tp->syn_fastopen_ch && atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times)) { - dst = sk_dst_get(sk); - dev = dst ? dst_dev(dst) : NULL; + rcu_read_lock(); + dst = __sk_dst_get(sk); + dev = dst ? dst_dev_rcu(dst) : NULL; if (!(dev && (dev->flags & IFF_LOOPBACK))) atomic_set(&sock_net(sk)->ipv4.tfo_active_disable_times, 0); - dst_release(dst); + rcu_read_unlock(); } } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 71b76e98371a..198f8a0d37be 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -70,8 +70,10 @@ #include <linux/sysctl.h> #include <linux/kernel.h> #include <linux/prefetch.h> +#include <linux/bitops.h> #include <net/dst.h> #include <net/tcp.h> +#include <net/tcp_ecn.h> #include <net/proto_memory.h> #include <net/inet_common.h> #include <linux/ipsec.h> @@ -339,31 +341,6 @@ static bool tcp_in_quickack_mode(struct sock *sk) (icsk->icsk_ack.quick && !inet_csk_in_pingpong_mode(sk)); } -static void tcp_ecn_queue_cwr(struct tcp_sock *tp) -{ - if (tcp_ecn_mode_rfc3168(tp)) - tp->ecn_flags |= TCP_ECN_QUEUE_CWR; -} - -static void tcp_ecn_accept_cwr(struct sock *sk, const struct sk_buff *skb) -{ - if (tcp_hdr(skb)->cwr) { - tcp_sk(sk)->ecn_flags &= ~TCP_ECN_DEMAND_CWR; - - /* If the sender is telling us it has entered CWR, then its - * cwnd may be very low (even just 1 packet), so we should ACK - * immediately. - */ - if (TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) - inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; - } -} - -static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp) -{ - tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; -} - static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); @@ -384,38 +361,117 @@ static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb) if (tcp_ca_needs_ecn(sk)) tcp_ca_event(sk, CA_EVENT_ECN_IS_CE); - if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { + if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR) && + tcp_ecn_mode_rfc3168(tp)) { /* Better not delay acks, sender can have a very low cwnd */ tcp_enter_quickack_mode(sk, 2); tp->ecn_flags |= TCP_ECN_DEMAND_CWR; } + /* As for RFC3168 ECN, the TCP_ECN_SEEN flag is set by + * tcp_data_ecn_check() when the ECN codepoint of + * received TCP data contains ECT(0), ECT(1), or CE. + */ + if (!tcp_ecn_mode_rfc3168(tp)) + break; tp->ecn_flags |= TCP_ECN_SEEN; break; default: if (tcp_ca_needs_ecn(sk)) tcp_ca_event(sk, CA_EVENT_ECN_NO_CE); + if (!tcp_ecn_mode_rfc3168(tp)) + break; tp->ecn_flags |= TCP_ECN_SEEN; break; } } -static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th) +/* Returns true if the byte counters can be used */ +static bool tcp_accecn_process_option(struct tcp_sock *tp, + const struct sk_buff *skb, + u32 delivered_bytes, int flag) { - if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || th->cwr)) - tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); -} + u8 estimate_ecnfield = tp->est_ecnfield; + bool ambiguous_ecn_bytes_incr = false; + bool first_changed = false; + unsigned int optlen; + bool order1, res; + unsigned int i; + u8 *ptr; -static void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th) -{ - if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || !th->cwr)) - tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); -} + if (tcp_accecn_opt_fail_recv(tp)) + return false; -static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th) -{ - if (th->ece && !th->syn && tcp_ecn_mode_rfc3168(tp)) - return true; - return false; + if (!(flag & FLAG_SLOWPATH) || !tp->rx_opt.accecn) { + if (!tp->saw_accecn_opt) { + /* Too late to enable after this point due to + * potential counter wraps + */ + if (tp->bytes_sent >= (1 << 23) - 1) { + u8 saw_opt = TCP_ACCECN_OPT_FAIL_SEEN; + + tcp_accecn_saw_opt_fail_recv(tp, saw_opt); + } + return false; + } + + if (estimate_ecnfield) { + u8 ecnfield = estimate_ecnfield - 1; + + tp->delivered_ecn_bytes[ecnfield] += delivered_bytes; + return true; + } + return false; + } + + ptr = skb_transport_header(skb) + tp->rx_opt.accecn; + optlen = ptr[1] - 2; + if (WARN_ON_ONCE(ptr[0] != TCPOPT_ACCECN0 && ptr[0] != TCPOPT_ACCECN1)) + return false; + order1 = (ptr[0] == TCPOPT_ACCECN1); + ptr += 2; + + if (tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { + tp->saw_accecn_opt = tcp_accecn_option_init(skb, + tp->rx_opt.accecn); + if (tp->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN) + tcp_accecn_fail_mode_set(tp, TCP_ACCECN_OPT_FAIL_RECV); + } + + res = !!estimate_ecnfield; + for (i = 0; i < 3; i++) { + u32 init_offset; + u8 ecnfield; + s32 delta; + u32 *cnt; + + if (optlen < TCPOLEN_ACCECN_PERFIELD) + break; + + ecnfield = tcp_accecn_optfield_to_ecnfield(i, order1); + init_offset = tcp_accecn_field_init_offset(ecnfield); + cnt = &tp->delivered_ecn_bytes[ecnfield - 1]; + delta = tcp_update_ecn_bytes(cnt, ptr, init_offset); + if (delta && delta < 0) { + res = false; + ambiguous_ecn_bytes_incr = true; + } + if (delta && ecnfield != estimate_ecnfield) { + if (!first_changed) { + tp->est_ecnfield = ecnfield; + first_changed = true; + } else { + res = false; + ambiguous_ecn_bytes_incr = true; + } + } + + optlen -= TCPOLEN_ACCECN_PERFIELD; + ptr += TCPOLEN_ACCECN_PERFIELD; + } + if (ambiguous_ecn_bytes_incr) + tp->est_ecnfield = 0; + + return res; } static void tcp_count_delivered_ce(struct tcp_sock *tp, u32 ecn_count) @@ -428,10 +484,101 @@ static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered, bool ece_ack) { tp->delivered += delivered; - if (ece_ack) + if (tcp_ecn_mode_rfc3168(tp) && ece_ack) tcp_count_delivered_ce(tp, delivered); } +/* Returns the ECN CE delta */ +static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, + u32 delivered_pkts, u32 delivered_bytes, + int flag) +{ + u32 old_ceb = tcp_sk(sk)->delivered_ecn_bytes[INET_ECN_CE - 1]; + const struct tcphdr *th = tcp_hdr(skb); + struct tcp_sock *tp = tcp_sk(sk); + u32 delta, safe_delta, d_ceb; + bool opt_deltas_valid; + u32 corrected_ace; + + /* Reordered ACK or uncertain due to lack of data to send and ts */ + if (!(flag & (FLAG_FORWARD_PROGRESS | FLAG_TS_PROGRESS))) + return 0; + + opt_deltas_valid = tcp_accecn_process_option(tp, skb, + delivered_bytes, flag); + + if (!(flag & FLAG_SLOWPATH)) { + /* AccECN counter might overflow on large ACKs */ + if (delivered_pkts <= TCP_ACCECN_CEP_ACE_MASK) + return 0; + } + + /* ACE field is not available during handshake */ + if (flag & FLAG_SYN_ACKED) + return 0; + + if (tp->received_ce_pending >= TCP_ACCECN_ACE_MAX_DELTA) + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; + + corrected_ace = tcp_accecn_ace(th) - TCP_ACCECN_CEP_INIT_OFFSET; + delta = (corrected_ace - tp->delivered_ce) & TCP_ACCECN_CEP_ACE_MASK; + if (delivered_pkts <= TCP_ACCECN_CEP_ACE_MASK) + return delta; + + safe_delta = delivered_pkts - + ((delivered_pkts - delta) & TCP_ACCECN_CEP_ACE_MASK); + + if (opt_deltas_valid) { + d_ceb = tp->delivered_ecn_bytes[INET_ECN_CE - 1] - old_ceb; + if (!d_ceb) + return delta; + + if ((delivered_pkts >= (TCP_ACCECN_CEP_ACE_MASK + 1) * 2) && + (tcp_is_sack(tp) || + ((1 << inet_csk(sk)->icsk_ca_state) & + (TCPF_CA_Open | TCPF_CA_CWR)))) { + u32 est_d_cep; + + if (delivered_bytes <= d_ceb) + return safe_delta; + + est_d_cep = DIV_ROUND_UP_ULL((u64)d_ceb * + delivered_pkts, + delivered_bytes); + return min(safe_delta, + delta + + (est_d_cep & ~TCP_ACCECN_CEP_ACE_MASK)); + } + + if (d_ceb > delta * tp->mss_cache) + return safe_delta; + if (d_ceb < + safe_delta * tp->mss_cache >> TCP_ACCECN_SAFETY_SHIFT) + return delta; + } + + return safe_delta; +} + +static u32 tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, + u32 delivered_pkts, u32 delivered_bytes, + int *flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 delta; + + delta = __tcp_accecn_process(sk, skb, delivered_pkts, + delivered_bytes, *flag); + if (delta > 0) { + tcp_count_delivered_ce(tp, delta); + *flag |= FLAG_ECE; + /* Recalculate header predictor */ + if (tp->pred_flags) + tcp_fast_path_on(tp); + } + return delta; +} + /* Buffer size and advertised window tuning. * * 1. Tuning sk->sk_sndbuf, when connection enters established state. @@ -744,18 +891,37 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, } } -static void tcp_rcvbuf_grow(struct sock *sk) +void tcp_rcvbuf_grow(struct sock *sk, u32 newval) { const struct net *net = sock_net(sk); struct tcp_sock *tp = tcp_sk(sk); - int rcvwin, rcvbuf, cap; + u32 rcvwin, rcvbuf, cap, oldval; + u32 rtt_threshold, rtt_us; + u64 grow; + + oldval = tp->rcvq_space.space; + tp->rcvq_space.space = newval; if (!READ_ONCE(net->ipv4.sysctl_tcp_moderate_rcvbuf) || (sk->sk_userlocks & SOCK_RCVBUF_LOCK)) return; - /* slow start: allow the sender to double its rate. */ - rcvwin = tp->rcvq_space.space << 1; + /* DRS is always one RTT late. */ + rcvwin = newval << 1; + + rtt_us = tp->rcv_rtt_est.rtt_us >> 3; + rtt_threshold = READ_ONCE(net->ipv4.sysctl_tcp_rcvbuf_low_rtt); + if (rtt_us < rtt_threshold) { + /* For small RTT, we set @grow to rcvwin * rtt_us/rtt_threshold. + * It might take few additional ms to reach 'line rate', + * but will avoid sk_rcvbuf inflation and poor cache use. + */ + grow = div_u64((u64)rcvwin * rtt_us, rtt_threshold); + } else { + /* slow start: allow the sender to double its rate. */ + grow = div_u64(((u64)rcvwin << 1) * (newval - oldval), oldval); + } + rcvwin += grow; if (!RB_EMPTY_ROOT(&tp->out_of_order_queue)) rcvwin += TCP_SKB_CB(tp->ooo_last_skb)->end_seq - tp->rcv_nxt; @@ -781,9 +947,15 @@ void tcp_rcv_space_adjust(struct sock *sk) trace_tcp_rcv_space_adjust(sk); - tcp_mstamp_refresh(tp); + if (unlikely(!tp->rcv_rtt_est.rtt_us)) + return; + + /* We do not refresh tp->tcp_mstamp here. + * Some platforms have expensive ktime_get() implementations. + * Using the last cached value is enough for DRS. + */ time = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcvq_space.time); - if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0) + if (time < (tp->rcv_rtt_est.rtt_us >> 3)) return; /* Number of bytes copied to user in last RTT */ @@ -796,9 +968,7 @@ void tcp_rcv_space_adjust(struct sock *sk) trace_tcp_rcvbuf_grow(sk, time); - tp->rcvq_space.space = copied; - - tcp_rcvbuf_grow(sk); + tcp_rcvbuf_grow(sk, copied); new_measure: tp->rcvq_space.seq = tp->copied_seq; @@ -948,7 +1118,7 @@ static void tcp_rtt_estimator(struct sock *sk, long mrtt_us) tp->srtt_us = max(1U, srtt); } -static void tcp_update_pacing_rate(struct sock *sk) +void tcp_update_pacing_rate(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); u64 rate; @@ -985,7 +1155,7 @@ static void tcp_update_pacing_rate(struct sock *sk) /* Calculate rto without backoff. This is the second half of Van Jacobson's * routine referred to above. */ -static void tcp_set_rto(struct sock *sk) +void tcp_set_rto(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); /* Old crap is replaced with new one. 8) @@ -1030,6 +1200,7 @@ struct tcp_sacktag_state { u64 last_sackt; u32 reord; u32 sack_delivered; + u32 delivered_bytes; int flag; unsigned int mss_now; struct rate_sample *rate; @@ -1391,7 +1562,7 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, static u8 tcp_sacktag_one(struct sock *sk, struct tcp_sacktag_state *state, u8 sacked, u32 start_seq, u32 end_seq, - int dup_sack, int pcount, + int dup_sack, int pcount, u32 plen, u64 xmit_time) { struct tcp_sock *tp = tcp_sk(sk); @@ -1451,6 +1622,7 @@ static u8 tcp_sacktag_one(struct sock *sk, tp->sacked_out += pcount; /* Out-of-order packets delivered */ state->sack_delivered += pcount; + state->delivered_bytes += plen; } /* D-SACK. We can detect redundant retransmission in S|R and plain R @@ -1487,7 +1659,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, * tcp_highest_sack_seq() when skb is highest_sack. */ tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, - start_seq, end_seq, dup_sack, pcount, + start_seq, end_seq, dup_sack, pcount, skb->len, tcp_skb_timestamp_us(skb)); tcp_rate_skb_delivered(sk, skb, state->rate); @@ -1772,6 +1944,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, TCP_SKB_CB(skb)->end_seq, dup_sack, tcp_skb_pcount(skb), + skb->len, tcp_skb_timestamp_us(skb)); tcp_rate_skb_delivered(sk, skb, state->rate); if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) @@ -2569,7 +2742,7 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo) if (frto_undo) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS); - inet_csk(sk)->icsk_retransmits = 0; + WRITE_ONCE(inet_csk(sk)->icsk_retransmits, 0); if (tcp_is_non_sack_preventing_reopen(sk)) return true; if (frto_undo || tcp_is_sack(tp)) { @@ -3280,6 +3453,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb, if (sacked & TCPCB_SACKED_ACKED) { tp->sacked_out -= acked_pcount; + /* snd_una delta covers these skbs */ + sack->delivered_bytes -= skb->len; } else if (tcp_is_sack(tp)) { tcp_count_delivered(tp, acked_pcount, ece_ack); if (!tcp_skb_spurious_retrans(tp, skb)) @@ -3376,6 +3551,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb, if (before(reord, prior_fack)) tcp_check_sack_reordering(sk, reord, 0); } + + sack->delivered_bytes = (skb ? + TCP_SKB_CB(skb)->seq : tp->snd_una) - + prior_snd_una; } else if (skb && rtt_update && sack_rtt_us >= 0 && sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, tcp_skb_timestamp_us(skb))) { @@ -3645,8 +3824,18 @@ bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb, return __tcp_oow_rate_limited(net, mib_idx, last_oow_ack_time); } +static void tcp_send_ack_reflect_ect(struct sock *sk, bool accecn_reflector) +{ + struct tcp_sock *tp = tcp_sk(sk); + u16 flags = 0; + + if (accecn_reflector) + flags = tcp_accecn_reflector_flags(tp->syn_ect_rcv); + __tcp_send_ack(sk, tp->rcv_nxt, flags); +} + /* RFC 5961 7 [ACK Throttling] */ -static void tcp_send_challenge_ack(struct sock *sk) +static void tcp_send_challenge_ack(struct sock *sk, bool accecn_reflector) { struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); @@ -3676,7 +3865,7 @@ static void tcp_send_challenge_ack(struct sock *sk) WRITE_ONCE(net->ipv4.tcp_challenge_count, count - 1); send_ack: NET_INC_STATS(net, LINUX_MIB_TCPCHALLENGEACK); - tcp_send_ack(sk); + tcp_send_ack_reflect_ect(sk, accecn_reflector); } } @@ -3787,7 +3976,8 @@ static void tcp_xmit_recovery(struct sock *sk, int rexmit) } /* Returns the number of packets newly acked or sacked by the current ACK */ -static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag) +static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, + u32 ecn_count, int flag) { const struct net *net = sock_net(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -3795,8 +3985,12 @@ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag) delivered = tp->delivered - prior_delivered; NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered); - if (flag & FLAG_ECE) - NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered); + + if (flag & FLAG_ECE) { + if (tcp_ecn_mode_rfc3168(tp)) + ecn_count = delivered; + NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, ecn_count); + } return delivered; } @@ -3817,11 +4011,13 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) u32 delivered = tp->delivered; u32 lost = tp->lost; int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */ + u32 ecn_count = 0; /* Did we receive ECE/an AccECN ACE update? */ u32 prior_fack; sack_state.first_sackt = 0; sack_state.rate = &rs; sack_state.sack_delivered = 0; + sack_state.delivered_bytes = 0; /* We very likely will need to access rtx queue. */ prefetch(sk->tcp_rtx_queue.rb_node); @@ -3837,7 +4033,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) /* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */ if (before(ack, prior_snd_una - max_window)) { if (!(flag & FLAG_NO_CHALLENGE_ACK)) - tcp_send_challenge_ack(sk); + tcp_send_challenge_ack(sk, false); return -SKB_DROP_REASON_TCP_TOO_OLD_ACK; } goto old_ack; @@ -3851,7 +4047,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (after(ack, prior_snd_una)) { flag |= FLAG_SND_UNA_ADVANCED; - icsk->icsk_retransmits = 0; + WRITE_ONCE(icsk->icsk_retransmits, 0); #if IS_ENABLED(CONFIG_TLS_DEVICE) if (static_branch_unlikely(&clean_acked_data_enabled.key)) @@ -3912,8 +4108,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) /* We passed data and got it acked, remove any soft error * log. Something worked... */ - WRITE_ONCE(sk->sk_err_soft, 0); - icsk->icsk_probes_out = 0; + if (READ_ONCE(sk->sk_err_soft)) + WRITE_ONCE(sk->sk_err_soft, 0); + WRITE_ONCE(icsk->icsk_probes_out, 0); tp->rcv_tstamp = tcp_jiffies32; if (!prior_packets) goto no_queue; @@ -3924,6 +4121,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_rack_update_reo_wnd(sk, &rs); + if (tcp_ecn_mode_accecn(tp)) + ecn_count = tcp_accecn_process(sk, skb, + tp->delivered - delivered, + sack_state.delivered_bytes, + &flag); + tcp_in_ack_event(sk, flag); if (tp->tlp_high_seq) @@ -3948,7 +4151,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) sk_dst_confirm(sk); - delivered = tcp_newly_delivered(sk, delivered, flag); + delivered = tcp_newly_delivered(sk, delivered, ecn_count, flag); + lost = tp->lost - lost; /* freshly marked lost */ rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED); tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate); @@ -3957,12 +4161,17 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) return 1; no_queue: + if (tcp_ecn_mode_accecn(tp)) + ecn_count = tcp_accecn_process(sk, skb, + tp->delivered - delivered, + sack_state.delivered_bytes, + &flag); tcp_in_ack_event(sk, flag); /* If data was DSACKed, see if we can undo a cwnd reduction. */ if (flag & FLAG_DSACKING_ACK) { tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, &rexmit); - tcp_newly_delivered(sk, delivered, flag); + tcp_newly_delivered(sk, delivered, ecn_count, flag); } /* If this ack opens up a zero window, clear backoff. It was * being used to time the probes, and is probably far higher than @@ -3983,7 +4192,7 @@ old_ack: &sack_state); tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, &rexmit); - tcp_newly_delivered(sk, delivered, flag); + tcp_newly_delivered(sk, delivered, ecn_count, flag); tcp_xmit_recovery(sk, rexmit); } @@ -4083,6 +4292,7 @@ void tcp_parse_options(const struct net *net, ptr = (const unsigned char *)(th + 1); opt_rx->saw_tstamp = 0; + opt_rx->accecn = 0; opt_rx->saw_unknown = 0; while (length > 0) { @@ -4174,6 +4384,12 @@ void tcp_parse_options(const struct net *net, ptr, th->syn, foc, false); break; + case TCPOPT_ACCECN0: + case TCPOPT_ACCECN1: + /* Save offset of AccECN option in TCP header */ + opt_rx->accecn = (ptr - 2) - (__u8 *)th; + break; + case TCPOPT_EXP: /* Fast Open option shares code 254 using a * 16 bits magic number. @@ -4234,11 +4450,14 @@ static bool tcp_fast_parse_options(const struct net *net, */ if (th->doff == (sizeof(*th) / 4)) { tp->rx_opt.saw_tstamp = 0; + tp->rx_opt.accecn = 0; return false; } else if (tp->rx_opt.tstamp_ok && th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) { - if (tcp_parse_aligned_timestamp(tp, th)) + if (tcp_parse_aligned_timestamp(tp, th)) { + tp->rx_opt.accecn = 0; return true; + } } tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL); @@ -4830,7 +5049,7 @@ static bool tcp_ooo_try_coalesce(struct sock *sk, noinline_for_tracing static void tcp_drop_reason(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason reason) { - sk_drops_add(sk, skb); + sk_drops_skbadd(sk, skb); sk_skb_reason_drop(sk, skb, reason); } @@ -4890,12 +5109,23 @@ static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb); /* Check if this incoming skb can be added to socket receive queues * while satisfying sk->sk_rcvbuf limit. + * + * In theory we should use skb->truesize, but this can cause problems + * when applications use too small SO_RCVBUF values. + * When LRO / hw gro is used, the socket might have a high tp->scaling_ratio, + * allowing RWIN to be close to available space. + * Whenever the receive queue gets full, we can receive a small packet + * filling RWIN, but with a high skb->truesize, because most NIC use 4K page + * plus sk_buff metadata even when receiving less than 1500 bytes of payload. + * + * Note that we use skb->len to decide to accept or drop this packet, + * but sk->sk_rmem_alloc is the sum of all skb->truesize. */ static bool tcp_can_ingest(const struct sock *sk, const struct sk_buff *skb) { - unsigned int new_mem = atomic_read(&sk->sk_rmem_alloc) + skb->truesize; + unsigned int rmem = atomic_read(&sk->sk_rmem_alloc); - return new_mem <= sk->sk_rcvbuf; + return rmem + skb->len <= sk->sk_rcvbuf; } static int tcp_try_rmem_schedule(struct sock *sk, const struct sk_buff *skb, @@ -5063,7 +5293,7 @@ end: } /* do not grow rcvbuf for not-yet-accepted or orphaned sockets. */ if (sk->sk_socket) - tcp_rcvbuf_grow(sk); + tcp_rcvbuf_grow(sk, tp->rcvq_space.space); } static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, @@ -5673,7 +5903,9 @@ static inline void tcp_data_snd_check(struct sock *sk) static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) { struct tcp_sock *tp = tcp_sk(sk); - unsigned long rtt, delay; + struct net *net = sock_net(sk); + unsigned long rtt; + u64 delay; /* More than one full frame received... */ if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss && @@ -5692,7 +5924,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) * Defer the ack until tcp_release_cb(). */ if (sock_owned_by_user_nocheck(sk) && - READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_backlog_ack_defer)) { + READ_ONCE(net->ipv4.sysctl_tcp_backlog_ack_defer)) { set_bit(TCP_ACK_DEFERRED, &sk->sk_tsq_flags); return; } @@ -5707,7 +5939,7 @@ send_now: } if (!tcp_is_sack(tp) || - tp->compressed_ack >= READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_nr)) + tp->compressed_ack >= READ_ONCE(net->ipv4.sysctl_tcp_comp_sack_nr)) goto send_now; if (tp->compressed_ack_rcv_nxt != tp->rcv_nxt) { @@ -5722,18 +5954,26 @@ send_now: if (hrtimer_is_queued(&tp->compressed_ack_timer)) return; - /* compress ack timer : 5 % of rtt, but no more than tcp_comp_sack_delay_ns */ + /* compress ack timer : comp_sack_rtt_percent of rtt, + * but no more than tcp_comp_sack_delay_ns. + */ rtt = tp->rcv_rtt_est.rtt_us; if (tp->srtt_us && tp->srtt_us < rtt) rtt = tp->srtt_us; - delay = min_t(unsigned long, - READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_delay_ns), - rtt * (NSEC_PER_USEC >> 3)/20); + /* delay = (rtt >> 3) * NSEC_PER_USEC * comp_sack_rtt_percent / 100 + * -> + * delay = rtt * 1.25 * comp_sack_rtt_percent + */ + delay = (u64)(rtt + (rtt >> 2)) * + READ_ONCE(net->ipv4.sysctl_tcp_comp_sack_rtt_percent); + + delay = min(delay, READ_ONCE(net->ipv4.sysctl_tcp_comp_sack_delay_ns)); + sock_hold(sk); hrtimer_start_range_ns(&tp->compressed_ack_timer, ns_to_ktime(delay), - READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_comp_sack_slack_ns), + READ_ONCE(net->ipv4.sysctl_tcp_comp_sack_slack_ns), HRTIMER_MODE_REL_PINNED_SOFT); } @@ -5871,6 +6111,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th, int syn_inerr) { struct tcp_sock *tp = tcp_sk(sk); + bool accecn_reflector = false; SKB_DR(reason); /* RFC1323: H1. Apply PAWS check first. */ @@ -5968,7 +6209,7 @@ step1: if (tp->syn_fastopen && !tp->data_segs_in && sk->sk_state == TCP_ESTABLISHED) tcp_fastopen_active_disable(sk); - tcp_send_challenge_ack(sk); + tcp_send_challenge_ack(sk, false); SKB_DR_SET(reason, TCP_RESET); goto discard; } @@ -5979,6 +6220,16 @@ step1: * RFC 5961 4.2 : Send a challenge ack */ if (th->syn) { + if (tcp_ecn_mode_accecn(tp)) { + accecn_reflector = true; + if (tp->rx_opt.accecn && + tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { + u8 saw_opt = tcp_accecn_option_init(skb, tp->rx_opt.accecn); + + tcp_accecn_saw_opt_fail_recv(tp, saw_opt); + tcp_accecn_opt_demand_min(sk, 1); + } + } if (sk->sk_state == TCP_SYN_RECV && sk->sk_socket && th->ack && TCP_SKB_CB(skb)->seq + 1 == TCP_SKB_CB(skb)->end_seq && TCP_SKB_CB(skb)->seq + 1 == tp->rcv_nxt && @@ -5988,7 +6239,7 @@ syn_challenge: if (syn_inerr) TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE); - tcp_send_challenge_ack(sk); + tcp_send_challenge_ack(sk, accecn_reflector); SKB_DR_SET(reason, TCP_INVALID_SYN); goto discard; } @@ -6060,6 +6311,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) */ tp->rx_opt.saw_tstamp = 0; + tp->rx_opt.accecn = 0; /* pred_flags is 0xS?10 << 16 + snd_wnd * if header_prediction is to be made @@ -6114,6 +6366,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) flag |= __tcp_replace_ts_recent(tp, delta); + tcp_ecn_received_counters(sk, skb, 0); + /* We know that such packets are checksummed * on entry. */ @@ -6162,6 +6416,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) /* Bulk data transfer: receiver */ tcp_cleanup_skb(skb); __skb_pull(skb, tcp_header_len); + tcp_ecn_received_counters(sk, skb, + len - tcp_header_len); eaten = tcp_queue_rcv(sk, skb, &fragstolen); tcp_event_data_recv(sk, skb); @@ -6202,6 +6458,8 @@ validate: return; step5: + tcp_ecn_received_counters_payload(sk, skb); + reason = tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT); if ((int)reason < 0) { reason = -reason; @@ -6297,7 +6555,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, u16 mss = tp->rx_opt.mss_clamp, try_exp = 0; bool syn_drop = false; - if (mss == tp->rx_opt.user_mss) { + if (mss == READ_ONCE(tp->rx_opt.user_mss)) { struct tcp_options_received opt; /* Get original SYNACK MSS value if user MSS sets mss_clamp */ @@ -6452,7 +6710,9 @@ consume: * state to ESTABLISHED..." */ - tcp_ecn_rcv_synack(tp, th); + if (tcp_ecn_mode_any(tp)) + tcp_ecn_rcv_synack(sk, skb, th, + TCP_SKB_CB(skb)->ip_dsfield); tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); tcp_try_undo_spurious_syn(sk); @@ -6524,7 +6784,7 @@ consume: TCP_DELACK_MAX, false); goto consume; } - tcp_send_ack(sk); + tcp_send_ack_reflect_ect(sk, tcp_ecn_mode_accecn(tp)); return -1; } @@ -6583,7 +6843,7 @@ consume: tp->snd_wl1 = TCP_SKB_CB(skb)->seq; tp->max_window = tp->snd_wnd; - tcp_ecn_rcv_syn(tp, th); + tcp_ecn_rcv_syn(tp, th, skb); tcp_mtup_init(sk); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); @@ -6636,7 +6896,7 @@ static void tcp_rcv_synrecv_state_fastopen(struct sock *sk) tcp_try_undo_recovery(sk); tcp_update_rto_time(tp); - inet_csk(sk)->icsk_retransmits = 0; + WRITE_ONCE(inet_csk(sk)->icsk_retransmits, 0); /* In tcp_fastopen_synack_timer() on the first SYNACK RTO we set * retrans_stamp but don't enter CA_Loss, so in case that happened we * need to zero retrans_stamp here to prevent spurious @@ -6765,7 +7025,7 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) } /* accept old ack during closing */ if ((int)reason < 0) { - tcp_send_challenge_ack(sk); + tcp_send_challenge_ack(sk, false); reason = -reason; goto discard; } @@ -6812,9 +7072,12 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) tp->lsndtime = tcp_jiffies32; tcp_initialize_rcv_mss(sk); + if (tcp_ecn_mode_accecn(tp)) + tcp_accecn_third_ack(sk, skb, tp->syn_ect_snt); tcp_fast_path_on(tp); if (sk->sk_shutdown & SEND_SHUTDOWN) tcp_shutdown(sk, SEND_SHUTDOWN); + break; case TCP_FIN_WAIT1: { @@ -6984,6 +7247,15 @@ static void tcp_ecn_create_request(struct request_sock *req, bool ect, ecn_ok; u32 ecn_ok_dst; + if (tcp_accecn_syn_requested(th) && + READ_ONCE(net->ipv4.sysctl_tcp_ecn) >= 3) { + inet_rsk(req)->ecn_ok = 1; + tcp_rsk(req)->accecn_ok = 1; + tcp_rsk(req)->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & + INET_ECN_MASK; + return; + } + if (!th_ecn) return; @@ -6991,7 +7263,8 @@ static void tcp_ecn_create_request(struct request_sock *req, ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK); ecn_ok = READ_ONCE(net->ipv4.sysctl_tcp_ecn) || ecn_ok_dst; - if (((!ect || th->res1) && ecn_ok) || tcp_ca_needs_ecn(listen_sk) || + if (((!ect || th->res1 || th->ae) && ecn_ok) || + tcp_ca_needs_ecn(listen_sk) || (ecn_ok_dst & DST_FEATURE_ECN_CA) || tcp_bpf_ca_needs_ecn((struct sock *)req)) inet_rsk(req)->ecn_ok = 1; @@ -7009,6 +7282,11 @@ static void tcp_openreq_init(struct request_sock *req, tcp_rsk(req)->snt_synack = 0; tcp_rsk(req)->snt_tsval_first = 0; tcp_rsk(req)->last_oow_ack_time = 0; + tcp_rsk(req)->accecn_ok = 0; + tcp_rsk(req)->saw_accecn_opt = TCP_ACCECN_OPT_NOT_SEEN; + tcp_rsk(req)->accecn_fail_mode = 0; + tcp_rsk(req)->syn_ect_rcv = 0; + tcp_rsk(req)->syn_ect_snt = 0; req->mss = rx_opt->mss_clamp; req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0; ireq->tstamp_ok = rx_opt->tstamp_ok; @@ -7048,8 +7326,8 @@ static bool tcp_syn_flood_action(struct sock *sk, const char *proto) #endif __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); - if (!READ_ONCE(queue->synflood_warned) && syncookies != 2 && - xchg(&queue->synflood_warned, 1) == 0) { + if (syncookies != 2 && !READ_ONCE(queue->synflood_warned)) { + WRITE_ONCE(queue->synflood_warned, 1); if (IS_ENABLED(CONFIG_IPV6) && sk->sk_family == AF_INET6) { net_info_ratelimited("%s: Possible SYN flooding on port [%pI6c]:%u. %s.\n", proto, inet6_rcv_saddr(sk), @@ -7117,7 +7395,7 @@ u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops, return 0; } - mss = tcp_parse_mss_option(th, tp->rx_opt.user_mss); + mss = tcp_parse_mss_option(th, READ_ONCE(tp->rx_opt.user_mss)); if (!mss) mss = af_ops->mss_clamp; @@ -7131,7 +7409,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, { struct tcp_fastopen_cookie foc = { .len = -1 }; struct tcp_options_received tmp_opt; - struct tcp_sock *tp = tcp_sk(sk); + const struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); struct sock *fastopen_sk = NULL; struct request_sock *req; @@ -7182,7 +7460,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tcp_clear_options(&tmp_opt); tmp_opt.mss_clamp = af_ops->mss_clamp; - tmp_opt.user_mss = tp->rx_opt.user_mss; + tmp_opt.user_mss = READ_ONCE(tp->rx_opt.user_mss); tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, want_cookie ? NULL : &foc); @@ -7264,7 +7542,6 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, &foc, TCP_SYNACK_FASTOPEN, skb); /* Add the child socket directly into the accept queue */ if (!inet_csk_reqsk_queue_add(sk, req, fastopen_sk)) { - reqsk_fastopen_remove(fastopen_sk, req, false); bh_unlock_sock(fastopen_sk); sock_put(fastopen_sk); goto drop_and_free; @@ -7274,15 +7551,11 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, sock_put(fastopen_sk); } else { tcp_rsk(req)->tfo_listener = false; - if (!want_cookie) { - req->timeout = tcp_timeout_init((struct sock *)req); - if (unlikely(!inet_csk_reqsk_queue_hash_add(sk, req, - req->timeout))) { - reqsk_free(req); - dst_release(dst); - return 0; - } - + if (!want_cookie && + unlikely(!inet_csk_reqsk_queue_hash_add(sk, req))) { + reqsk_free(req); + dst_release(dst); + return 0; } af_ops->send_synack(sk, dst, &fl, req, &foc, !want_cookie ? TCP_SYNACK_NORMAL : diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 84d3d556ed80..f8a9596e8f4d 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -53,6 +53,7 @@ #include <linux/module.h> #include <linux/random.h> #include <linux/cache.h> +#include <linux/fips.h> #include <linux/jhash.h> #include <linux/init.h> #include <linux/times.h> @@ -65,6 +66,7 @@ #include <net/icmp.h> #include <net/inet_hashtables.h> #include <net/tcp.h> +#include <net/tcp_ecn.h> #include <net/transp_v6.h> #include <net/ipv6.h> #include <net/inet_common.h> @@ -74,6 +76,7 @@ #include <net/secure_seq.h> #include <net/busy_poll.h> #include <net/rstreason.h> +#include <net/psp.h> #include <linux/inet.h> #include <linux/ipv6.h> @@ -84,14 +87,13 @@ #include <linux/btf_ids.h> #include <linux/skbuff_ref.h> -#include <crypto/hash.h> -#include <linux/scatterlist.h> +#include <crypto/md5.h> #include <trace/events/tcp.h> #ifdef CONFIG_TCP_MD5SIG -static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, - __be32 daddr, __be32 saddr, const struct tcphdr *th); +static void tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, + __be32 daddr, __be32 saddr, const struct tcphdr *th); #endif struct inet_hashinfo tcp_hashinfo; @@ -203,7 +205,7 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) } EXPORT_IPV6_MOD_GPL(tcp_twsk_unique); -static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, +static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { /* This check is replicated from tcp_v4_connect() and intended to @@ -219,7 +221,7 @@ static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, } /* This will initiate an outgoing connection. */ -int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +int tcp_v4_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; struct inet_timewait_death_row *tcp_death_row; @@ -292,9 +294,9 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->inet_dport = usin->sin_port; sk_daddr_set(sk, daddr); - inet_csk(sk)->icsk_ext_hdr_len = 0; + inet_csk(sk)->icsk_ext_hdr_len = psp_sk_overhead(sk); if (inet_opt) - inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; + inet_csk(sk)->icsk_ext_hdr_len += inet_opt->opt.optlen; tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; @@ -506,8 +508,7 @@ int tcp_v4_err(struct sk_buff *skb, u32 info) struct sock *sk; int err; - sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, - iph->daddr, th->dest, iph->saddr, + sk = __inet_lookup_established(net, iph->daddr, th->dest, iph->saddr, ntohs(th->source), inet_iif(skb), 0); if (!sk) { __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); @@ -753,7 +754,6 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, struct tcp_md5sig_key *key = NULL; unsigned char newhash[16]; struct sock *sk1 = NULL; - int genhash; #endif u64 transmit_time = 0; struct sock *ctl_sk; @@ -823,8 +823,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, * Incoming packet is checked with md5 hash with finding key, * no RST generated if md5 hash doesn't match. */ - sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, - NULL, 0, ip_hdr(skb)->saddr, + sk1 = __inet_lookup_listener(net, NULL, 0, ip_hdr(skb)->saddr, th->source, ip_hdr(skb)->daddr, ntohs(th->source), dif, sdif); /* don't send rst if it can't find key */ @@ -840,11 +839,9 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, if (!key) goto out; - - genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); - if (genhash || memcmp(md5_hash_location, newhash, 16) != 0) + tcp_v4_md5_hash_skb(newhash, key, NULL, skb); + if (memcmp(md5_hash_location, newhash, 16) != 0) goto out; - } if (key) { @@ -1191,7 +1188,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, enum tcp_synack_type synack_type, struct sk_buff *syn_skb) { - const struct inet_request_sock *ireq = inet_rsk(req); + struct inet_request_sock *ireq = inet_rsk(req); struct flowi4 fl4; int err = -1; struct sk_buff *skb; @@ -1204,6 +1201,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); if (skb) { + tcp_rsk(req)->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK; __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); tos = READ_ONCE(inet_sk(sk)->tos); @@ -1424,13 +1422,13 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, struct tcp_sock *tp = tcp_sk(sk); if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { - if (tcp_md5_alloc_sigpool()) - return -ENOMEM; + if (fips_enabled) { + pr_warn_once("TCP-MD5 support is disabled due to FIPS\n"); + return -EOPNOTSUPP; + } - if (tcp_md5sig_info_add(sk, GFP_KERNEL)) { - tcp_md5_release_sigpool(); + if (tcp_md5sig_info_add(sk, GFP_KERNEL)) return -ENOMEM; - } if (!static_branch_inc(&tcp_md5_needed.key)) { struct tcp_md5sig_info *md5sig; @@ -1438,7 +1436,6 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)); rcu_assign_pointer(tp->md5sig_info, NULL); kfree_rcu(md5sig, rcu); - tcp_md5_release_sigpool(); return -EUSERS; } } @@ -1455,12 +1452,9 @@ int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr, struct tcp_sock *tp = tcp_sk(sk); if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) { - tcp_md5_add_sigpool(); - if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) { - tcp_md5_release_sigpool(); + if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC))) return -ENOMEM; - } if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) { struct tcp_md5sig_info *md5sig; @@ -1469,7 +1463,6 @@ int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr, net_warn_ratelimited("Too many TCP-MD5 keys in the system\n"); rcu_assign_pointer(tp->md5sig_info, NULL); kfree_rcu(md5sig, rcu); - tcp_md5_release_sigpool(); return -EUSERS; } } @@ -1505,9 +1498,9 @@ void tcp_clear_md5_list(struct sock *sk) md5sig = rcu_dereference_protected(tp->md5sig_info, 1); hlist_for_each_entry_safe(key, n, &md5sig->head, node) { - hlist_del_rcu(&key->node); + hlist_del(&key->node); atomic_sub(sizeof(*key), &sk->sk_omem_alloc); - kfree_rcu(key, rcu); + kfree(key); } } @@ -1577,66 +1570,44 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, int optname, cmd.tcpm_key, cmd.tcpm_keylen); } -static int tcp_v4_md5_hash_headers(struct tcp_sigpool *hp, - __be32 daddr, __be32 saddr, - const struct tcphdr *th, int nbytes) +static void tcp_v4_md5_hash_headers(struct md5_ctx *ctx, + __be32 daddr, __be32 saddr, + const struct tcphdr *th, int nbytes) { - struct tcp4_pseudohdr *bp; - struct scatterlist sg; - struct tcphdr *_th; - - bp = hp->scratch; - bp->saddr = saddr; - bp->daddr = daddr; - bp->pad = 0; - bp->protocol = IPPROTO_TCP; - bp->len = cpu_to_be16(nbytes); - - _th = (struct tcphdr *)(bp + 1); - memcpy(_th, th, sizeof(*th)); - _th->check = 0; + struct { + struct tcp4_pseudohdr ip; + struct tcphdr tcp; + } h; - sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); - ahash_request_set_crypt(hp->req, &sg, NULL, - sizeof(*bp) + sizeof(*th)); - return crypto_ahash_update(hp->req); + h.ip.saddr = saddr; + h.ip.daddr = daddr; + h.ip.pad = 0; + h.ip.protocol = IPPROTO_TCP; + h.ip.len = cpu_to_be16(nbytes); + h.tcp = *th; + h.tcp.check = 0; + md5_update(ctx, (const u8 *)&h, sizeof(h.ip) + sizeof(h.tcp)); } -static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, - __be32 daddr, __be32 saddr, const struct tcphdr *th) +static noinline_for_stack void +tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, + __be32 daddr, __be32 saddr, const struct tcphdr *th) { - struct tcp_sigpool hp; - - if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp)) - goto clear_hash_nostart; - - if (crypto_ahash_init(hp.req)) - goto clear_hash; - if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2)) - goto clear_hash; - if (tcp_md5_hash_key(&hp, key)) - goto clear_hash; - ahash_request_set_crypt(hp.req, NULL, md5_hash, 0); - if (crypto_ahash_final(hp.req)) - goto clear_hash; - - tcp_sigpool_end(&hp); - return 0; + struct md5_ctx ctx; -clear_hash: - tcp_sigpool_end(&hp); -clear_hash_nostart: - memset(md5_hash, 0, 16); - return 1; + md5_init(&ctx); + tcp_v4_md5_hash_headers(&ctx, daddr, saddr, th, th->doff << 2); + tcp_md5_hash_key(&ctx, key); + md5_final(&ctx, md5_hash); } -int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, - const struct sock *sk, - const struct sk_buff *skb) +noinline_for_stack void +tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, + const struct sock *sk, const struct sk_buff *skb) { const struct tcphdr *th = tcp_hdr(skb); - struct tcp_sigpool hp; __be32 saddr, daddr; + struct md5_ctx ctx; if (sk) { /* valid for establish/request sockets */ saddr = sk->sk_rcv_saddr; @@ -1647,30 +1618,11 @@ int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, daddr = iph->daddr; } - if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp)) - goto clear_hash_nostart; - - if (crypto_ahash_init(hp.req)) - goto clear_hash; - - if (tcp_v4_md5_hash_headers(&hp, daddr, saddr, th, skb->len)) - goto clear_hash; - if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2)) - goto clear_hash; - if (tcp_md5_hash_key(&hp, key)) - goto clear_hash; - ahash_request_set_crypt(hp.req, NULL, md5_hash, 0); - if (crypto_ahash_final(hp.req)) - goto clear_hash; - - tcp_sigpool_end(&hp); - return 0; - -clear_hash: - tcp_sigpool_end(&hp); -clear_hash_nostart: - memset(md5_hash, 0, 16); - return 1; + md5_init(&ctx); + tcp_v4_md5_hash_headers(&ctx, daddr, saddr, th, skb->len); + tcp_md5_hash_skb_data(&ctx, skb, th->doff << 2); + tcp_md5_hash_key(&ctx, key); + md5_final(&ctx, md5_hash); } EXPORT_IPV6_MOD(tcp_v4_md5_hash_skb); @@ -1708,7 +1660,6 @@ struct request_sock_ops tcp_request_sock_ops __read_mostly = { .send_ack = tcp_v4_reqsk_send_ack, .destructor = tcp_v4_reqsk_destructor, .send_reset = tcp_v4_send_reset, - .syn_ack_timeout = tcp_syn_ack_timeout, }; const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { @@ -1907,6 +1858,10 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) enum skb_drop_reason reason; struct sock *rsk; + reason = psp_sk_rx_policy_check(sk, skb); + if (reason) + goto err_discard; + if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ struct dst_entry *dst; @@ -1968,6 +1923,7 @@ csum_err: reason = SKB_DROP_REASON_TCP_CSUM; trace_tcp_bad_csum(skb); TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); +err_discard: TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); goto discard; } @@ -1992,8 +1948,7 @@ int tcp_v4_early_demux(struct sk_buff *skb) if (th->doff < sizeof(struct tcphdr) / 4) return 0; - sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, - iph->saddr, th->source, + sk = __inet_lookup_established(net, iph->saddr, th->source, iph->daddr, ntohs(th->dest), skb->skb_iif, inet_sdif(skb)); if (sk) { @@ -2070,7 +2025,9 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)) || !tcp_skb_can_collapse_rx(tail, skb) || thtail->doff != th->doff || - memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) + memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)) || + /* prior to PSP Rx policy check, retain exact PSP metadata */ + psp_skb_coalesce_diff(tail, skb)) goto no_coalesce; __skb_pull(skb, hdrlen); @@ -2236,8 +2193,7 @@ int tcp_v4_rcv(struct sk_buff *skb) th = (const struct tcphdr *)skb->data; iph = ip_hdr(skb); lookup: - sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo, - skb, __tcp_hdrlen(th), th->source, + sk = __inet_lookup_skb(skb, __tcp_hdrlen(th), th->source, th->dest, sdif, &refcounted); if (!sk) goto no_tcp_socket; @@ -2258,7 +2214,7 @@ lookup: &iph->saddr, &iph->daddr, AF_INET, dif, sdif); if (unlikely(drop_reason)) { - sk_drops_add(sk, skb); + sk_drops_skbadd(sk, skb); reqsk_put(req); goto discard_it; } @@ -2403,7 +2359,7 @@ discard_it: return 0; discard_and_relse: - sk_drops_add(sk, skb); + sk_drops_skbadd(sk, skb); if (refcounted) sock_put(sk); goto discard_it; @@ -2426,9 +2382,7 @@ do_time_wait: &drop_reason); switch (tw_status) { case TCP_TW_SYN: { - struct sock *sk2 = inet_lookup_listener(net, - net->ipv4.tcp_death_row.hashinfo, - skb, __tcp_hdrlen(th), + struct sock *sk2 = inet_lookup_listener(net, skb, __tcp_hdrlen(th), iph->saddr, th->source, iph->daddr, th->dest, inet_iif(skb), @@ -2441,6 +2395,10 @@ do_time_wait: __this_cpu_write(tcp_tw_isn, isn); goto process; } + + drop_reason = psp_twsk_rx_policy_check(inet_twsk(sk), skb); + if (drop_reason) + break; } /* to ACK */ fallthrough; @@ -2459,7 +2417,6 @@ do_time_wait: static struct timewait_sock_ops tcp_timewait_sock_ops = { .twsk_obj_size = sizeof(struct tcp_timewait_sock), - .twsk_destructor= tcp_twsk_destructor, }; void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) @@ -2501,6 +2458,13 @@ static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { .ao_calc_key_sk = tcp_v4_ao_calc_key_sk, #endif }; + +static void tcp4_destruct_sock(struct sock *sk) +{ + tcp_md5_destruct_sock(sk); + tcp_ao_destroy_sock(sk, false); + inet_sock_destruct(sk); +} #endif /* NOTE: A lot of things set to zero explicitly by call to @@ -2516,23 +2480,12 @@ static int tcp_v4_init_sock(struct sock *sk) #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; + sk->sk_destruct = tcp4_destruct_sock; #endif return 0; } -#ifdef CONFIG_TCP_MD5SIG -static void tcp_md5sig_info_free_rcu(struct rcu_head *head) -{ - struct tcp_md5sig_info *md5sig; - - md5sig = container_of(head, struct tcp_md5sig_info, rcu); - kfree(md5sig); - static_branch_slow_dec_deferred(&tcp_md5_needed); - tcp_md5_release_sigpool(); -} -#endif - static void tcp_release_user_frags(struct sock *sk) { #ifdef CONFIG_PAGE_POOL @@ -2569,19 +2522,6 @@ void tcp_v4_destroy_sock(struct sock *sk) /* Cleans up our, hopefully empty, out_of_order_queue. */ skb_rbtree_purge(&tp->out_of_order_queue); -#ifdef CONFIG_TCP_MD5SIG - /* Clean up the MD5 key list, if any */ - if (tp->md5sig_info) { - struct tcp_md5sig_info *md5sig; - - md5sig = rcu_dereference_protected(tp->md5sig_info, 1); - tcp_clear_md5_list(sk); - call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu); - rcu_assign_pointer(tp->md5sig_info, NULL); - } -#endif - tcp_ao_destroy_sock(sk, false); - /* Clean up a referenced TCP bind bucket. */ if (inet_csk(sk)->icsk_bind_hash) inet_put_port(sk); @@ -2929,13 +2869,13 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk_pending == ICSK_TIME_LOSS_PROBE) { timer_active = 1; - timer_expires = icsk_timeout(icsk); + timer_expires = tcp_timeout_expires(sk); } else if (icsk_pending == ICSK_TIME_PROBE0) { timer_active = 4; - timer_expires = icsk_timeout(icsk); - } else if (timer_pending(&sk->sk_timer)) { + timer_expires = tcp_timeout_expires(sk); + } else if (timer_pending(&icsk->icsk_keepalive_timer)) { timer_active = 2; - timer_expires = sk->sk_timer.expires; + timer_expires = icsk->icsk_keepalive_timer.expires; } else { timer_active = 0; timer_expires = jiffies; @@ -2958,9 +2898,9 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) rx_queue, timer_active, jiffies_delta_to_clock_t(timer_expires - jiffies), - icsk->icsk_retransmits, + READ_ONCE(icsk->icsk_retransmits), from_kuid_munged(seq_user_ns(f), sk_uid(sk)), - icsk->icsk_probes_out, + READ_ONCE(icsk->icsk_probes_out), sock_i_ino(sk), refcount_read(&sk->sk_refcnt), sk, jiffies_to_clock_t(icsk->icsk_rto), @@ -3524,7 +3464,6 @@ struct proto tcp_prot = { .leave_memory_pressure = tcp_leave_memory_pressure, .stream_memory_free = tcp_stream_memory_free, .sockets_allocated = &tcp_sockets_allocated, - .orphan_count = &tcp_orphan_count, .memory_allocated = &net_aligned_data.tcp_memory_allocated, .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, @@ -3583,7 +3522,9 @@ fallback: static int __net_init tcp_sk_init(struct net *net) { - net->ipv4.sysctl_tcp_ecn = 2; + net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN; + net->ipv4.sysctl_tcp_ecn_option = TCP_ACCECN_OPTION_FULL; + net->ipv4.sysctl_tcp_ecn_option_beacon = TCP_ACCECN_OPTION_BEACON; net->ipv4.sysctl_tcp_ecn_fallback = 1; net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; @@ -3625,6 +3566,7 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_adv_win_scale = 1; net->ipv4.sysctl_tcp_frto = 2; net->ipv4.sysctl_tcp_moderate_rcvbuf = 1; + net->ipv4.sysctl_tcp_rcvbuf_low_rtt = USEC_PER_MSEC; /* This limits the percentage of the congestion window which we * will allow a single TSO frame to consume. Building TSO frames * which are too large can cause TCP streams to be bursty. @@ -3652,8 +3594,9 @@ static int __net_init tcp_sk_init(struct net *net) sizeof(init_net.ipv4.sysctl_tcp_wmem)); } net->ipv4.sysctl_tcp_comp_sack_delay_ns = NSEC_PER_MSEC; - net->ipv4.sysctl_tcp_comp_sack_slack_ns = 100 * NSEC_PER_USEC; + net->ipv4.sysctl_tcp_comp_sack_slack_ns = 10 * NSEC_PER_USEC; net->ipv4.sysctl_tcp_comp_sack_nr = 44; + net->ipv4.sysctl_tcp_comp_sack_rtt_percent = 33; net->ipv4.sysctl_tcp_backlog_ack_defer = 1; net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE; net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0; diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c index 52fe17167460..976b56644a8a 100644 --- a/net/ipv4/tcp_lp.c +++ b/net/ipv4/tcp_lp.c @@ -23,9 +23,9 @@ * Original Author: * Aleksandar Kuzmanovic <akuzma@northwestern.edu> * Available from: - * http://www.ece.rice.edu/~akuzma/Doc/akuzma/TCP-LP.pdf + * https://users.cs.northwestern.edu/~akuzma/doc/TCP-LP-ToN.pdf * Original implementation for 2.4.19: - * http://www-ece.rice.edu/networks/TCP-LP/ + * https://users.cs.northwestern.edu/~akuzma/rice/TCP-LP/linux/tcp-lp-linux.htm * * 2.6.x module Authors: * Wong Hoi Sing, Edison <hswong3i@gmail.com> @@ -113,6 +113,8 @@ static void tcp_lp_init(struct sock *sk) /** * tcp_lp_cong_avoid * @sk: socket to avoid congesting + * @ack: current ack sequence number + * @acked: number of ACKed packets * * Implementation of cong_avoid. * Will only call newReno CA when away from inference. @@ -261,6 +263,7 @@ static void tcp_lp_rtt_sample(struct sock *sk, u32 rtt) /** * tcp_lp_pkts_acked * @sk: socket requiring congestion avoidance calculations + * @sample: ACK sample containing timing and rate information * * Implementation of pkts_acked. * Deal with active drop under Early Congestion Indication. diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 03c068ea27b6..45b6ecd16412 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -170,7 +170,7 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, struct net *net; spin_lock_bh(&tcp_metrics_lock); - net = dev_net_rcu(dst_dev(dst)); + net = dst_dev_net_rcu(dst); /* While waiting for the spin-lock the cache might have been populated * with this entry and so we have to check again. @@ -273,7 +273,7 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, return NULL; } - net = dev_net_rcu(dst_dev(dst)); + net = dst_dev_net_rcu(dst); hash ^= net_hash_mix(net); hash = hash_32(hash, tcp_metrics_hash_log); @@ -318,7 +318,7 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk, else return NULL; - net = dev_net_rcu(dst_dev(dst)); + net = dst_dev_net_rcu(dst); hash ^= net_hash_mix(net); hash = hash_32(hash, tcp_metrics_hash_log); @@ -912,7 +912,7 @@ static void tcp_metrics_flush_all(struct net *net) spin_lock_bh(&tcp_metrics_lock); for (tm = deref_locked(*pp); tm; tm = deref_locked(*pp)) { match = net ? net_eq(tm_net(tm), net) : - !refcount_read(&tm_net(tm)->ns.count); + !check_net(tm_net(tm)); if (match) { rcu_assign_pointer(*pp, tm->tcpm_next); kfree_rcu(tm, rcu_head); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 2994c9222c9c..bd5462154f97 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -20,9 +20,11 @@ */ #include <net/tcp.h> +#include <net/tcp_ecn.h> #include <net/xfrm.h> #include <net/busy_poll.h> #include <net/rstreason.h> +#include <net/psp.h> static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) { @@ -103,9 +105,16 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); u32 rcv_nxt = READ_ONCE(tcptw->tw_rcv_nxt); struct tcp_options_received tmp_opt; + enum skb_drop_reason psp_drop; bool paws_reject = false; int ts_recent_stamp; + /* Instead of dropping immediately, wait to see what value is + * returned. We will accept a non psp-encapsulated syn in the + * case where TCP_TW_SYN is returned. + */ + psp_drop = psp_twsk_rx_policy_check(tw, skb); + tmp_opt.saw_tstamp = 0; ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp); if (th->doff > (sizeof(*th) >> 2) && ts_recent_stamp) { @@ -123,6 +132,9 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2) { /* Just repeat all the checks of tcp_rcv_state_process() */ + if (psp_drop) + goto out_put; + /* Out of window, send ACK */ if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, @@ -193,6 +205,9 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) { /* In window segment, it may be only reset or bare ack. */ + if (psp_drop) + goto out_put; + if (th->rst) { /* This is TIME_WAIT assassination, in two flavors. * Oh well... nobody has a sufficient solution to this @@ -246,6 +261,9 @@ kill: return TCP_TW_SYN; } + if (psp_drop) + goto out_put; + if (paws_reject) { *drop_reason = SKB_DROP_REASON_TCP_RFC7323_TW_PAWS; __NET_INC_STATS(twsk_net(tw), LINUX_MIB_PAWS_TW_REJECTED); @@ -264,6 +282,8 @@ kill: return tcp_timewait_check_oow_rate_limit( tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT); } + +out_put: inet_twsk_put(tw); return TCP_TW_SUCCESS; } @@ -292,7 +312,6 @@ static void tcp_time_wait_init(struct sock *sk, struct tcp_timewait_sock *tcptw) return; if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) goto out_free; - tcp_md5_add_sigpool(); } return; out_free: @@ -318,7 +337,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); - tw->tw_transparent = inet_test_bit(TRANSPARENT, sk); tw->tw_mark = sk->sk_mark; tw->tw_priority = READ_ONCE(sk->sk_priority); tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; @@ -377,31 +395,21 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) } EXPORT_SYMBOL(tcp_time_wait); -#ifdef CONFIG_TCP_MD5SIG -static void tcp_md5_twsk_free_rcu(struct rcu_head *head) -{ - struct tcp_md5sig_key *key; - - key = container_of(head, struct tcp_md5sig_key, rcu); - kfree(key); - static_branch_slow_dec_deferred(&tcp_md5_needed); - tcp_md5_release_sigpool(); -} -#endif - void tcp_twsk_destructor(struct sock *sk) { #ifdef CONFIG_TCP_MD5SIG if (static_branch_unlikely(&tcp_md5_needed.key)) { struct tcp_timewait_sock *twsk = tcp_twsk(sk); - if (twsk->tw_md5_key) - call_rcu(&twsk->tw_md5_key->rcu, tcp_md5_twsk_free_rcu); + if (twsk->tw_md5_key) { + kfree(twsk->tw_md5_key); + static_branch_slow_dec_deferred(&tcp_md5_needed); + } } #endif tcp_ao_destroy_sock(sk, true); + psp_twsk_assoc_free(inet_twsk(sk)); } -EXPORT_IPV6_MOD_GPL(tcp_twsk_destructor); void tcp_twsk_purge(struct list_head *net_exit_list) { @@ -461,12 +469,26 @@ void tcp_openreq_init_rwin(struct request_sock *req, ireq->rcv_wscale = rcv_wscale; } -static void tcp_ecn_openreq_child(struct tcp_sock *tp, - const struct request_sock *req) +static void tcp_ecn_openreq_child(struct sock *sk, + const struct request_sock *req, + const struct sk_buff *skb) { - tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ? - TCP_ECN_MODE_RFC3168 : - TCP_ECN_DISABLED); + const struct tcp_request_sock *treq = tcp_rsk(req); + struct tcp_sock *tp = tcp_sk(sk); + + if (treq->accecn_ok) { + tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); + tp->syn_ect_snt = treq->syn_ect_snt; + tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt); + tp->saw_accecn_opt = treq->saw_accecn_opt; + tp->prev_ecnfield = treq->syn_ect_rcv; + tp->accecn_opt_demand = 1; + tcp_ecn_received_counters_payload(sk, skb); + } else { + tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ? + TCP_ECN_MODE_RFC3168 : + TCP_ECN_DISABLED); + } } void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) @@ -631,7 +653,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len) newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; newtp->rx_opt.mss_clamp = req->mss; - tcp_ecn_openreq_child(newtp, req); + tcp_ecn_openreq_child(newsk, req, skb); newtp->fastopen_req = NULL; RCU_INIT_POINTER(newtp->fastopen_rsk, NULL); @@ -674,6 +696,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, bool own_req; tmp_opt.saw_tstamp = 0; + tmp_opt.accecn = 0; if (th->doff > (sizeof(struct tcphdr)>>2)) { tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL); @@ -690,7 +713,8 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, * it can be estimated (approximately) * from another data. */ - tmp_opt.ts_recent_stamp = ktime_get_seconds() - reqsk_timeout(req, TCP_RTO_MAX) / HZ; + tmp_opt.ts_recent_stamp = ktime_get_seconds() - + tcp_reqsk_timeout(req) / HZ; paws_reject = tcp_paws_reject(&tmp_opt, th->rst); } } @@ -729,7 +753,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, !tcp_rtx_synack(sk, req)) { unsigned long expires = jiffies; - expires += reqsk_timeout(req, TCP_RTO_MAX); + expires += tcp_reqsk_timeout(req); if (!fastopen) mod_timer_pending(&req->rsk_timer, expires); else @@ -851,6 +875,18 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, if (!(flg & TCP_FLAG_ACK)) return NULL; + if (tcp_rsk(req)->accecn_ok && tmp_opt.accecn && + tcp_rsk(req)->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { + u8 saw_opt = tcp_accecn_option_init(skb, tmp_opt.accecn); + + tcp_rsk(req)->saw_accecn_opt = saw_opt; + if (tcp_rsk(req)->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN) { + u8 fail_mode = TCP_ACCECN_OPT_FAIL_RECV; + + tcp_rsk(req)->accecn_fail_mode |= fail_mode; + } + } + /* For Fast Open no more processing is needed (sk is the * child socket). */ diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index be5c2294610e..fdda18b1abda 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -282,33 +282,6 @@ struct sk_buff *tcp_gro_lookup(struct list_head *head, struct tcphdr *th) return NULL; } -struct tcphdr *tcp_gro_pull_header(struct sk_buff *skb) -{ - unsigned int thlen, hlen, off; - struct tcphdr *th; - - off = skb_gro_offset(skb); - hlen = off + sizeof(*th); - th = skb_gro_header(skb, hlen, off); - if (unlikely(!th)) - return NULL; - - thlen = th->doff * 4; - if (thlen < sizeof(*th)) - return NULL; - - hlen = off + thlen; - if (!skb_gro_may_pull(skb, hlen)) { - th = skb_gro_header_slow(skb, hlen, off); - if (unlikely(!th)) - return NULL; - } - - skb_gro_pull(skb, thlen); - - return th; -} - struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb, struct tcphdr *th) { @@ -434,8 +407,7 @@ static void tcp4_check_fraglist_gro(struct list_head *head, struct sk_buff *skb, inet_get_iif_sdif(skb, &iif, &sdif); iph = skb_gro_network_header(skb); net = dev_net_rcu(skb->dev); - sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, - iph->saddr, th->source, + sk = __inet_lookup_established(net, iph->saddr, th->source, iph->daddr, ntohs(th->dest), iif, sdif); NAPI_GRO_CB(skb)->is_flist = !sk; @@ -485,6 +457,7 @@ INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff) th->check = ~tcp_v4_check(skb->len - thoff, iph->saddr, iph->daddr, 0); + BUILD_BUG_ON(SKB_GSO_TCP_FIXEDID << 1 != SKB_GSO_TCP_FIXEDID_INNER); skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4 | (NAPI_GRO_CB(skb)->ip_fixedid * SKB_GSO_TCP_FIXEDID); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index caf11920a878..479afb714bdf 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -38,8 +38,11 @@ #define pr_fmt(fmt) "TCP: " fmt #include <net/tcp.h> +#include <net/tcp_ecn.h> #include <net/mptcp.h> +#include <net/smc.h> #include <net/proto_memory.h> +#include <net/psp.h> #include <linux/compiler.h> #include <linux/gfp.h> @@ -319,60 +322,6 @@ static u16 tcp_select_window(struct sock *sk) return new_win; } -/* Packet ECN state for a SYN-ACK */ -static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb) -{ - const struct tcp_sock *tp = tcp_sk(sk); - - TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR; - if (tcp_ecn_disabled(tp)) - TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE; - else if (tcp_ca_needs_ecn(sk) || - tcp_bpf_ca_needs_ecn(sk)) - INET_ECN_xmit(sk); -} - -/* Packet ECN state for a SYN. */ -static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_sock *tp = tcp_sk(sk); - bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk); - bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 || - tcp_ca_needs_ecn(sk) || bpf_needs_ecn; - - if (!use_ecn) { - const struct dst_entry *dst = __sk_dst_get(sk); - - if (dst && dst_feature(dst, RTAX_FEATURE_ECN)) - use_ecn = true; - } - - tp->ecn_flags = 0; - - if (use_ecn) { - TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; - tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); - if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn) - INET_ECN_xmit(sk); - } -} - -static void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb) -{ - if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)) - /* tp->ecn_flags are cleared at a later point in time when - * SYN ACK is ultimatively being received. - */ - TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE | TCPHDR_CWR); -} - -static void -tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th) -{ - if (inet_rsk(req)->ecn_ok) - th->ece = 1; -} - /* Set up ECN state for a packet on a ESTABLISHED socket that is about to * be sent. */ @@ -381,7 +330,15 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, { struct tcp_sock *tp = tcp_sk(sk); - if (tcp_ecn_mode_rfc3168(tp)) { + if (!tcp_ecn_mode_any(tp)) + return; + + if (tcp_ecn_mode_accecn(tp)) { + if (!tcp_accecn_ace_fail_recv(tp)) + INET_ECN_xmit(sk); + tcp_accecn_set_ace(tp, skb, th); + skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ACCECN; + } else { /* Not-retransmitted data segment: set ECT and inject CWR. */ if (skb->len != tcp_header_len && !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) { @@ -403,13 +360,15 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, /* Constructs common control bits of non-data skb. If SYN/FIN is present, * auto increment end seqno. */ -static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u16 flags) +static void tcp_init_nondata_skb(struct sk_buff *skb, struct sock *sk, + u32 seq, u16 flags) { skb->ip_summed = CHECKSUM_PARTIAL; TCP_SKB_CB(skb)->tcp_flags = flags; tcp_skb_pcount_set(skb, 1); + psp_enqueue_set_decrypted(sk, skb); TCP_SKB_CB(skb)->seq = seq; if (flags & (TCPHDR_SYN | TCPHDR_FIN)) @@ -430,6 +389,7 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp) #define OPTION_SMC BIT(9) #define OPTION_MPTCP BIT(10) #define OPTION_AO BIT(11) +#define OPTION_ACCECN BIT(12) static void smc_options_write(__be32 *ptr, u16 *options) { @@ -451,6 +411,8 @@ struct tcp_out_options { u16 mss; /* 0 to disable */ u8 ws; /* window scale, 0 to disable */ u8 num_sack_blocks; /* number of SACK blocks to include */ + u8 num_accecn_fields:7, /* number of AccECN fields needed */ + use_synack_ecn_bytes:1; /* Use synack_ecn_bytes or not */ u8 hash_size; /* bytes in hash_location */ u8 bpf_opt_len; /* length of BPF hdr option */ __u8 *hash_location; /* temporary pointer, overloaded */ @@ -648,6 +610,11 @@ static __be32 *process_tcp_ao_options(struct tcp_sock *tp, return ptr; } +/* Initial values for AccECN option, ordered is based on ECN field bits + * similar to received_ecn_bytes. Used for SYN/ACK AccECN option. + */ +static const u32 synack_ecn_bytes[3] = { 0, 0, 0 }; + /* Write previously computed TCP options to the packet. * * Beware: Something in the Internet is very sensitive to the ordering of @@ -666,6 +633,8 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, struct tcp_out_options *opts, struct tcp_key *key) { + u8 leftover_highbyte = TCPOPT_NOP; /* replace 1st NOP if avail */ + u8 leftover_lowbyte = TCPOPT_NOP; /* replace 2nd NOP in succession */ __be32 *ptr = (__be32 *)(th + 1); u16 options = opts->options; /* mungable copy */ @@ -701,15 +670,75 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, *ptr++ = htonl(opts->tsecr); } + if (OPTION_ACCECN & options) { + const u32 *ecn_bytes = opts->use_synack_ecn_bytes ? + synack_ecn_bytes : + tp->received_ecn_bytes; + const u8 ect0_idx = INET_ECN_ECT_0 - 1; + const u8 ect1_idx = INET_ECN_ECT_1 - 1; + const u8 ce_idx = INET_ECN_CE - 1; + u32 e0b; + u32 e1b; + u32 ceb; + u8 len; + + e0b = ecn_bytes[ect0_idx] + TCP_ACCECN_E0B_INIT_OFFSET; + e1b = ecn_bytes[ect1_idx] + TCP_ACCECN_E1B_INIT_OFFSET; + ceb = ecn_bytes[ce_idx] + TCP_ACCECN_CEB_INIT_OFFSET; + len = TCPOLEN_ACCECN_BASE + + opts->num_accecn_fields * TCPOLEN_ACCECN_PERFIELD; + + if (opts->num_accecn_fields == 2) { + *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | + ((e1b >> 8) & 0xffff)); + *ptr++ = htonl(((e1b & 0xff) << 24) | + (ceb & 0xffffff)); + } else if (opts->num_accecn_fields == 1) { + *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | + ((e1b >> 8) & 0xffff)); + leftover_highbyte = e1b & 0xff; + leftover_lowbyte = TCPOPT_NOP; + } else if (opts->num_accecn_fields == 0) { + leftover_highbyte = TCPOPT_ACCECN1; + leftover_lowbyte = len; + } else if (opts->num_accecn_fields == 3) { + *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | + ((e1b >> 8) & 0xffff)); + *ptr++ = htonl(((e1b & 0xff) << 24) | + (ceb & 0xffffff)); + *ptr++ = htonl(((e0b & 0xffffff) << 8) | + TCPOPT_NOP); + } + if (tp) { + tp->accecn_minlen = 0; + tp->accecn_opt_tstamp = tp->tcp_mstamp; + if (tp->accecn_opt_demand) + tp->accecn_opt_demand--; + } + } + if (unlikely(OPTION_SACK_ADVERTISE & options)) { - *ptr++ = htonl((TCPOPT_NOP << 24) | - (TCPOPT_NOP << 16) | + *ptr++ = htonl((leftover_highbyte << 24) | + (leftover_lowbyte << 16) | (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM); + leftover_highbyte = TCPOPT_NOP; + leftover_lowbyte = TCPOPT_NOP; } if (unlikely(OPTION_WSCALE & options)) { - *ptr++ = htonl((TCPOPT_NOP << 24) | + u8 highbyte = TCPOPT_NOP; + + /* Do not split the leftover 2-byte to fit into a single + * NOP, i.e., replace this NOP only when 1 byte is leftover + * within leftover_highbyte. + */ + if (unlikely(leftover_highbyte != TCPOPT_NOP && + leftover_lowbyte == TCPOPT_NOP)) { + highbyte = leftover_highbyte; + leftover_highbyte = TCPOPT_NOP; + } + *ptr++ = htonl((highbyte << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | opts->ws); @@ -720,11 +749,13 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, tp->duplicate_sack : tp->selective_acks; int this_sack; - *ptr++ = htonl((TCPOPT_NOP << 24) | - (TCPOPT_NOP << 16) | + *ptr++ = htonl((leftover_highbyte << 24) | + (leftover_lowbyte << 16) | (TCPOPT_SACK << 8) | (TCPOLEN_SACK_BASE + (opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK))); + leftover_highbyte = TCPOPT_NOP; + leftover_lowbyte = TCPOPT_NOP; for (this_sack = 0; this_sack < opts->num_sack_blocks; ++this_sack) { @@ -733,6 +764,14 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, } tp->rx_opt.dsack = 0; + } else if (unlikely(leftover_highbyte != TCPOPT_NOP || + leftover_lowbyte != TCPOPT_NOP)) { + *ptr++ = htonl((leftover_highbyte << 24) | + (leftover_lowbyte << 16) | + (TCPOPT_NOP << 8) | + TCPOPT_NOP); + leftover_highbyte = TCPOPT_NOP; + leftover_lowbyte = TCPOPT_NOP; } if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) { @@ -764,34 +803,36 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, mptcp_options_write(th, ptr, tp, opts); } -static void smc_set_option(const struct tcp_sock *tp, +static void smc_set_option(struct tcp_sock *tp, struct tcp_out_options *opts, unsigned int *remaining) { #if IS_ENABLED(CONFIG_SMC) - if (static_branch_unlikely(&tcp_have_smc)) { - if (tp->syn_smc) { - if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) { - opts->options |= OPTION_SMC; - *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED; - } + if (static_branch_unlikely(&tcp_have_smc) && tp->syn_smc) { + tp->syn_smc = !!smc_call_hsbpf(1, tp, syn_option); + /* re-check syn_smc */ + if (tp->syn_smc && + *remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) { + opts->options |= OPTION_SMC; + *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED; } } #endif } static void smc_set_option_cond(const struct tcp_sock *tp, - const struct inet_request_sock *ireq, + struct inet_request_sock *ireq, struct tcp_out_options *opts, unsigned int *remaining) { #if IS_ENABLED(CONFIG_SMC) - if (static_branch_unlikely(&tcp_have_smc)) { - if (tp->syn_smc && ireq->smc_ok) { - if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) { - opts->options |= OPTION_SMC; - *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED; - } + if (static_branch_unlikely(&tcp_have_smc) && tp->syn_smc && ireq->smc_ok) { + ireq->smc_ok = !!smc_call_hsbpf(1, tp, synack_option, ireq); + /* re-check smc_ok */ + if (ireq->smc_ok && + *remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) { + opts->options |= OPTION_SMC; + *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED; } } #endif @@ -813,6 +854,80 @@ static void mptcp_set_option_cond(const struct request_sock *req, } } +static u32 tcp_synack_options_combine_saving(struct tcp_out_options *opts) +{ + /* How much there's room for combining with the alignment padding? */ + if ((opts->options & (OPTION_SACK_ADVERTISE | OPTION_TS)) == + OPTION_SACK_ADVERTISE) + return 2; + else if (opts->options & OPTION_WSCALE) + return 1; + return 0; +} + +/* Calculates how long AccECN option will fit to @remaining option space. + * + * AccECN option can sometimes replace NOPs used for alignment of other + * TCP options (up to @max_combine_saving available). + * + * Only solutions with at least @required AccECN fields are accepted. + * + * Returns: The size of the AccECN option excluding space repurposed from + * the alignment of the other options. + */ +static int tcp_options_fit_accecn(struct tcp_out_options *opts, int required, + int remaining) +{ + int size = TCP_ACCECN_MAXSIZE; + int sack_blocks_reduce = 0; + int max_combine_saving; + int rem = remaining; + int align_size; + + if (opts->use_synack_ecn_bytes) + max_combine_saving = tcp_synack_options_combine_saving(opts); + else + max_combine_saving = opts->num_sack_blocks > 0 ? 2 : 0; + opts->num_accecn_fields = TCP_ACCECN_NUMFIELDS; + while (opts->num_accecn_fields >= required) { + /* Pad to dword if cannot combine */ + if ((size & 0x3) > max_combine_saving) + align_size = ALIGN(size, 4); + else + align_size = ALIGN_DOWN(size, 4); + + if (rem >= align_size) { + size = align_size; + break; + } else if (opts->num_accecn_fields == required && + opts->num_sack_blocks > 2 && + required > 0) { + /* Try to fit the option by removing one SACK block */ + opts->num_sack_blocks--; + sack_blocks_reduce++; + rem = rem + TCPOLEN_SACK_PERBLOCK; + + opts->num_accecn_fields = TCP_ACCECN_NUMFIELDS; + size = TCP_ACCECN_MAXSIZE; + continue; + } + + opts->num_accecn_fields--; + size -= TCPOLEN_ACCECN_PERFIELD; + } + if (sack_blocks_reduce > 0) { + if (opts->num_accecn_fields >= required) + size -= sack_blocks_reduce * TCPOLEN_SACK_PERBLOCK; + else + opts->num_sack_blocks += sack_blocks_reduce; + } + if (opts->num_accecn_fields < required) + return 0; + + opts->options |= OPTION_ACCECN; + return size; +} + /* Compute TCP options for SYN packets. This is not the final * network wire format yet. */ @@ -895,6 +1010,20 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, } } + /* Simultaneous open SYN/ACK needs AccECN option but not SYN. + * It is attempted to negotiate the use of AccECN also on the first + * retransmitted SYN, as mentioned in "3.1.4.1. Retransmitted SYNs" + * of AccECN draft. + */ + if (unlikely((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK) && + tcp_ecn_mode_accecn(tp) && + inet_csk(sk)->icsk_retransmits < 2 && + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) && + remaining >= TCPOLEN_ACCECN_BASE)) { + opts->use_synack_ecn_bytes = 1; + remaining -= tcp_options_fit_accecn(opts, 0, remaining); + } + bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining); return MAX_TCP_OPTION_SPACE - remaining; @@ -912,6 +1041,7 @@ static unsigned int tcp_synack_options(const struct sock *sk, { struct inet_request_sock *ireq = inet_rsk(req); unsigned int remaining = MAX_TCP_OPTION_SPACE; + struct tcp_request_sock *treq = tcp_rsk(req); if (tcp_key_is_md5(key)) { opts->options |= OPTION_MD5; @@ -974,6 +1104,13 @@ static unsigned int tcp_synack_options(const struct sock *sk, smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining); + if (treq->accecn_ok && + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) && + req->num_timeout < 1 && remaining >= TCPOLEN_ACCECN_BASE) { + opts->use_synack_ecn_bytes = 1; + remaining -= tcp_options_fit_accecn(opts, 0, remaining); + } + bpf_skops_hdr_opt_len((struct sock *)sk, skb, req, syn_skb, synack_type, opts, &remaining); @@ -1030,17 +1167,32 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack; if (unlikely(eff_sacks)) { const unsigned int remaining = MAX_TCP_OPTION_SPACE - size; - if (unlikely(remaining < TCPOLEN_SACK_BASE_ALIGNED + - TCPOLEN_SACK_PERBLOCK)) - return size; + if (likely(remaining >= TCPOLEN_SACK_BASE_ALIGNED + + TCPOLEN_SACK_PERBLOCK)) { + opts->num_sack_blocks = + min_t(unsigned int, eff_sacks, + (remaining - TCPOLEN_SACK_BASE_ALIGNED) / + TCPOLEN_SACK_PERBLOCK); + + size += TCPOLEN_SACK_BASE_ALIGNED + + opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; + } else { + opts->num_sack_blocks = 0; + } + } else { + opts->num_sack_blocks = 0; + } - opts->num_sack_blocks = - min_t(unsigned int, eff_sacks, - (remaining - TCPOLEN_SACK_BASE_ALIGNED) / - TCPOLEN_SACK_PERBLOCK); + if (tcp_ecn_mode_accecn(tp)) { + int ecn_opt = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option); - size += TCPOLEN_SACK_BASE_ALIGNED + - opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; + if (ecn_opt && tp->saw_accecn_opt && !tcp_accecn_opt_fail_send(tp) && + (ecn_opt >= TCP_ACCECN_OPTION_FULL || tp->accecn_opt_demand || + tcp_accecn_option_beacon_check(sk))) { + opts->use_synack_ecn_bytes = 0; + size += tcp_options_fit_accecn(opts, tp->accecn_minlen, + MAX_TCP_OPTION_SPACE - size); + } } if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp, @@ -1437,7 +1589,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, err = tcp_ao_transmit_skb(sk, skb, key.ao_key, th, opts.hash_location); if (err) { - kfree_skb_reason(skb, SKB_DROP_REASON_NOT_SPECIFIED); + sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_NOT_SPECIFIED); return -ENOMEM; } } @@ -1510,6 +1662,7 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) /* Advance write_seq and place onto the write_queue. */ WRITE_ONCE(tp->write_seq, TCP_SKB_CB(skb)->end_seq); __skb_header_release(skb); + psp_enqueue_set_decrypted(sk, skb); tcp_add_write_queue_tail(sk, skb); sk_wmem_queued_add(sk, skb->truesize); sk_mem_charge(sk, skb->truesize); @@ -2219,7 +2372,8 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, u32 max_segs) { const struct inet_connection_sock *icsk = inet_csk(sk); - u32 send_win, cong_win, limit, in_flight; + u32 send_win, cong_win, limit, in_flight, threshold; + u64 srtt_in_ns, expected_ack, how_far_is_the_ack; struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *head; int win_divisor; @@ -2281,9 +2435,19 @@ static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb, head = tcp_rtx_queue_head(sk); if (!head) goto send_now; - delta = tp->tcp_clock_cache - head->tstamp; - /* If next ACK is likely to come too late (half srtt), do not defer */ - if ((s64)(delta - (u64)NSEC_PER_USEC * (tp->srtt_us >> 4)) < 0) + + srtt_in_ns = (u64)(NSEC_PER_USEC >> 3) * tp->srtt_us; + /* When is the ACK expected ? */ + expected_ack = head->tstamp + srtt_in_ns; + /* How far from now is the ACK expected ? */ + how_far_is_the_ack = expected_ack - tp->tcp_clock_cache; + + /* If next ACK is likely to come too late, + * ie in more than min(1ms, half srtt), do not defer. + */ + threshold = min(srtt_in_ns >> 1, NSEC_PER_MSEC); + + if ((s64)(how_far_is_the_ack - threshold) > 0) goto send_now; /* Ok, it looks like it is advisable to defer. @@ -2750,6 +2914,11 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, sent_pkts = 0; tcp_mstamp_refresh(tp); + + /* AccECN option beacon depends on mstamp, it may change mss */ + if (tcp_ecn_mode_accecn(tp) && tcp_accecn_option_beacon_check(sk)) + mss_now = tcp_current_mss(sk); + if (!push_one) { /* Do MTU probing. */ result = tcp_mtu_probe(sk); @@ -3402,7 +3571,10 @@ start: tcp_retrans_try_collapse(sk, skb, avail_wnd); } - /* RFC3168, section 6.1.1.1. ECN fallback */ + /* RFC3168, section 6.1.1.1. ECN fallback + * As AccECN uses the same SYN flags (+ AE), this check covers both + * cases. + */ if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN) tcp_ecn_clear_syn(sk, skb); @@ -3574,13 +3746,17 @@ void sk_forced_mem_schedule(struct sock *sk, int size) delta = size - sk->sk_forward_alloc; if (delta <= 0) return; + amt = sk_mem_pages(delta); sk_forward_alloc_add(sk, amt << PAGE_SHIFT); - sk_memory_allocated_add(sk, amt); - if (mem_cgroup_sockets_enabled && sk->sk_memcg) - mem_cgroup_charge_skmem(sk->sk_memcg, amt, - gfp_memcg_charge() | __GFP_NOFAIL); + if (mem_cgroup_sk_enabled(sk)) + mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge() | __GFP_NOFAIL); + + if (sk->sk_bypass_prot_mem) + return; + + sk_memory_allocated_add(sk, amt); } /* Send a FIN. The caller locks the socket for us. @@ -3625,7 +3801,7 @@ void tcp_send_fin(struct sock *sk) skb_reserve(skb, MAX_TCP_HEADER); sk_forced_mem_schedule(sk, skb->truesize); /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ - tcp_init_nondata_skb(skb, tp->write_seq, + tcp_init_nondata_skb(skb, sk, tp->write_seq, TCPHDR_ACK | TCPHDR_FIN); tcp_queue_skb(sk, skb); } @@ -3653,7 +3829,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority, /* Reserve space for headers and prepare control bits. */ skb_reserve(skb, MAX_TCP_HEADER); - tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk), + tcp_init_nondata_skb(skb, sk, tcp_acceptable_seq(sk), TCPHDR_ACK | TCPHDR_RST); tcp_mstamp_refresh(tcp_sk(sk)); /* Send it off. */ @@ -3891,6 +4067,7 @@ static void tcp_connect_init(struct sock *sk) const struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); __u8 rcv_wscale; + u16 user_mss; u32 rcv_wnd; /* We'll fix this up when we get a response from the other end. @@ -3903,8 +4080,9 @@ static void tcp_connect_init(struct sock *sk) tcp_ao_connect_init(sk); /* If user gave his TCP_MAXSEG, record it to clamp */ - if (tp->rx_opt.user_mss) - tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; + user_mss = READ_ONCE(tp->rx_opt.user_mss); + if (user_mss) + tp->rx_opt.mss_clamp = user_mss; tp->max_window = 0; tcp_mtup_init(sk); tcp_sync_mss(sk, dst_mtu(dst)); @@ -3955,7 +4133,7 @@ static void tcp_connect_init(struct sock *sk) WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); inet_csk(sk)->icsk_rto = tcp_timeout_init(sk); - inet_csk(sk)->icsk_retransmits = 0; + WRITE_ONCE(inet_csk(sk)->icsk_retransmits, 0); tcp_clear_retrans(tp); } @@ -4148,7 +4326,7 @@ int tcp_connect(struct sock *sk) /* SYN eats a sequence byte, write_seq updated by * tcp_connect_queue_skb(). */ - tcp_init_nondata_skb(buff, tp->write_seq, TCPHDR_SYN); + tcp_init_nondata_skb(buff, sk, tp->write_seq, TCPHDR_SYN); tcp_mstamp_refresh(tp); tp->retrans_stamp = tcp_time_stamp_ts(tp); tcp_connect_queue_skb(sk, buff); @@ -4273,7 +4451,8 @@ void __tcp_send_ack(struct sock *sk, u32 rcv_nxt, u16 flags) /* Reserve space for headers and prepare control bits. */ skb_reserve(buff, MAX_TCP_HEADER); - tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK | flags); + tcp_init_nondata_skb(buff, sk, + tcp_acceptable_seq(sk), TCPHDR_ACK | flags); /* We do not want pure acks influencing TCP Small Queues or fq/pacing * too much. @@ -4319,7 +4498,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib) * end to send an ack. Don't queue or clone SKB, just * send it. */ - tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK); + tcp_init_nondata_skb(skb, sk, tp->snd_una - !urgent, TCPHDR_ACK); NET_INC_STATS(sock_net(sk), mib); return tcp_transmit_skb(sk, skb, 0, (__force gfp_t)0); } @@ -4393,13 +4572,13 @@ void tcp_send_probe0(struct sock *sk) if (tp->packets_out || tcp_write_queue_empty(sk)) { /* Cancel probe timer, if it is not required. */ - icsk->icsk_probes_out = 0; + WRITE_ONCE(icsk->icsk_probes_out, 0); icsk->icsk_backoff = 0; icsk->icsk_probes_tstamp = 0; return; } - icsk->icsk_probes_out++; + WRITE_ONCE(icsk->icsk_probes_out, icsk->icsk_probes_out + 1); if (err <= 0) { if (icsk->icsk_backoff < READ_ONCE(net->ipv4.sysctl_tcp_retries2)) icsk->icsk_backoff++; @@ -4437,7 +4616,7 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) tcp_sk_rw(sk)->total_retrans++; } trace_tcp_retransmit_synack(sk, req); - req->num_retrans++; + WRITE_ONCE(req->num_retrans, req->num_retrans + 1); } return res; } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index a207877270fb..160080c9021d 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -392,7 +392,7 @@ static void tcp_probe_timer(struct sock *sk) int max_probes; if (tp->packets_out || !skb) { - icsk->icsk_probes_out = 0; + WRITE_ONCE(icsk->icsk_probes_out, 0); icsk->icsk_probes_tstamp = 0; return; } @@ -444,7 +444,7 @@ static void tcp_update_rto_stats(struct sock *sk) tp->total_rto_recoveries++; tp->rto_stamp = tcp_time_stamp_ms(tp); } - icsk->icsk_retransmits++; + WRITE_ONCE(icsk->icsk_retransmits, icsk->icsk_retransmits + 1); tp->total_rto++; } @@ -458,7 +458,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req) struct tcp_sock *tp = tcp_sk(sk); int max_retries; - req->rsk_ops->syn_ack_timeout(req); + tcp_syn_ack_timeout(req); /* Add one more retry for fastopen. * Paired with WRITE_ONCE() in tcp_sock_set_syncnt() @@ -510,7 +510,7 @@ static bool tcp_rtx_probe0_timed_out(const struct sock *sk, * and tp->rcv_tstamp might very well have been written recently. * rcv_delta can thus be negative. */ - rcv_delta = icsk_timeout(icsk) - tp->rcv_tstamp; + rcv_delta = tcp_timeout_expires(sk) - tp->rcv_tstamp; if (rcv_delta <= timeout) return false; @@ -697,9 +697,9 @@ void tcp_write_timer_handler(struct sock *sk) !icsk->icsk_pending) return; - if (time_after(icsk_timeout(icsk), jiffies)) { - sk_reset_timer(sk, &icsk->icsk_retransmit_timer, - icsk_timeout(icsk)); + if (time_after(tcp_timeout_expires(sk), jiffies)) { + sk_reset_timer(sk, &sk->tcp_retransmit_timer, + tcp_timeout_expires(sk)); return; } tcp_mstamp_refresh(tcp_sk(sk)); @@ -725,12 +725,10 @@ void tcp_write_timer_handler(struct sock *sk) static void tcp_write_timer(struct timer_list *t) { - struct inet_connection_sock *icsk = - timer_container_of(icsk, t, icsk_retransmit_timer); - struct sock *sk = &icsk->icsk_inet.sk; + struct sock *sk = timer_container_of(sk, t, tcp_retransmit_timer); /* Avoid locking the socket when there is no pending event. */ - if (!smp_load_acquire(&icsk->icsk_pending)) + if (!smp_load_acquire(&inet_csk(sk)->icsk_pending)) goto out; bh_lock_sock(sk); @@ -752,16 +750,15 @@ void tcp_syn_ack_timeout(const struct request_sock *req) __NET_INC_STATS(net, LINUX_MIB_TCPTIMEOUTS); } -EXPORT_IPV6_MOD(tcp_syn_ack_timeout); void tcp_reset_keepalive_timer(struct sock *sk, unsigned long len) { - sk_reset_timer(sk, &sk->sk_timer, jiffies + len); + sk_reset_timer(sk, &inet_csk(sk)->icsk_keepalive_timer, jiffies + len); } static void tcp_delete_keepalive_timer(struct sock *sk) { - sk_stop_timer(sk, &sk->sk_timer); + sk_stop_timer(sk, &inet_csk(sk)->icsk_keepalive_timer); } void tcp_set_keepalive(struct sock *sk, int val) @@ -778,8 +775,9 @@ EXPORT_IPV6_MOD_GPL(tcp_set_keepalive); static void tcp_keepalive_timer(struct timer_list *t) { - struct sock *sk = timer_container_of(sk, t, sk_timer); - struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_connection_sock *icsk = + timer_container_of(icsk, t, icsk_keepalive_timer); + struct sock *sk = &icsk->icsk_inet.sk; struct tcp_sock *tp = tcp_sk(sk); u32 elapsed; @@ -839,7 +837,7 @@ static void tcp_keepalive_timer(struct timer_list *t) goto out; } if (tcp_write_wakeup(sk, LINUX_MIB_TCPKEEPALIVE) <= 0) { - icsk->icsk_probes_out++; + WRITE_ONCE(icsk->icsk_probes_out, icsk->icsk_probes_out + 1); elapsed = keepalive_intvl_when(tp); } else { /* If keepalive was lost due to local congestion, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index cc3ce0f762ec..ffe074cb5865 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -68,7 +68,7 @@ * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which * Alexey Kuznetsov: allow both IPv4 and IPv6 sockets to bind * a single port at the same time. - * Derek Atkins <derek@ihtfp.com>: Add Encapulation Support + * Derek Atkins <derek@ihtfp.com>: Add Encapsulation Support * James Chapman : Add L2TP encapsulation type. */ @@ -509,7 +509,7 @@ rescore: /* compute_score is too long of a function to be * inlined, and calling it again here yields - * measureable overhead for some + * measurable overhead for some * workloads. Work around it by jumping * backwards to rescore 'result'. */ @@ -1685,31 +1685,6 @@ static void udp_skb_dtor_locked(struct sock *sk, struct sk_buff *skb) udp_rmem_release(sk, udp_skb_truesize(skb), 1, true); } -/* Idea of busylocks is to let producers grab an extra spinlock - * to relieve pressure on the receive_queue spinlock shared by consumer. - * Under flood, this means that only one producer can be in line - * trying to acquire the receive_queue spinlock. - * These busylock can be allocated on a per cpu manner, instead of a - * per socket one (that would consume a cache line per socket) - */ -static int udp_busylocks_log __read_mostly; -static spinlock_t *udp_busylocks __read_mostly; - -static spinlock_t *busylock_acquire(void *ptr) -{ - spinlock_t *busy; - - busy = udp_busylocks + hash_ptr(ptr, udp_busylocks_log); - spin_lock(busy); - return busy; -} - -static void busylock_release(spinlock_t *busy) -{ - if (busy) - spin_unlock(busy); -} - static int udp_rmem_schedule(struct sock *sk, int size) { int delta; @@ -1724,14 +1699,24 @@ static int udp_rmem_schedule(struct sock *sk, int size) int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) { struct sk_buff_head *list = &sk->sk_receive_queue; + struct udp_prod_queue *udp_prod_queue; + struct sk_buff *next, *to_drop = NULL; + struct llist_node *ll_list; unsigned int rmem, rcvbuf; - spinlock_t *busy = NULL; int size, err = -ENOMEM; + int total_size = 0; + int q_size = 0; + int dropcount; + int nb = 0; rmem = atomic_read(&sk->sk_rmem_alloc); rcvbuf = READ_ONCE(sk->sk_rcvbuf); size = skb->truesize; + udp_prod_queue = &udp_sk(sk)->udp_prod_queue[numa_node_id()]; + + rmem += atomic_read(&udp_prod_queue->rmem_alloc); + /* Immediately drop when the receive queue is full. * Cast to unsigned int performs the boundary check for INT_MAX. */ @@ -1739,8 +1724,8 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) if (rcvbuf > INT_MAX >> 1) goto drop; - /* Always allow at least one packet for small buffer. */ - if (rmem > rcvbuf) + /* Accept the packet if queue is empty. */ + if (rmem) goto drop; } @@ -1753,42 +1738,77 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) if (rmem > (rcvbuf >> 1)) { skb_condense(skb); size = skb->truesize; - busy = busylock_acquire(sk); } udp_set_dev_scratch(skb); - atomic_add(size, &sk->sk_rmem_alloc); + atomic_add(size, &udp_prod_queue->rmem_alloc); + + if (!llist_add(&skb->ll_node, &udp_prod_queue->ll_root)) + return 0; + + dropcount = sock_flag(sk, SOCK_RXQ_OVFL) ? sk_drops_read(sk) : 0; spin_lock(&list->lock); - err = udp_rmem_schedule(sk, size); - if (err) { - spin_unlock(&list->lock); - goto uncharge_drop; - } - sk_forward_alloc_add(sk, -size); + ll_list = llist_del_all(&udp_prod_queue->ll_root); - /* no need to setup a destructor, we will explicitly release the - * forward allocated memory on dequeue - */ - sock_skb_set_dropcount(sk, skb); + ll_list = llist_reverse_order(ll_list); + + llist_for_each_entry_safe(skb, next, ll_list, ll_node) { + size = udp_skb_truesize(skb); + total_size += size; + err = udp_rmem_schedule(sk, size); + if (unlikely(err)) { + /* Free the skbs outside of locked section. */ + skb->next = to_drop; + to_drop = skb; + continue; + } + + q_size += size; + sk_forward_alloc_add(sk, -size); + + /* no need to setup a destructor, we will explicitly release the + * forward allocated memory on dequeue + */ + SOCK_SKB_CB(skb)->dropcount = dropcount; + nb++; + __skb_queue_tail(list, skb); + } + + atomic_add(q_size, &sk->sk_rmem_alloc); - __skb_queue_tail(list, skb); spin_unlock(&list->lock); - if (!sock_flag(sk, SOCK_DEAD)) - INDIRECT_CALL_1(sk->sk_data_ready, sock_def_readable, sk); + if (!sock_flag(sk, SOCK_DEAD)) { + /* Multiple threads might be blocked in recvmsg(), + * using prepare_to_wait_exclusive(). + */ + while (nb) { + INDIRECT_CALL_1(sk->sk_data_ready, + sock_def_readable, sk); + nb--; + } + } - busylock_release(busy); - return 0; + if (unlikely(to_drop)) { + for (nb = 0; to_drop != NULL; nb++) { + skb = to_drop; + to_drop = skb->next; + skb_mark_not_on_list(skb); + /* TODO: update SNMP values. */ + sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_PROTO_MEM); + } + numa_drop_add(&udp_sk(sk)->drop_counters, nb); + } -uncharge_drop: - atomic_sub(skb->truesize, &sk->sk_rmem_alloc); + atomic_sub(total_size, &udp_prod_queue->rmem_alloc); + + return 0; drop: - atomic_inc(&sk->sk_drops); - busylock_release(busy); + udp_drops_inc(sk); return err; } EXPORT_IPV6_MOD_GPL(__udp_enqueue_schedule_skb); @@ -1806,6 +1826,7 @@ void udp_destruct_common(struct sock *sk) kfree_skb(skb); } udp_rmem_release(sk, total, 0, true); + kfree(up->udp_prod_queue); } EXPORT_IPV6_MOD_GPL(udp_destruct_common); @@ -1817,10 +1838,11 @@ static void udp_destruct_sock(struct sock *sk) int udp_init_sock(struct sock *sk) { - udp_lib_init_sock(sk); + int res = udp_lib_init_sock(sk); + sk->sk_destruct = udp_destruct_sock; set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); - return 0; + return res; } void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len) @@ -1828,6 +1850,11 @@ void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len) if (unlikely(READ_ONCE(udp_sk(sk)->peeking_with_offset))) sk_peek_offset_bwd(sk, len); + if (!skb_shared(skb)) { + skb_attempt_defer_free(skb); + return; + } + if (!skb_unref(skb)) return; @@ -1852,7 +1879,7 @@ static struct sk_buff *__first_packet_length(struct sock *sk, IS_UDPLITE(sk)); __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, IS_UDPLITE(sk)); - atomic_inc(&sk->sk_drops); + udp_drops_inc(sk); __skb_unlink(skb, rcvq); *total += skb->truesize; kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM); @@ -2008,7 +2035,7 @@ try_again: __UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, is_udplite); __UDP_INC_STATS(net, UDP_MIB_INERRORS, is_udplite); - atomic_inc(&sk->sk_drops); + udp_drops_inc(sk); kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM); goto try_again; } @@ -2078,7 +2105,7 @@ try_again: if (unlikely(err)) { if (!peeking) { - atomic_inc(&sk->sk_drops); + udp_drops_inc(sk); UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); } @@ -2132,7 +2159,8 @@ csum_copy_err: goto try_again; } -int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +int udp_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr, + int addr_len) { /* This check is replicated from __ip4_datagram_connect() and * intended to prevent BPF program called below from accessing bytes @@ -2145,7 +2173,8 @@ int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) } EXPORT_IPV6_MOD(udp_pre_connect); -static int udp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +static int udp_connect(struct sock *sk, struct sockaddr_unsized *uaddr, + int addr_len) { int res; @@ -2449,7 +2478,7 @@ csum_error: __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); drop: __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); - atomic_inc(&sk->sk_drops); + udp_drops_inc(sk); sk_skb_reason_drop(sk, skb, drop_reason); return -1; } @@ -2534,7 +2563,7 @@ start_lookup: nskb = skb_clone(skb, GFP_ATOMIC); if (unlikely(!nskb)) { - atomic_inc(&sk->sk_drops); + udp_drops_inc(sk); __UDP_INC_STATS(net, UDP_MIB_RCVBUFERRORS, IS_UDPLITE(sk)); __UDP_INC_STATS(net, UDP_MIB_INERRORS, @@ -2609,7 +2638,7 @@ static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh, return 0; } -/* wrapper for udp_queue_rcv_skb tacking care of csum conversion and +/* wrapper for udp_queue_rcv_skb taking care of csum conversion and * return code conversion for ip layer consumption */ static int udp_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb, @@ -2807,7 +2836,7 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net, return NULL; } -int udp_v4_early_demux(struct sk_buff *skb) +enum skb_drop_reason udp_v4_early_demux(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); struct in_device *in_dev = NULL; @@ -2821,7 +2850,7 @@ int udp_v4_early_demux(struct sk_buff *skb) /* validate the packet */ if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr))) - return 0; + return SKB_NOT_DROPPED_YET; iph = ip_hdr(skb); uh = udp_hdr(skb); @@ -2830,12 +2859,12 @@ int udp_v4_early_demux(struct sk_buff *skb) in_dev = __in_dev_get_rcu(skb->dev); if (!in_dev) - return 0; + return SKB_NOT_DROPPED_YET; ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr, iph->protocol); if (!ours) - return 0; + return SKB_NOT_DROPPED_YET; sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr, uh->source, iph->saddr, @@ -2846,7 +2875,7 @@ int udp_v4_early_demux(struct sk_buff *skb) } if (!sk) - return 0; + return SKB_NOT_DROPPED_YET; skb->sk = sk; DEBUG_NET_WARN_ON_ONCE(sk_is_refcounted(sk)); @@ -2873,7 +2902,7 @@ int udp_v4_early_demux(struct sk_buff *skb) ip4h_dscp(iph), skb->dev, in_dev, &itag); } - return 0; + return SKB_NOT_DROPPED_YET; } int udp_rcv(struct sk_buff *skb) @@ -3386,7 +3415,7 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f, from_kuid_munged(seq_user_ns(f), sk_uid(sp)), 0, sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, - atomic_read(&sp->sk_drops)); + sk_drops_read(sp)); } int udp4_seq_show(struct seq_file *seq, void *v) @@ -3994,7 +4023,6 @@ static void __init bpf_iter_register(void) void __init udp_init(void) { unsigned long limit; - unsigned int i; udp_table_init(&udp_table, "UDP"); limit = nr_free_buffer_pages() / 8; @@ -4003,15 +4031,6 @@ void __init udp_init(void) sysctl_udp_mem[1] = limit; sysctl_udp_mem[2] = sysctl_udp_mem[0] * 2; - /* 16 spinlocks per cpu */ - udp_busylocks_log = ilog2(nr_cpu_ids) + 4; - udp_busylocks = kmalloc(sizeof(spinlock_t) << udp_busylocks_log, - GFP_KERNEL); - if (!udp_busylocks) - panic("UDP: failed to alloc udp_busylocks\n"); - for (i = 0; i < (1U << udp_busylocks_log); i++) - spin_lock_init(udp_busylocks + i); - if (register_pernet_subsys(&udp_sysctl_ops)) panic("UDP: failed to init sysctl parameters.\n"); diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index 38cb3a28e4ed..6e491c720c90 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -16,9 +16,9 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *req, - struct nlattr *bc, bool net_admin) + bool net_admin) { - if (!inet_diag_bc_sk(bc, sk)) + if (!inet_diag_bc_sk(cb->data, sk)) return 0; return inet_sk_diag_fill(sk, NULL, skb, cb, req, NLM_F_MULTI, @@ -92,12 +92,8 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb, { bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); struct net *net = sock_net(skb->sk); - struct inet_diag_dump_data *cb_data; int num, s_num, slot, s_slot; - struct nlattr *bc; - cb_data = cb->data; - bc = cb_data->inet_diag_nla_bc; s_slot = cb->args[0]; num = s_num = cb->args[1]; @@ -130,7 +126,7 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb, r->id.idiag_dport) goto next; - if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0) { + if (sk_diag_dump(sk, skb, cb, r, net_admin) < 0) { spin_unlock_bh(&hslot->lock); goto done; } diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 5128e2a5b00a..19d0b5b09ffa 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -217,7 +217,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM); skb->remcsum_offload = remcsum; - need_ipsec = skb_dst(skb) && dst_xfrm(skb_dst(skb)); + need_ipsec = (skb_dst(skb) && dst_xfrm(skb_dst(skb))) || skb_sec_path(skb); /* Try to offload checksum if possible */ offload_csum = !!(need_csum && !need_ipsec && @@ -891,8 +891,6 @@ struct sk_buff *udp4_gro_receive(struct list_head *head, struct sk_buff *skb) skb_gro_checksum_try_convert(skb, IPPROTO_UDP, inet_gro_compute_pseudo); skip: - NAPI_GRO_CB(skb)->is_ipv6 = 0; - if (static_branch_unlikely(&udp_encap_needed_key)) sk = udp4_gro_lookup_skb(skb, uh->source, uh->dest); diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c index fce945f23069..b1f667c52cb2 100644 --- a/net/ipv4/udp_tunnel_core.c +++ b/net/ipv4/udp_tunnel_core.c @@ -4,6 +4,7 @@ #include <linux/socket.h> #include <linux/kernel.h> #include <net/dst_metadata.h> +#include <net/flow.h> #include <net/udp.h> #include <net/udp_tunnel.h> #include <net/inet_dscp.h> @@ -28,7 +29,7 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, udp_addr.sin_family = AF_INET; udp_addr.sin_addr = cfg->local_ip; udp_addr.sin_port = cfg->local_udp_port; - err = kernel_bind(sock, (struct sockaddr *)&udp_addr, + err = kernel_bind(sock, (struct sockaddr_unsized *)&udp_addr, sizeof(udp_addr)); if (err < 0) goto error; @@ -37,7 +38,7 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, udp_addr.sin_family = AF_INET; udp_addr.sin_addr = cfg->peer_ip; udp_addr.sin_port = cfg->peer_udp_port; - err = kernel_connect(sock, (struct sockaddr *)&udp_addr, + err = kernel_connect(sock, (struct sockaddr_unsized *)&udp_addr, sizeof(udp_addr), 0); if (err < 0) goto error; @@ -253,7 +254,7 @@ struct rtable *udp_tunnel_dst_lookup(struct sk_buff *skb, fl4.saddr = key->u.ipv4.src; fl4.fl4_dport = dport; fl4.fl4_sport = sport; - fl4.flowi4_tos = tos & INET_DSCP_MASK; + fl4.flowi4_dscp = inet_dsfield_to_dscp(tos); fl4.flowi4_flags = key->flow_flags; rt = ip_route_output_key(net, &fl4); diff --git a/net/ipv4/udp_tunnel_nic.c b/net/ipv4/udp_tunnel_nic.c index ff66db48453c..944b3cf25468 100644 --- a/net/ipv4/udp_tunnel_nic.c +++ b/net/ipv4/udp_tunnel_nic.c @@ -930,7 +930,7 @@ udp_tunnel_nic_netdevice_event(struct notifier_block *unused, err = udp_tunnel_nic_register(dev); if (err) - netdev_WARN(dev, "failed to register for UDP tunnel offloads: %d", err); + netdev_warn(dev, "failed to register for UDP tunnel offloads: %d", err); return notifier_from_errno(err); } /* All other events will need the udp_tunnel_nic state */ diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 7fb6205619e7..58faf1ddd2b1 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -14,7 +14,7 @@ #include <linux/inetdevice.h> #include <net/dst.h> #include <net/xfrm.h> -#include <net/inet_dscp.h> +#include <net/flow.h> #include <net/ip.h> #include <net/l3mdev.h> @@ -25,7 +25,7 @@ static struct dst_entry *__xfrm4_dst_lookup(struct flowi4 *fl4, memset(fl4, 0, sizeof(*fl4)); fl4->daddr = params->daddr->a4; - fl4->flowi4_tos = inet_dscp_to_dsfield(params->dscp); + fl4->flowi4_dscp = params->dscp; fl4->flowi4_l3mdev = l3mdev_master_ifindex_by_index(params->net, params->oif); fl4->flowi4_mark = params->mark; diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index 1c9c686d9522..b8f9a8c0302e 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -304,10 +304,9 @@ config IPV6_SEG6_LWTUNNEL config IPV6_SEG6_HMAC bool "IPv6: Segment Routing HMAC support" depends on IPV6 - select CRYPTO - select CRYPTO_HMAC - select CRYPTO_SHA1 - select CRYPTO_SHA256 + select CRYPTO_LIB_SHA1 + select CRYPTO_LIB_SHA256 + select CRYPTO_LIB_UTILS help Support for HMAC signature generation and verification of SR-enabled packets. diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index f17a5dd4789f..b66217d1b2f8 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1324,7 +1324,7 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp) __in6_ifa_put(ifp); } - if (ifp->flags & IFA_F_PERMANENT && !(ifp->flags & IFA_F_NOPREFIXROUTE)) + if (!(ifp->flags & IFA_F_NOPREFIXROUTE)) action = check_cleanup_prefix_route(ifp, &expires); list_del_rcu(&ifp->if_list); @@ -7238,7 +7238,9 @@ static const struct ctl_table addrconf_sysctl[] = { .data = &ipv6_devconf.rpl_seg_enabled, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "ioam6_enabled", diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 1992621e3f3f..b705751eb73c 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -277,7 +277,7 @@ out_sk_release: goto out; } -static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, +static int __inet6_bind(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len, u32 flags) { struct sockaddr_in6 *addr = (struct sockaddr_in6 *)uaddr; @@ -438,7 +438,7 @@ out_unlock: goto out; } -int inet6_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len) +int inet6_bind_sk(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { u32 flags = BIND_WITH_LOCK; const struct proto *prot; @@ -465,7 +465,7 @@ int inet6_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len) } /* bind for INET6 API */ -int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +int inet6_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { return inet6_bind_sk(sock->sk, uaddr, addr_len); } @@ -857,7 +857,7 @@ int inet6_sk_rebuild_header(struct sock *sk) return PTR_ERR(dst); } - ip6_dst_store(sk, dst, NULL, NULL); + ip6_dst_store(sk, dst, false, false); } return 0; @@ -960,6 +960,7 @@ static int __net_init inet6_net_init(struct net *net) net->ipv6.sysctl.icmpv6_echo_ignore_multicast = 0; net->ipv6.sysctl.icmpv6_echo_ignore_anycast = 0; net->ipv6.sysctl.icmpv6_error_anycast_as_unicast = 0; + net->ipv6.sysctl.icmpv6_errors_extension_mask = 0; /* By default, rate limit error messages. * Except for pmtu discovery, it would break it. diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index eb474f0987ae..95372e0f1d21 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -46,6 +46,34 @@ struct ah_skb_cb { #define AH_SKB_CB(__skb) ((struct ah_skb_cb *)&((__skb)->cb[0])) +/* Helper to save IPv6 addresses and extension headers to temporary storage */ +static inline void ah6_save_hdrs(struct tmp_ext *iph_ext, + struct ipv6hdr *top_iph, int extlen) +{ + if (!extlen) + return; + +#if IS_ENABLED(CONFIG_IPV6_MIP6) + iph_ext->saddr = top_iph->saddr; +#endif + iph_ext->daddr = top_iph->daddr; + memcpy(&iph_ext->hdrs, top_iph + 1, extlen - sizeof(*iph_ext)); +} + +/* Helper to restore IPv6 addresses and extension headers from temporary storage */ +static inline void ah6_restore_hdrs(struct ipv6hdr *top_iph, + struct tmp_ext *iph_ext, int extlen) +{ + if (!extlen) + return; + +#if IS_ENABLED(CONFIG_IPV6_MIP6) + top_iph->saddr = iph_ext->saddr; +#endif + top_iph->daddr = iph_ext->daddr; + memcpy(top_iph + 1, &iph_ext->hdrs, extlen - sizeof(*iph_ext)); +} + static void *ah_alloc_tmp(struct crypto_ahash *ahash, int nfrags, unsigned int size) { @@ -301,13 +329,7 @@ static void ah6_output_done(void *data, int err) memcpy(ah->auth_data, icv, ahp->icv_trunc_len); memcpy(top_iph, iph_base, IPV6HDR_BASELEN); - if (extlen) { -#if IS_ENABLED(CONFIG_IPV6_MIP6) - memcpy(&top_iph->saddr, iph_ext, extlen); -#else - memcpy(&top_iph->daddr, iph_ext, extlen); -#endif - } + ah6_restore_hdrs(top_iph, iph_ext, extlen); kfree(AH_SKB_CB(skb)->tmp); xfrm_output_resume(skb->sk, skb, err); @@ -378,12 +400,8 @@ static int ah6_output(struct xfrm_state *x, struct sk_buff *skb) */ memcpy(iph_base, top_iph, IPV6HDR_BASELEN); + ah6_save_hdrs(iph_ext, top_iph, extlen); if (extlen) { -#if IS_ENABLED(CONFIG_IPV6_MIP6) - memcpy(iph_ext, &top_iph->saddr, extlen); -#else - memcpy(iph_ext, &top_iph->daddr, extlen); -#endif err = ipv6_clear_mutable_options(top_iph, extlen - sizeof(*iph_ext) + sizeof(*top_iph), @@ -434,13 +452,7 @@ static int ah6_output(struct xfrm_state *x, struct sk_buff *skb) memcpy(ah->auth_data, icv, ahp->icv_trunc_len); memcpy(top_iph, iph_base, IPV6HDR_BASELEN); - if (extlen) { -#if IS_ENABLED(CONFIG_IPV6_MIP6) - memcpy(&top_iph->saddr, iph_ext, extlen); -#else - memcpy(&top_iph->daddr, iph_ext, extlen); -#endif - } + ah6_restore_hdrs(top_iph, iph_ext, extlen); out_free: kfree(iph_base); diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index f8a8e46286b8..52599584422b 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -104,7 +104,7 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) rcu_read_lock(); rt = rt6_lookup(net, addr, NULL, 0, NULL, 0); if (rt) { - dev = dst_dev(&rt->dst); + dev = dst_dev_rcu(&rt->dst); netdev_hold(dev, &dev_tracker, GFP_ATOMIC); ip6_rt_put(rt); } else if (ishost) { diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 972bf0426d59..83e03176819c 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -138,7 +138,7 @@ void ip6_datagram_release_cb(struct sock *sk) } EXPORT_SYMBOL_GPL(ip6_datagram_release_cb); -int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, +int __ip6_datagram_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; @@ -194,7 +194,7 @@ int __ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, sin.sin_port = usin->sin6_port; err = __ip4_datagram_connect(sk, - (struct sockaddr *) &sin, + (struct sockaddr_unsized *)&sin, sizeof(sin)); ipv4_connected: @@ -271,7 +271,7 @@ out: } EXPORT_SYMBOL_GPL(__ip6_datagram_connect); -int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +int ip6_datagram_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { int res; @@ -282,7 +282,7 @@ int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) } EXPORT_SYMBOL_GPL(ip6_datagram_connect); -int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr *uaddr, +int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, uaddr); @@ -1068,5 +1068,5 @@ void __ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp, 0, sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, - atomic_read(&sp->sk_drops)); + sk_drops_read(sp)); } diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 72adfc107b55..e75da98f5283 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -149,8 +149,8 @@ static struct sock *esp6_find_tcp_sk(struct xfrm_state *x) dport = encap->encap_dport; spin_unlock_bh(&x->lock); - sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, &x->id.daddr.in6, - dport, &x->props.saddr.in6, ntohs(sport), 0, 0); + sk = __inet6_lookup_established(net, &x->id.daddr.in6, dport, + &x->props.saddr.in6, ntohs(sport), 0, 0); if (!sk) return ERR_PTR(-ENOENT); diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index 7b41fb4f00b5..22410243ebe8 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -158,8 +158,10 @@ static struct sk_buff *xfrm6_tunnel_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { - __be16 type = x->inner_mode.family == AF_INET ? htons(ETH_P_IP) - : htons(ETH_P_IPV6); + const struct xfrm_mode *inner_mode = xfrm_ip2inner_mode(x, + XFRM_MODE_SKB_CB(skb)->protocol); + __be16 type = inner_mode->family == AF_INET ? htons(ETH_P_IP) + : htons(ETH_P_IPV6); return skb_eth_gso_segment(skb, features, type); } diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index d1ef9644f826..a23eb8734e15 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -494,10 +494,8 @@ static int ipv6_rpl_srh_rcv(struct sk_buff *skb) idev = __in6_dev_get(skb->dev); - accept_rpl_seg = net->ipv6.devconf_all->rpl_seg_enabled; - if (accept_rpl_seg > idev->cnf.rpl_seg_enabled) - accept_rpl_seg = idev->cnf.rpl_seg_enabled; - + accept_rpl_seg = min(READ_ONCE(net->ipv6.devconf_all->rpl_seg_enabled), + READ_ONCE(idev->cnf.rpl_seg_enabled)); if (!accept_rpl_seg) { kfree_skb(skb); return -1; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 44550957fd4e..5d2f90babaa5 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -209,7 +209,8 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, * this lookup should be more aggressive (not longer than timeout). */ dst = ip6_route_output(net, sk, fl6); - dev = dst_dev(dst); + rcu_read_lock(); + dev = dst_dev_rcu(dst); if (dst->error) { IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); @@ -224,14 +225,12 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, if (rt->rt6i_dst.plen < 128) tmo >>= ((128 - rt->rt6i_dst.plen)>>5); - rcu_read_lock(); peer = inet_getpeer_v6(net->ipv6.peers, &fl6->daddr); res = inet_peer_xrlim_allow(peer, tmo); - rcu_read_unlock(); } + rcu_read_unlock(); if (!res) - __ICMP6_INC_STATS(net, ip6_dst_idev(dst), - ICMP6_MIB_RATELIMITHOST); + __ICMP6_INC_STATS(net, NULL, ICMP6_MIB_RATELIMITHOST); else icmp_global_consume(net); dst_release(dst); @@ -445,6 +444,193 @@ static int icmp6_iif(const struct sk_buff *skb) return icmp6_dev(skb)->ifindex; } +struct icmp6_ext_iio_addr6_subobj { + __be16 afi; + __be16 reserved; + struct in6_addr addr6; +}; + +static unsigned int icmp6_ext_iio_len(void) +{ + return sizeof(struct icmp_extobj_hdr) + + /* ifIndex */ + sizeof(__be32) + + /* Interface Address Sub-Object */ + sizeof(struct icmp6_ext_iio_addr6_subobj) + + /* Interface Name Sub-Object. Length must be a multiple of 4 + * bytes. + */ + ALIGN(sizeof(struct icmp_ext_iio_name_subobj), 4) + + /* MTU */ + sizeof(__be32); +} + +static unsigned int icmp6_ext_max_len(u8 ext_objs) +{ + unsigned int ext_max_len; + + ext_max_len = sizeof(struct icmp_ext_hdr); + + if (ext_objs & BIT(ICMP_ERR_EXT_IIO_IIF)) + ext_max_len += icmp6_ext_iio_len(); + + return ext_max_len; +} + +static struct in6_addr *icmp6_ext_iio_addr6_find(const struct net_device *dev) +{ + struct inet6_dev *in6_dev; + struct inet6_ifaddr *ifa; + + in6_dev = __in6_dev_get(dev); + if (!in6_dev) + return NULL; + + /* It is unclear from RFC 5837 which IP address should be chosen, but + * it makes sense to choose a global unicast address. + */ + list_for_each_entry_rcu(ifa, &in6_dev->addr_list, if_list) { + if (ifa->flags & (IFA_F_TENTATIVE | IFA_F_DADFAILED)) + continue; + if (ipv6_addr_type(&ifa->addr) != IPV6_ADDR_UNICAST || + ipv6_addr_src_scope(&ifa->addr) != IPV6_ADDR_SCOPE_GLOBAL) + continue; + return &ifa->addr; + } + + return NULL; +} + +static void icmp6_ext_iio_iif_append(struct net *net, struct sk_buff *skb, + int iif) +{ + struct icmp_ext_iio_name_subobj *name_subobj; + struct icmp_extobj_hdr *objh; + struct net_device *dev; + struct in6_addr *addr6; + __be32 data; + + if (!iif) + return; + + /* Add the fields in the order specified by RFC 5837. */ + objh = skb_put(skb, sizeof(*objh)); + objh->class_num = ICMP_EXT_OBJ_CLASS_IIO; + objh->class_type = ICMP_EXT_CTYPE_IIO_ROLE(ICMP_EXT_CTYPE_IIO_ROLE_IIF); + + data = htonl(iif); + skb_put_data(skb, &data, sizeof(__be32)); + objh->class_type |= ICMP_EXT_CTYPE_IIO_IFINDEX; + + rcu_read_lock(); + + dev = dev_get_by_index_rcu(net, iif); + if (!dev) + goto out; + + addr6 = icmp6_ext_iio_addr6_find(dev); + if (addr6) { + struct icmp6_ext_iio_addr6_subobj *addr6_subobj; + + addr6_subobj = skb_put_zero(skb, sizeof(*addr6_subobj)); + addr6_subobj->afi = htons(ICMP_AFI_IP6); + addr6_subobj->addr6 = *addr6; + objh->class_type |= ICMP_EXT_CTYPE_IIO_IPADDR; + } + + name_subobj = skb_put_zero(skb, ALIGN(sizeof(*name_subobj), 4)); + name_subobj->len = ALIGN(sizeof(*name_subobj), 4); + netdev_copy_name(dev, name_subobj->name); + objh->class_type |= ICMP_EXT_CTYPE_IIO_NAME; + + data = htonl(READ_ONCE(dev->mtu)); + skb_put_data(skb, &data, sizeof(__be32)); + objh->class_type |= ICMP_EXT_CTYPE_IIO_MTU; + +out: + rcu_read_unlock(); + objh->length = htons(skb_tail_pointer(skb) - (unsigned char *)objh); +} + +static void icmp6_ext_objs_append(struct net *net, struct sk_buff *skb, + u8 ext_objs, int iif) +{ + if (ext_objs & BIT(ICMP_ERR_EXT_IIO_IIF)) + icmp6_ext_iio_iif_append(net, skb, iif); +} + +static struct sk_buff * +icmp6_ext_append(struct net *net, struct sk_buff *skb_in, + struct icmp6hdr *icmp6h, unsigned int room, int iif) +{ + unsigned int payload_len, ext_max_len, ext_len; + struct icmp_ext_hdr *ext_hdr; + struct sk_buff *skb; + u8 ext_objs; + int nhoff; + + switch (icmp6h->icmp6_type) { + case ICMPV6_DEST_UNREACH: + case ICMPV6_TIME_EXCEED: + break; + default: + return NULL; + } + + /* Do not overwrite existing extensions. This can happen when we + * receive an ICMPv4 message with extensions from a tunnel and + * translate it to an ICMPv6 message towards an IPv6 host in the + * overlay network. + */ + if (icmp6h->icmp6_datagram_len) + return NULL; + + ext_objs = READ_ONCE(net->ipv6.sysctl.icmpv6_errors_extension_mask); + if (!ext_objs) + return NULL; + + ext_max_len = icmp6_ext_max_len(ext_objs); + if (ICMP_EXT_ORIG_DGRAM_MIN_LEN + ext_max_len > room) + return NULL; + + skb = skb_clone(skb_in, GFP_ATOMIC); + if (!skb) + return NULL; + + nhoff = skb_network_offset(skb); + payload_len = min(skb->len - nhoff, ICMP_EXT_ORIG_DGRAM_MIN_LEN); + + if (!pskb_network_may_pull(skb, payload_len)) + goto free_skb; + + if (pskb_trim(skb, nhoff + ICMP_EXT_ORIG_DGRAM_MIN_LEN) || + __skb_put_padto(skb, nhoff + ICMP_EXT_ORIG_DGRAM_MIN_LEN, false)) + goto free_skb; + + if (pskb_expand_head(skb, 0, ext_max_len, GFP_ATOMIC)) + goto free_skb; + + ext_hdr = skb_put_zero(skb, sizeof(*ext_hdr)); + ext_hdr->version = ICMP_EXT_VERSION_2; + + icmp6_ext_objs_append(net, skb, ext_objs, iif); + + /* Do not send an empty extension structure. */ + ext_len = skb_tail_pointer(skb) - (unsigned char *)ext_hdr; + if (ext_len == sizeof(*ext_hdr)) + goto free_skb; + + ext_hdr->checksum = ip_compute_csum(ext_hdr, ext_len); + /* The length of the original datagram in 64-bit words (RFC 4884). */ + icmp6h->icmp6_datagram_len = ICMP_EXT_ORIG_DGRAM_MIN_LEN / sizeof(u64); + + return skb; + +free_skb: + consume_skb(skb); + return NULL; +} + /* * Send an ICMP message in response to a packet in error */ @@ -459,7 +645,9 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, struct ipv6_pinfo *np; const struct in6_addr *saddr = NULL; bool apply_ratelimit = false; + struct sk_buff *ext_skb; struct dst_entry *dst; + unsigned int room; struct icmp6hdr tmp_hdr; struct flowi6 fl6; struct icmpv6_msg msg; @@ -613,8 +801,13 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, msg.offset = skb_network_offset(skb); msg.type = type; - len = skb->len - msg.offset; - len = min_t(unsigned int, len, IPV6_MIN_MTU - sizeof(struct ipv6hdr) - sizeof(struct icmp6hdr)); + room = IPV6_MIN_MTU - sizeof(struct ipv6hdr) - sizeof(struct icmp6hdr); + ext_skb = icmp6_ext_append(net, skb, &tmp_hdr, room, parm->iif); + if (ext_skb) + msg.skb = ext_skb; + + len = msg.skb->len - msg.offset; + len = min_t(unsigned int, len, room); if (len < 0) { net_dbg_ratelimited("icmp: len problem [%pI6c > %pI6c]\n", &hdr->saddr, &hdr->daddr); @@ -636,6 +829,8 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, } out_dst_release: + if (ext_skb) + consume_skb(ext_skb); dst_release(dst); out_unlock: icmpv6_xmit_unlock(sk); @@ -1172,6 +1367,10 @@ int icmpv6_err_convert(u8 type, u8 code, int *err) EXPORT_SYMBOL(icmpv6_err_convert); #ifdef CONFIG_SYSCTL + +static u32 icmpv6_errors_extension_mask_all = + GENMASK_U8(ICMP_ERR_EXT_COUNT - 1, 0); + static struct ctl_table ipv6_icmp_table_template[] = { { .procname = "ratelimit", @@ -1217,6 +1416,15 @@ static struct ctl_table ipv6_icmp_table_template[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, + { + .procname = "errors_extension_mask", + .data = &init_net.ipv6.sysctl.icmpv6_errors_extension_mask, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &icmpv6_errors_extension_mask_all, + }, }; struct ctl_table * __net_init ipv6_icmp_sysctl_init(struct net *net) @@ -1234,6 +1442,7 @@ struct ctl_table * __net_init ipv6_icmp_sysctl_init(struct net *net) table[3].data = &net->ipv6.sysctl.icmpv6_echo_ignore_anycast; table[4].data = &net->ipv6.sysctl.icmpv6_ratemask_ptr; table[5].data = &net->ipv6.sysctl.icmpv6_error_anycast_as_unicast; + table[6].data = &net->ipv6.sysctl.icmpv6_errors_extension_mask; } return table; } diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 333e43434dd7..ea5cf3fdfdd6 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -91,7 +91,7 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk, dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); if (!IS_ERR(dst)) - ip6_dst_store(sk, dst, NULL, NULL); + ip6_dst_store(sk, dst, false, false); } return dst; } diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 76ee521189eb..5e1da088d8e1 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -47,24 +47,23 @@ EXPORT_SYMBOL_GPL(inet6_ehashfn); * The sockhash lock must be held as a reader here. */ struct sock *__inet6_lookup_established(const struct net *net, - struct inet_hashinfo *hashinfo, - const struct in6_addr *saddr, - const __be16 sport, - const struct in6_addr *daddr, - const u16 hnum, - const int dif, const int sdif) + const struct in6_addr *saddr, + const __be16 sport, + const struct in6_addr *daddr, + const u16 hnum, + const int dif, const int sdif) { - struct sock *sk; - const struct hlist_nulls_node *node; const __portpair ports = INET_COMBINED_PORTS(sport, hnum); - /* Optimize here for direct hit, only listening connections can - * have wildcards anyways. - */ - unsigned int hash = inet6_ehashfn(net, daddr, hnum, saddr, sport); - unsigned int slot = hash & hashinfo->ehash_mask; - struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; - + const struct hlist_nulls_node *node; + struct inet_ehash_bucket *head; + struct inet_hashinfo *hashinfo; + unsigned int hash, slot; + struct sock *sk; + hashinfo = net->ipv4.tcp_death_row.hashinfo; + hash = inet6_ehashfn(net, daddr, hnum, saddr, sport); + slot = hash & hashinfo->ehash_mask; + head = &hashinfo->ehash[slot]; begin: sk_nulls_for_each_rcu(sk, node, &head->chain) { if (sk->sk_hash != hash) @@ -200,19 +199,20 @@ struct sock *inet6_lookup_run_sk_lookup(const struct net *net, EXPORT_SYMBOL_GPL(inet6_lookup_run_sk_lookup); struct sock *inet6_lookup_listener(const struct net *net, - struct inet_hashinfo *hashinfo, - struct sk_buff *skb, int doff, - const struct in6_addr *saddr, - const __be16 sport, const struct in6_addr *daddr, - const unsigned short hnum, const int dif, const int sdif) + struct sk_buff *skb, int doff, + const struct in6_addr *saddr, + const __be16 sport, + const struct in6_addr *daddr, + const unsigned short hnum, + const int dif, const int sdif) { struct inet_listen_hashbucket *ilb2; + struct inet_hashinfo *hashinfo; struct sock *result = NULL; unsigned int hash2; /* Lookup redirect from BPF */ - if (static_branch_unlikely(&bpf_sk_lookup_enabled) && - hashinfo == net->ipv4.tcp_death_row.hashinfo) { + if (static_branch_unlikely(&bpf_sk_lookup_enabled)) { result = inet6_lookup_run_sk_lookup(net, IPPROTO_TCP, skb, doff, saddr, sport, daddr, hnum, dif, inet6_ehashfn); @@ -220,6 +220,7 @@ struct sock *inet6_lookup_listener(const struct net *net, goto done; } + hashinfo = net->ipv4.tcp_death_row.hashinfo; hash2 = ipv6_portaddr_hash(net, daddr, hnum); ilb2 = inet_lhash2_bucket(hashinfo, hash2); @@ -244,7 +245,6 @@ done: EXPORT_SYMBOL_GPL(inet6_lookup_listener); struct sock *inet6_lookup(const struct net *net, - struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const __be16 dport, @@ -253,7 +253,7 @@ struct sock *inet6_lookup(const struct net *net, struct sock *sk; bool refcounted; - sk = __inet6_lookup(net, hashinfo, skb, doff, saddr, sport, daddr, + sk = __inet6_lookup(net, skb, doff, saddr, sport, daddr, ntohs(dport), dif, 0, &refcounted); if (sk && !refcounted && !refcount_inc_not_zero(&sk->sk_refcnt)) sk = NULL; @@ -305,8 +305,7 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row, dif, sdif))) { if (sk2->sk_state == TCP_TIME_WAIT) { tw = inet_twsk(sk2); - if (sk->sk_protocol == IPPROTO_TCP && - tcp_twsk_unique(sk, sk2, twp)) + if (tcp_twsk_unique(sk, sk2, twp)) break; } goto not_unique; @@ -369,14 +368,3 @@ int inet6_hash_connect(struct inet_timewait_death_row *death_row, __inet6_check_established); } EXPORT_SYMBOL_GPL(inet6_hash_connect); - -int inet6_hash(struct sock *sk) -{ - int err = 0; - - if (sk->sk_state != TCP_CLOSE) - err = __inet_hash(sk, NULL); - - return err; -} -EXPORT_SYMBOL_GPL(inet6_hash); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 02c16909f618..2111af022d94 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1138,6 +1138,10 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, fib6_set_expires(iter, rt->expires); fib6_add_gc_list(iter); } + if (!(rt->fib6_flags & (RTF_ADDRCONF | RTF_PREFIX_RT))) { + iter->fib6_flags &= ~RTF_ADDRCONF; + iter->fib6_flags &= ~RTF_PREFIX_RT; + } if (rt->fib6_pmtu) fib6_metric_set(iter, RTAX_MTU, diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index a3ff575798dd..60d0be47a9f3 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -66,8 +66,8 @@ EXPORT_SYMBOL(ipv6_flowlabel_exclusive); fl != NULL; \ fl = rcu_dereference(fl->next)) -#define for_each_sk_fl_rcu(np, sfl) \ - for (sfl = rcu_dereference(np->ipv6_fl_list); \ +#define for_each_sk_fl_rcu(sk, sfl) \ + for (sfl = rcu_dereference(inet_sk(sk)->ipv6_fl_list); \ sfl != NULL; \ sfl = rcu_dereference(sfl->next)) @@ -262,12 +262,11 @@ static struct ip6_flowlabel *fl_intern(struct net *net, struct ip6_flowlabel *__fl6_sock_lookup(struct sock *sk, __be32 label) { struct ipv6_fl_socklist *sfl; - struct ipv6_pinfo *np = inet6_sk(sk); label &= IPV6_FLOWLABEL_MASK; rcu_read_lock(); - for_each_sk_fl_rcu(np, sfl) { + for_each_sk_fl_rcu(sk, sfl) { struct ip6_flowlabel *fl = sfl->fl; if (fl->label == label && atomic_inc_not_zero(&fl->users)) { @@ -283,16 +282,16 @@ EXPORT_SYMBOL_GPL(__fl6_sock_lookup); void fl6_free_socklist(struct sock *sk) { - struct ipv6_pinfo *np = inet6_sk(sk); + struct inet_sock *inet = inet_sk(sk); struct ipv6_fl_socklist *sfl; - if (!rcu_access_pointer(np->ipv6_fl_list)) + if (!rcu_access_pointer(inet->ipv6_fl_list)) return; spin_lock_bh(&ip6_sk_fl_lock); - while ((sfl = rcu_dereference_protected(np->ipv6_fl_list, + while ((sfl = rcu_dereference_protected(inet->ipv6_fl_list, lockdep_is_held(&ip6_sk_fl_lock))) != NULL) { - np->ipv6_fl_list = sfl->next; + inet->ipv6_fl_list = sfl->next; spin_unlock_bh(&ip6_sk_fl_lock); fl_release(sfl->fl); @@ -470,16 +469,15 @@ done: static int mem_check(struct sock *sk) { - struct ipv6_pinfo *np = inet6_sk(sk); - struct ipv6_fl_socklist *sfl; int room = FL_MAX_SIZE - atomic_read(&fl_size); + struct ipv6_fl_socklist *sfl; int count = 0; if (room > FL_MAX_SIZE - FL_MAX_PER_SOCK) return 0; rcu_read_lock(); - for_each_sk_fl_rcu(np, sfl) + for_each_sk_fl_rcu(sk, sfl) count++; rcu_read_unlock(); @@ -492,13 +490,15 @@ static int mem_check(struct sock *sk) return 0; } -static inline void fl_link(struct ipv6_pinfo *np, struct ipv6_fl_socklist *sfl, - struct ip6_flowlabel *fl) +static inline void fl_link(struct sock *sk, struct ipv6_fl_socklist *sfl, + struct ip6_flowlabel *fl) { + struct inet_sock *inet = inet_sk(sk); + spin_lock_bh(&ip6_sk_fl_lock); sfl->fl = fl; - sfl->next = np->ipv6_fl_list; - rcu_assign_pointer(np->ipv6_fl_list, sfl); + sfl->next = inet->ipv6_fl_list; + rcu_assign_pointer(inet->ipv6_fl_list, sfl); spin_unlock_bh(&ip6_sk_fl_lock); } @@ -520,7 +520,7 @@ int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq, rcu_read_lock(); - for_each_sk_fl_rcu(np, sfl) { + for_each_sk_fl_rcu(sk, sfl) { if (sfl->fl->label == (np->flow_label & IPV6_FLOWLABEL_MASK)) { spin_lock_bh(&ip6_fl_lock); freq->flr_label = sfl->fl->label; @@ -559,7 +559,7 @@ static int ipv6_flowlabel_put(struct sock *sk, struct in6_flowlabel_req *freq) } spin_lock_bh(&ip6_sk_fl_lock); - for (sflp = &np->ipv6_fl_list; + for (sflp = &inet_sk(sk)->ipv6_fl_list; (sfl = socklist_dereference(*sflp)) != NULL; sflp = &sfl->next) { if (sfl->fl->label == freq->flr_label) @@ -579,13 +579,12 @@ found: static int ipv6_flowlabel_renew(struct sock *sk, struct in6_flowlabel_req *freq) { - struct ipv6_pinfo *np = inet6_sk(sk); struct net *net = sock_net(sk); struct ipv6_fl_socklist *sfl; int err; rcu_read_lock(); - for_each_sk_fl_rcu(np, sfl) { + for_each_sk_fl_rcu(sk, sfl) { if (sfl->fl->label == freq->flr_label) { err = fl6_renew(sfl->fl, freq->flr_linger, freq->flr_expires); @@ -614,7 +613,6 @@ static int ipv6_flowlabel_get(struct sock *sk, struct in6_flowlabel_req *freq, { struct ipv6_fl_socklist *sfl, *sfl1 = NULL; struct ip6_flowlabel *fl, *fl1 = NULL; - struct ipv6_pinfo *np = inet6_sk(sk); struct net *net = sock_net(sk); int err; @@ -645,7 +643,7 @@ static int ipv6_flowlabel_get(struct sock *sk, struct in6_flowlabel_req *freq, if (freq->flr_label) { err = -EEXIST; rcu_read_lock(); - for_each_sk_fl_rcu(np, sfl) { + for_each_sk_fl_rcu(sk, sfl) { if (sfl->fl->label == freq->flr_label) { if (freq->flr_flags & IPV6_FL_F_EXCL) { rcu_read_unlock(); @@ -682,7 +680,7 @@ recheck: fl1->linger = fl->linger; if ((long)(fl->expires - fl1->expires) > 0) fl1->expires = fl->expires; - fl_link(np, sfl1, fl1); + fl_link(sk, sfl1, fl1); fl_free(fl); return 0; @@ -716,7 +714,7 @@ release: } } - fl_link(np, sfl1, fl); + fl_link(sk, sfl1, fl); return 0; done: fl_free(fl); diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 74d49dd6124d..c82a75510c0e 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -329,9 +329,9 @@ static struct ip6_tnl *ip6gre_tunnel_locate(struct net *net, if (parms->name[0]) { if (!dev_valid_name(parms->name)) return NULL; - strscpy(name, parms->name, IFNAMSIZ); + strscpy(name, parms->name); } else { - strcpy(name, "ip6gre%d"); + strscpy(name, "ip6gre%d"); } dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN, ip6gre_tunnel_setup); @@ -1469,7 +1469,7 @@ static int ip6gre_tunnel_init_common(struct net_device *dev) tunnel = netdev_priv(dev); tunnel->dev = dev; - strcpy(tunnel->parms.name, dev->name); + strscpy(tunnel->parms.name, dev->name); ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); if (ret) @@ -1529,7 +1529,7 @@ static void ip6gre_fb_tunnel_init(struct net_device *dev) tunnel->dev = dev; tunnel->net = dev_net(dev); - strcpy(tunnel->parms.name, dev->name); + strscpy(tunnel->parms.name, dev->name); tunnel->hlen = sizeof(struct ipv6hdr) + 4; } @@ -1842,7 +1842,7 @@ static int ip6erspan_tap_init(struct net_device *dev) tunnel = netdev_priv(dev); tunnel->dev = dev; - strcpy(tunnel->parms.name, dev->name); + strscpy(tunnel->parms.name, dev->name); ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); if (ret) diff --git a/net/ipv6/ip6_icmp.c b/net/ipv6/ip6_icmp.c index 9e3574880cb0..233914b63bdb 100644 --- a/net/ipv6/ip6_icmp.c +++ b/net/ipv6/ip6_icmp.c @@ -54,11 +54,12 @@ void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info) struct inet6_skb_parm parm = { 0 }; struct sk_buff *cloned_skb = NULL; enum ip_conntrack_info ctinfo; + enum ip_conntrack_dir dir; struct in6_addr orig_ip; struct nf_conn *ct; ct = nf_ct_get(skb_in, &ctinfo); - if (!ct || !(ct->status & IPS_SRC_NAT)) { + if (!ct || !(READ_ONCE(ct->status) & IPS_NAT_MASK)) { __icmpv6_send(skb_in, type, code, info, &parm); return; } @@ -73,7 +74,8 @@ void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info) goto out; orig_ip = ipv6_hdr(skb_in)->saddr; - ipv6_hdr(skb_in)->saddr = ct->tuplehash[0].tuple.src.u3.in6; + dir = CTINFO2DIR(ctinfo); + ipv6_hdr(skb_in)->saddr = ct->tuplehash[dir].tuple.src.u3.in6; __icmpv6_send(skb_in, type, code, info, &parm); ipv6_hdr(skb_in)->saddr = orig_ip; out: diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 1e1410237b6e..f904739e99b9 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -60,7 +60,7 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); - struct net_device *dev = dst_dev(dst); + struct net_device *dev = dst_dev_rcu(dst); struct inet6_dev *idev = ip6_dst_idev(dst); unsigned int hh_len = LL_RESERVED_SPACE(dev); const struct in6_addr *daddr, *nexthop; @@ -70,15 +70,12 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * /* Be paranoid, rather than too clever. */ if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) { - /* Make sure idev stays alive */ - rcu_read_lock(); + /* idev stays alive because we hold rcu_read_lock(). */ skb = skb_expand_head(skb, hh_len); if (!skb) { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); - rcu_read_unlock(); return -ENOMEM; } - rcu_read_unlock(); } hdr = ipv6_hdr(skb); @@ -123,7 +120,6 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len); - rcu_read_lock(); nexthop = rt6_nexthop(dst_rt6_info(dst), daddr); neigh = __ipv6_neigh_lookup_noref(dev, nexthop); @@ -131,7 +127,6 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * if (unlikely(!neigh)) neigh = __neigh_create(&nd_tbl, nexthop, dev, false); if (IS_ERR(neigh)) { - rcu_read_unlock(); IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES); kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL); return -EINVAL; @@ -139,7 +134,6 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * } sock_confirm_neigh(skb, neigh); ret = neigh_output(neigh, skb, false); - rcu_read_unlock(); return ret; } @@ -233,22 +227,29 @@ static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *s int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); - struct net_device *dev = dst_dev(dst), *indev = skb->dev; - struct inet6_dev *idev = ip6_dst_idev(dst); + struct net_device *dev, *indev = skb->dev; + struct inet6_dev *idev; + int ret; skb->protocol = htons(ETH_P_IPV6); + rcu_read_lock(); + dev = dst_dev_rcu(dst); + idev = ip6_dst_idev(dst); skb->dev = dev; if (unlikely(!idev || READ_ONCE(idev->cnf.disable_ipv6))) { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); + rcu_read_unlock(); kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED); return 0; } - return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, - net, sk, skb, indev, dev, - ip6_finish_output, - !(IP6CB(skb)->flags & IP6SKB_REROUTED)); + ret = NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, + net, sk, skb, indev, dev, + ip6_finish_output, + !(IP6CB(skb)->flags & IP6SKB_REROUTED)); + rcu_read_unlock(); + return ret; } EXPORT_SYMBOL(ip6_output); @@ -268,35 +269,36 @@ bool ip6_autoflowlabel(struct net *net, const struct sock *sk) int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority) { - struct net *net = sock_net(sk); const struct ipv6_pinfo *np = inet6_sk(sk); struct in6_addr *first_hop = &fl6->daddr; struct dst_entry *dst = skb_dst(skb); - struct net_device *dev = dst_dev(dst); struct inet6_dev *idev = ip6_dst_idev(dst); struct hop_jumbo_hdr *hop_jumbo; int hoplen = sizeof(*hop_jumbo); + struct net *net = sock_net(sk); unsigned int head_room; + struct net_device *dev; struct ipv6hdr *hdr; u8 proto = fl6->flowi6_proto; int seg_len = skb->len; - int hlimit = -1; + int ret, hlimit = -1; u32 mtu; + rcu_read_lock(); + + dev = dst_dev_rcu(dst); head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev); if (opt) head_room += opt->opt_nflen + opt->opt_flen; if (unlikely(head_room > skb_headroom(skb))) { - /* Make sure idev stays alive */ - rcu_read_lock(); + /* idev stays alive while we hold rcu_read_lock(). */ skb = skb_expand_head(skb, head_room); if (!skb) { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); - rcu_read_unlock(); - return -ENOBUFS; + ret = -ENOBUFS; + goto unlock; } - rcu_read_unlock(); } if (opt) { @@ -358,17 +360,21 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, * skb to its handler for processing */ skb = l3mdev_ip6_out((struct sock *)sk, skb); - if (unlikely(!skb)) - return 0; + if (unlikely(!skb)) { + ret = 0; + goto unlock; + } /* hooks should never assume socket lock is held. * we promote our socket to non const */ - return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, - net, (struct sock *)sk, skb, NULL, dev, - dst_output); + ret = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, + net, (struct sock *)sk, skb, NULL, dev, + dst_output); + goto unlock; } + ret = -EMSGSIZE; skb->dev = dev; /* ipv6_local_error() does not require socket lock, * we promote our socket to non const @@ -377,7 +383,9 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS); kfree_skb(skb); - return -EMSGSIZE; +unlock: + rcu_read_unlock(); + return ret; } EXPORT_SYMBOL(ip6_xmit); @@ -1092,9 +1100,11 @@ static struct dst_entry *ip6_sk_dst_check(struct sock *sk, * sockets. * 2. oif also should be the same. */ - if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) || + if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, + np->daddr_cache ? &sk->sk_v6_daddr : NULL) || #ifdef CONFIG_IPV6_SUBTREES - ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) || + ip6_rt_check(&rt->rt6i_src, &fl6->saddr, + np->saddr_cache ? &np->saddr : NULL) || #endif (fl6->flowi6_oif && fl6->flowi6_oif != dst_dev(dst)->ifindex)) { dst_release(dst); diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 3262e81223df..6405072050e0 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1257,8 +1257,7 @@ route_lookup: */ max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct ipv6hdr) + dst->header_len + t->hlen; - if (max_headroom > READ_ONCE(dev->needed_headroom)) - WRITE_ONCE(dev->needed_headroom, max_headroom); + ip_tunnel_adj_headroom(dev, max_headroom); err = ip6_tnl_encap(skb, t, &proto, fl6); if (err) diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c index 0ff547a4bff7..cef3e0210744 100644 --- a/net/ipv6/ip6_udp_tunnel.c +++ b/net/ipv6/ip6_udp_tunnel.c @@ -40,7 +40,7 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, memcpy(&udp6_addr.sin6_addr, &cfg->local_ip6, sizeof(udp6_addr.sin6_addr)); udp6_addr.sin6_port = cfg->local_udp_port; - err = kernel_bind(sock, (struct sockaddr *)&udp6_addr, + err = kernel_bind(sock, (struct sockaddr_unsized *)&udp6_addr, sizeof(udp6_addr)); if (err < 0) goto error; @@ -52,7 +52,7 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, sizeof(udp6_addr.sin6_addr)); udp6_addr.sin6_port = cfg->peer_udp_port; err = kernel_connect(sock, - (struct sockaddr *)&udp6_addr, + (struct sockaddr_unsized *)&udp6_addr, sizeof(udp6_addr), 0); } if (err < 0) diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index e66ec623972e..a61e742794f9 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -49,6 +49,7 @@ #include <net/xfrm.h> #include <net/compat.h> #include <net/seg6.h> +#include <net/psp.h> #include <linux/uaccess.h> @@ -107,7 +108,10 @@ struct ipv6_txoptions *ipv6_update_options(struct sock *sk, !((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) && inet_sk(sk)->inet_daddr != LOOPBACK4_IPV6) { struct inet_connection_sock *icsk = inet_csk(sk); - icsk->icsk_ext_hdr_len = opt->opt_flen + opt->opt_nflen; + + icsk->icsk_ext_hdr_len = + psp_sk_overhead(sk) + + opt->opt_flen + opt->opt_nflen; icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); } } diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 36ca27496b3c..016b572e7d6f 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -169,6 +169,29 @@ static int unsolicited_report_interval(struct inet6_dev *idev) return iv > 0 ? iv : 1; } +static struct net_device *ip6_mc_find_dev(struct net *net, + const struct in6_addr *group, + int ifindex) +{ + struct net_device *dev = NULL; + struct rt6_info *rt; + + if (ifindex == 0) { + rcu_read_lock(); + rt = rt6_lookup(net, group, NULL, 0, NULL, 0); + if (rt) { + dev = dst_dev_rcu(&rt->dst); + dev_hold(dev); + ip6_rt_put(rt); + } + rcu_read_unlock(); + } else { + dev = dev_get_by_index(net, ifindex); + } + + return dev; +} + /* * socket join on multicast group */ @@ -191,28 +214,13 @@ static int __ipv6_sock_mc_join(struct sock *sk, int ifindex, } mc_lst = sock_kmalloc(sk, sizeof(struct ipv6_mc_socklist), GFP_KERNEL); - if (!mc_lst) return -ENOMEM; mc_lst->next = NULL; mc_lst->addr = *addr; - if (ifindex == 0) { - struct rt6_info *rt; - - rcu_read_lock(); - rt = rt6_lookup(net, addr, NULL, 0, NULL, 0); - if (rt) { - dev = dst_dev(&rt->dst); - dev_hold(dev); - ip6_rt_put(rt); - } - rcu_read_unlock(); - } else { - dev = dev_get_by_index(net, ifindex); - } - + dev = ip6_mc_find_dev(net, addr, ifindex); if (!dev) { sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); return -ENODEV; @@ -302,27 +310,14 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) } EXPORT_SYMBOL(ipv6_sock_mc_drop); -static struct inet6_dev *ip6_mc_find_dev(struct net *net, - const struct in6_addr *group, - int ifindex) +static struct inet6_dev *ip6_mc_find_idev(struct net *net, + const struct in6_addr *group, + int ifindex) { - struct net_device *dev = NULL; + struct net_device *dev; struct inet6_dev *idev; - if (ifindex == 0) { - struct rt6_info *rt; - - rcu_read_lock(); - rt = rt6_lookup(net, group, NULL, 0, NULL, 0); - if (rt) { - dev = dst_dev(&rt->dst); - dev_hold(dev); - ip6_rt_put(rt); - } - rcu_read_unlock(); - } else { - dev = dev_get_by_index(net, ifindex); - } + dev = ip6_mc_find_dev(net, group, ifindex); if (!dev) return NULL; @@ -374,7 +369,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk, if (!ipv6_addr_is_multicast(group)) return -EINVAL; - idev = ip6_mc_find_dev(net, group, pgsr->gsr_interface); + idev = ip6_mc_find_idev(net, group, pgsr->gsr_interface); if (!idev) return -ENODEV; @@ -509,7 +504,7 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf, gsf->gf_fmode != MCAST_EXCLUDE) return -EINVAL; - idev = ip6_mc_find_dev(net, group, gsf->gf_interface); + idev = ip6_mc_find_idev(net, group, gsf->gf_interface); if (!idev) return -ENODEV; diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 7d5abb3158ec..59d17b6f06bf 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -130,7 +130,7 @@ struct neigh_table nd_tbl = { [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ, [NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ, [NEIGH_VAR_GC_STALETIME] = 60 * HZ, - [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX, + [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_DEFAULT, [NEIGH_VAR_PROXY_QLEN] = 64, [NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ, [NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10, @@ -505,7 +505,7 @@ void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr, ip6_nd_hdr(skb, saddr, daddr, READ_ONCE(inet6_sk(sk)->hop_limit), skb->len); - dev = dst_dev(dst); + dev = dst_dev_rcu(dst); idev = __in6_dev_get(dev); IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS); @@ -1449,7 +1449,7 @@ skip_defrtr: BASE_REACHABLE_TIME, rtime); NEIGH_VAR_SET(in6_dev->nd_parms, GC_STALETIME, 3 * rtime); - in6_dev->nd_parms->reachable_time = neigh_rand_reach_time(rtime); + neigh_set_reach_time(in6_dev->nd_parms); in6_dev->tstamp = jiffies; send_ifinfo_notify = true; } @@ -1948,9 +1948,9 @@ int ndisc_ifinfo_sysctl_change(const struct ctl_table *ctl, int write, void *buf ret = -1; if (write && ret == 0 && dev && (idev = in6_dev_get(dev)) != NULL) { - if (ctl->data == &NEIGH_VAR(idev->nd_parms, BASE_REACHABLE_TIME)) - idev->nd_parms->reachable_time = - neigh_rand_reach_time(NEIGH_VAR(idev->nd_parms, BASE_REACHABLE_TIME)); + if (ctl->data == NEIGH_VAR_PTR(idev->nd_parms, BASE_REACHABLE_TIME)) + neigh_set_reach_time(idev->nd_parms); + WRITE_ONCE(idev->tstamp, jiffies); inet6_ifinfo_notify(RTM_NEWLINK, idev); in6_dev_put(idev); diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index 45f9105f9ac1..46540a5a4331 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -63,7 +63,10 @@ int ip6_route_me_harder(struct net *net, struct sock *sk_partial, struct sk_buff #ifdef CONFIG_XFRM if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && xfrm_decode_session(net, skb, flowi6_to_flowi(&fl6), AF_INET6) == 0) { - skb_dst_set(skb, NULL); + /* ignore return value from skb_dstref_steal, xfrm_lookup takes + * care of dropping the refcnt if needed. + */ + skb_dstref_steal(skb); dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), sk, 0); if (IS_ERR(dst)) return PTR_ERR(dst); diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 276860f65baa..81daf82ddc2d 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -10,6 +10,7 @@ menu "IPv6: Netfilter Configuration" config IP6_NF_IPTABLES_LEGACY tristate "Legacy IP6 tables support" depends on INET && IPV6 && NETFILTER_XTABLES_LEGACY + depends on NETFILTER_XTABLES default m if NETFILTER_XTABLES_LEGACY help ip6tables is a legacy packet classifier. diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c index 838295fa32e3..ef5b7e85cffa 100644 --- a/net/ipv6/netfilter/nf_reject_ipv6.c +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -12,6 +12,19 @@ #include <linux/netfilter_ipv6.h> #include <linux/netfilter_bridge.h> +static struct ipv6hdr * +nf_reject_ip6hdr_put(struct sk_buff *nskb, + const struct sk_buff *oldskb, + __u8 protocol, int hoplimit); +static void +nf_reject_ip6_tcphdr_put(struct sk_buff *nskb, + const struct sk_buff *oldskb, + const struct tcphdr *oth, unsigned int otcplen); +static const struct tcphdr * +nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb, + struct tcphdr *otcph, + unsigned int *otcplen, int hook); + static bool nf_reject_v6_csum_ok(struct sk_buff *skb, int hook) { const struct ipv6hdr *ip6h = ipv6_hdr(skb); @@ -91,6 +104,32 @@ struct sk_buff *nf_reject_skb_v6_tcp_reset(struct net *net, } EXPORT_SYMBOL_GPL(nf_reject_skb_v6_tcp_reset); +static bool nf_skb_is_icmp6_unreach(const struct sk_buff *skb) +{ + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + u8 proto = ip6h->nexthdr; + u8 _type, *tp; + int thoff; + __be16 fo; + + thoff = ipv6_skip_exthdr(skb, ((u8 *)(ip6h + 1) - skb->data), &proto, &fo); + + if (thoff < 0 || thoff >= skb->len || fo != 0) + return false; + + if (proto != IPPROTO_ICMPV6) + return false; + + tp = skb_header_pointer(skb, + thoff + offsetof(struct icmp6hdr, icmp6_type), + sizeof(_type), &_type); + + if (!tp) + return false; + + return *tp == ICMPV6_DEST_UNREACH; +} + struct sk_buff *nf_reject_skb_v6_unreach(struct net *net, struct sk_buff *oldskb, const struct net_device *dev, @@ -104,6 +143,10 @@ struct sk_buff *nf_reject_skb_v6_unreach(struct net *net, if (!nf_reject_ip6hdr_validate(oldskb)) return NULL; + /* Don't reply to ICMPV6_DEST_UNREACH with ICMPV6_DEST_UNREACH */ + if (nf_skb_is_icmp6_unreach(oldskb)) + return NULL; + /* Include "As much of invoking packet as possible without the ICMPv6 * packet exceeding the minimum IPv6 MTU" in the ICMP payload. */ @@ -146,9 +189,10 @@ struct sk_buff *nf_reject_skb_v6_unreach(struct net *net, } EXPORT_SYMBOL_GPL(nf_reject_skb_v6_unreach); -const struct tcphdr *nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb, - struct tcphdr *otcph, - unsigned int *otcplen, int hook) +static const struct tcphdr * +nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb, + struct tcphdr *otcph, + unsigned int *otcplen, int hook) { const struct ipv6hdr *oip6h = ipv6_hdr(oldskb); u8 proto; @@ -192,11 +236,11 @@ const struct tcphdr *nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb, return otcph; } -EXPORT_SYMBOL_GPL(nf_reject_ip6_tcphdr_get); -struct ipv6hdr *nf_reject_ip6hdr_put(struct sk_buff *nskb, - const struct sk_buff *oldskb, - __u8 protocol, int hoplimit) +static struct ipv6hdr * +nf_reject_ip6hdr_put(struct sk_buff *nskb, + const struct sk_buff *oldskb, + __u8 protocol, int hoplimit) { struct ipv6hdr *ip6h; const struct ipv6hdr *oip6h = ipv6_hdr(oldskb); @@ -216,11 +260,11 @@ struct ipv6hdr *nf_reject_ip6hdr_put(struct sk_buff *nskb, return ip6h; } -EXPORT_SYMBOL_GPL(nf_reject_ip6hdr_put); -void nf_reject_ip6_tcphdr_put(struct sk_buff *nskb, - const struct sk_buff *oldskb, - const struct tcphdr *oth, unsigned int otcplen) +static void +nf_reject_ip6_tcphdr_put(struct sk_buff *nskb, + const struct sk_buff *oldskb, + const struct tcphdr *oth, unsigned int otcplen) { struct tcphdr *tcph; @@ -248,7 +292,6 @@ void nf_reject_ip6_tcphdr_put(struct sk_buff *nskb, csum_partial(tcph, sizeof(struct tcphdr), 0)); } -EXPORT_SYMBOL_GPL(nf_reject_ip6_tcphdr_put); static int nf_reject6_fill_skb_dst(struct sk_buff *skb_in) { @@ -293,7 +336,7 @@ void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb, fl6.fl6_sport = otcph->dest; fl6.fl6_dport = otcph->source; - if (hook == NF_INET_PRE_ROUTING || hook == NF_INET_INGRESS) { + if (!skb_dst(oldskb)) { nf_ip6_route(net, &dst, flowi6_to_flowi(&fl6), false); if (!dst) return; @@ -397,8 +440,7 @@ void nf_send_unreach6(struct net *net, struct sk_buff *skb_in, if (hooknum == NF_INET_LOCAL_OUT && skb_in->dev == NULL) skb_in->dev = net->loopback_dev; - if ((hooknum == NF_INET_PRE_ROUTING || hooknum == NF_INET_INGRESS) && - nf_reject6_fill_skb_dst(skb_in) < 0) + if (!skb_dst(skb_in) && nf_reject6_fill_skb_dst(skb_in) < 0) return; icmpv6_send(skb_in, ICMPV6_DEST_UNREACH, code, 0); diff --git a/net/ipv6/netfilter/nf_socket_ipv6.c b/net/ipv6/netfilter/nf_socket_ipv6.c index 9ea5ef56cb27..ced8bd44828e 100644 --- a/net/ipv6/netfilter/nf_socket_ipv6.c +++ b/net/ipv6/netfilter/nf_socket_ipv6.c @@ -83,8 +83,7 @@ nf_socket_get_sock_v6(struct net *net, struct sk_buff *skb, int doff, { switch (protocol) { case IPPROTO_TCP: - return inet6_lookup(net, net->ipv4.tcp_death_row.hashinfo, - skb, doff, saddr, sport, daddr, dport, + return inet6_lookup(net, skb, doff, saddr, sport, daddr, dport, in->ifindex); case IPPROTO_UDP: return udp6_lib_lookup(net, saddr, sport, daddr, dport, diff --git a/net/ipv6/netfilter/nf_tproxy_ipv6.c b/net/ipv6/netfilter/nf_tproxy_ipv6.c index 52f828bb5a83..b2f59ed9d7cc 100644 --- a/net/ipv6/netfilter/nf_tproxy_ipv6.c +++ b/net/ipv6/netfilter/nf_tproxy_ipv6.c @@ -80,7 +80,6 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, const struct net_device *in, const enum nf_tproxy_lookup_t lookup_type) { - struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo; struct sock *sk; switch (protocol) { @@ -94,7 +93,7 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, switch (lookup_type) { case NF_TPROXY_LOOKUP_LISTENER: - sk = inet6_lookup_listener(net, hinfo, skb, + sk = inet6_lookup_listener(net, skb, thoff + __tcp_hdrlen(hp), saddr, sport, daddr, ntohs(dport), @@ -109,7 +108,7 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, */ break; case NF_TPROXY_LOOKUP_ESTABLISHED: - sk = __inet6_lookup_established(net, hinfo, saddr, sport, daddr, + sk = __inet6_lookup_established(net, saddr, sport, daddr, ntohs(dport), in->ifindex, 0); break; default: diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index d21fe27fe21e..1c9b283a4132 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -104,18 +104,20 @@ EXPORT_SYMBOL(ip6_find_1stfragopt); int ip6_dst_hoplimit(struct dst_entry *dst) { int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT); + + rcu_read_lock(); if (hoplimit == 0) { - struct net_device *dev = dst_dev(dst); + struct net_device *dev = dst_dev_rcu(dst); struct inet6_dev *idev; - rcu_read_lock(); idev = __in6_dev_get(dev); if (idev) hoplimit = READ_ONCE(idev->cnf.hop_limit); else hoplimit = READ_ONCE(dev_net(dev)->ipv6.devconf_all->hop_limit); - rcu_read_unlock(); } + rcu_read_unlock(); + return hoplimit; } EXPORT_SYMBOL(ip6_dst_hoplimit); diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 82b0492923d4..e4afc651731a 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -45,7 +45,7 @@ static int dummy_ipv6_chk_addr(struct net *net, const struct in6_addr *addr, return 0; } -static int ping_v6_pre_connect(struct sock *sk, struct sockaddr *uaddr, +static int ping_v6_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { /* This check is replicated from __ip6_datagram_connect() and @@ -208,7 +208,6 @@ struct proto pingv6_prot = { .recvmsg = ping_recvmsg, .bind = ping_bind, .backlog_rcv = ping_queue_rcv_skb, - .hash = ping_hash, .unhash = ping_unhash, .get_port = ping_get_port, .put_port = ping_unhash, diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index 752327b10dde..73296f38c252 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -85,7 +85,6 @@ static const struct snmp_mib snmp6_ipstats_list[] = { SNMP_MIB_ITEM("Ip6InECT0Pkts", IPSTATS_MIB_ECT0PKTS), SNMP_MIB_ITEM("Ip6InCEPkts", IPSTATS_MIB_CEPKTS), SNMP_MIB_ITEM("Ip6OutTransmits", IPSTATS_MIB_OUTPKTS), - SNMP_MIB_SENTINEL }; static const struct snmp_mib snmp6_icmp6_list[] = { @@ -95,30 +94,10 @@ static const struct snmp_mib snmp6_icmp6_list[] = { SNMP_MIB_ITEM("Icmp6OutMsgs", ICMP6_MIB_OUTMSGS), SNMP_MIB_ITEM("Icmp6OutErrors", ICMP6_MIB_OUTERRORS), SNMP_MIB_ITEM("Icmp6InCsumErrors", ICMP6_MIB_CSUMERRORS), +/* ICMP6_MIB_RATELIMITHOST needs to be last, see snmp6_dev_seq_show(). */ SNMP_MIB_ITEM("Icmp6OutRateLimitHost", ICMP6_MIB_RATELIMITHOST), - SNMP_MIB_SENTINEL }; -/* RFC 4293 v6 ICMPMsgStatsTable; named items for RFC 2466 compatibility */ -static const char *const icmp6type2name[256] = { - [ICMPV6_DEST_UNREACH] = "DestUnreachs", - [ICMPV6_PKT_TOOBIG] = "PktTooBigs", - [ICMPV6_TIME_EXCEED] = "TimeExcds", - [ICMPV6_PARAMPROB] = "ParmProblems", - [ICMPV6_ECHO_REQUEST] = "Echos", - [ICMPV6_ECHO_REPLY] = "EchoReplies", - [ICMPV6_MGM_QUERY] = "GroupMembQueries", - [ICMPV6_MGM_REPORT] = "GroupMembResponses", - [ICMPV6_MGM_REDUCTION] = "GroupMembReductions", - [ICMPV6_MLD2_REPORT] = "MLDv2Reports", - [NDISC_ROUTER_ADVERTISEMENT] = "RouterAdvertisements", - [NDISC_ROUTER_SOLICITATION] = "RouterSolicits", - [NDISC_NEIGHBOUR_ADVERTISEMENT] = "NeighborAdvertisements", - [NDISC_NEIGHBOUR_SOLICITATION] = "NeighborSolicits", - [NDISC_REDIRECT] = "Redirects", -}; - - static const struct snmp_mib snmp6_udp6_list[] = { SNMP_MIB_ITEM("Udp6InDatagrams", UDP_MIB_INDATAGRAMS), SNMP_MIB_ITEM("Udp6NoPorts", UDP_MIB_NOPORTS), @@ -129,7 +108,6 @@ static const struct snmp_mib snmp6_udp6_list[] = { SNMP_MIB_ITEM("Udp6InCsumErrors", UDP_MIB_CSUMERRORS), SNMP_MIB_ITEM("Udp6IgnoredMulti", UDP_MIB_IGNOREDMULTI), SNMP_MIB_ITEM("Udp6MemErrors", UDP_MIB_MEMERRORS), - SNMP_MIB_SENTINEL }; static const struct snmp_mib snmp6_udplite6_list[] = { @@ -141,7 +119,6 @@ static const struct snmp_mib snmp6_udplite6_list[] = { SNMP_MIB_ITEM("UdpLite6SndbufErrors", UDP_MIB_SNDBUFERRORS), SNMP_MIB_ITEM("UdpLite6InCsumErrors", UDP_MIB_CSUMERRORS), SNMP_MIB_ITEM("UdpLite6MemErrors", UDP_MIB_MEMERRORS), - SNMP_MIB_SENTINEL }; static void snmp6_seq_show_icmpv6msg(struct seq_file *seq, atomic_long_t *smib) @@ -151,11 +128,31 @@ static void snmp6_seq_show_icmpv6msg(struct seq_file *seq, atomic_long_t *smib) /* print by name -- deprecated items */ for (i = 0; i < ICMP6MSG_MIB_MAX; i++) { + const char *p = NULL; int icmptype; - const char *p; + +#define CASE(TYP, STR) case TYP: p = STR; break; icmptype = i & 0xff; - p = icmp6type2name[icmptype]; + switch (icmptype) { +/* RFC 4293 v6 ICMPMsgStatsTable; named items for RFC 2466 compatibility */ + CASE(ICMPV6_DEST_UNREACH, "DestUnreachs") + CASE(ICMPV6_PKT_TOOBIG, "PktTooBigs") + CASE(ICMPV6_TIME_EXCEED, "TimeExcds") + CASE(ICMPV6_PARAMPROB, "ParmProblems") + CASE(ICMPV6_ECHO_REQUEST, "Echos") + CASE(ICMPV6_ECHO_REPLY, "EchoReplies") + CASE(ICMPV6_MGM_QUERY, "GroupMembQueries") + CASE(ICMPV6_MGM_REPORT, "GroupMembResponses") + CASE(ICMPV6_MGM_REDUCTION, "GroupMembReductions") + CASE(ICMPV6_MLD2_REPORT, "MLDv2Reports") + CASE(NDISC_ROUTER_ADVERTISEMENT, "RouterAdvertisements") + CASE(NDISC_ROUTER_SOLICITATION, "RouterSolicits") + CASE(NDISC_NEIGHBOUR_ADVERTISEMENT, "NeighborAdvertisements") + CASE(NDISC_NEIGHBOUR_SOLICITATION, "NeighborSolicits") + CASE(NDISC_REDIRECT, "Redirects") + } +#undef CASE if (!p) /* don't print un-named types here */ continue; snprintf(name, sizeof(name), "Icmp6%s%s", @@ -182,35 +179,37 @@ static void snmp6_seq_show_icmpv6msg(struct seq_file *seq, atomic_long_t *smib) */ static void snmp6_seq_show_item(struct seq_file *seq, void __percpu *pcpumib, atomic_long_t *smib, - const struct snmp_mib *itemlist) + const struct snmp_mib *itemlist, + int cnt) { unsigned long buff[SNMP_MIB_MAX]; int i; if (pcpumib) { - memset(buff, 0, sizeof(unsigned long) * SNMP_MIB_MAX); + memset(buff, 0, sizeof(unsigned long) * cnt); - snmp_get_cpu_field_batch(buff, itemlist, pcpumib); - for (i = 0; itemlist[i].name; i++) + snmp_get_cpu_field_batch_cnt(buff, itemlist, cnt, pcpumib); + for (i = 0; i < cnt; i++) seq_printf(seq, "%-32s\t%lu\n", itemlist[i].name, buff[i]); } else { - for (i = 0; itemlist[i].name; i++) + for (i = 0; i < cnt; i++) seq_printf(seq, "%-32s\t%lu\n", itemlist[i].name, atomic_long_read(smib + itemlist[i].entry)); } } static void snmp6_seq_show_item64(struct seq_file *seq, void __percpu *mib, - const struct snmp_mib *itemlist, size_t syncpoff) + const struct snmp_mib *itemlist, + int cnt, size_t syncpoff) { u64 buff64[SNMP_MIB_MAX]; int i; - memset(buff64, 0, sizeof(u64) * SNMP_MIB_MAX); + memset(buff64, 0, sizeof(u64) * cnt); - snmp_get_cpu_field64_batch(buff64, itemlist, mib, syncpoff); - for (i = 0; itemlist[i].name; i++) + snmp_get_cpu_field64_batch_cnt(buff64, itemlist, cnt, mib, syncpoff); + for (i = 0; i < cnt; i++) seq_printf(seq, "%-32s\t%llu\n", itemlist[i].name, buff64[i]); } @@ -219,14 +218,19 @@ static int snmp6_seq_show(struct seq_file *seq, void *v) struct net *net = (struct net *)seq->private; snmp6_seq_show_item64(seq, net->mib.ipv6_statistics, - snmp6_ipstats_list, offsetof(struct ipstats_mib, syncp)); + snmp6_ipstats_list, + ARRAY_SIZE(snmp6_ipstats_list), + offsetof(struct ipstats_mib, syncp)); snmp6_seq_show_item(seq, net->mib.icmpv6_statistics, - NULL, snmp6_icmp6_list); + NULL, snmp6_icmp6_list, + ARRAY_SIZE(snmp6_icmp6_list)); snmp6_seq_show_icmpv6msg(seq, net->mib.icmpv6msg_statistics->mibs); snmp6_seq_show_item(seq, net->mib.udp_stats_in6, - NULL, snmp6_udp6_list); + NULL, snmp6_udp6_list, + ARRAY_SIZE(snmp6_udp6_list)); snmp6_seq_show_item(seq, net->mib.udplite_stats_in6, - NULL, snmp6_udplite6_list); + NULL, snmp6_udplite6_list, + ARRAY_SIZE(snmp6_udplite6_list)); return 0; } @@ -236,9 +240,14 @@ static int snmp6_dev_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%-32s\t%u\n", "ifIndex", idev->dev->ifindex); snmp6_seq_show_item64(seq, idev->stats.ipv6, - snmp6_ipstats_list, offsetof(struct ipstats_mib, syncp)); + snmp6_ipstats_list, + ARRAY_SIZE(snmp6_ipstats_list), + offsetof(struct ipstats_mib, syncp)); + + /* Per idev icmp stats do not have ICMP6_MIB_RATELIMITHOST */ snmp6_seq_show_item(seq, NULL, idev->stats.icmpv6dev->mibs, - snmp6_icmp6_list); + snmp6_icmp6_list, ARRAY_SIZE(snmp6_icmp6_list) - 1); + snmp6_seq_show_icmpv6msg(seq, idev->stats.icmpv6msgdev->mibs); return 0; } diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 4c3f8245c40f..b4cd05dba9b6 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -163,7 +163,7 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); continue; } @@ -214,7 +214,8 @@ bool raw6_local_deliver(struct sk_buff *skb, int nexthdr) } /* This cleans up af_inet6 a bit. -DaveM */ -static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) +static int rawv6_bind(struct sock *sk, struct sockaddr_unsized *uaddr, + int addr_len) { struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); @@ -361,7 +362,7 @@ static inline int rawv6_rcv_skb(struct sock *sk, struct sk_buff *skb) if ((raw6_sk(sk)->checksum || rcu_access_pointer(sk->sk_filter)) && skb_checksum_complete(skb)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_SKB_CSUM); return NET_RX_DROP; } @@ -389,7 +390,7 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb) struct raw6_sock *rp = raw6_sk(sk); if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_XFRM_POLICY); return NET_RX_DROP; } @@ -414,7 +415,7 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb) if (inet_test_bit(HDRINCL, sk)) { if (skb_checksum_complete(skb)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_SKB_CSUM); return NET_RX_DROP; } @@ -445,7 +446,7 @@ static int rawv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (flags & MSG_ERRQUEUE) return ipv6_recv_error(sk, msg, len, addr_len); - if (np->rxpmtu && np->rxopt.bits.rxpmtu) + if (np->rxopt.bits.rxpmtu && READ_ONCE(np->rxpmtu)) return ipv6_recv_rxpmtu(sk, msg, len, addr_len); skb = skb_recv_datagram(sk, flags, &err); @@ -1175,6 +1176,7 @@ static int rawv6_init_sk(struct sock *sk) { struct raw6_sock *rp = raw6_sk(sk); + sk->sk_drop_counters = &rp->drop_counters; switch (inet_sk(sk)->inet_num) { case IPPROTO_ICMPV6: rp->checksum = 1; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 3299cfa12e21..aee6a10b112a 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2943,7 +2943,7 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, if (res.f6i->nh) { struct fib6_nh_match_arg arg = { - .dev = dst_dev(dst), + .dev = dst_dev_rcu(dst), .gw = &rt6->rt6i_gateway, }; @@ -3032,13 +3032,12 @@ void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, #endif ip6_dst_store(sk, dst, - ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ? - &sk->sk_v6_daddr : NULL, + ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr), #ifdef CONFIG_IPV6_SUBTREES ipv6_addr_equal(&fl6->saddr, &np->saddr) ? - &np->saddr : + true : #endif - NULL); + false); } static bool ip6_redirect_nh_match(const struct fib6_result *res, @@ -3238,7 +3237,6 @@ EXPORT_SYMBOL_GPL(ip6_sk_redirect); static unsigned int ip6_default_advmss(const struct dst_entry *dst) { - struct net_device *dev = dst_dev(dst); unsigned int mtu = dst_mtu(dst); struct net *net; @@ -3246,7 +3244,7 @@ static unsigned int ip6_default_advmss(const struct dst_entry *dst) rcu_read_lock(); - net = dev_net_rcu(dev); + net = dst_dev_net_rcu(dst); if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) mtu = net->ipv6.sysctl.ip6_rt_min_advmss; @@ -4301,7 +4299,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu if (res.f6i->nh) { struct fib6_nh_match_arg arg = { - .dev = dst_dev(dst), + .dev = dst_dev_rcu(dst), .gw = &rt->rt6i_gateway, }; diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c index 180da19c148c..a5c4c629b788 100644 --- a/net/ipv6/seg6.c +++ b/net/ipv6/seg6.c @@ -522,16 +522,10 @@ int __init seg6_init(void) if (err) goto out_unregister_iptun; - err = seg6_hmac_init(); - if (err) - goto out_unregister_seg6; - pr_info("Segment Routing with IPv6\n"); out: return err; -out_unregister_seg6: - seg6_local_exit(); out_unregister_iptun: seg6_iptunnel_exit(); out_unregister_genl: @@ -543,7 +537,6 @@ out_unregister_pernet: void seg6_exit(void) { - seg6_hmac_exit(); seg6_local_exit(); seg6_iptunnel_exit(); genl_unregister_family(&seg6_genl_family); diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c index f78ecb6ad838..ee6bac0160ac 100644 --- a/net/ipv6/seg6_hmac.c +++ b/net/ipv6/seg6_hmac.c @@ -16,7 +16,6 @@ #include <linux/in6.h> #include <linux/icmpv6.h> #include <linux/mroute6.h> -#include <linux/slab.h> #include <linux/rhashtable.h> #include <linux/netfilter.h> @@ -34,7 +33,9 @@ #include <net/addrconf.h> #include <net/xfrm.h> -#include <crypto/hash.h> +#include <crypto/sha1.h> +#include <crypto/sha2.h> +#include <crypto/utils.h> #include <net/seg6.h> #include <net/genetlink.h> #include <net/seg6_hmac.h> @@ -77,17 +78,6 @@ static const struct rhashtable_params rht_params = { .obj_cmpfn = seg6_hmac_cmpfn, }; -static struct seg6_hmac_algo hmac_algos[] = { - { - .alg_id = SEG6_HMAC_ALGO_SHA1, - .name = "hmac(sha1)", - }, - { - .alg_id = SEG6_HMAC_ALGO_SHA256, - .name = "hmac(sha256)", - }, -}; - static struct sr6_tlv_hmac *seg6_get_tlv_hmac(struct ipv6_sr_hdr *srh) { struct sr6_tlv_hmac *tlv; @@ -107,75 +97,13 @@ static struct sr6_tlv_hmac *seg6_get_tlv_hmac(struct ipv6_sr_hdr *srh) return tlv; } -static struct seg6_hmac_algo *__hmac_get_algo(u8 alg_id) -{ - struct seg6_hmac_algo *algo; - int i, alg_count; - - alg_count = ARRAY_SIZE(hmac_algos); - for (i = 0; i < alg_count; i++) { - algo = &hmac_algos[i]; - if (algo->alg_id == alg_id) - return algo; - } - - return NULL; -} - -static int __do_hmac(struct seg6_hmac_info *hinfo, const char *text, u8 psize, - u8 *output, int outlen) -{ - struct seg6_hmac_algo *algo; - struct crypto_shash *tfm; - struct shash_desc *shash; - int ret, dgsize; - - algo = __hmac_get_algo(hinfo->alg_id); - if (!algo) - return -ENOENT; - - tfm = *this_cpu_ptr(algo->tfms); - - dgsize = crypto_shash_digestsize(tfm); - if (dgsize > outlen) { - pr_debug("sr-ipv6: __do_hmac: digest size too big (%d / %d)\n", - dgsize, outlen); - return -ENOMEM; - } - - ret = crypto_shash_setkey(tfm, hinfo->secret, hinfo->slen); - if (ret < 0) { - pr_debug("sr-ipv6: crypto_shash_setkey failed: err %d\n", ret); - goto failed; - } - - shash = *this_cpu_ptr(algo->shashs); - shash->tfm = tfm; - - ret = crypto_shash_digest(shash, text, psize, output); - if (ret < 0) { - pr_debug("sr-ipv6: crypto_shash_digest failed: err %d\n", ret); - goto failed; - } - - return dgsize; - -failed: - return ret; -} - int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr, struct in6_addr *saddr, u8 *output) { __be32 hmackeyid = cpu_to_be32(hinfo->hmackeyid); - u8 tmp_out[SEG6_HMAC_MAX_DIGESTSIZE]; - int plen, i, dgsize, wrsize; + int plen, i, ret = 0; char *ring, *off; - /* a 160-byte buffer for digest output allows to store highest known - * hash function (RadioGatun) with up to 1216 bits - */ - /* saddr(16) + first_seg(1) + flags(1) + keyid(4) + seglist(16n) */ plen = 16 + 1 + 1 + 4 + (hdr->first_segment + 1) * 16; @@ -218,22 +146,25 @@ int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr, off += 16; } - dgsize = __do_hmac(hinfo, ring, plen, tmp_out, - SEG6_HMAC_MAX_DIGESTSIZE); + switch (hinfo->alg_id) { + case SEG6_HMAC_ALGO_SHA1: + hmac_sha1(&hinfo->key.sha1, ring, plen, output); + static_assert(SEG6_HMAC_FIELD_LEN > SHA1_DIGEST_SIZE); + memset(&output[SHA1_DIGEST_SIZE], 0, + SEG6_HMAC_FIELD_LEN - SHA1_DIGEST_SIZE); + break; + case SEG6_HMAC_ALGO_SHA256: + hmac_sha256(&hinfo->key.sha256, ring, plen, output); + static_assert(SEG6_HMAC_FIELD_LEN == SHA256_DIGEST_SIZE); + break; + default: + WARN_ON_ONCE(1); + ret = -EINVAL; + break; + } local_unlock_nested_bh(&hmac_storage.bh_lock); local_bh_enable(); - - if (dgsize < 0) - return dgsize; - - wrsize = SEG6_HMAC_FIELD_LEN; - if (wrsize > dgsize) - wrsize = dgsize; - - memset(output, 0, SEG6_HMAC_FIELD_LEN); - memcpy(output, tmp_out, wrsize); - - return 0; + return ret; } EXPORT_SYMBOL(seg6_hmac_compute); @@ -280,7 +211,7 @@ bool seg6_hmac_validate_skb(struct sk_buff *skb) if (seg6_hmac_compute(hinfo, srh, &ipv6_hdr(skb)->saddr, hmac_output)) return false; - if (memcmp(hmac_output, tlv->hmac, SEG6_HMAC_FIELD_LEN) != 0) + if (crypto_memneq(hmac_output, tlv->hmac, SEG6_HMAC_FIELD_LEN)) return false; return true; @@ -304,6 +235,19 @@ int seg6_hmac_info_add(struct net *net, u32 key, struct seg6_hmac_info *hinfo) struct seg6_pernet_data *sdata = seg6_pernet(net); int err; + switch (hinfo->alg_id) { + case SEG6_HMAC_ALGO_SHA1: + hmac_sha1_preparekey(&hinfo->key.sha1, + hinfo->secret, hinfo->slen); + break; + case SEG6_HMAC_ALGO_SHA256: + hmac_sha256_preparekey(&hinfo->key.sha256, + hinfo->secret, hinfo->slen); + break; + default: + return -EINVAL; + } + err = rhashtable_lookup_insert_fast(&sdata->hmac_infos, &hinfo->node, rht_params); @@ -359,65 +303,6 @@ out: } EXPORT_SYMBOL(seg6_push_hmac); -static int seg6_hmac_init_algo(void) -{ - struct seg6_hmac_algo *algo; - struct crypto_shash *tfm; - struct shash_desc *shash; - int i, alg_count, cpu; - int ret = -ENOMEM; - - alg_count = ARRAY_SIZE(hmac_algos); - - for (i = 0; i < alg_count; i++) { - struct crypto_shash **p_tfm; - int shsize; - - algo = &hmac_algos[i]; - algo->tfms = alloc_percpu(struct crypto_shash *); - if (!algo->tfms) - goto error_out; - - for_each_possible_cpu(cpu) { - tfm = crypto_alloc_shash(algo->name, 0, 0); - if (IS_ERR(tfm)) { - ret = PTR_ERR(tfm); - goto error_out; - } - p_tfm = per_cpu_ptr(algo->tfms, cpu); - *p_tfm = tfm; - } - - p_tfm = raw_cpu_ptr(algo->tfms); - tfm = *p_tfm; - - shsize = sizeof(*shash) + crypto_shash_descsize(tfm); - - algo->shashs = alloc_percpu(struct shash_desc *); - if (!algo->shashs) - goto error_out; - - for_each_possible_cpu(cpu) { - shash = kzalloc_node(shsize, GFP_KERNEL, - cpu_to_node(cpu)); - if (!shash) - goto error_out; - *per_cpu_ptr(algo->shashs, cpu) = shash; - } - } - - return 0; - -error_out: - seg6_hmac_exit(); - return ret; -} - -int __init seg6_hmac_init(void) -{ - return seg6_hmac_init_algo(); -} - int __net_init seg6_hmac_net_init(struct net *net) { struct seg6_pernet_data *sdata = seg6_pernet(net); @@ -425,36 +310,6 @@ int __net_init seg6_hmac_net_init(struct net *net) return rhashtable_init(&sdata->hmac_infos, &rht_params); } -void seg6_hmac_exit(void) -{ - struct seg6_hmac_algo *algo = NULL; - struct crypto_shash *tfm; - struct shash_desc *shash; - int i, alg_count, cpu; - - alg_count = ARRAY_SIZE(hmac_algos); - for (i = 0; i < alg_count; i++) { - algo = &hmac_algos[i]; - - if (algo->shashs) { - for_each_possible_cpu(cpu) { - shash = *per_cpu_ptr(algo->shashs, cpu); - kfree(shash); - } - free_percpu(algo->shashs); - } - - if (algo->tfms) { - for_each_possible_cpu(cpu) { - tfm = *per_cpu_ptr(algo->tfms, cpu); - crypto_free_shash(tfm); - } - free_percpu(algo->tfms); - } - } -} -EXPORT_SYMBOL(seg6_hmac_exit); - void __net_exit seg6_hmac_net_exit(struct net *net) { struct seg6_pernet_data *sdata = seg6_pernet(net); diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 12496ba1b7d4..cf37ad9686e6 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -848,6 +848,49 @@ static inline __be32 try_6rd(struct ip_tunnel *tunnel, return dst; } +static bool ipip6_tunnel_dst_find(struct sk_buff *skb, __be32 *dst, + bool is_isatap) +{ + const struct ipv6hdr *iph6 = ipv6_hdr(skb); + struct neighbour *neigh = NULL; + const struct in6_addr *addr6; + bool found = false; + int addr_type; + + if (skb_dst(skb)) + neigh = dst_neigh_lookup(skb_dst(skb), &iph6->daddr); + + if (!neigh) { + net_dbg_ratelimited("nexthop == NULL\n"); + return false; + } + + addr6 = (const struct in6_addr *)&neigh->primary_key; + addr_type = ipv6_addr_type(addr6); + + if (is_isatap) { + if ((addr_type & IPV6_ADDR_UNICAST) && + ipv6_addr_is_isatap(addr6)) { + *dst = addr6->s6_addr32[3]; + found = true; + } + } else { + if (addr_type == IPV6_ADDR_ANY) { + addr6 = &ipv6_hdr(skb)->daddr; + addr_type = ipv6_addr_type(addr6); + } + + if ((addr_type & IPV6_ADDR_COMPATv4) != 0) { + *dst = addr6->s6_addr32[3]; + found = true; + } + } + + neigh_release(neigh); + + return found; +} + /* * This function assumes it is being called from dev_queue_xmit() * and that skb is filled properly by that function. @@ -867,8 +910,6 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, __be32 dst = tiph->daddr; struct flowi4 fl4; int mtu; - const struct in6_addr *addr6; - int addr_type; u8 ttl; u8 protocol = IPPROTO_IPV6; int t_hlen = tunnel->hlen + sizeof(struct iphdr); @@ -877,64 +918,15 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, tos = ipv6_get_dsfield(iph6); /* ISATAP (RFC4214) - must come before 6to4 */ - if (dev->priv_flags & IFF_ISATAP) { - struct neighbour *neigh = NULL; - bool do_tx_error = false; - - if (skb_dst(skb)) - neigh = dst_neigh_lookup(skb_dst(skb), &iph6->daddr); - - if (!neigh) { - net_dbg_ratelimited("nexthop == NULL\n"); - goto tx_error; - } - - addr6 = (const struct in6_addr *)&neigh->primary_key; - addr_type = ipv6_addr_type(addr6); - - if ((addr_type & IPV6_ADDR_UNICAST) && - ipv6_addr_is_isatap(addr6)) - dst = addr6->s6_addr32[3]; - else - do_tx_error = true; - - neigh_release(neigh); - if (do_tx_error) - goto tx_error; - } + if ((dev->priv_flags & IFF_ISATAP) && + !ipip6_tunnel_dst_find(skb, &dst, true)) + goto tx_error; if (!dst) dst = try_6rd(tunnel, &iph6->daddr); - if (!dst) { - struct neighbour *neigh = NULL; - bool do_tx_error = false; - - if (skb_dst(skb)) - neigh = dst_neigh_lookup(skb_dst(skb), &iph6->daddr); - - if (!neigh) { - net_dbg_ratelimited("nexthop == NULL\n"); - goto tx_error; - } - - addr6 = (const struct in6_addr *)&neigh->primary_key; - addr_type = ipv6_addr_type(addr6); - - if (addr_type == IPV6_ADDR_ANY) { - addr6 = &ipv6_hdr(skb)->daddr; - addr_type = ipv6_addr_type(addr6); - } - - if ((addr_type & IPV6_ADDR_COMPATv4) != 0) - dst = addr6->s6_addr32[3]; - else - do_tx_error = true; - - neigh_release(neigh); - if (do_tx_error) - goto tx_error; - } + if (!dst && !ipip6_tunnel_dst_find(skb, &dst, false)) + goto tx_error; flowi4_init_output(&fl4, tunnel->parms.link, tunnel->fwmark, tos & INET_DSCP_MASK, RT_SCOPE_UNIVERSE, diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index f0ee1a909771..7e007f013ec8 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -16,6 +16,7 @@ #include <net/secure_seq.h> #include <net/ipv6.h> #include <net/tcp.h> +#include <net/tcp_ecn.h> #define COOKIEBITS 24 /* Upper bits store count */ #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) @@ -264,6 +265,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) if (!req->syncookie) ireq->rcv_wscale = rcv_wscale; ireq->ecn_ok &= cookie_ecn_ok(net, dst); + tcp_rsk(req)->accecn_ok = ireq->ecn_ok && cookie_accecn_ok(th); ret = tcp_get_cookie_sock(sk, skb, req, dst); if (!ret) { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 7577e7eb2c97..280fe5978559 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -62,12 +62,12 @@ #include <net/hotdata.h> #include <net/busy_poll.h> #include <net/rstreason.h> +#include <net/psp.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> -#include <crypto/hash.h> -#include <linux/scatterlist.h> +#include <crypto/md5.h> #include <trace/events/tcp.h> @@ -118,7 +118,7 @@ static u32 tcp_v6_init_ts_off(const struct net *net, const struct sk_buff *skb) ipv6_hdr(skb)->saddr.s6_addr32); } -static int tcp_v6_pre_connect(struct sock *sk, struct sockaddr *uaddr, +static int tcp_v6_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { /* This check is replicated from tcp_v6_connect() and intended to @@ -133,7 +133,7 @@ static int tcp_v6_pre_connect(struct sock *sk, struct sockaddr *uaddr, return BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr, &addr_len); } -static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, +static int tcp_v6_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; @@ -238,7 +238,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, tp->af_specific = &tcp_sock_ipv6_mapped_specific; #endif - err = tcp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin)); + err = tcp_v4_connect(sk, (struct sockaddr_unsized *)&sin, sizeof(sin)); if (err) { icsk->icsk_ext_hdr_len = exthdrlen; @@ -299,12 +299,12 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, inet->inet_rcv_saddr = LOOPBACK4_IPV6; sk->sk_gso_type = SKB_GSO_TCPV6; - ip6_dst_store(sk, dst, NULL, NULL); + ip6_dst_store(sk, dst, false, false); - icsk->icsk_ext_hdr_len = 0; + icsk->icsk_ext_hdr_len = psp_sk_overhead(sk); if (opt) - icsk->icsk_ext_hdr_len = opt->opt_flen + - opt->opt_nflen; + icsk->icsk_ext_hdr_len += opt->opt_flen + + opt->opt_nflen; tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); @@ -388,8 +388,7 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, bool fatal; int err; - sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, - &hdr->daddr, th->dest, + sk = __inet6_lookup_established(net, &hdr->daddr, th->dest, &hdr->saddr, ntohs(th->source), skb->dev->ifindex, inet6_sdif(skb)); @@ -545,6 +544,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); if (skb) { + tcp_rsk(req)->syn_ect_snt = np->tclass & INET_ECN_MASK; __tcp_v6_send_check(skb, &ireq->ir_v6_loc_addr, &ireq->ir_v6_rmt_addr); @@ -690,69 +690,45 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname, cmd.tcpm_key, cmd.tcpm_keylen); } -static int tcp_v6_md5_hash_headers(struct tcp_sigpool *hp, - const struct in6_addr *daddr, - const struct in6_addr *saddr, - const struct tcphdr *th, int nbytes) +static void tcp_v6_md5_hash_headers(struct md5_ctx *ctx, + const struct in6_addr *daddr, + const struct in6_addr *saddr, + const struct tcphdr *th, int nbytes) { - struct tcp6_pseudohdr *bp; - struct scatterlist sg; - struct tcphdr *_th; - - bp = hp->scratch; - /* 1. TCP pseudo-header (RFC2460) */ - bp->saddr = *saddr; - bp->daddr = *daddr; - bp->protocol = cpu_to_be32(IPPROTO_TCP); - bp->len = cpu_to_be32(nbytes); - - _th = (struct tcphdr *)(bp + 1); - memcpy(_th, th, sizeof(*th)); - _th->check = 0; - - sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); - ahash_request_set_crypt(hp->req, &sg, NULL, - sizeof(*bp) + sizeof(*th)); - return crypto_ahash_update(hp->req); + struct { + struct tcp6_pseudohdr ip; /* TCP pseudo-header (RFC2460) */ + struct tcphdr tcp; + } h; + + h.ip.saddr = *saddr; + h.ip.daddr = *daddr; + h.ip.protocol = cpu_to_be32(IPPROTO_TCP); + h.ip.len = cpu_to_be32(nbytes); + h.tcp = *th; + h.tcp.check = 0; + md5_update(ctx, (const u8 *)&h, sizeof(h.ip) + sizeof(h.tcp)); } -static int tcp_v6_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, - const struct in6_addr *daddr, struct in6_addr *saddr, - const struct tcphdr *th) +static noinline_for_stack void +tcp_v6_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, + const struct in6_addr *daddr, struct in6_addr *saddr, + const struct tcphdr *th) { - struct tcp_sigpool hp; - - if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp)) - goto clear_hash_nostart; - - if (crypto_ahash_init(hp.req)) - goto clear_hash; - if (tcp_v6_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2)) - goto clear_hash; - if (tcp_md5_hash_key(&hp, key)) - goto clear_hash; - ahash_request_set_crypt(hp.req, NULL, md5_hash, 0); - if (crypto_ahash_final(hp.req)) - goto clear_hash; - - tcp_sigpool_end(&hp); - return 0; + struct md5_ctx ctx; -clear_hash: - tcp_sigpool_end(&hp); -clear_hash_nostart: - memset(md5_hash, 0, 16); - return 1; + md5_init(&ctx); + tcp_v6_md5_hash_headers(&ctx, daddr, saddr, th, th->doff << 2); + tcp_md5_hash_key(&ctx, key); + md5_final(&ctx, md5_hash); } -static int tcp_v6_md5_hash_skb(char *md5_hash, - const struct tcp_md5sig_key *key, - const struct sock *sk, - const struct sk_buff *skb) +static noinline_for_stack void +tcp_v6_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, + const struct sock *sk, const struct sk_buff *skb) { const struct tcphdr *th = tcp_hdr(skb); const struct in6_addr *saddr, *daddr; - struct tcp_sigpool hp; + struct md5_ctx ctx; if (sk) { /* valid for establish/request sockets */ saddr = &sk->sk_v6_rcv_saddr; @@ -763,30 +739,11 @@ static int tcp_v6_md5_hash_skb(char *md5_hash, daddr = &ip6h->daddr; } - if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp)) - goto clear_hash_nostart; - - if (crypto_ahash_init(hp.req)) - goto clear_hash; - - if (tcp_v6_md5_hash_headers(&hp, daddr, saddr, th, skb->len)) - goto clear_hash; - if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2)) - goto clear_hash; - if (tcp_md5_hash_key(&hp, key)) - goto clear_hash; - ahash_request_set_crypt(hp.req, NULL, md5_hash, 0); - if (crypto_ahash_final(hp.req)) - goto clear_hash; - - tcp_sigpool_end(&hp); - return 0; - -clear_hash: - tcp_sigpool_end(&hp); -clear_hash_nostart: - memset(md5_hash, 0, 16); - return 1; + md5_init(&ctx); + tcp_v6_md5_hash_headers(&ctx, daddr, saddr, th, skb->len); + tcp_md5_hash_skb_data(&ctx, skb, th->doff << 2); + tcp_md5_hash_key(&ctx, key); + md5_final(&ctx, md5_hash); } #endif @@ -839,7 +796,6 @@ struct request_sock_ops tcp6_request_sock_ops __read_mostly = { .send_ack = tcp_v6_reqsk_send_ack, .destructor = tcp_v6_reqsk_destructor, .send_reset = tcp_v6_send_reset, - .syn_ack_timeout = tcp_syn_ack_timeout, }; const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { @@ -973,6 +929,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 if (sk) { /* unconstify the socket only to attach it to buff with care. */ skb_set_owner_edemux(buff, (struct sock *)sk); + psp_reply_set_decrypted(sk, buff); if (sk->sk_state == TCP_TIME_WAIT) mark = inet_twsk(sk)->tw_mark; @@ -1030,7 +987,6 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb, int oif = 0; #ifdef CONFIG_TCP_MD5SIG unsigned char newhash[16]; - int genhash; struct sock *sk1 = NULL; #endif @@ -1073,8 +1029,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb, * Incoming packet is checked with md5 hash with finding key, * no RST generated if md5 hash doesn't match. */ - sk1 = inet6_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, - NULL, 0, &ipv6h->saddr, th->source, + sk1 = inet6_lookup_listener(net, NULL, 0, &ipv6h->saddr, th->source, &ipv6h->daddr, ntohs(th->source), dif, sdif); if (!sk1) @@ -1090,8 +1045,8 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb, goto out; key.type = TCP_KEY_MD5; - genhash = tcp_v6_md5_hash_skb(newhash, key.md5_key, NULL, skb); - if (genhash || memcmp(md5_hash_location, newhash, 16) != 0) + tcp_v6_md5_hash_skb(newhash, key.md5_key, NULL, skb); + if (memcmp(md5_hash_location, newhash, 16) != 0) goto out; } #endif @@ -1385,7 +1340,9 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * if (!newsk) return NULL; - inet_sk(newsk)->pinet6 = tcp_inet6_sk(newsk); + newinet = inet_sk(newsk); + newinet->pinet6 = tcp_inet6_sk(newsk); + newinet->ipv6_fl_list = NULL; newnp = tcp_inet6_sk(newsk); newtp = tcp_sk(newsk); @@ -1404,7 +1361,6 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * newnp->ipv6_mc_list = NULL; newnp->ipv6_ac_list = NULL; - newnp->ipv6_fl_list = NULL; newnp->pktoptions = NULL; newnp->opt = NULL; newnp->mcast_oif = inet_iif(skb); @@ -1431,17 +1387,17 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * ireq = inet_rsk(req); if (sk_acceptq_is_full(sk)) - goto out_overflow; + goto exit_overflow; if (!dst) { dst = inet6_csk_route_req(sk, &fl6, req, IPPROTO_TCP); if (!dst) - goto out; + goto exit; } newsk = tcp_create_openreq_child(sk, req, skb); if (!newsk) - goto out_nonewsk; + goto exit_nonewsk; /* * No need to charge this sock to the relevant IPv6 refcnt debug socks @@ -1452,15 +1408,17 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * newsk->sk_gso_type = SKB_GSO_TCPV6; inet6_sk_rx_dst_set(newsk, skb); - inet_sk(newsk)->pinet6 = tcp_inet6_sk(newsk); + newinet = inet_sk(newsk); + newinet->pinet6 = tcp_inet6_sk(newsk); + newinet->ipv6_fl_list = NULL; + newinet->inet_opt = NULL; newtp = tcp_sk(newsk); - newinet = inet_sk(newsk); newnp = tcp_inet6_sk(newsk); memcpy(newnp, np, sizeof(struct ipv6_pinfo)); - ip6_dst_store(newsk, dst, NULL, NULL); + ip6_dst_store(newsk, dst, false, false); newnp->saddr = ireq->ir_v6_loc_addr; @@ -1468,10 +1426,8 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * First: no IPv4 options. */ - newinet->inet_opt = NULL; newnp->ipv6_mc_list = NULL; newnp->ipv6_ac_list = NULL; - newnp->ipv6_fl_list = NULL; /* Clone RX bits */ newnp->rxopt.all = np->rxopt.all; @@ -1525,25 +1481,19 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * const union tcp_md5_addr *addr; addr = (union tcp_md5_addr *)&newsk->sk_v6_daddr; - if (tcp_md5_key_copy(newsk, addr, AF_INET6, 128, l3index, key)) { - inet_csk_prepare_forced_close(newsk); - tcp_done(newsk); - goto out; - } + if (tcp_md5_key_copy(newsk, addr, AF_INET6, 128, l3index, key)) + goto put_and_exit; } } #endif #ifdef CONFIG_TCP_AO /* Copy over tcp_ao_info if any */ if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET6)) - goto out; /* OOM */ + goto put_and_exit; /* OOM */ #endif - if (__inet_inherit_port(sk, newsk) < 0) { - inet_csk_prepare_forced_close(newsk); - tcp_done(newsk); - goto out; - } + if (__inet_inherit_port(sk, newsk) < 0) + goto put_and_exit; *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), &found_dup_sk); if (*own_req) { @@ -1570,13 +1520,17 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * return newsk; -out_overflow: +exit_overflow: __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); -out_nonewsk: +exit_nonewsk: dst_release(dst); -out: +exit: tcp_listendrop(sk); return NULL; +put_and_exit: + inet_csk_prepare_forced_close(newsk); + tcp_done(newsk); + goto exit; } INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, @@ -1608,6 +1562,10 @@ int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) if (skb->protocol == htons(ETH_P_IP)) return tcp_v4_do_rcv(sk, skb); + reason = psp_sk_rx_policy_check(sk, skb); + if (reason) + goto err_discard; + /* * socket locking is here for SMP purposes as backlog rcv * is currently called with bh processing disabled. @@ -1687,6 +1645,7 @@ csum_err: reason = SKB_DROP_REASON_TCP_CSUM; trace_tcp_bad_csum(skb); TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); +err_discard: TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); goto discard; @@ -1789,7 +1748,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) hdr = ipv6_hdr(skb); lookup: - sk = __inet6_lookup_skb(net->ipv4.tcp_death_row.hashinfo, skb, __tcp_hdrlen(th), + sk = __inet6_lookup_skb(skb, __tcp_hdrlen(th), th->source, th->dest, inet6_iif(skb), sdif, &refcounted); if (!sk) @@ -1811,7 +1770,7 @@ lookup: &hdr->saddr, &hdr->daddr, AF_INET6, dif, sdif); if (drop_reason) { - sk_drops_add(sk, skb); + sk_drops_skbadd(sk, skb); reqsk_put(req); goto discard_it; } @@ -1950,7 +1909,7 @@ discard_it: return 0; discard_and_relse: - sk_drops_add(sk, skb); + sk_drops_skbadd(sk, skb); if (refcounted) sock_put(sk); goto discard_it; @@ -1976,8 +1935,7 @@ do_time_wait: { struct sock *sk2; - sk2 = inet6_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, - skb, __tcp_hdrlen(th), + sk2 = inet6_lookup_listener(net, skb, __tcp_hdrlen(th), &ipv6_hdr(skb)->saddr, th->source, &ipv6_hdr(skb)->daddr, ntohs(th->dest), @@ -1992,6 +1950,10 @@ do_time_wait: __this_cpu_write(tcp_tw_isn, isn); goto process; } + + drop_reason = psp_twsk_rx_policy_check(inet_twsk(sk), skb); + if (drop_reason) + break; } /* to ACK */ fallthrough; @@ -2029,8 +1991,7 @@ void tcp_v6_early_demux(struct sk_buff *skb) return; /* Note : We use inet6_iif() here, not tcp_v6_iif() */ - sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, - &hdr->saddr, th->source, + sk = __inet6_lookup_established(net, &hdr->saddr, th->source, &hdr->daddr, ntohs(th->dest), inet6_iif(skb), inet6_sdif(skb)); if (sk) { @@ -2050,7 +2011,6 @@ void tcp_v6_early_demux(struct sk_buff *skb) static struct timewait_sock_ops tcp6_timewait_sock_ops = { .twsk_obj_size = sizeof(struct tcp6_timewait_sock), - .twsk_destructor = tcp_twsk_destructor, }; INDIRECT_CALLABLE_SCOPE void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb) @@ -2117,6 +2077,13 @@ static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific = { .ao_calc_key_sk = tcp_v4_ao_calc_key_sk, #endif }; + +static void tcp6_destruct_sock(struct sock *sk) +{ + tcp_md5_destruct_sock(sk); + tcp_ao_destroy_sock(sk, false); + inet6_sock_destruct(sk); +} #endif /* NOTE: A lot of things set to zero explicitly by call to @@ -2132,6 +2099,7 @@ static int tcp_v6_init_sock(struct sock *sk) #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) tcp_sk(sk)->af_specific = &tcp_sock_ipv6_specific; + sk->sk_destruct = tcp6_destruct_sock; #endif return 0; @@ -2195,13 +2163,13 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk_pending == ICSK_TIME_LOSS_PROBE) { timer_active = 1; - timer_expires = icsk_timeout(icsk); + timer_expires = tcp_timeout_expires(sp); } else if (icsk_pending == ICSK_TIME_PROBE0) { timer_active = 4; - timer_expires = icsk_timeout(icsk); - } else if (timer_pending(&sp->sk_timer)) { + timer_expires = tcp_timeout_expires(sp); + } else if (timer_pending(&icsk->icsk_keepalive_timer)) { timer_active = 2; - timer_expires = sp->sk_timer.expires; + timer_expires = icsk->icsk_keepalive_timer.expires; } else { timer_active = 0; timer_expires = jiffies; @@ -2230,9 +2198,9 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) rx_queue, timer_active, jiffies_delta_to_clock_t(timer_expires - jiffies), - icsk->icsk_retransmits, + READ_ONCE(icsk->icsk_retransmits), from_kuid_munged(seq_user_ns(seq), sk_uid(sp)), - icsk->icsk_probes_out, + READ_ONCE(icsk->icsk_probes_out), sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, jiffies_to_clock_t(icsk->icsk_rto), @@ -2342,7 +2310,7 @@ struct proto tcpv6_prot = { .splice_eof = tcp_splice_eof, .backlog_rcv = tcp_v6_do_rcv, .release_cb = tcp_release_cb, - .hash = inet6_hash, + .hash = inet_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, .put_port = inet_put_port, @@ -2358,7 +2326,6 @@ struct proto tcpv6_prot = { .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, .memory_pressure = &tcp_memory_pressure, - .orphan_count = &tcp_orphan_count, .sysctl_mem = sysctl_tcp_mem, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c index a8a04f441e78..effeba58630b 100644 --- a/net/ipv6/tcpv6_offload.c +++ b/net/ipv6/tcpv6_offload.c @@ -36,8 +36,7 @@ static void tcp6_check_fraglist_gro(struct list_head *head, struct sk_buff *skb, inet6_get_iif_sdif(skb, &iif, &sdif); hdr = skb_gro_network_header(skb); net = dev_net_rcu(skb->dev); - sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, - &hdr->saddr, th->source, + sk = __inet6_lookup_established(net, &hdr->saddr, th->source, &hdr->daddr, ntohs(th->dest), iif, sdif); NAPI_GRO_CB(skb)->is_flist = !sk; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 6a68f77da44b..794c13674e8a 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -67,10 +67,11 @@ static void udpv6_destruct_sock(struct sock *sk) int udpv6_init_sock(struct sock *sk) { - udp_lib_init_sock(sk); + int res = udp_lib_init_sock(sk); + sk->sk_destruct = udpv6_destruct_sock; set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); - return 0; + return res; } INDIRECT_CALLABLE_SCOPE @@ -260,7 +261,7 @@ rescore: /* compute_score is too long of a function to be * inlined, and calling it again here yields - * measureable overhead for some + * measurable overhead for some * workloads. Work around it by jumping * backwards to rescore 'result'. */ @@ -449,7 +450,7 @@ struct sock *udp6_lib_lookup(const struct net *net, const struct in6_addr *saddr EXPORT_SYMBOL_GPL(udp6_lib_lookup); #endif -/* do not use the scratch area len for jumbogram: their length execeeds the +/* do not use the scratch area len for jumbogram: their length exceeds the * scratch area space; note that the IP6CB flags is still in the first * cacheline, so checking for jumbograms is cheap */ @@ -479,7 +480,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (flags & MSG_ERRQUEUE) return ipv6_recv_error(sk, msg, len, addr_len); - if (np->rxpmtu && np->rxopt.bits.rxpmtu) + if (np->rxopt.bits.rxpmtu && READ_ONCE(np->rxpmtu)) return ipv6_recv_rxpmtu(sk, msg, len, addr_len); try_again: @@ -524,7 +525,7 @@ try_again: } if (unlikely(err)) { if (!peeking) { - atomic_inc(&sk->sk_drops); + udp_drops_inc(sk); SNMP_INC_STATS(mib, UDP_MIB_INERRORS); } kfree_skb(skb); @@ -908,7 +909,7 @@ csum_error: __UDP6_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); drop: __UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); - atomic_inc(&sk->sk_drops); + udp_drops_inc(sk); sk_skb_reason_drop(sk, skb, drop_reason); return -1; } @@ -1013,7 +1014,7 @@ start_lookup: } nskb = skb_clone(skb, GFP_ATOMIC); if (unlikely(!nskb)) { - atomic_inc(&sk->sk_drops); + udp_drops_inc(sk); __UDP6_INC_STATS(net, UDP_MIB_RCVBUFERRORS, IS_UDPLITE(sk)); __UDP6_INC_STATS(net, UDP_MIB_INERRORS, @@ -1048,7 +1049,7 @@ static void udp6_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) sk->sk_rx_dst_cookie = rt6_get_cookie(dst_rt6_info(dst)); } -/* wrapper for udp_queue_rcv_skb tacking care of csum conversion and +/* wrapper for udp_queue_rcv_skb taking care of csum conversion and * return code conversion for ip layer consumption */ static int udp6_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb, @@ -1281,7 +1282,7 @@ static void udp_v6_flush_pending_frames(struct sock *sk) } } -static int udpv6_pre_connect(struct sock *sk, struct sockaddr *uaddr, +static int udpv6_pre_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { if (addr_len < offsetofend(struct sockaddr, sa_family)) @@ -1302,7 +1303,8 @@ static int udpv6_pre_connect(struct sock *sk, struct sockaddr *uaddr, return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, &addr_len); } -static int udpv6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +static int udpv6_connect(struct sock *sk, struct sockaddr_unsized *uaddr, + int addr_len) { int res; diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index d8445ac1b2e4..046f13b1d77a 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -154,8 +154,6 @@ struct sk_buff *udp6_gro_receive(struct list_head *head, struct sk_buff *skb) ip6_gro_compute_pseudo); skip: - NAPI_GRO_CB(skb)->is_ipv6 = 1; - if (static_branch_unlikely(&udpv6_encap_needed_key)) sk = udp6_gro_lookup_skb(skb, uh->source, uh->dest); diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index 5120a763da0d..0a0eeaed0591 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -334,7 +334,7 @@ static void __net_exit xfrm6_tunnel_net_exit(struct net *net) struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); unsigned int i; - xfrm_state_flush(net, IPSEC_PROTO_ANY, false); + xfrm_state_flush(net, 0, false); xfrm_flush_gc(); for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++) diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index cc2b3c44bc05..1e62fbc22cb7 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -10,8 +10,7 @@ * Ursula Braun <ursula.braun@de.ibm.com> */ -#define KMSG_COMPONENT "af_iucv" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "af_iucv: " fmt #include <linux/filter.h> #include <linux/module.h> @@ -553,16 +552,17 @@ static void __iucv_auto_name(struct iucv_sock *iucv) { char name[12]; - sprintf(name, "%08x", atomic_inc_return(&iucv_sk_list.autobind_name)); + scnprintf(name, sizeof(name), + "%08x", atomic_inc_return(&iucv_sk_list.autobind_name)); while (__iucv_get_sock_by_name(name)) { - sprintf(name, "%08x", - atomic_inc_return(&iucv_sk_list.autobind_name)); + scnprintf(name, sizeof(name), "%08x", + atomic_inc_return(&iucv_sk_list.autobind_name)); } memcpy(iucv->src_name, name, 8); } /* Bind an unbound socket */ -static int iucv_sock_bind(struct socket *sock, struct sockaddr *addr, +static int iucv_sock_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len) { DECLARE_SOCKADDR(struct sockaddr_iucv *, sa, addr); @@ -667,7 +667,7 @@ static int iucv_sock_autobind(struct sock *sk) return err; } -static int afiucv_path_connect(struct socket *sock, struct sockaddr *addr) +static int afiucv_path_connect(struct socket *sock, struct sockaddr_unsized *addr) { DECLARE_SOCKADDR(struct sockaddr_iucv *, sa, addr); struct sock *sk = sock->sk; @@ -713,7 +713,7 @@ done: } /* Connect an unconnected socket */ -static int iucv_sock_connect(struct socket *sock, struct sockaddr *addr, +static int iucv_sock_connect(struct socket *sock, struct sockaddr_unsized *addr, int alen, int flags) { DECLARE_SOCKADDR(struct sockaddr_iucv *, sa, addr); @@ -1187,7 +1187,7 @@ static void iucv_process_message(struct sock *sk, struct sk_buff *skb, IUCV_SKB_CB(skb)->offset = 0; if (sk_filter(sk, skb)) { - atomic_inc(&sk->sk_drops); /* skb rejected by filter */ + sk_drops_inc(sk); /* skb rejected by filter */ kfree_skb(skb); return; } @@ -2011,7 +2011,7 @@ static int afiucv_hs_callback_rx(struct sock *sk, struct sk_buff *skb) skb_reset_network_header(skb); IUCV_SKB_CB(skb)->offset = 0; if (sk_filter(sk, skb)) { - atomic_inc(&sk->sk_drops); /* skb rejected by filter */ + sk_drops_inc(sk); /* skb rejected by filter */ kfree_skb(skb); return NET_RX_SUCCESS; } diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c index 473a7847d80b..da2af413c89d 100644 --- a/net/iucv/iucv.c +++ b/net/iucv/iucv.c @@ -20,8 +20,7 @@ * CP Programming Service, IBM document # SC24-5760 */ -#define KMSG_COMPONENT "iucv" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "iucv: " fmt #include <linux/kernel_stat.h> #include <linux/export.h> @@ -95,7 +94,7 @@ struct device *iucv_alloc_device(const struct attribute_group **attrs, if (!dev) goto out_error; va_start(vargs, fmt); - vsnprintf(buf, sizeof(buf), fmt, vargs); + vscnprintf(buf, sizeof(buf), fmt, vargs); rc = dev_set_name(dev, "%s", buf); va_end(vargs); if (rc) diff --git a/net/kcm/Kconfig b/net/kcm/Kconfig index 16f39f2565d9..66660a06cacf 100644 --- a/net/kcm/Kconfig +++ b/net/kcm/Kconfig @@ -7,5 +7,5 @@ config AF_KCM select STREAM_PARSER help KCM (Kernel Connection Multiplexor) sockets provide a method - for multiplexing messages of a message based application - protocol over kernel connectons (e.g. TCP connections). + for multiplexing messages of a message-based application + protocol over kernel connections (e.g. TCP connections). diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index a4971e6fa943..5dd7e0509a48 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -430,7 +430,7 @@ static void psock_write_space(struct sock *sk) /* Check if the socket is reserved so someone is waiting for sending. */ kcm = psock->tx_kcm; - if (kcm && !unlikely(kcm->tx_stopped)) + if (kcm) queue_work(kcm_wq, &kcm->tx_work); spin_unlock_bh(&mux->lock); @@ -1560,24 +1560,16 @@ static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) } case SIOCKCMCLONE: { struct kcm_clone info; - struct file *file; - info.fd = get_unused_fd_flags(0); - if (unlikely(info.fd < 0)) - return info.fd; + FD_PREPARE(fdf, 0, kcm_clone(sock)); + if (fdf.err) + return fdf.err; - file = kcm_clone(sock); - if (IS_ERR(file)) { - put_unused_fd(info.fd); - return PTR_ERR(file); - } - if (copy_to_user((void __user *)arg, &info, - sizeof(info))) { - put_unused_fd(info.fd); - fput(file); + info.fd = fd_prepare_fd(fdf); + if (copy_to_user((void __user *)arg, &info, sizeof(info))) return -EFAULT; - } - fd_install(info.fd, file); + + fd_publish(fdf); err = 0; break; } @@ -1693,12 +1685,6 @@ static int kcm_release(struct socket *sock) */ __skb_queue_purge(&sk->sk_write_queue); - /* Set tx_stopped. This is checked when psock is bound to a kcm and we - * get a writespace callback. This prevents further work being queued - * from the callback (unbinding the psock occurs after canceling work. - */ - kcm->tx_stopped = 1; - release_sock(sk); spin_lock_bh(&mux->lock); @@ -1714,7 +1700,7 @@ static int kcm_release(struct socket *sock) /* Cancel work. After this point there should be no outside references * to the kcm socket. */ - cancel_work_sync(&kcm->tx_work); + disable_work_sync(&kcm->tx_work); lock_sock(sk); psock = kcm->tx_psock; diff --git a/net/key/af_key.c b/net/key/af_key.c index 2ebde0352245..571200433aa9 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -3903,6 +3903,8 @@ static int __init ipsec_pfkey_init(void) { int err = proto_register(&key_proto, 0); + pr_warn_once("PFKEY is deprecated and scheduled to be removed in 2027, " + "please contact the netdev mailing list\n"); if (err != 0) goto out; diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index 369a2f2e459c..687c1366a4d0 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -1246,9 +1246,9 @@ static int l2tp_xmit_core(struct l2tp_session *session, struct sk_buff *skb, uns else l2tp_build_l2tpv3_header(session, __skb_push(skb, session->hdr_len)); - /* Reset skb netfilter state */ - memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED | IPSKB_REROUTED); + /* Reset control buffer */ + memset(skb->cb, 0, sizeof(skb->cb)); + nf_reset_ct(skb); /* L2TP uses its own lockdep subclass to avoid lockdep splats caused by @@ -1503,7 +1503,7 @@ static int l2tp_tunnel_sock_create(struct net *net, memcpy(&ip6_addr.l2tp_addr, cfg->local_ip6, sizeof(ip6_addr.l2tp_addr)); ip6_addr.l2tp_conn_id = tunnel_id; - err = kernel_bind(sock, (struct sockaddr *)&ip6_addr, + err = kernel_bind(sock, (struct sockaddr_unsized *)&ip6_addr, sizeof(ip6_addr)); if (err < 0) goto out; @@ -1513,7 +1513,7 @@ static int l2tp_tunnel_sock_create(struct net *net, sizeof(ip6_addr.l2tp_addr)); ip6_addr.l2tp_conn_id = peer_tunnel_id; err = kernel_connect(sock, - (struct sockaddr *)&ip6_addr, + (struct sockaddr_unsized *)&ip6_addr, sizeof(ip6_addr), 0); if (err < 0) goto out; @@ -1530,7 +1530,7 @@ static int l2tp_tunnel_sock_create(struct net *net, ip_addr.l2tp_family = AF_INET; ip_addr.l2tp_addr = cfg->local_ip; ip_addr.l2tp_conn_id = tunnel_id; - err = kernel_bind(sock, (struct sockaddr *)&ip_addr, + err = kernel_bind(sock, (struct sockaddr_unsized *)&ip_addr, sizeof(ip_addr)); if (err < 0) goto out; @@ -1538,7 +1538,7 @@ static int l2tp_tunnel_sock_create(struct net *net, ip_addr.l2tp_family = AF_INET; ip_addr.l2tp_addr = cfg->peer_ip; ip_addr.l2tp_conn_id = peer_tunnel_id; - err = kernel_connect(sock, (struct sockaddr *)&ip_addr, + err = kernel_connect(sock, (struct sockaddr_unsized *)&ip_addr, sizeof(ip_addr), 0); if (err < 0) goto out; diff --git a/net/l2tp/l2tp_debugfs.c b/net/l2tp/l2tp_debugfs.c index 2d0c8275a3a8..5cfaab7d0890 100644 --- a/net/l2tp/l2tp_debugfs.c +++ b/net/l2tp/l2tp_debugfs.c @@ -163,7 +163,7 @@ static void l2tp_dfs_seq_tunnel_show(struct seq_file *m, void *v) seq_printf(m, " %d sessions, refcnt %d/%d\n", session_count, tunnel->sock ? refcount_read(&tunnel->sock->sk_refcnt) : 0, refcount_read(&tunnel->ref_count)); - seq_printf(m, " %08x rx %ld/%ld/%ld rx %ld/%ld/%ld\n", + seq_printf(m, " %08x tx %ld/%ld/%ld rx %ld/%ld/%ld\n", 0, atomic_long_read(&tunnel->stats.tx_packets), atomic_long_read(&tunnel->stats.tx_bytes), diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 29795d2839e8..cac1ff59cb83 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -267,7 +267,8 @@ static void l2tp_ip_destroy_sock(struct sock *sk) } } -static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) +static int l2tp_ip_bind(struct sock *sk, struct sockaddr_unsized *uaddr, + int addr_len) { struct inet_sock *inet = inet_sk(sk); struct sockaddr_l2tpip *addr = (struct sockaddr_l2tpip *)uaddr; @@ -328,7 +329,8 @@ out: return ret; } -static int l2tp_ip_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +static int l2tp_ip_connect(struct sock *sk, struct sockaddr_unsized *uaddr, + int addr_len) { struct sockaddr_l2tpip *lsa = (struct sockaddr_l2tpip *)uaddr; struct l2tp_ip_net *pn = l2tp_ip_pernet(sock_net(sk)); diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index ea232f338dcb..05a396ba6a3e 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -280,7 +280,8 @@ static void l2tp_ip6_destroy_sock(struct sock *sk) } } -static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) +static int l2tp_ip6_bind(struct sock *sk, struct sockaddr_unsized *uaddr, + int addr_len) { struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); @@ -383,7 +384,7 @@ out_unlock: return err; } -static int l2tp_ip6_connect(struct sock *sk, struct sockaddr *uaddr, +static int l2tp_ip6_connect(struct sock *sk, struct sockaddr_unsized *uaddr, int addr_len) { struct sockaddr_l2tpip6 *lsa = (struct sockaddr_l2tpip6 *)uaddr; diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index fc5c2fd8f34c..ae4543d5597b 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -129,22 +129,12 @@ static const struct ppp_channel_ops pppol2tp_chan_ops = { static const struct proto_ops pppol2tp_ops; -/* Retrieves the pppol2tp socket associated to a session. - * A reference is held on the returned socket, so this function must be paired - * with sock_put(). - */ +/* Retrieves the pppol2tp socket associated to a session. */ static struct sock *pppol2tp_session_get_sock(struct l2tp_session *session) { struct pppol2tp_session *ps = l2tp_session_priv(session); - struct sock *sk; - - rcu_read_lock(); - sk = rcu_dereference(ps->sk); - if (sk) - sock_hold(sk); - rcu_read_unlock(); - return sk; + return rcu_dereference(ps->sk); } /* Helpers to obtain tunnel/session contexts from sockets. @@ -206,14 +196,13 @@ end: static void pppol2tp_recv(struct l2tp_session *session, struct sk_buff *skb, int data_len) { - struct pppol2tp_session *ps = l2tp_session_priv(session); - struct sock *sk = NULL; + struct sock *sk; /* If the socket is bound, send it in to PPP's input queue. Otherwise * queue it on the session socket. */ rcu_read_lock(); - sk = rcu_dereference(ps->sk); + sk = pppol2tp_session_get_sock(session); if (!sk) goto no_sock; @@ -510,13 +499,14 @@ static void pppol2tp_show(struct seq_file *m, void *arg) struct l2tp_session *session = arg; struct sock *sk; + rcu_read_lock(); sk = pppol2tp_session_get_sock(session); if (sk) { struct pppox_sock *po = pppox_sk(sk); seq_printf(m, " interface %s\n", ppp_dev_name(&po->chan)); - sock_put(sk); } + rcu_read_unlock(); } static void pppol2tp_session_init(struct l2tp_session *session) @@ -694,7 +684,7 @@ static struct l2tp_tunnel *pppol2tp_tunnel_get(struct net *net, /* connect() handler. Attach a PPPoX socket to a tunnel UDP socket */ -static int pppol2tp_connect(struct socket *sock, struct sockaddr *uservaddr, +static int pppol2tp_connect(struct socket *sock, struct sockaddr_unsized *uservaddr, int sockaddr_len, int flags) { struct sock *sk = sock->sk; @@ -1530,6 +1520,7 @@ static void pppol2tp_seq_session_show(struct seq_file *m, void *v) port = ntohs(inet->inet_sport); } + rcu_read_lock(); sk = pppol2tp_session_get_sock(session); if (sk) { state = sk->sk_state; @@ -1565,8 +1556,8 @@ static void pppol2tp_seq_session_show(struct seq_file *m, void *v) struct pppox_sock *po = pppox_sk(sk); seq_printf(m, " interface %s\n", ppp_dev_name(&po->chan)); - sock_put(sk); } + rcu_read_unlock(); } static int pppol2tp_seq_show(struct seq_file *m, void *v) diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 5958a80fe14c..59d593bb5d18 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -337,7 +337,7 @@ out: * otherwise all hell will break loose. * Returns: 0 upon success, negative otherwise. */ -static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen) +static int llc_ui_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addrlen) { struct sockaddr_llc *addr = (struct sockaddr_llc *)uaddr; struct sock *sk = sock->sk; @@ -477,7 +477,7 @@ out: * This function will autobind if user did not previously call bind. * Returns: 0 upon success, negative otherwise. */ -static int llc_ui_connect(struct socket *sock, struct sockaddr *uaddr, +static int llc_ui_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addrlen, int flags) { struct sock *sk = sock->sk; diff --git a/net/mac80211/aes_cmac.c b/net/mac80211/aes_cmac.c index 48c04f89de20..0827965455dc 100644 --- a/net/mac80211/aes_cmac.c +++ b/net/mac80211/aes_cmac.c @@ -16,56 +16,48 @@ #include "key.h" #include "aes_cmac.h" -#define CMAC_TLEN 8 /* CMAC TLen = 64 bits (8 octets) */ -#define CMAC_TLEN_256 16 /* CMAC TLen = 128 bits (16 octets) */ #define AAD_LEN 20 -static const u8 zero[CMAC_TLEN_256]; +static const u8 zero[IEEE80211_CMAC_256_MIC_LEN]; -void ieee80211_aes_cmac(struct crypto_shash *tfm, const u8 *aad, - const u8 *data, size_t data_len, u8 *mic) +int ieee80211_aes_cmac(struct crypto_shash *tfm, const u8 *aad, + const u8 *data, size_t data_len, u8 *mic, + unsigned int mic_len) { + int err; SHASH_DESC_ON_STACK(desc, tfm); u8 out[AES_BLOCK_SIZE]; const __le16 *fc; desc->tfm = tfm; - crypto_shash_init(desc); - crypto_shash_update(desc, aad, AAD_LEN); + err = crypto_shash_init(desc); + if (err) + return err; + err = crypto_shash_update(desc, aad, AAD_LEN); + if (err) + return err; fc = (const __le16 *)aad; if (ieee80211_is_beacon(*fc)) { /* mask Timestamp field to zero */ - crypto_shash_update(desc, zero, 8); - crypto_shash_update(desc, data + 8, data_len - 8 - CMAC_TLEN); + err = crypto_shash_update(desc, zero, 8); + if (err) + return err; + err = crypto_shash_update(desc, data + 8, + data_len - 8 - mic_len); + if (err) + return err; } else { - crypto_shash_update(desc, data, data_len - CMAC_TLEN); + err = crypto_shash_update(desc, data, data_len - mic_len); + if (err) + return err; } - crypto_shash_finup(desc, zero, CMAC_TLEN, out); + err = crypto_shash_finup(desc, zero, mic_len, out); + if (err) + return err; + memcpy(mic, out, mic_len); - memcpy(mic, out, CMAC_TLEN); -} - -void ieee80211_aes_cmac_256(struct crypto_shash *tfm, const u8 *aad, - const u8 *data, size_t data_len, u8 *mic) -{ - SHASH_DESC_ON_STACK(desc, tfm); - const __le16 *fc; - - desc->tfm = tfm; - - crypto_shash_init(desc); - crypto_shash_update(desc, aad, AAD_LEN); - fc = (const __le16 *)aad; - if (ieee80211_is_beacon(*fc)) { - /* mask Timestamp field to zero */ - crypto_shash_update(desc, zero, 8); - crypto_shash_update(desc, data + 8, - data_len - 8 - CMAC_TLEN_256); - } else { - crypto_shash_update(desc, data, data_len - CMAC_TLEN_256); - } - crypto_shash_finup(desc, zero, CMAC_TLEN_256, mic); + return 0; } struct crypto_shash *ieee80211_aes_cmac_key_setup(const u8 key[], diff --git a/net/mac80211/aes_cmac.h b/net/mac80211/aes_cmac.h index 76817446fb83..5f971a8298cb 100644 --- a/net/mac80211/aes_cmac.h +++ b/net/mac80211/aes_cmac.h @@ -11,10 +11,9 @@ struct crypto_shash *ieee80211_aes_cmac_key_setup(const u8 key[], size_t key_len); -void ieee80211_aes_cmac(struct crypto_shash *tfm, const u8 *aad, - const u8 *data, size_t data_len, u8 *mic); -void ieee80211_aes_cmac_256(struct crypto_shash *tfm, const u8 *aad, - const u8 *data, size_t data_len, u8 *mic); +int ieee80211_aes_cmac(struct crypto_shash *tfm, const u8 *aad, + const u8 *data, size_t data_len, u8 *mic, + unsigned int mic_len); void ieee80211_aes_cmac_key_free(struct crypto_shash *tfm); #endif /* AES_CMAC_H */ diff --git a/net/mac80211/aes_gmac.c b/net/mac80211/aes_gmac.c index 512cab073f2e..811a83d8d525 100644 --- a/net/mac80211/aes_gmac.c +++ b/net/mac80211/aes_gmac.c @@ -24,15 +24,16 @@ int ieee80211_aes_gmac(struct crypto_aead *tfm, const u8 *aad, u8 *nonce, const __le16 *fc; int ret; - if (data_len < GMAC_MIC_LEN) + if (data_len < IEEE80211_GMAC_MIC_LEN) return -EINVAL; - aead_req = kzalloc(reqsize + GMAC_MIC_LEN + GMAC_AAD_LEN, GFP_ATOMIC); + aead_req = kzalloc(reqsize + IEEE80211_GMAC_MIC_LEN + GMAC_AAD_LEN, + GFP_ATOMIC); if (!aead_req) return -ENOMEM; zero = (u8 *)aead_req + reqsize; - __aad = zero + GMAC_MIC_LEN; + __aad = zero + IEEE80211_GMAC_MIC_LEN; memcpy(__aad, aad, GMAC_AAD_LEN); fc = (const __le16 *)aad; @@ -41,15 +42,16 @@ int ieee80211_aes_gmac(struct crypto_aead *tfm, const u8 *aad, u8 *nonce, sg_init_table(sg, 5); sg_set_buf(&sg[0], __aad, GMAC_AAD_LEN); sg_set_buf(&sg[1], zero, 8); - sg_set_buf(&sg[2], data + 8, data_len - 8 - GMAC_MIC_LEN); - sg_set_buf(&sg[3], zero, GMAC_MIC_LEN); - sg_set_buf(&sg[4], mic, GMAC_MIC_LEN); + sg_set_buf(&sg[2], data + 8, + data_len - 8 - IEEE80211_GMAC_MIC_LEN); + sg_set_buf(&sg[3], zero, IEEE80211_GMAC_MIC_LEN); + sg_set_buf(&sg[4], mic, IEEE80211_GMAC_MIC_LEN); } else { sg_init_table(sg, 4); sg_set_buf(&sg[0], __aad, GMAC_AAD_LEN); - sg_set_buf(&sg[1], data, data_len - GMAC_MIC_LEN); - sg_set_buf(&sg[2], zero, GMAC_MIC_LEN); - sg_set_buf(&sg[3], mic, GMAC_MIC_LEN); + sg_set_buf(&sg[1], data, data_len - IEEE80211_GMAC_MIC_LEN); + sg_set_buf(&sg[2], zero, IEEE80211_GMAC_MIC_LEN); + sg_set_buf(&sg[3], mic, IEEE80211_GMAC_MIC_LEN); } memcpy(iv, nonce, GMAC_NONCE_LEN); @@ -78,7 +80,7 @@ struct crypto_aead *ieee80211_aes_gmac_key_setup(const u8 key[], err = crypto_aead_setkey(tfm, key, key_len); if (!err) - err = crypto_aead_setauthsize(tfm, GMAC_MIC_LEN); + err = crypto_aead_setauthsize(tfm, IEEE80211_GMAC_MIC_LEN); if (!err) return tfm; diff --git a/net/mac80211/aes_gmac.h b/net/mac80211/aes_gmac.h index c739356bae2a..206136b60bca 100644 --- a/net/mac80211/aes_gmac.h +++ b/net/mac80211/aes_gmac.h @@ -9,7 +9,6 @@ #include <linux/crypto.h> #define GMAC_AAD_LEN 20 -#define GMAC_MIC_LEN 16 #define GMAC_NONCE_LEN 12 struct crypto_aead *ieee80211_aes_gmac_key_setup(const u8 key[], diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index e38f46ffebfa..7da909d78c68 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -9,7 +9,7 @@ * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2007-2010, Intel Corporation * Copyright(c) 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2024 Intel Corporation + * Copyright (C) 2018-2025 Intel Corporation */ /** @@ -206,7 +206,10 @@ u8 ieee80211_retrieve_addba_ext_data(struct sta_info *sta, if (elem_len <= 0) return 0; - elems = ieee802_11_parse_elems(elem_data, elem_len, true, NULL); + elems = ieee802_11_parse_elems(elem_data, elem_len, + IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION, + NULL); if (!elems || elems->parse_error || !elems->addba_ext_ie) goto free; diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 2ed07fa121ab..b51c2c8584ae 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -63,12 +63,14 @@ static void ieee80211_set_mu_mimo_follow(struct ieee80211_sub_if_data *sdata, memcpy(sdata->vif.bss_conf.mu_group.position, params->vht_mumimo_groups + WLAN_MEMBERSHIP_LEN, WLAN_USER_POSITION_LEN); - ieee80211_link_info_change_notify(sdata, &sdata->deflink, - BSS_CHANGED_MU_GROUPS); + /* don't care about endianness - just check for 0 */ memcpy(&membership, params->vht_mumimo_groups, WLAN_MEMBERSHIP_LEN); mu_mimo_groups = membership != 0; + + /* Unset following if configured explicitly */ + eth_broadcast_addr(sdata->u.mntr.mu_follow_addr); } if (params->vht_mumimo_follow_addr) { @@ -76,16 +78,26 @@ static void ieee80211_set_mu_mimo_follow(struct ieee80211_sub_if_data *sdata, is_valid_ether_addr(params->vht_mumimo_follow_addr); ether_addr_copy(sdata->u.mntr.mu_follow_addr, params->vht_mumimo_follow_addr); + + /* Unset current membership until a management frame is RXed */ + memset(sdata->vif.bss_conf.mu_group.membership, 0, + WLAN_MEMBERSHIP_LEN); } sdata->vif.bss_conf.mu_mimo_owner = mu_mimo_groups || mu_mimo_follow; + + /* Notify only after setting mu_mimo_owner */ + if (sdata->vif.bss_conf.mu_mimo_owner && + sdata->flags & IEEE80211_SDATA_IN_DRIVER) + ieee80211_link_info_change_notify(sdata, &sdata->deflink, + BSS_CHANGED_MU_GROUPS); } static int ieee80211_set_mon_options(struct ieee80211_sub_if_data *sdata, struct vif_params *params) { struct ieee80211_local *local = sdata->local; - struct ieee80211_sub_if_data *monitor_sdata; + struct ieee80211_sub_if_data *monitor_sdata = NULL; /* check flags first */ if (params->flags && ieee80211_sdata_running(sdata)) { @@ -103,23 +115,28 @@ static int ieee80211_set_mon_options(struct ieee80211_sub_if_data *sdata, return -EBUSY; } - /* also validate MU-MIMO change */ - if (ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) - monitor_sdata = sdata; - else - monitor_sdata = wiphy_dereference(local->hw.wiphy, - local->monitor_sdata); - - if (!monitor_sdata && + /* validate whether MU-MIMO can be configured */ + if (!ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF) && + !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR) && (params->vht_mumimo_groups || params->vht_mumimo_follow_addr)) return -EOPNOTSUPP; + /* Also update dependent monitor_sdata if required */ + if (test_bit(SDATA_STATE_RUNNING, &sdata->state) && + !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) + monitor_sdata = wiphy_dereference(local->hw.wiphy, + local->monitor_sdata); + /* apply all changes now - no failures allowed */ - if (monitor_sdata && - (ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF) || - ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR))) - ieee80211_set_mu_mimo_follow(monitor_sdata, params); + if (ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF) || + ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) { + /* This is copied in when the VIF is activated */ + ieee80211_set_mu_mimo_follow(sdata, params); + + if (monitor_sdata) + ieee80211_set_mu_mimo_follow(monitor_sdata, params); + } if (params->flags) { if (ieee80211_sdata_running(sdata)) { @@ -311,6 +328,96 @@ static void ieee80211_stop_p2p_device(struct wiphy *wiphy, ieee80211_sdata_stop(IEEE80211_WDEV_TO_SUB_IF(wdev)); } +static void ieee80211_nan_conf_free(struct cfg80211_nan_conf *conf) +{ + kfree(conf->cluster_id); + kfree(conf->extra_nan_attrs); + kfree(conf->vendor_elems); + memset(conf, 0, sizeof(*conf)); +} + +static void ieee80211_stop_nan(struct wiphy *wiphy, + struct wireless_dev *wdev) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); + + if (!sdata->u.nan.started) + return; + + drv_stop_nan(sdata->local, sdata); + sdata->u.nan.started = false; + + ieee80211_nan_conf_free(&sdata->u.nan.conf); + + ieee80211_sdata_stop(sdata); + ieee80211_recalc_idle(sdata->local); +} + +static int ieee80211_nan_conf_copy(struct cfg80211_nan_conf *dst, + struct cfg80211_nan_conf *src, + u32 changes) +{ + if (changes & CFG80211_NAN_CONF_CHANGED_PREF) + dst->master_pref = src->master_pref; + + if (changes & CFG80211_NAN_CONF_CHANGED_BANDS) + dst->bands = src->bands; + + if (changes & CFG80211_NAN_CONF_CHANGED_CONFIG) { + dst->scan_period = src->scan_period; + dst->scan_dwell_time = src->scan_dwell_time; + dst->discovery_beacon_interval = + src->discovery_beacon_interval; + dst->enable_dw_notification = src->enable_dw_notification; + memcpy(&dst->band_cfgs, &src->band_cfgs, + sizeof(dst->band_cfgs)); + + kfree(dst->cluster_id); + dst->cluster_id = NULL; + + kfree(dst->extra_nan_attrs); + dst->extra_nan_attrs = NULL; + dst->extra_nan_attrs_len = 0; + + kfree(dst->vendor_elems); + dst->vendor_elems = NULL; + dst->vendor_elems_len = 0; + + if (src->cluster_id) { + dst->cluster_id = kmemdup(src->cluster_id, ETH_ALEN, + GFP_KERNEL); + if (!dst->cluster_id) + goto no_mem; + } + + if (src->extra_nan_attrs && src->extra_nan_attrs_len) { + dst->extra_nan_attrs = kmemdup(src->extra_nan_attrs, + src->extra_nan_attrs_len, + GFP_KERNEL); + if (!dst->extra_nan_attrs) + goto no_mem; + + dst->extra_nan_attrs_len = src->extra_nan_attrs_len; + } + + if (src->vendor_elems && src->vendor_elems_len) { + dst->vendor_elems = kmemdup(src->vendor_elems, + src->vendor_elems_len, + GFP_KERNEL); + if (!dst->vendor_elems) + goto no_mem; + + dst->vendor_elems_len = src->vendor_elems_len; + } + } + + return 0; + +no_mem: + ieee80211_nan_conf_free(dst); + return -ENOMEM; +} + static int ieee80211_start_nan(struct wiphy *wiphy, struct wireless_dev *wdev, struct cfg80211_nan_conf *conf) @@ -320,6 +427,9 @@ static int ieee80211_start_nan(struct wiphy *wiphy, lockdep_assert_wiphy(sdata->local->hw.wiphy); + if (sdata->u.nan.started) + return -EALREADY; + ret = ieee80211_check_combinations(sdata, NULL, 0, 0, -1); if (ret < 0) return ret; @@ -329,21 +439,21 @@ static int ieee80211_start_nan(struct wiphy *wiphy, return ret; ret = drv_start_nan(sdata->local, sdata, conf); - if (ret) + if (ret) { ieee80211_sdata_stop(sdata); + return ret; + } - sdata->u.nan.conf = *conf; - - return ret; -} + sdata->u.nan.started = true; + ieee80211_recalc_idle(sdata->local); -static void ieee80211_stop_nan(struct wiphy *wiphy, - struct wireless_dev *wdev) -{ - struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); + ret = ieee80211_nan_conf_copy(&sdata->u.nan.conf, conf, 0xFFFFFFFF); + if (ret) { + ieee80211_stop_nan(wiphy, wdev); + return ret; + } - drv_stop_nan(sdata->local, sdata); - ieee80211_sdata_stop(sdata); + return 0; } static int ieee80211_nan_change_conf(struct wiphy *wiphy, @@ -352,7 +462,7 @@ static int ieee80211_nan_change_conf(struct wiphy *wiphy, u32 changes) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); - struct cfg80211_nan_conf new_conf; + struct cfg80211_nan_conf new_conf = {}; int ret = 0; if (sdata->vif.type != NL80211_IFTYPE_NAN) @@ -361,17 +471,28 @@ static int ieee80211_nan_change_conf(struct wiphy *wiphy, if (!ieee80211_sdata_running(sdata)) return -ENETDOWN; - new_conf = sdata->u.nan.conf; + if (!changes) + return 0; - if (changes & CFG80211_NAN_CONF_CHANGED_PREF) - new_conf.master_pref = conf->master_pref; + /* First make a full copy of the previous configuration and then apply + * the changes. This might be a little wasteful, but it is simpler. + */ + ret = ieee80211_nan_conf_copy(&new_conf, &sdata->u.nan.conf, + 0xFFFFFFFF); + if (ret < 0) + return ret; - if (changes & CFG80211_NAN_CONF_CHANGED_BANDS) - new_conf.bands = conf->bands; + ret = ieee80211_nan_conf_copy(&new_conf, conf, changes); + if (ret < 0) + return ret; ret = drv_nan_change_conf(sdata->local, sdata, &new_conf, changes); - if (!ret) + if (ret) { + ieee80211_nan_conf_free(&new_conf); + } else { + ieee80211_nan_conf_free(&sdata->u.nan.conf); sdata->u.nan.conf = new_conf; + } return ret; } @@ -1772,6 +1893,9 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev, link_conf->nontransmitted = false; link_conf->ema_ap = false; link_conf->bssid_indicator = 0; + link_conf->fils_discovery.min_interval = 0; + link_conf->fils_discovery.max_interval = 0; + link_conf->unsol_bcast_probe_resp_interval = 0; __sta_info_flush(sdata, true, link_id, NULL); @@ -2171,10 +2295,16 @@ static int sta_apply_parameters(struct ieee80211_local *local, /* * cfg80211 validates this (1-2007) and allows setting the AID - * only when creating a new station entry + * only when creating a new station entry. For S1G APs, the current + * implementation supports a maximum of 1600 AIDs. */ - if (params->aid) + if (params->aid) { + if (sdata->vif.cfg.s1g && + params->aid > IEEE80211_MAX_SUPPORTED_S1G_AID) + return -EINVAL; + sta->sta.aid = params->aid; + } /* * Some of the following updates would be racy if called on an @@ -3001,6 +3131,9 @@ static int ieee80211_scan(struct wiphy *wiphy, struct cfg80211_scan_request *req) { struct ieee80211_sub_if_data *sdata; + struct ieee80211_link_data *link; + struct ieee80211_channel *chan; + int radio_idx; sdata = IEEE80211_WDEV_TO_SUB_IF(req->wdev); @@ -3028,10 +3161,20 @@ static int ieee80211_scan(struct wiphy *wiphy, * the frames sent while scanning on other channel will be * lost) */ - if (ieee80211_num_beaconing_links(sdata) && - (!(wiphy->features & NL80211_FEATURE_AP_SCAN) || - !(req->flags & NL80211_SCAN_FLAG_AP))) - return -EOPNOTSUPP; + for_each_link_data(sdata, link) { + /* if the link is not beaconing, ignore it */ + if (!sdata_dereference(link->u.ap.beacon, sdata)) + continue; + + chan = link->conf->chanreq.oper.chan; + radio_idx = cfg80211_get_radio_idx_by_chan(wiphy, chan); + + if (ieee80211_is_radio_idx_in_scan_req(wiphy, req, + radio_idx) && + (!(wiphy->features & NL80211_FEATURE_AP_SCAN) || + !(req->flags & NL80211_SCAN_FLAG_AP))) + return -EOPNOTSUPP; + } break; case NL80211_IFTYPE_NAN: default: @@ -3677,12 +3820,7 @@ static bool ieee80211_is_scan_ongoing(struct wiphy *wiphy, if (list_empty(&local->roc_list) && !local->scanning) return false; - if (wiphy->n_radio < 2) - return true; - req_radio_idx = cfg80211_get_radio_idx_by_chan(wiphy, chandef->chan); - if (req_radio_idx < 0) - return true; if (local->scanning) { scan_req = wiphy_dereference(wiphy, local->scan_req); @@ -3701,14 +3839,6 @@ static bool ieee80211_is_scan_ongoing(struct wiphy *wiphy, list_for_each_entry(roc, &local->roc_list, list) { chan_radio_idx = cfg80211_get_radio_idx_by_chan(wiphy, roc->chan); - /* - * The roc work is added but chan_radio_idx is invalid. - * Should not happen but if it does, let's not take - * risk and return true. - */ - if (chan_radio_idx < 0) - return true; - if (chan_radio_idx == req_radio_idx) return true; } @@ -4825,7 +4955,6 @@ static int ieee80211_get_txq_stats(struct wiphy *wiphy, int ret = 0; spin_lock_bh(&local->fq.lock); - rcu_read_lock(); if (wdev) { sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); @@ -4851,7 +4980,6 @@ static int ieee80211_get_txq_stats(struct wiphy *wiphy, } out: - rcu_read_unlock(); spin_unlock_bh(&local->fq.lock); return ret; diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index c9cea0e7ac16..d0bfb1216401 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -12,15 +12,131 @@ #include "driver-ops.h" #include "rate.h" +struct ieee80211_chanctx_user_iter { + struct ieee80211_chan_req *chanreq; + struct ieee80211_sub_if_data *sdata; + struct ieee80211_link_data *link; + enum nl80211_iftype iftype; + bool reserved, radar_required, done; + enum { + CHANCTX_ITER_POS_ASSIGNED, + CHANCTX_ITER_POS_RESERVED, + CHANCTX_ITER_POS_DONE, + } per_link; +}; + +enum ieee80211_chanctx_iter_type { + CHANCTX_ITER_ALL, + CHANCTX_ITER_RESERVED, + CHANCTX_ITER_ASSIGNED, +}; + +static void ieee80211_chanctx_user_iter_next(struct ieee80211_local *local, + struct ieee80211_chanctx *ctx, + struct ieee80211_chanctx_user_iter *iter, + enum ieee80211_chanctx_iter_type type, + bool start) +{ + lockdep_assert_wiphy(local->hw.wiphy); + + if (start) { + memset(iter, 0, sizeof(*iter)); + goto next_interface; + } + +next_link: + for (int link_id = iter->link ? iter->link->link_id : 0; + link_id < ARRAY_SIZE(iter->sdata->link); + link_id++) { + struct ieee80211_link_data *link; + + link = sdata_dereference(iter->sdata->link[link_id], + iter->sdata); + if (!link) + continue; + + switch (iter->per_link) { + case CHANCTX_ITER_POS_ASSIGNED: + iter->per_link = CHANCTX_ITER_POS_RESERVED; + if (type != CHANCTX_ITER_RESERVED && + rcu_access_pointer(link->conf->chanctx_conf) == &ctx->conf) { + iter->link = link; + iter->reserved = false; + iter->radar_required = link->radar_required; + iter->chanreq = &link->conf->chanreq; + return; + } + fallthrough; + case CHANCTX_ITER_POS_RESERVED: + iter->per_link = CHANCTX_ITER_POS_DONE; + if (type != CHANCTX_ITER_ASSIGNED && + link->reserved_chanctx == ctx) { + iter->link = link; + iter->reserved = true; + iter->radar_required = + link->reserved_radar_required; + + iter->chanreq = &link->reserved; + return; + } + fallthrough; + case CHANCTX_ITER_POS_DONE: + iter->per_link = CHANCTX_ITER_POS_ASSIGNED; + continue; + } + } + +next_interface: + /* next (or first) interface */ + iter->sdata = list_prepare_entry(iter->sdata, &local->interfaces, list); + list_for_each_entry_continue(iter->sdata, &local->interfaces, list) { + /* AP_VLAN has a chanctx pointer but follows AP */ + if (iter->sdata->vif.type == NL80211_IFTYPE_AP_VLAN) + continue; + + iter->link = NULL; + iter->per_link = CHANCTX_ITER_POS_ASSIGNED; + iter->iftype = iter->sdata->vif.type; + goto next_link; + } + + iter->done = true; +} + +#define for_each_chanctx_user_assigned(local, ctx, iter) \ + for (ieee80211_chanctx_user_iter_next(local, ctx, iter, \ + CHANCTX_ITER_ASSIGNED, \ + true); \ + !((iter)->done); \ + ieee80211_chanctx_user_iter_next(local, ctx, iter, \ + CHANCTX_ITER_ASSIGNED, \ + false)) + +#define for_each_chanctx_user_reserved(local, ctx, iter) \ + for (ieee80211_chanctx_user_iter_next(local, ctx, iter, \ + CHANCTX_ITER_RESERVED, \ + true); \ + !((iter)->done); \ + ieee80211_chanctx_user_iter_next(local, ctx, iter, \ + CHANCTX_ITER_RESERVED, \ + false)) + +#define for_each_chanctx_user_all(local, ctx, iter) \ + for (ieee80211_chanctx_user_iter_next(local, ctx, iter, \ + CHANCTX_ITER_ALL, \ + true); \ + !((iter)->done); \ + ieee80211_chanctx_user_iter_next(local, ctx, iter, \ + CHANCTX_ITER_ALL, \ + false)) + static int ieee80211_chanctx_num_assigned(struct ieee80211_local *local, struct ieee80211_chanctx *ctx) { - struct ieee80211_link_data *link; + struct ieee80211_chanctx_user_iter iter; int num = 0; - lockdep_assert_wiphy(local->hw.wiphy); - - list_for_each_entry(link, &ctx->assigned_links, assigned_chanctx_list) + for_each_chanctx_user_assigned(local, ctx, &iter) num++; return num; @@ -29,12 +145,10 @@ static int ieee80211_chanctx_num_assigned(struct ieee80211_local *local, static int ieee80211_chanctx_num_reserved(struct ieee80211_local *local, struct ieee80211_chanctx *ctx) { - struct ieee80211_link_data *link; + struct ieee80211_chanctx_user_iter iter; int num = 0; - lockdep_assert_wiphy(local->hw.wiphy); - - list_for_each_entry(link, &ctx->reserved_links, reserved_chanctx_list) + for_each_chanctx_user_reserved(local, ctx, &iter) num++; return num; @@ -43,8 +157,13 @@ static int ieee80211_chanctx_num_reserved(struct ieee80211_local *local, int ieee80211_chanctx_refcount(struct ieee80211_local *local, struct ieee80211_chanctx *ctx) { - return ieee80211_chanctx_num_assigned(local, ctx) + - ieee80211_chanctx_num_reserved(local, ctx); + struct ieee80211_chanctx_user_iter iter; + int num = 0; + + for_each_chanctx_user_all(local, ctx, &iter) + num++; + + return num; } static int ieee80211_num_chanctx(struct ieee80211_local *local, int radio_idx) @@ -143,15 +262,15 @@ ieee80211_chanctx_reserved_chanreq(struct ieee80211_local *local, const struct ieee80211_chan_req *req, struct ieee80211_chan_req *tmp) { - struct ieee80211_link_data *link; + struct ieee80211_chanctx_user_iter iter; lockdep_assert_wiphy(local->hw.wiphy); if (WARN_ON(!req)) return NULL; - list_for_each_entry(link, &ctx->reserved_links, reserved_chanctx_list) { - req = ieee80211_chanreq_compatible(&link->reserved, req, tmp); + for_each_chanctx_user_reserved(local, ctx, &iter) { + req = ieee80211_chanreq_compatible(iter.chanreq, req, tmp); if (!req) break; } @@ -165,18 +284,16 @@ ieee80211_chanctx_non_reserved_chandef(struct ieee80211_local *local, const struct ieee80211_chan_req *compat, struct ieee80211_chan_req *tmp) { - struct ieee80211_link_data *link; const struct ieee80211_chan_req *comp_def = compat; + struct ieee80211_chanctx_user_iter iter; lockdep_assert_wiphy(local->hw.wiphy); - list_for_each_entry(link, &ctx->assigned_links, assigned_chanctx_list) { - struct ieee80211_bss_conf *link_conf = link->conf; - - if (link->reserved_chanctx) + for_each_chanctx_user_assigned(local, ctx, &iter) { + if (iter.link->reserved_chanctx) continue; - comp_def = ieee80211_chanreq_compatible(&link_conf->chanreq, + comp_def = ieee80211_chanreq_compatible(iter.chanreq, comp_def, tmp); if (!comp_def) break; @@ -200,7 +317,7 @@ ieee80211_chanctx_can_reserve(struct ieee80211_local *local, if (!ieee80211_chanctx_non_reserved_chandef(local, ctx, req, &tmp)) return false; - if (!list_empty(&ctx->reserved_links) && + if (ieee80211_chanctx_num_reserved(local, ctx) != 0 && ieee80211_chanctx_reserved_chanreq(local, ctx, req, &tmp)) return true; @@ -389,10 +506,10 @@ ieee80211_get_chanctx_max_required_bw(struct ieee80211_local *local, * channel context. */ static u32 -_ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local, - struct ieee80211_chanctx *ctx, - struct ieee80211_link_data *rsvd_for, - bool check_reserved) +__ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local, + struct ieee80211_chanctx *ctx, + struct ieee80211_link_data *rsvd_for, + bool check_reserved) { enum nl80211_chan_width max_bw; struct cfg80211_chan_def min_def; @@ -497,13 +614,14 @@ static void ieee80211_chan_bw_change(struct ieee80211_local *local, * the max of min required widths of all the interfaces bound to this * channel context. */ -void ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local, - struct ieee80211_chanctx *ctx, - struct ieee80211_link_data *rsvd_for, - bool check_reserved) +static void +_ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local, + struct ieee80211_chanctx *ctx, + struct ieee80211_link_data *rsvd_for, + bool check_reserved) { - u32 changed = _ieee80211_recalc_chanctx_min_def(local, ctx, rsvd_for, - check_reserved); + u32 changed = __ieee80211_recalc_chanctx_min_def(local, ctx, rsvd_for, + check_reserved); if (!changed) return; @@ -517,6 +635,12 @@ void ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local, ieee80211_chan_bw_change(local, ctx, false, false); } +void ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local, + struct ieee80211_chanctx *ctx) +{ + _ieee80211_recalc_chanctx_min_def(local, ctx, NULL, false); +} + static void _ieee80211_change_chanctx(struct ieee80211_local *local, struct ieee80211_chanctx *ctx, struct ieee80211_chanctx *old_ctx, @@ -530,8 +654,19 @@ static void _ieee80211_change_chanctx(struct ieee80211_local *local, }; u32 changed = 0; - /* expected to handle only 20/40/80/160/320 channel widths */ + /* 5/10 MHz not handled here */ switch (chandef->width) { + case NL80211_CHAN_WIDTH_1: + case NL80211_CHAN_WIDTH_2: + case NL80211_CHAN_WIDTH_4: + case NL80211_CHAN_WIDTH_8: + case NL80211_CHAN_WIDTH_16: + /* + * mac80211 currently only supports sharing identical + * chanctx's for S1G interfaces. + */ + WARN_ON(!ieee80211_chanreq_identical(&ctx_req, chanreq)); + return; case NL80211_CHAN_WIDTH_20_NOHT: case NL80211_CHAN_WIDTH_20: case NL80211_CHAN_WIDTH_40: @@ -551,7 +686,7 @@ static void _ieee80211_change_chanctx(struct ieee80211_local *local, ieee80211_chan_bw_change(local, old_ctx, false, true); if (ieee80211_chanreq_identical(&ctx_req, chanreq)) { - ieee80211_recalc_chanctx_min_def(local, ctx, rsvd_for, false); + _ieee80211_recalc_chanctx_min_def(local, ctx, rsvd_for, false); return; } @@ -572,7 +707,8 @@ static void _ieee80211_change_chanctx(struct ieee80211_local *local, ctx->conf.ap = chanreq->ap; /* check if min chanctx also changed */ - changed |= _ieee80211_recalc_chanctx_min_def(local, ctx, rsvd_for, false); + changed |= __ieee80211_recalc_chanctx_min_def(local, ctx, rsvd_for, + false); ieee80211_add_wbrf(local, &ctx->conf.def); @@ -633,8 +769,6 @@ ieee80211_find_chanctx(struct ieee80211_local *local, * context to actually be removed. */ link->reserved_chanctx = ctx; - list_add(&link->reserved_chanctx_list, - &ctx->reserved_links); ieee80211_change_chanctx(local, ctx, ctx, compat); @@ -659,19 +793,8 @@ bool ieee80211_is_radar_required(struct ieee80211_local *local, for_each_sdata_link(local, link) { if (link->radar_required) { - if (wiphy->n_radio < 2) - return true; - chan = link->conf->chanreq.oper.chan; radio_idx = cfg80211_get_radio_idx_by_chan(wiphy, chan); - /* - * The radio index (radio_idx) is expected to be valid, - * as it's derived from a channel tied to a link. If - * it's invalid (i.e., negative), return true to avoid - * potential issues with radar-sensitive operations. - */ - if (radio_idx < 0) - return true; if (ieee80211_is_radio_idx_in_scan_req(wiphy, req, radio_idx)) @@ -686,17 +809,13 @@ static bool ieee80211_chanctx_radar_required(struct ieee80211_local *local, struct ieee80211_chanctx *ctx) { - struct ieee80211_chanctx_conf *conf = &ctx->conf; - struct ieee80211_link_data *link; + struct ieee80211_chanctx_user_iter iter; lockdep_assert_wiphy(local->hw.wiphy); - for_each_sdata_link(local, link) { - if (rcu_access_pointer(link->conf->chanctx_conf) != conf) - continue; - if (!link->radar_required) - continue; - return true; + for_each_chanctx_user_assigned(local, ctx, &iter) { + if (iter.radar_required) + return true; } return false; @@ -716,8 +835,6 @@ ieee80211_alloc_chanctx(struct ieee80211_local *local, if (!ctx) return NULL; - INIT_LIST_HEAD(&ctx->assigned_links); - INIT_LIST_HEAD(&ctx->reserved_links); ctx->conf.def = chanreq->oper; ctx->conf.ap = chanreq->ap; ctx->conf.rx_chains_static = 1; @@ -726,7 +843,7 @@ ieee80211_alloc_chanctx(struct ieee80211_local *local, ctx->conf.radar_enabled = false; ctx->conf.radio_idx = radio_idx; ctx->radar_detected = false; - _ieee80211_recalc_chanctx_min_def(local, ctx, NULL, false); + __ieee80211_recalc_chanctx_min_def(local, ctx, NULL, false); return ctx; } @@ -815,27 +932,17 @@ void ieee80211_recalc_chanctx_chantype(struct ieee80211_local *local, { struct ieee80211_chanctx_conf *conf = &ctx->conf; const struct ieee80211_chan_req *compat = NULL; - struct ieee80211_link_data *link; + struct ieee80211_chanctx_user_iter iter; struct ieee80211_chan_req tmp; struct sta_info *sta; lockdep_assert_wiphy(local->hw.wiphy); - for_each_sdata_link(local, link) { - struct ieee80211_bss_conf *link_conf; - - if (link->sdata->vif.type == NL80211_IFTYPE_AP_VLAN) - continue; - - link_conf = link->conf; - - if (rcu_access_pointer(link_conf->chanctx_conf) != conf) - continue; - + for_each_chanctx_user_assigned(local, ctx, &iter) { if (!compat) - compat = &link_conf->chanreq; + compat = iter.chanreq; - compat = ieee80211_chanreq_compatible(&link_conf->chanreq, + compat = ieee80211_chanreq_compatible(iter.chanreq, compat, &tmp); if (WARN_ON_ONCE(!compat)) return; @@ -848,6 +955,7 @@ void ieee80211_recalc_chanctx_chantype(struct ieee80211_local *local, list_for_each_entry(sta, &local->sta_list, list) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_chan_req tdls_chanreq = {}; + struct ieee80211_link_data *link; int tdls_link_id; if (!sta->uploaded || @@ -915,12 +1023,11 @@ static int ieee80211_assign_link_chanctx(struct ieee80211_link_data *link, drv_unassign_vif_chanctx(local, sdata, link->conf, curr_ctx); conf = NULL; - list_del(&link->assigned_chanctx_list); } if (new_ctx) { /* recalc considering the link we'll use it for now */ - ieee80211_recalc_chanctx_min_def(local, new_ctx, link, false); + _ieee80211_recalc_chanctx_min_def(local, new_ctx, link, false); ret = drv_assign_vif_chanctx(local, sdata, link->conf, new_ctx); if (assign_on_failure || !ret) { @@ -930,9 +1037,6 @@ static int ieee80211_assign_link_chanctx(struct ieee80211_link_data *link, /* succeeded, so commit it to the data structures */ conf = &new_ctx->conf; - if (!local->in_reconfig) - list_add(&link->assigned_chanctx_list, - &new_ctx->assigned_links); } } else { ret = 0; @@ -944,12 +1048,12 @@ static int ieee80211_assign_link_chanctx(struct ieee80211_link_data *link, ieee80211_recalc_chanctx_chantype(local, curr_ctx); ieee80211_recalc_smps_chanctx(local, curr_ctx); ieee80211_recalc_radar_chanctx(local, curr_ctx); - ieee80211_recalc_chanctx_min_def(local, curr_ctx, NULL, false); + ieee80211_recalc_chanctx_min_def(local, curr_ctx); } if (new_ctx && ieee80211_chanctx_num_assigned(local, new_ctx) > 0) { ieee80211_recalc_txpower(link, false); - ieee80211_recalc_chanctx_min_def(local, new_ctx, NULL, false); + ieee80211_recalc_chanctx_min_def(local, new_ctx); } if (conf) { @@ -982,21 +1086,21 @@ static int ieee80211_assign_link_chanctx(struct ieee80211_link_data *link, void ieee80211_recalc_smps_chanctx(struct ieee80211_local *local, struct ieee80211_chanctx *chanctx) { + struct ieee80211_chanctx_user_iter iter; struct ieee80211_sub_if_data *sdata; u8 rx_chains_static, rx_chains_dynamic; - struct ieee80211_link_data *link; lockdep_assert_wiphy(local->hw.wiphy); rx_chains_static = 1; rx_chains_dynamic = 1; - for_each_sdata_link(local, link) { + for_each_chanctx_user_assigned(local, chanctx, &iter) { u8 needed_static, needed_dynamic; - switch (link->sdata->vif.type) { + switch (iter.iftype) { case NL80211_IFTYPE_STATION: - if (!link->sdata->u.mgd.associated) + if (!iter.sdata->u.mgd.associated) continue; break; case NL80211_IFTYPE_MONITOR: @@ -1012,26 +1116,23 @@ void ieee80211_recalc_smps_chanctx(struct ieee80211_local *local, continue; } - if (rcu_access_pointer(link->conf->chanctx_conf) != &chanctx->conf) - continue; - - if (link->sdata->vif.type == NL80211_IFTYPE_MONITOR) { + if (iter.iftype == NL80211_IFTYPE_MONITOR) { rx_chains_dynamic = rx_chains_static = local->rx_chains; break; } - switch (link->smps_mode) { + switch (iter.link->smps_mode) { default: WARN_ONCE(1, "Invalid SMPS mode %d\n", - link->smps_mode); + iter.link->smps_mode); fallthrough; case IEEE80211_SMPS_OFF: - needed_static = link->needed_rx_chains; - needed_dynamic = link->needed_rx_chains; + needed_static = iter.link->needed_rx_chains; + needed_dynamic = iter.link->needed_rx_chains; break; case IEEE80211_SMPS_DYNAMIC: needed_static = 1; - needed_dynamic = link->needed_rx_chains; + needed_dynamic = iter.link->needed_rx_chains; break; case IEEE80211_SMPS_STATIC: needed_static = 1; @@ -1119,7 +1220,6 @@ void ieee80211_link_unreserve_chanctx(struct ieee80211_link_data *link) if (WARN_ON(!ctx)) return; - list_del(&link->reserved_chanctx_list); link->reserved_chanctx = NULL; if (ieee80211_chanctx_refcount(sdata->local, ctx) == 0) { @@ -1153,9 +1253,9 @@ ieee80211_replace_chanctx(struct ieee80211_local *local, struct wiphy *wiphy = local->hw.wiphy; const struct wiphy_radio *radio; - if (!curr_ctx || (curr_ctx->replace_state == - IEEE80211_CHANCTX_WILL_BE_REPLACED) || - !list_empty(&curr_ctx->reserved_links)) { + if (!curr_ctx || + curr_ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED || + ieee80211_chanctx_num_reserved(local, curr_ctx) != 0) { /* * Another link already requested this context for a * reservation. Find another one hoping all links assigned @@ -1178,7 +1278,7 @@ ieee80211_replace_chanctx(struct ieee80211_local *local, IEEE80211_CHANCTX_REPLACE_NONE) continue; - if (!list_empty(&ctx->reserved_links)) + if (ieee80211_chanctx_num_reserved(local, ctx) != 0) continue; if (ctx->conf.radio_idx >= 0) { @@ -1196,9 +1296,9 @@ ieee80211_replace_chanctx(struct ieee80211_local *local, * If that's true then all available contexts already have reservations * and cannot be used. */ - if (!curr_ctx || (curr_ctx->replace_state == - IEEE80211_CHANCTX_WILL_BE_REPLACED) || - !list_empty(&curr_ctx->reserved_links)) + if (!curr_ctx || + curr_ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED || + ieee80211_chanctx_num_reserved(local, curr_ctx) != 0) return ERR_PTR(-EBUSY); new_ctx = ieee80211_alloc_chanctx(local, chanreq, mode, -1); @@ -1278,7 +1378,6 @@ int ieee80211_link_reserve_chanctx(struct ieee80211_link_data *link, return PTR_ERR(new_ctx); } - list_add(&link->reserved_chanctx_list, &new_ctx->reserved_links); link->reserved_chanctx = new_ctx; link->reserved = *chanreq; link->reserved_radar_required = radar_required; @@ -1301,7 +1400,7 @@ ieee80211_link_chanctx_reservation_complete(struct ieee80211_link_data *link) &link->csa.finalize_work); break; case NL80211_IFTYPE_STATION: - wiphy_delayed_work_queue(sdata->local->hw.wiphy, + wiphy_hrtimer_work_queue(sdata->local->hw.wiphy, &link->u.mgd.csa.switch_work, 0); break; case NL80211_IFTYPE_UNSPECIFIED: @@ -1392,7 +1491,6 @@ ieee80211_link_use_reserved_reassign(struct ieee80211_link_data *link) vif_chsw[0].new_ctx = &new_ctx->conf; vif_chsw[0].link_conf = link->conf; - list_del(&link->reserved_chanctx_list); link->reserved_chanctx = NULL; err = drv_switch_vif_chanctx(local, vif_chsw, 1, @@ -1405,7 +1503,6 @@ ieee80211_link_use_reserved_reassign(struct ieee80211_link_data *link) } link->radar_required = link->reserved_radar_required; - list_move(&link->assigned_chanctx_list, &new_ctx->assigned_links); rcu_assign_pointer(link_conf->chanctx_conf, &new_ctx->conf); if (sdata->vif.type == NL80211_IFTYPE_AP) @@ -1416,7 +1513,7 @@ ieee80211_link_use_reserved_reassign(struct ieee80211_link_data *link) if (ieee80211_chanctx_refcount(local, old_ctx) == 0) ieee80211_free_chanctx(local, old_ctx, false); - ieee80211_recalc_chanctx_min_def(local, new_ctx, NULL, false); + ieee80211_recalc_chanctx_min_def(local, new_ctx); ieee80211_recalc_smps_chanctx(local, new_ctx); ieee80211_recalc_radar_chanctx(local, new_ctx); @@ -1462,7 +1559,6 @@ ieee80211_link_use_reserved_assign(struct ieee80211_link_data *link) ieee80211_change_chanctx(local, new_ctx, new_ctx, chanreq); - list_del(&link->reserved_chanctx_list); link->reserved_chanctx = NULL; err = ieee80211_assign_link_chanctx(link, new_ctx, false); @@ -1508,7 +1604,6 @@ static int ieee80211_chsw_switch_vifs(struct ieee80211_local *local, int n_vifs) { struct ieee80211_vif_chanctx_switch *vif_chsw; - struct ieee80211_link_data *link; struct ieee80211_chanctx *ctx, *old_ctx; int i, err; @@ -1520,6 +1615,8 @@ static int ieee80211_chsw_switch_vifs(struct ieee80211_local *local, i = 0; list_for_each_entry(ctx, &local->chanctx_list, list) { + struct ieee80211_chanctx_user_iter iter; + if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER) continue; @@ -1528,16 +1625,15 @@ static int ieee80211_chsw_switch_vifs(struct ieee80211_local *local, goto out; } - list_for_each_entry(link, &ctx->reserved_links, - reserved_chanctx_list) { - if (!ieee80211_link_has_in_place_reservation(link)) + for_each_chanctx_user_reserved(local, ctx, &iter) { + if (!ieee80211_link_has_in_place_reservation(iter.link)) continue; - old_ctx = ieee80211_link_get_chanctx(link); - vif_chsw[i].vif = &link->sdata->vif; + old_ctx = ieee80211_link_get_chanctx(iter.link); + vif_chsw[i].vif = &iter.sdata->vif; vif_chsw[i].old_ctx = &old_ctx->conf; vif_chsw[i].new_ctx = &ctx->conf; - vif_chsw[i].link_conf = link->conf; + vif_chsw[i].link_conf = iter.link->conf; i++; } @@ -1562,7 +1658,7 @@ static int ieee80211_chsw_switch_ctxs(struct ieee80211_local *local) if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER) continue; - if (!list_empty(&ctx->replace_ctx->assigned_links)) + if (ieee80211_chanctx_num_assigned(local, ctx) != 0) continue; ieee80211_del_chanctx(local, ctx->replace_ctx, false); @@ -1579,7 +1675,7 @@ err: if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER) continue; - if (!list_empty(&ctx->replace_ctx->assigned_links)) + if (ieee80211_chanctx_num_assigned(local, ctx) != 0) continue; ieee80211_del_chanctx(local, ctx, false); @@ -1614,7 +1710,7 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local) */ list_for_each_entry(ctx, &local->chanctx_list, list) { - struct ieee80211_link_data *link; + struct ieee80211_chanctx_user_iter iter; if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER) continue; @@ -1630,12 +1726,11 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local) n_reserved = 0; n_ready = 0; - list_for_each_entry(link, &ctx->replace_ctx->assigned_links, - assigned_chanctx_list) { + for_each_chanctx_user_assigned(local, ctx->replace_ctx, &iter) { n_assigned++; - if (link->reserved_chanctx) { + if (iter.link->reserved_chanctx) { n_reserved++; - if (link->reserved_ready) + if (iter.link->reserved_ready) n_ready++; } } @@ -1652,13 +1747,12 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local) } ctx->conf.radar_enabled = false; - list_for_each_entry(link, &ctx->reserved_links, - reserved_chanctx_list) { - if (ieee80211_link_has_in_place_reservation(link) && - !link->reserved_ready) + for_each_chanctx_user_reserved(local, ctx, &iter) { + if (ieee80211_link_has_in_place_reservation(iter.link) && + !iter.link->reserved_ready) return -EAGAIN; - old_ctx = ieee80211_link_get_chanctx(link); + old_ctx = ieee80211_link_get_chanctx(iter.link); if (old_ctx) { if (old_ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED) @@ -1669,7 +1763,7 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local) n_vifs_ctxless++; } - if (link->reserved_radar_required) + if (iter.radar_required) ctx->conf.radar_enabled = true; } } @@ -1684,7 +1778,7 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local) /* update station rate control and min width before switch */ list_for_each_entry(ctx, &local->chanctx_list, list) { - struct ieee80211_link_data *link; + struct ieee80211_chanctx_user_iter iter; if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER) continue; @@ -1694,17 +1788,16 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local) goto err; } - list_for_each_entry(link, &ctx->reserved_links, - reserved_chanctx_list) { - if (!ieee80211_link_has_in_place_reservation(link)) + for_each_chanctx_user_reserved(local, ctx, &iter) { + if (!ieee80211_link_has_in_place_reservation(iter.link)) continue; ieee80211_chan_bw_change(local, - ieee80211_link_get_chanctx(link), + ieee80211_link_get_chanctx(iter.link), true, true); } - ieee80211_recalc_chanctx_min_def(local, ctx, NULL, true); + _ieee80211_recalc_chanctx_min_def(local, ctx, NULL, true); } /* @@ -1729,7 +1822,7 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local) * context(s). */ list_for_each_entry(ctx, &local->chanctx_list, list) { - struct ieee80211_link_data *link, *link_tmp; + struct ieee80211_chanctx_user_iter iter; if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER) continue; @@ -1739,9 +1832,9 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local) goto err; } - list_for_each_entry(link, &ctx->reserved_links, - reserved_chanctx_list) { - struct ieee80211_sub_if_data *sdata = link->sdata; + for_each_chanctx_user_reserved(local, ctx, &iter) { + struct ieee80211_link_data *link = iter.link; + struct ieee80211_sub_if_data *sdata = iter.sdata; struct ieee80211_bss_conf *link_conf = link->conf; u64 changed = 0; @@ -1757,9 +1850,9 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local) ieee80211_check_fast_xmit_iface(sdata); - link->radar_required = link->reserved_radar_required; + link->radar_required = iter.radar_required; - if (link_conf->chanreq.oper.width != link->reserved.oper.width) + if (link_conf->chanreq.oper.width != iter.chanreq->oper.width) changed = BSS_CHANGED_BANDWIDTH; ieee80211_link_update_chanreq(link, &link->reserved); @@ -1774,19 +1867,15 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local) ieee80211_recalc_chanctx_chantype(local, ctx); ieee80211_recalc_smps_chanctx(local, ctx); ieee80211_recalc_radar_chanctx(local, ctx); - ieee80211_recalc_chanctx_min_def(local, ctx, NULL, false); + ieee80211_recalc_chanctx_min_def(local, ctx); - list_for_each_entry_safe(link, link_tmp, &ctx->reserved_links, - reserved_chanctx_list) { - if (ieee80211_link_get_chanctx(link) != ctx) + for_each_chanctx_user_reserved(local, ctx, &iter) { + if (ieee80211_link_get_chanctx(iter.link) != ctx) continue; - list_del(&link->reserved_chanctx_list); - list_move(&link->assigned_chanctx_list, - &ctx->assigned_links); - link->reserved_chanctx = NULL; + iter.link->reserved_chanctx = NULL; - ieee80211_link_chanctx_reservation_complete(link); + ieee80211_link_chanctx_reservation_complete(iter.link); ieee80211_chan_bw_change(local, ctx, false, false); } @@ -1797,12 +1886,10 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local) * reservation for originally requested interface has already * succeeded at this point. */ - list_for_each_entry_safe(link, link_tmp, &ctx->reserved_links, - reserved_chanctx_list) { - if (WARN_ON(ieee80211_link_has_in_place_reservation(link))) - continue; + for_each_chanctx_user_reserved(local, ctx, &iter) { + struct ieee80211_link_data *link = iter.link; - if (WARN_ON(link->reserved_chanctx != ctx)) + if (WARN_ON(ieee80211_link_has_in_place_reservation(link))) continue; if (!link->reserved_ready) @@ -1845,15 +1932,14 @@ static int ieee80211_vif_use_reserved_switch(struct ieee80211_local *local) err: list_for_each_entry(ctx, &local->chanctx_list, list) { - struct ieee80211_link_data *link, *link_tmp; + struct ieee80211_chanctx_user_iter iter; if (ctx->replace_state != IEEE80211_CHANCTX_REPLACES_OTHER) continue; - list_for_each_entry_safe(link, link_tmp, &ctx->reserved_links, - reserved_chanctx_list) { - ieee80211_link_unreserve_chanctx(link); - ieee80211_link_chanctx_reservation_complete(link); + for_each_chanctx_user_reserved(local, ctx, &iter) { + ieee80211_link_unreserve_chanctx(iter.link); + ieee80211_link_chanctx_reservation_complete(iter.link); } } @@ -1960,7 +2046,6 @@ int _ieee80211_link_use_channel(struct ieee80211_link_data *link, /* remove reservation */ WARN_ON(link->reserved_chanctx != ctx); link->reserved_chanctx = NULL; - list_del(&link->reserved_chanctx_list); } if (ret) { @@ -2057,29 +2142,17 @@ ieee80211_chanctx_recheck(struct ieee80211_local *local, struct ieee80211_chan_req *tmp) { const struct ieee80211_chan_req *ret = req; - struct ieee80211_link_data *link; + struct ieee80211_chanctx_user_iter iter; lockdep_assert_wiphy(local->hw.wiphy); - for_each_sdata_link(local, link) { - if (link == skip_link) + for_each_chanctx_user_all(local, ctx, &iter) { + if (iter.link == skip_link) continue; - if (rcu_access_pointer(link->conf->chanctx_conf) == &ctx->conf) { - ret = ieee80211_chanreq_compatible(ret, - &link->conf->chanreq, - tmp); - if (!ret) - return NULL; - } - - if (link->reserved_chanctx == ctx) { - ret = ieee80211_chanreq_compatible(ret, - &link->reserved, - tmp); - if (!ret) - return NULL; - } + ret = ieee80211_chanreq_compatible(ret, iter.chanreq, tmp); + if (!ret) + return NULL; } *tmp = *ret; diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index e8b78ec682da..d02f07368c51 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -82,7 +82,6 @@ static ssize_t aqm_read(struct file *file, int len = 0; spin_lock_bh(&local->fq.lock); - rcu_read_lock(); len = scnprintf(buf, sizeof(buf), "access name value\n" @@ -105,7 +104,6 @@ static ssize_t aqm_read(struct file *file, fq->limit, fq->quantum); - rcu_read_unlock(); spin_unlock_bh(&local->fq.lock); return simple_read_from_buffer(user_buf, count, ppos, @@ -717,7 +715,6 @@ void debugfs_hw_add(struct ieee80211_local *local) DEBUGFS_STATS_ADD(dot11ReceivedFragmentCount); DEBUGFS_STATS_ADD(dot11MulticastReceivedFrameCount); DEBUGFS_STATS_ADD(dot11TransmittedFrameCount); - DEBUGFS_STATS_ADD(tx_handlers_drop); DEBUGFS_STATS_ADD(tx_handlers_queued); DEBUGFS_STATS_ADD(tx_handlers_drop_wep); DEBUGFS_STATS_ADD(tx_handlers_drop_not_assoc); diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index 1dac78271045..30a5a978a678 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -625,7 +625,6 @@ static ssize_t ieee80211_if_fmt_aqm( txqi = to_txq_info(sdata->vif.txq); spin_lock_bh(&local->fq.lock); - rcu_read_lock(); len = scnprintf(buf, buflen, @@ -642,7 +641,6 @@ static ssize_t ieee80211_if_fmt_aqm( txqi->tin.tx_bytes, txqi->tin.tx_packets); - rcu_read_unlock(); spin_unlock_bh(&local->fq.lock); return len; diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index 49061bd4151b..ef75255d47d5 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -148,7 +148,6 @@ static ssize_t sta_aqm_read(struct file *file, char __user *userbuf, return -ENOMEM; spin_lock_bh(&local->fq.lock); - rcu_read_lock(); p += scnprintf(p, bufsz + buf - p, @@ -178,7 +177,6 @@ static ssize_t sta_aqm_read(struct file *file, char __user *userbuf, test_bit(IEEE80211_TXQ_DIRTY, &txqi->flags) ? " DIRTY" : ""); } - rcu_read_unlock(); spin_unlock_bh(&local->fq.lock); rv = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf); diff --git a/net/mac80211/driver-ops.c b/net/mac80211/driver-ops.c index ba9fba165926..49753b73aba2 100644 --- a/net/mac80211/driver-ops.c +++ b/net/mac80211/driver-ops.c @@ -476,8 +476,12 @@ void drv_link_info_changed(struct ieee80211_local *local, if (WARN_ON_ONCE(sdata->vif.type == NL80211_IFTYPE_P2P_DEVICE || sdata->vif.type == NL80211_IFTYPE_NAN || (sdata->vif.type == NL80211_IFTYPE_MONITOR && - !sdata->vif.bss_conf.mu_mimo_owner && - !(changed & BSS_CHANGED_TXPOWER)))) + changed & ~(BSS_CHANGED_TXPOWER | + BSS_CHANGED_MU_GROUPS)))) + return; + + if (WARN_ON_ONCE(changed & BSS_CHANGED_MU_GROUPS && + !sdata->vif.bss_conf.mu_mimo_owner)) return; if (!check_sdata_in_driver(sdata)) diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index 181bcb34b795..55105d238d6b 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -1416,7 +1416,7 @@ drv_get_ftm_responder_stats(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct cfg80211_ftm_responder_stats *ftm_stats) { - u32 ret = -EOPNOTSUPP; + int ret = -EOPNOTSUPP; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); diff --git a/net/mac80211/ethtool.c b/net/mac80211/ethtool.c index 0397755a3bd1..3d365626faa4 100644 --- a/net/mac80211/ethtool.c +++ b/net/mac80211/ethtool.c @@ -48,8 +48,8 @@ static const char ieee80211_gstrings_sta_stats[][ETH_GSTRING_LEN] = { "rx_duplicates", "rx_fragments", "rx_dropped", "tx_packets", "tx_bytes", "tx_filtered", "tx_retry_failed", "tx_retries", - "sta_state", "txrate", "rxrate", "signal", - "channel", "noise", "ch_time", "ch_time_busy", + "tx_handlers_drop", "sta_state", "txrate", "rxrate", + "signal", "channel", "noise", "ch_time", "ch_time_busy", "ch_time_ext_busy", "ch_time_rx", "ch_time_tx" }; #define STA_STATS_LEN ARRAY_SIZE(ieee80211_gstrings_sta_stats) @@ -120,6 +120,7 @@ static void ieee80211_get_stats(struct net_device *dev, i = 0; ADD_STA_STATS(&sta->deflink); + data[i++] = sdata->tx_handlers_drop; data[i++] = sta->sta_state; @@ -145,6 +146,7 @@ static void ieee80211_get_stats(struct net_device *dev, sta_set_sinfo(sta, &sinfo, false); i = 0; ADD_STA_STATS(&sta->deflink); + data[i++] = sdata->tx_handlers_drop; } } diff --git a/net/mac80211/he.c b/net/mac80211/he.c index 5792ef77e986..f7b05e59374c 100644 --- a/net/mac80211/he.c +++ b/net/mac80211/he.c @@ -3,7 +3,7 @@ * HE handling * * Copyright(c) 2017 Intel Deutschland GmbH - * Copyright(c) 2019 - 2024 Intel Corporation + * Copyright(c) 2019-2025 Intel Corporation */ #include "ieee80211_i.h" @@ -313,7 +313,7 @@ bool ieee80211_prepare_rx_omi_bw(struct ieee80211_link_sta *pub_link_sta, ieee80211_link_sta_rc_update_omi(link, link_sta); } else { link_sta->rx_omi_bw_rx = bw; - ieee80211_recalc_chanctx_min_def(local, chanctx, NULL, false); + ieee80211_recalc_chanctx_min_def(local, chanctx); } link_sta->rx_omi_bw_staging = bw; @@ -359,7 +359,7 @@ void ieee80211_finalize_rx_omi_bw(struct ieee80211_link_sta *pub_link_sta) /* channel context in finalize only when narrowing bandwidth */ WARN_ON(link_sta->rx_omi_bw_rx < link_sta->rx_omi_bw_staging); link_sta->rx_omi_bw_rx = link_sta->rx_omi_bw_staging; - ieee80211_recalc_chanctx_min_def(local, chanctx, NULL, false); + ieee80211_recalc_chanctx_min_def(local, chanctx); } trace_api_return_void(local); diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 6e36b09fe97f..168f84a1353b 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -9,7 +9,7 @@ * Copyright 2009, Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright(c) 2016 Intel Deutschland GmbH - * Copyright(c) 2018-2024 Intel Corporation + * Copyright(c) 2018-2025 Intel Corporation */ #include <linux/delay.h> @@ -1554,6 +1554,7 @@ void ieee80211_rx_mgmt_probe_beacon(struct ieee80211_sub_if_data *sdata, { size_t baselen; struct ieee802_11_elems *elems; + u16 type; BUILD_BUG_ON(offsetof(typeof(mgmt->u.probe_resp), variable) != offsetof(typeof(mgmt->u.beacon), variable)); @@ -1566,8 +1567,9 @@ void ieee80211_rx_mgmt_probe_beacon(struct ieee80211_sub_if_data *sdata, if (baselen > len) return; + type = le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_TYPE; elems = ieee802_11_parse_elems(mgmt->u.probe_resp.variable, - len - baselen, false, NULL); + len - baselen, type, NULL); if (elems) { ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, elems); @@ -1616,9 +1618,11 @@ void ieee80211_ibss_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, if (ies_len < 0) break; - elems = ieee802_11_parse_elems( - mgmt->u.action.u.chan_switch.variable, - ies_len, true, NULL); + elems = ieee802_11_parse_elems(mgmt->u.action.u.chan_switch.variable, + ies_len, + IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION, + NULL); if (elems && !elems->parse_error) ieee80211_rx_mgmt_spectrum_mgmt(sdata, mgmt, diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 8afa2404eaa8..9d9313eee59f 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -86,6 +86,14 @@ extern const u8 ieee80211_ac_to_qos_mask[IEEE80211_NUM_ACS]; #define IEEE80211_MAX_NAN_INSTANCE_ID 255 +/* + * Current mac80211 implementation supports a maximum of 1600 AIDS + * for S1G interfaces. With regards to an S1G TIM, this covers 25 blocks + * as each block is 64 AIDs. + */ +#define IEEE80211_MAX_SUPPORTED_S1G_AID 1600 +#define IEEE80211_MAX_SUPPORTED_S1G_TIM_BLOCKS 25 + enum ieee80211_status_data { IEEE80211_STATUS_TYPE_MASK = 0x00f, IEEE80211_STATUS_TYPE_INVALID = 0, @@ -604,11 +612,11 @@ struct ieee80211_if_managed { u8 *assoc_req_ies; size_t assoc_req_ies_len; - struct wiphy_delayed_work ml_reconf_work; + struct wiphy_hrtimer_work ml_reconf_work; u16 removed_links; /* TID-to-link mapping support */ - struct wiphy_delayed_work ttlm_work; + struct wiphy_hrtimer_work ttlm_work; struct ieee80211_adv_ttlm_info ttlm_info; struct wiphy_work teardown_ttlm_work; @@ -908,9 +916,6 @@ struct ieee80211_chanctx { struct list_head list; struct rcu_head rcu_head; - struct list_head assigned_links; - struct list_head reserved_links; - enum ieee80211_chanctx_replace_state replace_state; struct ieee80211_chanctx *replace_ctx; @@ -977,11 +982,13 @@ struct ieee80211_if_mntr { * struct ieee80211_if_nan - NAN state * * @conf: current NAN configuration + * @started: true iff NAN is started * @func_lock: lock for @func_inst_ids * @function_inst_ids: a bitmap of available instance_id's */ struct ieee80211_if_nan { struct cfg80211_nan_conf conf; + bool started; /* protects function_inst_ids */ spinlock_t func_lock; @@ -1007,10 +1014,10 @@ struct ieee80211_link_data_managed { bool operating_11g_mode; struct { - struct wiphy_delayed_work switch_work; + struct wiphy_hrtimer_work switch_work; struct cfg80211_chan_def ap_chandef; struct ieee80211_parsed_tpe tpe; - unsigned long time; + ktime_t time; bool waiting_bcn; bool ignored_same_chan; bool blocked_tx; @@ -1061,9 +1068,6 @@ struct ieee80211_link_data { struct ieee80211_sub_if_data *sdata; unsigned int link_id; - struct list_head assigned_chanctx_list; /* protected by wiphy mutex */ - struct list_head reserved_chanctx_list; /* protected by wiphy mutex */ - /* multicast keys only */ struct ieee80211_key __rcu *gtk[NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS + @@ -1210,6 +1214,7 @@ struct ieee80211_sub_if_data { } debugfs; #endif + u32 tx_handlers_drop; /* must be last, dynamically sized area in this! */ struct ieee80211_vif vif; }; @@ -1228,9 +1233,12 @@ struct ieee80211_sub_if_data *vif_to_sdata(struct ieee80211_vif *p) for (struct ieee80211_sub_if_data *___sdata = NULL; \ !___sdata; \ ___sdata = (void *)~0 /* always stop */) \ + for (int ___link_id = ARRAY_SIZE(___sdata->link); \ + ___link_id; ___link_id = 0 /* always stop */) \ list_for_each_entry(___sdata, &(_local)->interfaces, list) \ - if (ieee80211_sdata_running(___sdata)) \ - for (int ___link_id = 0; \ + if (___link_id == ARRAY_SIZE(___sdata->link) && \ + ieee80211_sdata_running(___sdata)) \ + for (___link_id = 0; \ ___link_id < ARRAY_SIZE(___sdata->link); \ ___link_id++) \ if ((_link = wiphy_dereference((_local)->hw.wiphy, \ @@ -1244,9 +1252,12 @@ struct ieee80211_sub_if_data *vif_to_sdata(struct ieee80211_vif *p) for (struct ieee80211_sub_if_data *___sdata = NULL; \ !___sdata; \ ___sdata = (void *)~0 /* always stop */) \ - list_for_each_entry_rcu(___sdata, &(_local)->interfaces, list) \ - if (ieee80211_sdata_running(___sdata)) \ - for (int ___link_id = 0; \ + for (int ___link_id = ARRAY_SIZE(___sdata->link); \ + ___link_id; ___link_id = 0 /* always stop */) \ + list_for_each_entry(___sdata, &(_local)->interfaces, list) \ + if (___link_id == ARRAY_SIZE(___sdata->link) && \ + ieee80211_sdata_running(___sdata)) \ + for (___link_id = 0; \ ___link_id < ARRAY_SIZE((___sdata)->link); \ ___link_id++) \ if ((_link = rcu_dereference((___sdata)->link[___link_id]))) @@ -1599,7 +1610,6 @@ struct ieee80211_local { u32 dot11TransmittedFrameCount; /* TX/RX handler statistics */ - unsigned int tx_handlers_drop; unsigned int tx_handlers_queued; unsigned int tx_handlers_drop_wep; unsigned int tx_handlers_drop_not_assoc; @@ -1665,8 +1675,6 @@ struct ieee80211_local { struct idr ack_status_frames; spinlock_t ack_status_lock; - struct ieee80211_sub_if_data __rcu *p2p_sdata; - /* virtual monitor interface */ struct ieee80211_sub_if_data __rcu *monitor_sdata; struct ieee80211_chan_req monitor_chanreq; @@ -2099,7 +2107,8 @@ void ieee80211_adjust_monitor_flags(struct ieee80211_sub_if_data *sdata, const int offset); int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up); void ieee80211_sdata_stop(struct ieee80211_sub_if_data *sdata); -int ieee80211_add_virtual_monitor(struct ieee80211_local *local); +int ieee80211_add_virtual_monitor(struct ieee80211_local *local, + struct ieee80211_sub_if_data *creator_sdata); void ieee80211_del_virtual_monitor(struct ieee80211_local *local); bool __ieee80211_recalc_txpower(struct ieee80211_link_data *link); @@ -2414,7 +2423,8 @@ static inline void ieee80211_tx_skb(struct ieee80211_sub_if_data *sdata, * @mode: connection mode for parsing * @start: pointer to the elements * @len: length of the elements - * @action: %true if the elements came from an action frame + * @type: type of the frame the elements came from + * (action, probe response, beacon, etc.) * @filter: bitmap of element IDs to filter out while calculating * the element CRC * @crc: CRC starting value @@ -2432,7 +2442,7 @@ struct ieee80211_elems_parse_params { enum ieee80211_conn_mode mode; const u8 *start; size_t len; - bool action; + u8 type; u64 filter; u32 crc; struct cfg80211_bss *bss; @@ -2444,17 +2454,14 @@ struct ieee802_11_elems * ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params); static inline struct ieee802_11_elems * -ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, - u64 filter, u32 crc, - struct cfg80211_bss *bss) +ieee802_11_parse_elems(const u8 *start, size_t len, u8 type, + struct cfg80211_bss *bss) { struct ieee80211_elems_parse_params params = { .mode = IEEE80211_CONN_MODE_HIGHEST, .start = start, .len = len, - .action = action, - .filter = filter, - .crc = crc, + .type = type, .bss = bss, .link_id = -1, }; @@ -2462,13 +2469,6 @@ ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, return ieee802_11_parse_elems_full(¶ms); } -static inline struct ieee802_11_elems * -ieee802_11_parse_elems(const u8 *start, size_t len, bool action, - struct cfg80211_bss *bss) -{ - return ieee802_11_parse_elems_crc(start, len, action, 0, 0, bss); -} - extern const int ieee802_1d_to_ac[8]; static inline int ieee80211_ac_from_tid(int tid) @@ -2702,7 +2702,8 @@ bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_local *local, const struct ieee80211_he_operation *he_oper, const struct ieee80211_eht_operation *eht_oper, struct cfg80211_chan_def *chandef); -bool ieee80211_chandef_s1g_oper(const struct ieee80211_s1g_oper_ie *oper, +bool ieee80211_chandef_s1g_oper(struct ieee80211_local *local, + const struct ieee80211_s1g_oper_ie *oper, struct cfg80211_chan_def *chandef); void ieee80211_chandef_downgrade(struct cfg80211_chan_def *chandef, struct ieee80211_conn_settings *conn); @@ -2759,9 +2760,7 @@ int ieee80211_chanctx_refcount(struct ieee80211_local *local, void ieee80211_recalc_smps_chanctx(struct ieee80211_local *local, struct ieee80211_chanctx *chanctx); void ieee80211_recalc_chanctx_min_def(struct ieee80211_local *local, - struct ieee80211_chanctx *ctx, - struct ieee80211_link_data *rsvd_for, - bool check_reserved); + struct ieee80211_chanctx *ctx); bool ieee80211_is_radar_required(struct ieee80211_local *local, struct cfg80211_scan_request *req); bool ieee80211_is_radio_idx_in_scan_req(struct wiphy *wiphy, diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 07ba68f7cd81..4f04d95c19d4 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -107,6 +107,7 @@ static u32 __ieee80211_recalc_idle(struct ieee80211_local *local, { bool working, scanning, active; unsigned int led_trig_start = 0, led_trig_stop = 0; + struct ieee80211_sub_if_data *iter; lockdep_assert_wiphy(local->hw.wiphy); @@ -117,6 +118,14 @@ static u32 __ieee80211_recalc_idle(struct ieee80211_local *local, working = !local->ops->remain_on_channel && !list_empty(&local->roc_list); + list_for_each_entry(iter, &local->interfaces, list) { + if (iter->vif.type == NL80211_IFTYPE_NAN && + iter->u.nan.started) { + working = true; + break; + } + } + scanning = test_bit(SCAN_SW_SCANNING, &local->scanning) || test_bit(SCAN_ONCHANNEL_SCANNING, &local->scanning); @@ -214,6 +223,10 @@ static int ieee80211_can_powered_addr_change(struct ieee80211_sub_if_data *sdata if (netif_carrier_ok(sdata->dev)) return -EBUSY; + /* if any stations are set known (so they know this vif too), reject */ + if (sta_info_get_by_idx(sdata, 0)) + return -EBUSY; + /* First check no ROC work is happening on this iface */ list_for_each_entry(roc, &local->roc_list, list) { if (roc->sdata != sdata) @@ -233,12 +246,16 @@ static int ieee80211_can_powered_addr_change(struct ieee80211_sub_if_data *sdata ret = -EBUSY; } + /* + * More interface types could be added here but changing the + * address while powered makes the most sense in client modes. + */ switch (sdata->vif.type) { case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_P2P_CLIENT: - /* More interface types could be added here but changing the - * address while powered makes the most sense in client modes. - */ + /* refuse while connecting */ + if (sdata->u.mgd.auth_data || sdata->u.mgd.assoc_data) + return -EBUSY; break; default: ret = -EOPNOTSUPP; @@ -611,10 +628,6 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do spin_unlock_bh(&sdata->u.nan.func_lock); break; - case NL80211_IFTYPE_P2P_DEVICE: - /* relies on synchronize_rcu() below */ - RCU_INIT_POINTER(local->p2p_sdata, NULL); - fallthrough; default: wiphy_work_cancel(sdata->local->hw.wiphy, &sdata->work); /* @@ -728,8 +741,9 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do ieee80211_configure_filter(local); ieee80211_hw_config(local, -1, hw_reconf_flags); + /* Passing NULL means an interface is picked for configuration */ if (local->virt_monitors == local->open_count) - ieee80211_add_virtual_monitor(local); + ieee80211_add_virtual_monitor(local, NULL); } void ieee80211_stop_mbssid(struct ieee80211_sub_if_data *sdata) @@ -1163,7 +1177,8 @@ static void ieee80211_sdata_init(struct ieee80211_local *local, ieee80211_link_init(sdata, -1, &sdata->deflink, &sdata->vif.bss_conf); } -int ieee80211_add_virtual_monitor(struct ieee80211_local *local) +int ieee80211_add_virtual_monitor(struct ieee80211_local *local, + struct ieee80211_sub_if_data *creator_sdata) { struct ieee80211_sub_if_data *sdata; int ret; @@ -1171,10 +1186,14 @@ int ieee80211_add_virtual_monitor(struct ieee80211_local *local) ASSERT_RTNL(); lockdep_assert_wiphy(local->hw.wiphy); - if (local->monitor_sdata || - ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) + if (ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) return 0; + /* Already have a monitor set up, configure it */ + sdata = wiphy_dereference(local->hw.wiphy, local->monitor_sdata); + if (sdata) + goto configure_monitor; + sdata = kzalloc(sizeof(*sdata) + local->hw.vif_data_size, GFP_KERNEL); if (!sdata) return -ENOMEM; @@ -1227,6 +1246,32 @@ int ieee80211_add_virtual_monitor(struct ieee80211_local *local) skb_queue_head_init(&sdata->status_queue); wiphy_work_init(&sdata->work, ieee80211_iface_work); +configure_monitor: + /* Copy in the MU-MIMO configuration if set */ + if (!creator_sdata) { + struct ieee80211_sub_if_data *other; + + list_for_each_entry(other, &local->mon_list, list) { + if (!other->vif.bss_conf.mu_mimo_owner) + continue; + + creator_sdata = other; + break; + } + } + + if (creator_sdata && creator_sdata->vif.bss_conf.mu_mimo_owner) { + sdata->vif.bss_conf.mu_mimo_owner = true; + memcpy(&sdata->vif.bss_conf.mu_group, + &creator_sdata->vif.bss_conf.mu_group, + sizeof(sdata->vif.bss_conf.mu_group)); + memcpy(&sdata->u.mntr.mu_follow_addr, + creator_sdata->u.mntr.mu_follow_addr, ETH_ALEN); + + ieee80211_link_info_change_notify(sdata, &sdata->deflink, + BSS_CHANGED_MU_GROUPS); + } + return 0; } @@ -1383,11 +1428,13 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) if (res) goto err_stop; } else { - if (local->virt_monitors == 0 && local->open_count == 0) { - res = ieee80211_add_virtual_monitor(local); + /* add/configure if there is no non-monitor interface */ + if (local->virt_monitors == local->open_count) { + res = ieee80211_add_virtual_monitor(local, sdata); if (res) goto err_stop; } + local->virt_monitors++; /* must be before the call to ieee80211_configure_filter */ @@ -1405,6 +1452,7 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) ieee80211_recalc_idle(local); netif_carrier_on(dev); + list_add_tail_rcu(&sdata->u.mntr.list, &local->mon_list); break; default: if (coming_up) { @@ -1468,17 +1516,6 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) sdata->vif.type != NL80211_IFTYPE_STATION); } - switch (sdata->vif.type) { - case NL80211_IFTYPE_P2P_DEVICE: - rcu_assign_pointer(local->p2p_sdata, sdata); - break; - case NL80211_IFTYPE_MONITOR: - list_add_tail_rcu(&sdata->u.mntr.list, &local->mon_list); - break; - default: - break; - } - /* * set_multicast_list will be invoked by the networking core * which will check whether any increments here were done in diff --git a/net/mac80211/key.c b/net/mac80211/key.c index b14e9cd9713f..d5da7ccea66e 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -508,11 +508,16 @@ static int ieee80211_key_replace(struct ieee80211_sub_if_data *sdata, ret = ieee80211_key_enable_hw_accel(new); } } else { - if (!new->local->wowlan) + if (!new->local->wowlan) { ret = ieee80211_key_enable_hw_accel(new); - else if (link_id < 0 || !sdata->vif.active_links || - BIT(link_id) & sdata->vif.active_links) + } else if (link_id < 0 || !sdata->vif.active_links || + BIT(link_id) & sdata->vif.active_links) { new->flags |= KEY_FLAG_UPLOADED_TO_HARDWARE; + if (!(new->conf.flags & (IEEE80211_KEY_FLAG_GENERATE_MMIC | + IEEE80211_KEY_FLAG_PUT_MIC_SPACE | + IEEE80211_KEY_FLAG_RESERVE_TAILROOM))) + decrease_tailroom_need_count(sdata, 1); + } } if (ret) diff --git a/net/mac80211/link.c b/net/mac80211/link.c index d71eabe5abf8..1e05845872af 100644 --- a/net/mac80211/link.c +++ b/net/mac80211/link.c @@ -23,9 +23,6 @@ static void ieee80211_update_apvlan_links(struct ieee80211_sub_if_data *sdata) list_for_each_entry(vlan, &sdata->u.ap.vlans, u.vlan.list) { int link_id; - if (!vlan) - continue; - /* No support for 4addr with MLO yet */ if (vlan->wdev.use_4addr) return; @@ -119,8 +116,6 @@ void ieee80211_link_init(struct ieee80211_sub_if_data *sdata, ieee80211_color_change_finalize_work); wiphy_delayed_work_init(&link->color_collision_detect_work, ieee80211_color_collision_detection_work); - INIT_LIST_HEAD(&link->assigned_chanctx_list); - INIT_LIST_HEAD(&link->reserved_chanctx_list); wiphy_delayed_work_init(&link->dfs_cac_timer_work, ieee80211_dfs_cac_timer_work); @@ -472,10 +467,10 @@ static int _ieee80211_set_active_links(struct ieee80211_sub_if_data *sdata, * from there. */ if (link->conf->csa_active) - wiphy_delayed_work_queue(local->hw.wiphy, + wiphy_hrtimer_work_queue(local->hw.wiphy, &link->u.mgd.csa.switch_work, link->u.mgd.csa.time - - jiffies); + ktime_get_boottime()); } for_each_set_bit(link_id, &add, IEEE80211_MLD_MAX_NUM_LINKS) { diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 9c8f18b258a6..b05e313c7f17 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -356,8 +356,7 @@ void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata, if (WARN_ON_ONCE(sdata->vif.type == NL80211_IFTYPE_P2P_DEVICE || sdata->vif.type == NL80211_IFTYPE_NAN || (sdata->vif.type == NL80211_IFTYPE_MONITOR && - !sdata->vif.bss_conf.mu_mimo_owner && - !(changed & BSS_CHANGED_TXPOWER)))) + changed & ~BSS_CHANGED_TXPOWER))) return; if (!check_sdata_in_driver(sdata)) @@ -746,6 +745,11 @@ ieee80211_default_mgmt_stypes[NUM_NL80211_IFTYPES] = { BIT(IEEE80211_STYPE_PROBE_REQ >> 4) | BIT(IEEE80211_STYPE_AUTH >> 4), }, + [NL80211_IFTYPE_NAN] = { + .tx = 0xffff, + .rx = BIT(IEEE80211_STYPE_ACTION >> 4) | + BIT(IEEE80211_STYPE_AUTH >> 4), + }, }; static const struct ieee80211_ht_cap mac80211_ht_capa_mod_mask = { @@ -862,6 +866,14 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, if (emulate_chanctx || ops->remain_on_channel) wiphy->flags |= WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL; + wiphy->bss_param_support = WIPHY_BSS_PARAM_CTS_PROT | + WIPHY_BSS_PARAM_SHORT_PREAMBLE | + WIPHY_BSS_PARAM_SHORT_SLOT_TIME | + WIPHY_BSS_PARAM_BASIC_RATES | + WIPHY_BSS_PARAM_AP_ISOLATE | + WIPHY_BSS_PARAM_HT_OPMODE | + WIPHY_BSS_PARAM_P2P_CTWINDOW | + WIPHY_BSS_PARAM_P2P_OPPPS; wiphy->features |= NL80211_FEATURE_SK_TX_STATUS | NL80211_FEATURE_SAE | NL80211_FEATURE_HT_IBSS | @@ -1111,7 +1123,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) int result, i; enum nl80211_band band; int channels, max_bitrates; - bool supp_ht, supp_vht, supp_he, supp_eht; + bool supp_ht, supp_vht, supp_he, supp_eht, supp_s1g; struct cfg80211_chan_def dflt_chandef = {}; if (ieee80211_hw_check(hw, QUEUE_CONTROL) && @@ -1164,9 +1176,6 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) if (WARN_ON(!ieee80211_hw_check(hw, MFP_CAPABLE))) return -EINVAL; - if (WARN_ON(!ieee80211_hw_check(hw, CONNECTION_MONITOR))) - return -EINVAL; - if (WARN_ON(ieee80211_hw_check(hw, NEED_DTIM_BEFORE_ASSOC))) return -EINVAL; @@ -1227,6 +1236,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) supp_vht = false; supp_he = false; supp_eht = false; + supp_s1g = false; for (band = 0; band < NUM_NL80211_BANDS; band++) { const struct ieee80211_sband_iftype_data *iftd; struct ieee80211_supported_band *sband; @@ -1238,11 +1248,13 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) if (!dflt_chandef.chan) { /* * Assign the first enabled channel to dflt_chandef - * from the list of channels + * from the list of channels. For S1G interfaces + * ensure it can be used as a primary. */ for (i = 0; i < sband->n_channels; i++) if (!(sband->channels[i].flags & - IEEE80211_CHAN_DISABLED)) + (IEEE80211_CHAN_DISABLED | + IEEE80211_CHAN_S1G_NO_PRIMARY))) break; /* if none found then use the first anyway */ if (i == sband->n_channels) @@ -1274,6 +1286,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) max_bitrates = sband->n_bitrates; supp_ht = supp_ht || sband->ht_cap.ht_supported; supp_vht = supp_vht || sband->vht_cap.vht_supported; + supp_s1g = supp_s1g || sband->s1g_cap.s1g; for_each_sband_iftype_data(sband, i, iftd) { u8 he_40_mhz_cap; @@ -1406,6 +1419,9 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) local->scan_ies_len += 2 + sizeof(struct ieee80211_vht_cap); + if (supp_s1g) + local->scan_ies_len += 2 + sizeof(struct ieee80211_s1g_cap); + /* * HE cap element is variable in size - set len to allow max size */ if (supp_he) { diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index a4a715f6f1c3..68901f1def0d 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008, 2009 open80211s Ltd. - * Copyright (C) 2018 - 2024 Intel Corporation + * Copyright (C) 2018 - 2025 Intel Corporation * Authors: Luis Carlos Cobo <luisca@cozybit.com> * Javier Cardona <javier@cozybit.com> */ @@ -624,6 +624,9 @@ int mesh_add_he_6ghz_cap_ie(struct ieee80211_sub_if_data *sdata, if (!sband) return -EINVAL; + if (sband->band != NL80211_BAND_6GHZ) + return 0; + iftd = ieee80211_get_sband_iftype_data(sband, NL80211_IFTYPE_MESH_POINT); /* The device doesn't support HE in mesh mode or at all */ @@ -1407,7 +1410,10 @@ ieee80211_mesh_rx_probe_req(struct ieee80211_sub_if_data *sdata, if (baselen > len) return; - elems = ieee802_11_parse_elems(pos, len - baselen, false, NULL); + elems = ieee802_11_parse_elems(pos, len - baselen, + IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_PROBE_REQ, + NULL); if (!elems) return; @@ -1452,11 +1458,11 @@ free: } static void ieee80211_mesh_rx_bcn_presp(struct ieee80211_sub_if_data *sdata, - u16 stype, struct ieee80211_mgmt *mgmt, size_t len, struct ieee80211_rx_status *rx_status) { + u16 type = le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_TYPE; struct ieee80211_local *local = sdata->local; struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; struct ieee802_11_elems *elems; @@ -1466,7 +1472,7 @@ static void ieee80211_mesh_rx_bcn_presp(struct ieee80211_sub_if_data *sdata, enum nl80211_band band = rx_status->band; /* ignore ProbeResp to foreign address */ - if (stype == IEEE80211_STYPE_PROBE_RESP && + if (type == (IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_PROBE_RESP) && !ether_addr_equal(mgmt->da, sdata->vif.addr)) return; @@ -1475,8 +1481,7 @@ static void ieee80211_mesh_rx_bcn_presp(struct ieee80211_sub_if_data *sdata, return; elems = ieee802_11_parse_elems(mgmt->u.probe_resp.variable, - len - baselen, - false, NULL); + len - baselen, type, NULL); if (!elems) return; @@ -1511,7 +1516,9 @@ static void ieee80211_mesh_rx_bcn_presp(struct ieee80211_sub_if_data *sdata, } if (ifmsh->sync_ops) - ifmsh->sync_ops->rx_bcn_presp(sdata, stype, mgmt, len, + ifmsh->sync_ops->rx_bcn_presp(sdata, + type & IEEE80211_FCTL_STYPE, + mgmt, len, elems->mesh_config, rx_status); free: kfree(elems); @@ -1619,7 +1626,10 @@ static void mesh_rx_csa_frame(struct ieee80211_sub_if_data *sdata, pos = mgmt->u.action.u.chan_switch.variable; baselen = offsetof(struct ieee80211_mgmt, u.action.u.chan_switch.variable); - elems = ieee802_11_parse_elems(pos, len - baselen, true, NULL); + elems = ieee802_11_parse_elems(pos, len - baselen, + IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION, + NULL); if (!elems) return; @@ -1696,8 +1706,7 @@ void ieee80211_mesh_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, switch (stype) { case IEEE80211_STYPE_PROBE_RESP: case IEEE80211_STYPE_BEACON: - ieee80211_mesh_rx_bcn_presp(sdata, stype, mgmt, skb->len, - rx_status); + ieee80211_mesh_rx_bcn_presp(sdata, mgmt, skb->len, rx_status); break; case IEEE80211_STYPE_PROBE_REQ: ieee80211_mesh_rx_probe_req(sdata, mgmt, skb->len); diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index 9101858525dd..a41b57bd11ff 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008, 2009 open80211s Ltd. - * Copyright (C) 2019, 2021-2023 Intel Corporation + * Copyright (C) 2019, 2021-2023, 2025 Intel Corporation * Author: Luis Carlos Cobo <luisca@cozybit.com> */ @@ -951,7 +951,10 @@ void mesh_rx_path_sel_frame(struct ieee80211_sub_if_data *sdata, baselen = (u8 *) mgmt->u.action.u.mesh_action.variable - (u8 *) mgmt; elems = ieee802_11_parse_elems(mgmt->u.action.u.mesh_action.variable, - len - baselen, false, NULL); + len - baselen, + IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION, + NULL); if (!elems) return; diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index cb45a5d2009d..04c931cd2063 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008, 2009 open80211s Ltd. - * Copyright (C) 2019, 2021-2024 Intel Corporation + * Copyright (C) 2019, 2021-2025 Intel Corporation * Author: Luis Carlos Cobo <luisca@cozybit.com> */ #include <linux/gfp.h> @@ -1248,7 +1248,10 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, if (baselen > len) return; } - elems = ieee802_11_parse_elems(baseaddr, len - baselen, true, NULL); + elems = ieee802_11_parse_elems(baseaddr, len - baselen, + IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION, + NULL); if (elems) { mesh_process_plink_frame(sdata, mgmt, elems, rx_status); kfree(elems); diff --git a/net/mac80211/mesh_ps.c b/net/mac80211/mesh_ps.c index 20e022a03933..ebab1f0a0138 100644 --- a/net/mac80211/mesh_ps.c +++ b/net/mac80211/mesh_ps.c @@ -586,7 +586,7 @@ void ieee80211_mps_frame_release(struct sta_info *sta, if (sta->mesh->plink_state == NL80211_PLINK_ESTAB) has_buffered = ieee80211_check_tim(elems->tim, elems->tim_len, - sta->mesh->aid); + sta->mesh->aid, false); if (has_buffered) mps_dbg(sta->sdata, "%pM indicates buffered frames\n", diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 1008eb8e9b13..e56ad4b9330f 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -45,7 +45,7 @@ #define IEEE80211_ASSOC_TIMEOUT_SHORT (HZ / 10) #define IEEE80211_ASSOC_MAX_TRIES 3 -#define IEEE80211_ADV_TTLM_SAFETY_BUFFER_MS msecs_to_jiffies(100) +#define IEEE80211_ADV_TTLM_SAFETY_BUFFER_MS (100 * USEC_PER_MSEC) #define IEEE80211_ADV_TTLM_ST_UNDERFLOW 0xff00 #define IEEE80211_NEG_TTLM_REQ_TIMEOUT (HZ / 5) @@ -180,10 +180,11 @@ ieee80211_determine_ap_chan(struct ieee80211_sub_if_data *sdata, /* get special S1G case out of the way */ if (sband->band == NL80211_BAND_S1GHZ) { - if (!ieee80211_chandef_s1g_oper(elems->s1g_oper, chandef)) { - sdata_info(sdata, - "Missing S1G Operation Element? Trying operating == primary\n"); - chandef->width = ieee80211_s1g_channel_width(channel); + if (!ieee80211_chandef_s1g_oper(sdata->local, elems->s1g_oper, + chandef)) { + /* Fallback to default 1MHz */ + chandef->width = NL80211_CHAN_WIDTH_1; + chandef->s1g_primary_2mhz = false; } return IEEE80211_CONN_MODE_S1G; @@ -275,11 +276,8 @@ ieee80211_determine_ap_chan(struct ieee80211_sub_if_data *sdata, return IEEE80211_CONN_MODE_VHT; } } else if (!vht_oper || !elems->vht_cap_elem) { - if (sband->band == NL80211_BAND_5GHZ) { - sdata_info(sdata, - "VHT information is missing, disabling VHT\n"); + if (sband->band == NL80211_BAND_5GHZ) return IEEE80211_CONN_MODE_HT; - } no_vht = true; } else if (sband->band == NL80211_BAND_2GHZ) { no_vht = true; @@ -1001,6 +999,9 @@ ieee80211_determine_chan_mode(struct ieee80211_sub_if_data *sdata, .from_ap = true, .start = ies->data, .len = ies->len, + .type = ies->from_beacon ? + IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_BEACON : + IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_PROBE_RESP, }; struct ieee802_11_elems *elems; struct ieee80211_supported_band *sband; @@ -1046,6 +1047,14 @@ again: ret = -EINVAL; goto free; } + + chanreq->oper = *ap_chandef; + if (!cfg80211_chandef_usable(sdata->wdev.wiphy, &chanreq->oper, + IEEE80211_CHAN_DISABLED)) { + ret = -EINVAL; + goto free; + } + return elems; case NL80211_BAND_6GHZ: if (ap_mode < IEEE80211_CONN_MODE_HE) { @@ -1189,6 +1198,14 @@ again: "required MCSes not supported, disabling EHT\n"); } + if (conn->mode >= IEEE80211_CONN_MODE_EHT && + channel->band != NL80211_BAND_2GHZ && + conn->bw_limit == IEEE80211_CONN_BW_LIMIT_40) { + conn->mode = IEEE80211_CONN_MODE_HE; + link_id_info(sdata, link_id, + "required bandwidth not supported, disabling EHT\n"); + } + /* the mode can only decrease, so this must terminate */ if (ap_mode != conn->mode) { kfree(elems); @@ -1842,7 +1859,8 @@ ieee80211_add_link_elems(struct ieee80211_sub_if_data *sdata, ieee80211_put_he_cap(skb, sdata, sband, &assoc_data->link[link_id].conn); ADD_PRESENT_EXT_ELEM(WLAN_EID_EXT_HE_CAPABILITY); - ieee80211_put_he_6ghz_cap(skb, sdata, smps_mode); + if (sband->band == NL80211_BAND_6GHZ) + ieee80211_put_he_6ghz_cap(skb, sdata, smps_mode); } /* @@ -2104,8 +2122,11 @@ ieee80211_link_common_elems_size(struct ieee80211_sub_if_data *sdata, sizeof(struct ieee80211_he_mcs_nss_supp) + IEEE80211_HE_PPE_THRES_MAX_LEN; - if (sband->band == NL80211_BAND_6GHZ) + if (sband->band == NL80211_BAND_6GHZ) { size += 2 + 1 + sizeof(struct ieee80211_he_6ghz_capa); + /* reg connection */ + size += 4; + } size += 2 + 1 + sizeof(struct ieee80211_eht_cap_elem) + sizeof(struct ieee80211_eht_mcs_nss_supp) + @@ -2179,11 +2200,7 @@ static int ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) 2 + /* ext capa & op */ 2; /* EML capa */ - /* - * The capability elements were already considered above; - * note this over-estimates a bit because there's no - * STA profile for the assoc link. - */ + /* The capability elements were already considered above */ size += (n_links - 1) * (1 + 1 + /* subelement ID/length */ 2 + /* STA control */ @@ -2491,6 +2508,16 @@ static void ieee80211_csa_switch_work(struct wiphy *wiphy, link->u.mgd.csa.waiting_bcn = true; + /* + * The next beacon really should always be different, so this should + * have no effect whatsoever. However, some APs (we observed this in + * an Asus AXE11000), the beacon after the CSA might be identical to + * the last beacon on the old channel - in this case we'd ignore it. + * Resetting the CRC will lead us to handle it better (albeit with a + * disconnect, but clearly the AP is broken.) + */ + link->u.mgd.beacon_crc_valid = false; + /* apply new TPE restrictions immediately on the new channel */ if (link->u.mgd.csa.ap_chandef.chan->band == NL80211_BAND_6GHZ && link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_HE) { @@ -2577,7 +2604,7 @@ void ieee80211_chswitch_done(struct ieee80211_vif *vif, bool success, return; } - wiphy_delayed_work_queue(sdata->local->hw.wiphy, + wiphy_hrtimer_work_queue(sdata->local->hw.wiphy, &link->u.mgd.csa.switch_work, 0); } @@ -2736,7 +2763,8 @@ ieee80211_sta_process_chanswitch(struct ieee80211_link_data *link, .timestamp = timestamp, .device_timestamp = device_timestamp, }; - unsigned long now; + u32 csa_time_tu; + ktime_t now; int res; lockdep_assert_wiphy(local->hw.wiphy); @@ -2966,10 +2994,9 @@ ieee80211_sta_process_chanswitch(struct ieee80211_link_data *link, csa_ie.mode); /* we may have to handle timeout for deactivated link in software */ - now = jiffies; - link->u.mgd.csa.time = now + - TU_TO_JIFFIES((max_t(int, csa_ie.count, 1) - 1) * - link->conf->beacon_int); + now = ktime_get_boottime(); + csa_time_tu = (max_t(int, csa_ie.count, 1) - 1) * link->conf->beacon_int; + link->u.mgd.csa.time = now + us_to_ktime(ieee80211_tu_to_usec(csa_time_tu)); if (ieee80211_vif_link_active(&sdata->vif, link->link_id) && local->ops->channel_switch) { @@ -2984,7 +3011,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_link_data *link, } /* channel switch handled in software */ - wiphy_delayed_work_queue(local->hw.wiphy, + wiphy_hrtimer_work_queue(local->hw.wiphy, &link->u.mgd.csa.switch_work, link->u.mgd.csa.time - now); return; @@ -4225,14 +4252,14 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, memset(&sdata->u.mgd.ttlm_info, 0, sizeof(sdata->u.mgd.ttlm_info)); - wiphy_delayed_work_cancel(sdata->local->hw.wiphy, &ifmgd->ttlm_work); + wiphy_hrtimer_work_cancel(sdata->local->hw.wiphy, &ifmgd->ttlm_work); memset(&sdata->vif.neg_ttlm, 0, sizeof(sdata->vif.neg_ttlm)); wiphy_delayed_work_cancel(sdata->local->hw.wiphy, &ifmgd->neg_ttlm_timeout_work); sdata->u.mgd.removed_links = 0; - wiphy_delayed_work_cancel(sdata->local->hw.wiphy, + wiphy_hrtimer_work_cancel(sdata->local->hw.wiphy, &sdata->u.mgd.ml_reconf_work); wiphy_work_cancel(sdata->local->hw.wiphy, @@ -5153,7 +5180,9 @@ static void ieee80211_epcs_teardown(struct ieee80211_sub_if_data *sdata) continue; } - elems = ieee802_11_parse_elems(ies->data, ies->len, false, + elems = ieee802_11_parse_elems(ies->data, ies->len, + IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_BEACON, NULL); if (!elems) { rcu_read_unlock(); @@ -5199,6 +5228,7 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, .len = elem_len, .link_id = link_id == assoc_data->assoc_link_id ? -1 : link_id, .from_ap = true, + .type = le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_TYPE, }; bool is_5ghz = cbss->channel->band == NL80211_BAND_5GHZ; bool is_6ghz = cbss->channel->band == NL80211_BAND_6GHZ; @@ -5721,7 +5751,7 @@ static u8 ieee80211_max_rx_chains(struct ieee80211_link_data *link, he_cap_elem = cfg80211_find_ext_elem(WLAN_EID_EXT_HE_CAPABILITY, ies->data, ies->len); - if (!he_cap_elem || he_cap_elem->datalen < sizeof(*he_cap)) + if (!he_cap_elem || he_cap_elem->datalen < sizeof(*he_cap) + 1) return chains; /* skip one byte ext_tag_id */ @@ -6004,24 +6034,6 @@ ieee80211_determine_our_sta_mode_assoc(struct ieee80211_sub_if_data *sdata, conn->bw_limit, tmp.bw_limit); } -static enum ieee80211_ap_reg_power -ieee80211_ap_power_type(u8 control) -{ - switch (u8_get_bits(control, IEEE80211_HE_6GHZ_OPER_CTRL_REG_INFO)) { - case IEEE80211_6GHZ_CTRL_REG_LPI_AP: - case IEEE80211_6GHZ_CTRL_REG_INDOOR_LPI_AP: - return IEEE80211_REG_LPI_AP; - case IEEE80211_6GHZ_CTRL_REG_SP_AP: - case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP: - case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP_OLD: - return IEEE80211_REG_SP_AP; - case IEEE80211_6GHZ_CTRL_REG_VLP_AP: - return IEEE80211_REG_VLP_AP; - default: - return IEEE80211_REG_UNSET_AP; - } -} - static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link, int link_id, @@ -6064,7 +6076,8 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, he_6ghz_oper = ieee80211_he_6ghz_oper(elems->he_operation); if (he_6ghz_oper) link->conf->power_type = - ieee80211_ap_power_type(he_6ghz_oper->control); + cfg80211_6ghz_power_type(he_6ghz_oper->control, + cbss->channel->flags); else link_info(link, "HE 6 GHz operation missing (on %d MHz), expect issues\n", @@ -6095,9 +6108,10 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, ret = ieee80211_link_use_channel(link, &chanreq, IEEE80211_CHANCTX_SHARED); - /* don't downgrade for 5 and 10 MHz channels, though. */ + /* don't downgrade for 5/10/S1G MHz channels, though. */ if (chanreq.oper.width == NL80211_CHAN_WIDTH_5 || - chanreq.oper.width == NL80211_CHAN_WIDTH_10) + chanreq.oper.width == NL80211_CHAN_WIDTH_10 || + cfg80211_chandef_is_s1g(&chanreq.oper)) return ret; while (ret && chanreq.oper.width != NL80211_CHAN_WIDTH_20_NOHT) { @@ -6332,6 +6346,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, .bss = NULL, .link_id = -1, .from_ap = true, + .type = le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_TYPE, }; struct ieee802_11_elems *elems; int ac; @@ -6348,6 +6363,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, }; u8 ap_mld_addr[ETH_ALEN] __aligned(2); unsigned int link_id; + u16 max_aid = IEEE80211_MAX_AID; lockdep_assert_wiphy(sdata->local->hw.wiphy); @@ -6374,10 +6390,12 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, reassoc = ieee80211_is_reassoc_resp(mgmt->frame_control); capab_info = le16_to_cpu(mgmt->u.assoc_resp.capab_info); status_code = le16_to_cpu(mgmt->u.assoc_resp.status_code); - if (assoc_data->s1g) + if (assoc_data->s1g) { elem_start = mgmt->u.s1g_assoc_resp.variable; - else + max_aid = IEEE80211_MAX_SUPPORTED_S1G_AID; + } else { elem_start = mgmt->u.assoc_resp.variable; + } /* * Note: this may not be perfect, AP might misbehave - if @@ -6401,16 +6419,15 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, if (elems->aid_resp) aid = le16_to_cpu(elems->aid_resp->aid); - else if (assoc_data->s1g) - aid = 0; /* TODO */ else aid = le16_to_cpu(mgmt->u.assoc_resp.aid); /* - * The 5 MSB of the AID field are reserved - * (802.11-2016 9.4.1.8 AID field) + * The 5 MSB of the AID field are reserved for a non-S1G STA. For + * an S1G STA the 3 MSBs are reserved. + * (802.11-2016 9.4.1.8 AID field). */ - aid &= 0x7ff; + aid &= assoc_data->s1g ? 0x1fff : 0x7ff; sdata_info(sdata, "RX %sssocResp from %pM (capab=0x%x status=%d aid=%d)\n", @@ -6447,7 +6464,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, event.u.mlme.reason = status_code; drv_event_callback(sdata->local, sdata, &event); } else { - if (aid == 0 || aid > IEEE80211_MAX_AID) { + if (aid == 0 || aid > max_aid) { sdata_info(sdata, "invalid AID value %d (out of range), turn off PS\n", aid); @@ -6485,6 +6502,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, } sdata->vif.cfg.aid = aid; + sdata->vif.cfg.s1g = assoc_data->s1g; if (!ieee80211_assoc_success(sdata, mgmt, elems, elem_start, elem_len)) { @@ -6590,8 +6608,8 @@ static void ieee80211_rx_mgmt_probe_resp(struct ieee80211_link_data *link, * Response frame shall be set to the broadcast address [..]" * So, on 6GHz band we should also accept broadcast responses. */ - channel = ieee80211_get_channel(sdata->local->hw.wiphy, - rx_status->freq); + channel = ieee80211_get_channel_khz(sdata->local->hw.wiphy, + ieee80211_rx_status_to_khz(rx_status)); if (!channel) return; @@ -6856,7 +6874,7 @@ static void ieee80211_ml_reconfiguration(struct ieee80211_sub_if_data *sdata, /* In case the removal was cancelled, abort it */ if (sdata->u.mgd.removed_links) { sdata->u.mgd.removed_links = 0; - wiphy_delayed_work_cancel(sdata->local->hw.wiphy, + wiphy_hrtimer_work_cancel(sdata->local->hw.wiphy, &sdata->u.mgd.ml_reconf_work); } return; @@ -6886,9 +6904,9 @@ static void ieee80211_ml_reconfiguration(struct ieee80211_sub_if_data *sdata, } sdata->u.mgd.removed_links = removed_links; - wiphy_delayed_work_queue(sdata->local->hw.wiphy, + wiphy_hrtimer_work_queue(sdata->local->hw.wiphy, &sdata->u.mgd.ml_reconf_work, - TU_TO_JIFFIES(delay)); + us_to_ktime(ieee80211_tu_to_usec(delay))); } static int ieee80211_ttlm_set_links(struct ieee80211_sub_if_data *sdata, @@ -7075,7 +7093,7 @@ static void ieee80211_process_adv_ttlm(struct ieee80211_sub_if_data *sdata, /* if a planned TID-to-link mapping was cancelled - * abort it */ - wiphy_delayed_work_cancel(sdata->local->hw.wiphy, + wiphy_hrtimer_work_cancel(sdata->local->hw.wiphy, &sdata->u.mgd.ttlm_work); } else if (sdata->u.mgd.ttlm_info.active) { /* if no TID-to-link element, set to default mapping in @@ -7110,7 +7128,7 @@ static void ieee80211_process_adv_ttlm(struct ieee80211_sub_if_data *sdata, if (ttlm_info.switch_time) { u16 beacon_ts_tu, st_tu, delay; - u32 delay_jiffies; + u64 delay_usec; u64 mask; /* The t2l map switch time is indicated with a partial @@ -7132,23 +7150,23 @@ static void ieee80211_process_adv_ttlm(struct ieee80211_sub_if_data *sdata, if (delay > IEEE80211_ADV_TTLM_ST_UNDERFLOW) return; - delay_jiffies = TU_TO_JIFFIES(delay); + delay_usec = ieee80211_tu_to_usec(delay); /* Link switching can take time, so schedule it * 100ms before to be ready on time */ - if (delay_jiffies > IEEE80211_ADV_TTLM_SAFETY_BUFFER_MS) - delay_jiffies -= + if (delay_usec > IEEE80211_ADV_TTLM_SAFETY_BUFFER_MS) + delay_usec -= IEEE80211_ADV_TTLM_SAFETY_BUFFER_MS; else - delay_jiffies = 0; + delay_usec = 0; sdata->u.mgd.ttlm_info = ttlm_info; - wiphy_delayed_work_cancel(sdata->local->hw.wiphy, + wiphy_hrtimer_work_cancel(sdata->local->hw.wiphy, &sdata->u.mgd.ttlm_work); - wiphy_delayed_work_queue(sdata->local->hw.wiphy, + wiphy_hrtimer_work_queue(sdata->local->hw.wiphy, &sdata->u.mgd.ttlm_work, - delay_jiffies); + us_to_ktime(delay_usec)); return; } } @@ -7237,7 +7255,9 @@ ieee80211_mgd_check_cross_link_csa(struct ieee80211_sub_if_data *sdata, (prof->sta_info_len - 1), len - (prof->sta_info_len - 1), - false, NULL); + IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_BEACON, + NULL); /* memory allocation failed - let's hope that's transient */ if (!prof_elems) @@ -7281,6 +7301,38 @@ static bool ieee80211_mgd_ssid_mismatch(struct ieee80211_sub_if_data *sdata, return memcmp(elems->ssid, cfg->ssid, cfg->ssid_len); } +static bool +ieee80211_rx_beacon_freq_valid(struct ieee80211_local *local, + struct ieee80211_mgmt *mgmt, + struct ieee80211_rx_status *rx_status, + struct ieee80211_chanctx_conf *chanctx) +{ + u32 pri_2mhz_khz; + struct ieee80211_channel *s1g_sibling_1mhz; + u32 pri_khz = ieee80211_channel_to_khz(chanctx->def.chan); + u32 rx_khz = ieee80211_rx_status_to_khz(rx_status); + + if (rx_khz == pri_khz) + return true; + + if (!chanctx->def.s1g_primary_2mhz) + return false; + + /* + * If we have an S1G interface with a 2MHz primary, beacons are + * sent on the center frequency of the 2MHz primary. Find the sibling + * 1MHz channel and calculate the 2MHz primary center frequency. + */ + s1g_sibling_1mhz = cfg80211_s1g_get_primary_sibling(local->hw.wiphy, + &chanctx->def); + if (!s1g_sibling_1mhz) + return false; + + pri_2mhz_khz = + (pri_khz + ieee80211_channel_to_khz(s1g_sibling_1mhz)) / 2; + return rx_khz == pri_2mhz_khz; +} + static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, struct ieee80211_hdr *hdr, size_t len, struct ieee80211_rx_status *rx_status) @@ -7309,6 +7361,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, .mode = link->u.mgd.conn.mode, .link_id = -1, .from_ap = true, + .type = le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_TYPE, }; lockdep_assert_wiphy(local->hw.wiphy); @@ -7335,8 +7388,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, return; } - if (ieee80211_rx_status_to_khz(rx_status) != - ieee80211_channel_to_khz(chanctx_conf->def.chan)) { + if (!ieee80211_rx_beacon_freq_valid(local, mgmt, rx_status, + chanctx_conf)) { rcu_read_unlock(); return; } @@ -7432,7 +7485,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, ncrc = elems->crc; if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK) && - ieee80211_check_tim(elems->tim, elems->tim_len, vif_cfg->aid)) { + ieee80211_check_tim(elems->tim, elems->tim_len, vif_cfg->aid, + vif_cfg->s1g)) { if (local->hw.conf.dynamic_ps_timeout > 0) { if (local->hw.conf.flags & IEEE80211_CONF_PS) { local->hw.conf.flags &= ~IEEE80211_CONF_PS; @@ -7910,7 +7964,10 @@ void ieee80211_process_neg_ttlm_req(struct ieee80211_sub_if_data *sdata, ies_len = len - offsetof(struct ieee80211_mgmt, u.action.u.ttlm_req.variable); elems = ieee802_11_parse_elems(mgmt->u.action.u.ttlm_req.variable, - ies_len, true, NULL); + ies_len, + IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION, + NULL); if (!elems) { ttlm_res = NEG_TTLM_RES_REJECT; goto out; @@ -8116,9 +8173,11 @@ void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, break; /* CSA IE cannot be overridden, no need for BSSID */ - elems = ieee802_11_parse_elems( - mgmt->u.action.u.chan_switch.variable, - ies_len, true, NULL); + elems = ieee802_11_parse_elems(mgmt->u.action.u.chan_switch.variable, + ies_len, + IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION, + NULL); if (elems && !elems->parse_error) { enum ieee80211_csa_source src = @@ -8145,9 +8204,11 @@ void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, * extended CSA IE can't be overridden, no need for * BSSID */ - elems = ieee802_11_parse_elems( - mgmt->u.action.u.ext_chan_switch.variable, - ies_len, true, NULL); + elems = ieee802_11_parse_elems(mgmt->u.action.u.ext_chan_switch.variable, + ies_len, + IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION, + NULL); if (elems && !elems->parse_error) { enum ieee80211_csa_source src; @@ -8740,7 +8801,7 @@ void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata) ieee80211_csa_connection_drop_work); wiphy_delayed_work_init(&ifmgd->tdls_peer_del_work, ieee80211_tdls_peer_del_work); - wiphy_delayed_work_init(&ifmgd->ml_reconf_work, + wiphy_hrtimer_work_init(&ifmgd->ml_reconf_work, ieee80211_ml_reconf_work); wiphy_delayed_work_init(&ifmgd->reconf.wk, ieee80211_ml_sta_reconf_timeout); @@ -8749,7 +8810,7 @@ void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata) timer_setup(&ifmgd->conn_mon_timer, ieee80211_sta_conn_mon_timer, 0); wiphy_delayed_work_init(&ifmgd->tx_tspec_wk, ieee80211_sta_handle_tspec_ac_params_wk); - wiphy_delayed_work_init(&ifmgd->ttlm_work, + wiphy_hrtimer_work_init(&ifmgd->ttlm_work, ieee80211_tid_to_link_map_work); wiphy_delayed_work_init(&ifmgd->neg_ttlm_timeout_work, ieee80211_neg_ttlm_timeout_work); @@ -8796,7 +8857,7 @@ void ieee80211_mgd_setup_link(struct ieee80211_link_data *link) else link->u.mgd.req_smps = IEEE80211_SMPS_OFF; - wiphy_delayed_work_init(&link->u.mgd.csa.switch_work, + wiphy_hrtimer_work_init(&link->u.mgd.csa.switch_work, ieee80211_csa_switch_work); ieee80211_clear_tpe(&link->conf->tpe); @@ -10011,7 +10072,7 @@ void ieee80211_mgd_stop_link(struct ieee80211_link_data *link) &link->u.mgd.request_smps_work); wiphy_work_cancel(link->sdata->local->hw.wiphy, &link->u.mgd.recalc_smps); - wiphy_delayed_work_cancel(link->sdata->local->hw.wiphy, + wiphy_hrtimer_work_cancel(link->sdata->local->hw.wiphy, &link->u.mgd.csa.switch_work); } @@ -10925,7 +10986,10 @@ static void ieee80211_ml_epcs(struct ieee80211_sub_if_data *sdata, pos = scratch + sizeof(control); len -= sizeof(control); - link_elems = ieee802_11_parse_elems(pos, len, false, NULL); + link_elems = ieee802_11_parse_elems(pos, len, + IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION, + NULL); if (!link_elems) continue; @@ -10976,7 +11040,10 @@ void ieee80211_process_epcs_ena_resp(struct ieee80211_sub_if_data *sdata, u.action.u.epcs.variable) - IEEE80211_EPCS_ENA_RESP_BODY_LEN; - elems = ieee802_11_parse_elems(pos, ies_len, true, NULL); + elems = ieee802_11_parse_elems(pos, ies_len, + IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION, + NULL); if (!elems) return; diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c index 13df6321634d..ae82533e3c02 100644 --- a/net/mac80211/offchannel.c +++ b/net/mac80211/offchannel.c @@ -8,7 +8,7 @@ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2009 Johannes Berg <johannes@sipsolutions.net> - * Copyright (C) 2019, 2022-2024 Intel Corporation + * Copyright (C) 2019, 2022-2025 Intel Corporation */ #include <linux/export.h> #include <net/mac80211.h> @@ -897,6 +897,7 @@ int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, need_offchan = true; break; case NL80211_IFTYPE_NAN: + break; default: return -EOPNOTSUPP; } @@ -910,6 +911,8 @@ int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, /* Check if the operating channel is the requested channel */ if (!params->chan && mlo_sta) { need_offchan = false; + } else if (sdata->vif.type == NL80211_IFTYPE_NAN) { + /* Frames can be sent during NAN schedule */ } else if (!need_offchan) { struct ieee80211_chanctx_conf *chanctx_conf = NULL; int i; diff --git a/net/mac80211/parse.c b/net/mac80211/parse.c index c5e0f7f46004..bfc4ecb7a048 100644 --- a/net/mac80211/parse.c +++ b/net/mac80211/parse.c @@ -6,7 +6,7 @@ * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2024 Intel Corporation + * Copyright (C) 2018-2025 Intel Corporation * * element parsing for mac80211 */ @@ -286,6 +286,24 @@ _ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params, bitmap_zero(seen_elems, 256); + switch (params->type) { + /* we don't need to parse assoc request, luckily (it's value 0) */ + case IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ASSOC_REQ: + case IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_REASSOC_REQ: + default: + WARN(1, "invalid frame type 0x%x for element parsing\n", + params->type); + break; + case IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ASSOC_RESP: + case IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_REASSOC_RESP: + case IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_PROBE_REQ: + case IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_PROBE_RESP: + case IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_BEACON: + case IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION: + case IEEE80211_FTYPE_EXT | IEEE80211_STYPE_S1G_BEACON: + break; + } + for_each_element(elem, params->start, params->len) { const struct element *subelem; u8 elem_parse_failed; @@ -566,7 +584,8 @@ _ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params, if (params->mode < IEEE80211_CONN_MODE_VHT) break; - if (!params->action) { + if (params->type != (IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION)) { elem_parse_failed = IEEE80211_PARSE_ERR_UNEXPECTED_ELEM; break; @@ -582,7 +601,8 @@ _ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params, case WLAN_EID_CHANNEL_SWITCH_WRAPPER: if (params->mode < IEEE80211_CONN_MODE_VHT) break; - if (params->action) { + if (params->type == (IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION)) { elem_parse_failed = IEEE80211_PARSE_ERR_UNEXPECTED_ELEM; break; @@ -942,7 +962,7 @@ ieee80211_prep_mle_link_parse(struct ieee80211_elems_parse *elems_parse, sub->len = end - sub->start; sub->mode = params->mode; - sub->action = params->action; + sub->type = params->type; sub->from_ap = params->from_ap; sub->link_id = -1; @@ -1041,7 +1061,7 @@ ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params) sub.start = elems_parse->scratch_pos; sub.mode = params->mode; sub.len = nontx_len; - sub.action = params->action; + sub.type = params->type; sub.link_id = params->link_id; /* consume the space used for non-transmitted profile */ diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index 3cb2ad6d0b28..e441f8541603 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -4,7 +4,7 @@ * Copyright 2005-2006, Devicescape Software, Inc. * Copyright (c) 2006 Jiri Benc <jbenc@suse.cz> * Copyright 2017 Intel Deutschland GmbH - * Copyright (C) 2019, 2022-2024 Intel Corporation + * Copyright (C) 2019, 2022-2025 Intel Corporation */ #include <linux/kernel.h> @@ -98,6 +98,9 @@ void rate_control_tx_status(struct ieee80211_local *local, if (!ref || !test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) return; + if (st->info->band >= NUM_NL80211_BANDS) + return; + sband = local->hw.wiphy->bands[st->info->band]; spin_lock_bh(&sta->rate_ctrl_lock); @@ -419,6 +422,9 @@ static bool rate_control_send_low(struct ieee80211_sta *pubsta, int mcast_rate; bool use_basicrate = false; + if (!sband) + return false; + if (!pubsta || rc_no_data_or_no_ack_use_min(txrc)) { __rate_control_send_low(txrc->hw, sband, pubsta, info, txrc->rate_idx_mask); @@ -898,6 +904,9 @@ void ieee80211_get_tx_rates(struct ieee80211_vif *vif, return; sdata = vif_to_sdata(vif); + if (info->band >= NUM_NL80211_BANDS) + return; + sband = sdata->local->hw.wiphy->bands[info->band]; if (ieee80211_is_tx_data(skb)) diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 4d4ff4d4917a..6a1899512d07 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -59,7 +59,8 @@ static struct sk_buff *ieee80211_clean_skb(struct sk_buff *skb, status->flag &= ~(RX_FLAG_RADIOTAP_TLV_AT_END | RX_FLAG_RADIOTAP_LSIG | RX_FLAG_RADIOTAP_HE_MU | - RX_FLAG_RADIOTAP_HE); + RX_FLAG_RADIOTAP_HE | + RX_FLAG_RADIOTAP_VHT); hdr = (void *)skb->data; fc = hdr->frame_control; @@ -151,8 +152,10 @@ ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local, } if (status->encoding == RX_ENC_VHT) { + /* Included even if RX_FLAG_RADIOTAP_VHT is not set */ len = ALIGN(len, 2); len += 12; + BUILD_BUG_ON(sizeof(struct ieee80211_radiotap_vht) != 12); } if (local->hw.radiotap_timestamp.units_pos >= 0) { @@ -195,6 +198,9 @@ ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local, * The position to look at depends on the existence (or non- * existence) of other elements, so take that into account... */ + if (status->flag & RX_FLAG_RADIOTAP_VHT) + tlv_offset += + sizeof(struct ieee80211_radiotap_vht); if (status->flag & RX_FLAG_RADIOTAP_HE) tlv_offset += sizeof(struct ieee80211_radiotap_he); @@ -319,10 +325,17 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, u32 tlvs_len = 0; int mpdulen, chain; unsigned long chains = status->chains; + struct ieee80211_radiotap_vht vht = {}; struct ieee80211_radiotap_he he = {}; struct ieee80211_radiotap_he_mu he_mu = {}; struct ieee80211_radiotap_lsig lsig = {}; + if (status->flag & RX_FLAG_RADIOTAP_VHT) { + vht = *(struct ieee80211_radiotap_vht *)skb->data; + skb_pull(skb, sizeof(vht)); + WARN_ON_ONCE(status->encoding != RX_ENC_VHT); + } + if (status->flag & RX_FLAG_RADIOTAP_HE) { he = *(struct ieee80211_radiotap_he *)skb->data; skb_pull(skb, sizeof(he)); @@ -530,45 +543,61 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, } if (status->encoding == RX_ENC_VHT) { - u16 known = local->hw.radiotap_vht_details; + u16 fill = local->hw.radiotap_vht_details; - rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_VHT)); - put_unaligned_le16(known, pos); - pos += 2; - /* flags */ - if (status->enc_flags & RX_ENC_FLAG_SHORT_GI) - *pos |= IEEE80211_RADIOTAP_VHT_FLAG_SGI; + /* Leave driver filled fields alone */ + fill &= ~le16_to_cpu(vht.known); + vht.known |= cpu_to_le16(fill); + + if (fill & IEEE80211_RADIOTAP_VHT_KNOWN_GI && + status->enc_flags & RX_ENC_FLAG_SHORT_GI) + vht.flags |= IEEE80211_RADIOTAP_VHT_FLAG_SGI; /* in VHT, STBC is binary */ - if (status->enc_flags & RX_ENC_FLAG_STBC_MASK) - *pos |= IEEE80211_RADIOTAP_VHT_FLAG_STBC; - if (status->enc_flags & RX_ENC_FLAG_BF) + if (fill & IEEE80211_RADIOTAP_VHT_KNOWN_STBC && + status->enc_flags & RX_ENC_FLAG_STBC_MASK) + vht.flags |= IEEE80211_RADIOTAP_VHT_FLAG_STBC; + if (fill & IEEE80211_RADIOTAP_VHT_KNOWN_BEAMFORMED && + status->enc_flags & RX_ENC_FLAG_BF) *pos |= IEEE80211_RADIOTAP_VHT_FLAG_BEAMFORMED; - pos++; - /* bandwidth */ - switch (status->bw) { - case RATE_INFO_BW_80: - *pos++ = 4; - break; - case RATE_INFO_BW_160: - *pos++ = 11; - break; - case RATE_INFO_BW_40: - *pos++ = 1; - break; - default: - *pos++ = 0; + + if (fill & IEEE80211_RADIOTAP_VHT_KNOWN_BANDWIDTH) { + switch (status->bw) { + case RATE_INFO_BW_40: + vht.bandwidth = IEEE80211_RADIOTAP_VHT_BW_40; + break; + case RATE_INFO_BW_80: + vht.bandwidth = IEEE80211_RADIOTAP_VHT_BW_80; + break; + case RATE_INFO_BW_160: + vht.bandwidth = IEEE80211_RADIOTAP_VHT_BW_160; + break; + default: + vht.bandwidth = IEEE80211_RADIOTAP_VHT_BW_20; + break; + } } - /* MCS/NSS */ - *pos = (status->rate_idx << 4) | status->nss; - pos += 4; - /* coding field */ - if (status->enc_flags & RX_ENC_FLAG_LDPC) - *pos |= IEEE80211_RADIOTAP_CODING_LDPC_USER0; - pos++; - /* group ID */ - pos++; - /* partial_aid */ - pos += 2; + + /* + * If the driver filled in mcs_nss[0], then do not touch it. + * + * Otherwise, put some information about MCS/NSS into the + * user 0 field. Note that this is not technically correct for + * an MU frame as we might have decoded a different user. + */ + if (!vht.mcs_nss[0]) { + vht.mcs_nss[0] = (status->rate_idx << 4) | status->nss; + + /* coding field */ + if (status->enc_flags & RX_ENC_FLAG_LDPC) + vht.coding |= IEEE80211_RADIOTAP_CODING_LDPC_USER0; + } + + /* ensure 2 byte alignment */ + while ((pos - (u8 *)rthdr) & 1) + pos++; + rthdr->it_present |= cpu_to_le32(BIT(IEEE80211_RADIOTAP_VHT)); + memcpy(pos, &vht, sizeof(vht)); + pos += sizeof(vht); } if (local->hw.radiotap_timestamp.units_pos >= 0) { @@ -763,6 +792,51 @@ ieee80211_make_monitor_skb(struct ieee80211_local *local, return skb; } +static bool +ieee80211_validate_monitor_radio(struct ieee80211_sub_if_data *sdata, + struct ieee80211_local *local, + struct ieee80211_rx_status *status) +{ + struct wiphy *wiphy = local->hw.wiphy; + int i, freq, bw; + + if (!wiphy->n_radio) + return true; + + switch (status->bw) { + case RATE_INFO_BW_20: + bw = 20000; + break; + case RATE_INFO_BW_40: + bw = 40000; + break; + case RATE_INFO_BW_80: + bw = 80000; + break; + case RATE_INFO_BW_160: + bw = 160000; + break; + case RATE_INFO_BW_320: + bw = 320000; + break; + default: + return false; + } + + freq = MHZ_TO_KHZ(status->freq); + + for (i = 0; i < wiphy->n_radio; i++) { + if (!(sdata->wdev.radio_mask & BIT(i))) + continue; + + if (!ieee80211_radio_freq_range_valid(&wiphy->radio[i], freq, bw)) + continue; + + return true; + } + return false; +} + /* * This function copies a received frame to all monitor interfaces and * returns a cleaned-up SKB that no longer includes the FCS nor the @@ -789,6 +863,9 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, return NULL; } + if (status->flag & RX_FLAG_RADIOTAP_VHT) + rtap_space += sizeof(struct ieee80211_radiotap_vht); + if (status->flag & RX_FLAG_RADIOTAP_HE) rtap_space += sizeof(struct ieee80211_radiotap_he); @@ -855,6 +932,10 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, chandef->chan->center_freq != status->freq) continue; + if (ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR) && + !ieee80211_validate_monitor_radio(sdata, local, status)) + continue; + if (!prev_sdata) { prev_sdata = sdata; continue; @@ -2134,10 +2215,12 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) rx, IEEE80211_CCMP_256_MIC_LEN); break; case WLAN_CIPHER_SUITE_AES_CMAC: - result = ieee80211_crypto_aes_cmac_decrypt(rx); + result = ieee80211_crypto_aes_cmac_decrypt( + rx, IEEE80211_CMAC_128_MIC_LEN); break; case WLAN_CIPHER_SUITE_BIP_CMAC_256: - result = ieee80211_crypto_aes_cmac_256_decrypt(rx); + result = ieee80211_crypto_aes_cmac_decrypt( + rx, IEEE80211_CMAC_256_MIC_LEN); break; case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: @@ -3521,8 +3604,11 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) switch (mgmt->u.action.category) { case WLAN_CATEGORY_HT: - /* reject HT action frames from stations not supporting HT */ - if (!rx->link_sta->pub->ht_cap.ht_supported) + /* reject HT action frames from stations not supporting HT + * or not HE Capable + */ + if (!rx->link_sta->pub->ht_cap.ht_supported && + !rx->link_sta->pub->he_cap.has_he) goto invalid; if (sdata->vif.type != NL80211_IFTYPE_STATION && @@ -4502,8 +4588,16 @@ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx) (ieee80211_is_auth(hdr->frame_control) && ether_addr_equal(sdata->vif.addr, hdr->addr1)); case NL80211_IFTYPE_NAN: - /* Currently no frames on NAN interface are allowed */ - return false; + /* Accept only frames that are addressed to the NAN cluster + * (based on the Cluster ID). From these frames, accept only + * action frames or authentication frames that are addressed to + * the local NAN interface. + */ + return memcmp(sdata->wdev.u.nan.cluster_id, + hdr->addr3, ETH_ALEN) == 0 && + (ieee80211_is_public_action(hdr, skb->len) || + (ieee80211_is_auth(hdr->frame_control) && + ether_addr_equal(sdata->vif.addr, hdr->addr1))); default: break; } @@ -4895,6 +4989,11 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, /* after this point, don't punt to the slowpath! */ + if (fast_rx->uses_rss) + stats = this_cpu_ptr(rx->link_sta->pcpu_rx_stats); + else + stats = &rx->link_sta->rx_stats; + if (rx->key && !(status->flag & RX_FLAG_MIC_STRIPPED) && pskb_trim(skb, skb->len - fast_rx->icv_len)) goto drop; @@ -4929,6 +5028,8 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, res = ieee80211_rx_mesh_data(rx->sdata, rx->sta, rx->skb); switch (res) { case RX_QUEUED: + stats->last_rx = jiffies; + stats->last_rate = sta_stats_encode_rate(status); return true; case RX_CONTINUE: break; @@ -4942,11 +5043,6 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, drop: dev_kfree_skb(skb); - if (fast_rx->uses_rss) - stats = this_cpu_ptr(rx->link_sta->pcpu_rx_stats); - else - stats = &rx->link_sta->rx_stats; - stats->dropped++; return true; } @@ -5230,12 +5326,20 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, } rx.sdata = prev_sta->sdata; + if (!status->link_valid && prev_sta->sta.mlo) { + struct link_sta_info *link_sta; + + link_sta = link_sta_info_get_bss(rx.sdata, + hdr->addr2); + if (!link_sta) + continue; + + link_id = link_sta->link_id; + } + if (!ieee80211_rx_data_set_sta(&rx, prev_sta, link_id)) goto out; - if (!status->link_valid && prev_sta->sta.mlo) - continue; - ieee80211_prepare_and_rx_handle(&rx, skb, false); prev_sta = sta; @@ -5243,10 +5347,18 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, if (prev_sta) { rx.sdata = prev_sta->sdata; - if (!ieee80211_rx_data_set_sta(&rx, prev_sta, link_id)) - goto out; + if (!status->link_valid && prev_sta->sta.mlo) { + struct link_sta_info *link_sta; + + link_sta = link_sta_info_get_bss(rx.sdata, + hdr->addr2); + if (!link_sta) + goto out; - if (!status->link_valid && prev_sta->sta.mlo) + link_id = link_sta->link_id; + } + + if (!ieee80211_rx_data_set_sta(&rx, prev_sta, link_id)) goto out; if (ieee80211_prepare_and_rx_handle(&rx, skb, true)) @@ -5336,10 +5448,14 @@ void ieee80211_rx_list(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, if (WARN_ON(!local->started)) goto drop; - if (likely(!(status->flag & RX_FLAG_FAILED_PLCP_CRC))) { + if (likely(!(status->flag & RX_FLAG_FAILED_PLCP_CRC) && + !(status->flag & RX_FLAG_NO_PSDU && + status->zero_length_psdu_type == + IEEE80211_RADIOTAP_ZERO_LEN_PSDU_NOT_CAPTURED))) { /* - * Validate the rate, unless a PLCP error means that - * we probably can't have a valid rate here anyway. + * Validate the rate, unless there was a PLCP error which may + * have an invalid rate or the PSDU was not capture and may be + * missing rate information. */ switch (status->encoding) { diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index dbf98aa4cd67..5ef315ed3b0f 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -76,7 +76,11 @@ void ieee80211_inform_bss(struct wiphy *wiphy, if (!update_data) return; - elems = ieee802_11_parse_elems(ies->data, ies->len, false, NULL); + elems = ieee802_11_parse_elems(ies->data, ies->len, + update_data->beacon ? + IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_BEACON : + IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_PROBE_RESP, + NULL); if (!elems) return; @@ -996,15 +1000,15 @@ static void ieee80211_scan_state_set_channel(struct ieee80211_local *local, local->scan_chandef.freq1_offset = chan->freq_offset; local->scan_chandef.center_freq2 = 0; - /* For scanning on the S1G band, detect the channel width according to - * the channel being scanned. - */ + /* For S1G, only scan the 1MHz primaries. */ if (chan->band == NL80211_BAND_S1GHZ) { - local->scan_chandef.width = ieee80211_s1g_channel_width(chan); + local->scan_chandef.width = NL80211_CHAN_WIDTH_1; + local->scan_chandef.s1g_primary_2mhz = false; goto set_channel; } - /* If scanning on oper channel, use whatever channel-type + /* + * If scanning on oper channel, use whatever channel-type * is currently in use. */ if (chan == local->hw.conf.chandef.chan) @@ -1213,7 +1217,8 @@ int ieee80211_request_ibss_scan(struct ieee80211_sub_if_data *sdata, for (band = 0; band < NUM_NL80211_BANDS; band++) { if (!local->hw.wiphy->bands[band] || - band == NL80211_BAND_6GHZ) + band == NL80211_BAND_6GHZ || + band == NL80211_BAND_S1GHZ) continue; max_n = local->hw.wiphy->bands[band]->n_channels; diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 8c550aab9bdc..f4d3b67fda06 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -2637,13 +2637,11 @@ static void sta_set_tidstats(struct sta_info *sta, if (link_id < 0 && tid < IEEE80211_NUM_TIDS) { spin_lock_bh(&local->fq.lock); - rcu_read_lock(); tidstats->filled |= BIT(NL80211_TID_STATS_TXQ_STATS); ieee80211_fill_txq_stats(&tidstats->txq_stats, to_txq_info(sta->sta.txq[tid])); - rcu_read_unlock(); spin_unlock_bh(&local->fq.lock); } } @@ -2962,7 +2960,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; u32 thr = 0; - int i, ac, cpu, link_id; + int i, ac, cpu; struct ieee80211_sta_rx_stats *last_rxstats; last_rxstats = sta_get_last_rx_stats(sta, -1); @@ -3204,18 +3202,23 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, if (sta->sta.valid_links) { struct ieee80211_link_data *link; struct link_sta_info *link_sta; + int link_id; ether_addr_copy(sinfo->mld_addr, sta->addr); + + /* assign valid links first for iteration */ + sinfo->valid_links = sta->sta.valid_links; + for_each_valid_link(sinfo, link_id) { link_sta = wiphy_dereference(sta->local->hw.wiphy, sta->link[link_id]); link = wiphy_dereference(sdata->local->hw.wiphy, sdata->link[link_id]); - if (!link_sta || !sinfo->links[link_id] || !link) + if (!link_sta || !sinfo->links[link_id] || !link) { + sinfo->valid_links &= ~BIT(link_id); continue; - - sinfo->valid_links = sta->sta.valid_links; + } sta_set_link_sinfo(sta, sinfo->links[link_id], link, tidstats); } diff --git a/net/mac80211/status.c b/net/mac80211/status.c index a362254b310c..4b38aa0e902a 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -5,7 +5,7 @@ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2008-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH - * Copyright 2021-2024 Intel Corporation + * Copyright 2021-2025 Intel Corporation */ #include <linux/export.h> @@ -572,6 +572,7 @@ static struct ieee80211_sub_if_data * ieee80211_sdata_from_skb(struct ieee80211_local *local, struct sk_buff *skb) { struct ieee80211_sub_if_data *sdata; + struct ieee80211_hdr *hdr = (void *)skb->data; if (skb->dev) { list_for_each_entry_rcu(sdata, &local->interfaces, list) { @@ -585,7 +586,23 @@ ieee80211_sdata_from_skb(struct ieee80211_local *local, struct sk_buff *skb) return NULL; } - return rcu_dereference(local->p2p_sdata); + list_for_each_entry_rcu(sdata, &local->interfaces, list) { + switch (sdata->vif.type) { + case NL80211_IFTYPE_P2P_DEVICE: + break; + case NL80211_IFTYPE_NAN: + if (sdata->u.nan.started) + break; + fallthrough; + default: + continue; + } + + if (ether_addr_equal(sdata->vif.addr, hdr->addr2)) + return sdata; + } + + return NULL; } static void ieee80211_report_ack_skb(struct ieee80211_local *local, diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index ba5fbacbeeda..dbbfe2d6842f 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -6,7 +6,7 @@ * Copyright 2014, Intel Corporation * Copyright 2014 Intel Mobile Communications GmbH * Copyright 2015 - 2016 Intel Deutschland GmbH - * Copyright (C) 2019, 2021-2024 Intel Corporation + * Copyright (C) 2019, 2021-2025 Intel Corporation */ #include <linux/ieee80211.h> @@ -1783,7 +1783,10 @@ ieee80211_process_tdls_channel_switch_resp(struct ieee80211_sub_if_data *sdata, } elems = ieee802_11_parse_elems(tf->u.chan_switch_resp.variable, - skb->len - baselen, false, NULL); + skb->len - baselen, + IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION, + NULL); if (!elems) { ret = -ENOMEM; goto out; @@ -1902,7 +1905,10 @@ ieee80211_process_tdls_channel_switch_req(struct ieee80211_sub_if_data *sdata, } elems = ieee802_11_parse_elems(tf->u.chan_switch_req.variable, - skb->len - baselen, false, NULL); + skb->len - baselen, + IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION, + NULL); if (!elems) return -ENOMEM; diff --git a/net/mac80211/tests/Makefile b/net/mac80211/tests/Makefile index 3b0c08356fc5..3c7f874e5c41 100644 --- a/net/mac80211/tests/Makefile +++ b/net/mac80211/tests/Makefile @@ -1,3 +1,3 @@ -mac80211-tests-y += module.o util.o elems.o mfp.o tpe.o chan-mode.o +mac80211-tests-y += module.o util.o elems.o mfp.o tpe.o chan-mode.o s1g_tim.o obj-$(CONFIG_MAC80211_KUNIT_TEST) += mac80211-tests.o diff --git a/net/mac80211/tests/chan-mode.c b/net/mac80211/tests/chan-mode.c index 96c7b3ab2744..adc069065e73 100644 --- a/net/mac80211/tests/chan-mode.c +++ b/net/mac80211/tests/chan-mode.c @@ -2,7 +2,7 @@ /* * KUnit tests for channel mode functions * - * Copyright (C) 2024 Intel Corporation + * Copyright (C) 2024-2025 Intel Corporation */ #include <net/cfg80211.h> #include <kunit/test.h> @@ -28,6 +28,10 @@ static const struct determine_chan_mode_case { u8 vht_basic_mcs_1_4, vht_basic_mcs_5_8; u8 he_basic_mcs_1_4, he_basic_mcs_5_8; u8 eht_mcs7_min_nss; + u16 eht_disabled_subchannels; + u8 eht_bw; + enum ieee80211_conn_bw_limit conn_bw_limit; + enum ieee80211_conn_bw_limit expected_bw_limit; int error; } determine_chan_mode_cases[] = { { @@ -128,6 +132,14 @@ static const struct determine_chan_mode_case { .conn_mode = IEEE80211_CONN_MODE_EHT, .eht_mcs7_min_nss = 0x15, .error = EINVAL, + }, { + .desc = "80 MHz EHT is downgraded to 40 MHz HE due to puncturing", + .conn_mode = IEEE80211_CONN_MODE_EHT, + .expected_mode = IEEE80211_CONN_MODE_HE, + .conn_bw_limit = IEEE80211_CONN_BW_LIMIT_80, + .expected_bw_limit = IEEE80211_CONN_BW_LIMIT_40, + .eht_disabled_subchannels = 0x08, + .eht_bw = IEEE80211_EHT_OPER_CHAN_WIDTH_80MHZ, } }; KUNIT_ARRAY_PARAM_DESC(determine_chan_mode, determine_chan_mode_cases, desc) @@ -138,7 +150,7 @@ static void test_determine_chan_mode(struct kunit *test) struct t_sdata *t_sdata = T_SDATA(test); struct ieee80211_conn_settings conn = { .mode = params->conn_mode, - .bw_limit = IEEE80211_CONN_BW_LIMIT_20, + .bw_limit = params->conn_bw_limit, }; struct cfg80211_bss cbss = { .channel = &t_sdata->band_5ghz.channels[0], @@ -191,14 +203,21 @@ static void test_determine_chan_mode(struct kunit *test) 0x7f, 0x01, 0x00, 0x88, 0x88, 0x88, 0x00, 0x00, 0x00, /* EHT Operation */ - WLAN_EID_EXTENSION, 0x09, WLAN_EID_EXT_EHT_OPERATION, - 0x01, params->eht_mcs7_min_nss ? params->eht_mcs7_min_nss : 0x11, - 0x00, 0x00, 0x00, 0x00, 0x24, 0x00, + WLAN_EID_EXTENSION, 0x0b, WLAN_EID_EXT_EHT_OPERATION, + 0x03, params->eht_mcs7_min_nss ? params->eht_mcs7_min_nss : 0x11, + 0x00, 0x00, 0x00, params->eht_bw, + params->eht_bw == IEEE80211_EHT_OPER_CHAN_WIDTH_80MHZ ? 42 : 36, + 0x00, + u16_get_bits(params->eht_disabled_subchannels, 0xff), + u16_get_bits(params->eht_disabled_subchannels, 0xff00), }; struct ieee80211_chan_req chanreq = {}; struct cfg80211_chan_def ap_chandef = {}; struct ieee802_11_elems *elems; + /* To force EHT downgrade to HE on punctured 80 MHz downgraded to 40 MHz */ + set_bit(IEEE80211_HW_DISALLOW_PUNCTURING, t_sdata->local.hw.flags); + if (params->strict) set_bit(IEEE80211_HW_STRICT, t_sdata->local.hw.flags); else @@ -237,6 +256,7 @@ static void test_determine_chan_mode(struct kunit *test) } else { KUNIT_ASSERT_NOT_ERR_OR_NULL(test, elems); KUNIT_ASSERT_EQ(test, conn.mode, params->expected_mode); + KUNIT_ASSERT_EQ(test, conn.bw_limit, params->expected_bw_limit); } } diff --git a/net/mac80211/tests/elems.c b/net/mac80211/tests/elems.c index a53c55a879a8..1039794a0183 100644 --- a/net/mac80211/tests/elems.c +++ b/net/mac80211/tests/elems.c @@ -2,7 +2,7 @@ /* * KUnit tests for element parsing * - * Copyright (C) 2023-2024 Intel Corporation + * Copyright (C) 2023-2025 Intel Corporation */ #include <kunit/test.h> #include "../ieee80211_i.h" @@ -15,6 +15,8 @@ static void mle_defrag(struct kunit *test) .link_id = 12, .from_ap = true, .mode = IEEE80211_CONN_MODE_EHT, + /* type is not really relevant here */ + .type = IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_BEACON, }; struct ieee802_11_elems *parsed; struct sk_buff *skb; diff --git a/net/mac80211/tests/s1g_tim.c b/net/mac80211/tests/s1g_tim.c new file mode 100644 index 000000000000..642fa4ece89f --- /dev/null +++ b/net/mac80211/tests/s1g_tim.c @@ -0,0 +1,356 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * KUnit tests for S1G TIM PVB decoding. This test suite covers + * IEEE80211-2024 Annex L figures 8, 9, 10, 12, 13, 14. ADE mode + * is not covered as it is an optional encoding format and is not + * currently supported by mac80211. + * + * Copyright (C) 2025 Morse Micro + */ +#include <linux/ieee80211.h> +#include <kunit/test.h> +#include <kunit/test-bug.h> + +#define MAX_AID 128 + +#define BC(enc_mode, inverse, blk_off) \ + ((((blk_off) & 0x1f) << 3) | ((inverse) ? BIT(2) : 0) | \ + ((enc_mode) & 0x3)) + +static void byte_to_bitstr(u8 v, char *out) +{ + for (int b = 7; b >= 0; b--) + *out++ = (v & BIT(b)) ? '1' : '0'; + *out = '\0'; +} + +static void dump_tim_bits(struct kunit *test, + const struct ieee80211_tim_ie *tim, u8 tim_len) +{ + const u8 *ptr = tim->virtual_map; + const u8 *end = (const u8 *)tim + tim_len; + unsigned int oct = 1; + unsigned int blk = 0; + char bits[9]; + + while (ptr < end) { + u8 ctrl = *ptr++; + u8 mode = ctrl & 0x03; + bool inverse = ctrl & BIT(2); + u8 blk_off = ctrl >> 3; + + kunit_info( + test, "Block %u (ENC=%s, blk_off=%u, inverse=%u)", blk, + (mode == IEEE80211_S1G_TIM_ENC_MODE_BLOCK) ? "BLOCK" : + (mode == IEEE80211_S1G_TIM_ENC_MODE_SINGLE) ? "SINGLE" : + "OLB", + blk_off, inverse); + + byte_to_bitstr(ctrl, bits); + kunit_info(test, " octet %2u (ctrl) : %s (0x%02x)", oct, + bits, ctrl); + ++oct; + + switch (mode) { + case IEEE80211_S1G_TIM_ENC_MODE_BLOCK: { + u8 blkmap = *ptr++; + + byte_to_bitstr(blkmap, bits); + kunit_info(test, " octet %2u (blk-map) : %s (0x%02x)", + oct, bits, blkmap); + ++oct; + + for (u8 sb = 0; sb < 8; sb++) { + if (!(blkmap & BIT(sb))) + continue; + u8 sub = *ptr++; + + byte_to_bitstr(sub, bits); + kunit_info( + test, + " octet %2u (SB %2u) : %s (0x%02x)", + oct, sb, bits, sub); + ++oct; + } + break; + } + case IEEE80211_S1G_TIM_ENC_MODE_SINGLE: { + u8 single = *ptr++; + + byte_to_bitstr(single, bits); + kunit_info(test, " octet %2u (single) : %s (0x%02x)", + oct, bits, single); + ++oct; + break; + } + case IEEE80211_S1G_TIM_ENC_MODE_OLB: { + u8 len = *ptr++; + + byte_to_bitstr(len, bits); + kunit_info(test, " octet %2u (len=%2u) : %s (0x%02x)", + oct, len, bits, len); + ++oct; + + for (u8 i = 0; i < len && ptr < end; i++) { + u8 sub = *ptr++; + + byte_to_bitstr(sub, bits); + kunit_info( + test, + " octet %2u (SB %2u) : %s (0x%02x)", + oct, i, bits, sub); + ++oct; + } + break; + } + default: + kunit_info(test, " ** unknown encoding 0x%x **", mode); + return; + } + blk++; + } +} + +static void tim_push(u8 **p, u8 v) +{ + *(*p)++ = v; +} + +static void tim_begin(struct ieee80211_tim_ie *tim, u8 **p) +{ + tim->dtim_count = 0; + tim->dtim_period = 1; + tim->bitmap_ctrl = 0; + *p = tim->virtual_map; +} + +static u8 tim_end(struct ieee80211_tim_ie *tim, u8 *tail) +{ + return tail - (u8 *)tim; +} + +static void pvb_add_block_bitmap(u8 **p, u8 blk_off, bool inverse, u8 blk_bmap, + const u8 *subblocks) +{ + u8 enc = IEEE80211_S1G_TIM_ENC_MODE_BLOCK; + u8 n = hweight8(blk_bmap); + + tim_push(p, BC(enc, inverse, blk_off)); + tim_push(p, blk_bmap); + + for (u8 i = 0; i < n; i++) + tim_push(p, subblocks[i]); +} + +static void pvb_add_single_aid(u8 **p, u8 blk_off, bool inverse, u8 single6) +{ + u8 enc = IEEE80211_S1G_TIM_ENC_MODE_SINGLE; + + tim_push(p, BC(enc, inverse, blk_off)); + tim_push(p, single6 & GENMASK(5, 0)); +} + +static void pvb_add_olb(u8 **p, u8 blk_off, bool inverse, const u8 *subblocks, + u8 len) +{ + u8 enc = IEEE80211_S1G_TIM_ENC_MODE_OLB; + + tim_push(p, BC(enc, inverse, blk_off)); + tim_push(p, len); + for (u8 i = 0; i < len; i++) + tim_push(p, subblocks[i]); +} + +static void check_all_aids(struct kunit *test, + const struct ieee80211_tim_ie *tim, u8 tim_len, + const unsigned long *expected) +{ + for (u16 aid = 1; aid <= MAX_AID; aid++) { + bool want = test_bit(aid, expected); + bool got = ieee80211_s1g_check_tim(tim, tim_len, aid); + + KUNIT_ASSERT_EQ_MSG(test, got, want, + "AID %u mismatch (got=%d want=%d)", aid, + got, want); + } +} + +static void fill_bitmap(unsigned long *bm, const u16 *list, size_t n) +{ + size_t i; + + bitmap_zero(bm, MAX_AID + 1); + for (i = 0; i < n; i++) + __set_bit(list[i], bm); +} + +static void fill_bitmap_inverse(unsigned long *bm, u16 max_aid, + const u16 *except, size_t n_except) +{ + bitmap_zero(bm, MAX_AID + 1); + for (u16 aid = 1; aid <= max_aid; aid++) + __set_bit(aid, bm); + + for (size_t i = 0; i < n_except; i++) + if (except[i] <= max_aid) + __clear_bit(except[i], bm); +} + +static void s1g_tim_block_test(struct kunit *test) +{ + u8 buf[256] = {}; + struct ieee80211_tim_ie *tim = (void *)buf; + u8 *p, tim_len; + static const u8 subblocks[] = { + 0x42, /* SB m=0: AIDs 1,6 */ + 0xA0, /* SB m=2: AIDs 21,23 */ + }; + u8 blk_bmap = 0x05; /* bits 0 and 2 set */ + bool inverse = false; + static const u16 set_list[] = { 1, 6, 21, 23 }; + DECLARE_BITMAP(exp, MAX_AID + 1); + + tim_begin(tim, &p); + pvb_add_block_bitmap(&p, 0, inverse, blk_bmap, subblocks); + tim_len = tim_end(tim, p); + + fill_bitmap(exp, set_list, ARRAY_SIZE(set_list)); + + dump_tim_bits(test, tim, tim_len); + check_all_aids(test, tim, tim_len, exp); +} + +static void s1g_tim_single_test(struct kunit *test) +{ + u8 buf[256] = {}; + struct ieee80211_tim_ie *tim = (void *)buf; + u8 *p, tim_len; + bool inverse = false; + u8 blk_off = 0; + u8 single6 = 0x1f; /* 31 */ + static const u16 set_list[] = { 31 }; + DECLARE_BITMAP(exp, MAX_AID + 1); + + tim_begin(tim, &p); + pvb_add_single_aid(&p, blk_off, inverse, single6); + tim_len = tim_end(tim, p); + + fill_bitmap(exp, set_list, ARRAY_SIZE(set_list)); + + dump_tim_bits(test, tim, tim_len); + check_all_aids(test, tim, tim_len, exp); +} + +static void s1g_tim_olb_test(struct kunit *test) +{ + u8 buf[256] = {}; + struct ieee80211_tim_ie *tim = (void *)buf; + u8 *p, tim_len; + bool inverse = false; + u8 blk_off = 0; + static const u16 set_list[] = { 1, 6, 13, 15, 17, 22, 29, 31, 33, + 38, 45, 47, 49, 54, 61, 63, 65, 70 }; + static const u8 subblocks[] = { 0x42, 0xA0, 0x42, 0xA0, 0x42, + 0xA0, 0x42, 0xA0, 0x42 }; + u8 len = ARRAY_SIZE(subblocks); + DECLARE_BITMAP(exp, MAX_AID + 1); + + tim_begin(tim, &p); + pvb_add_olb(&p, blk_off, inverse, subblocks, len); + tim_len = tim_end(tim, p); + + fill_bitmap(exp, set_list, ARRAY_SIZE(set_list)); + + dump_tim_bits(test, tim, tim_len); + check_all_aids(test, tim, tim_len, exp); +} + +static void s1g_tim_inverse_block_test(struct kunit *test) +{ + u8 buf[256] = {}; + struct ieee80211_tim_ie *tim = (void *)buf; + u8 *p, tim_len; + /* Same sub-block content as Figure L-8, but inverse = true */ + static const u8 subblocks[] = { + 0x42, /* SB m=0: AIDs 1,6 */ + 0xA0, /* SB m=2: AIDs 21,23 */ + }; + u8 blk_bmap = 0x05; + bool inverse = true; + /* All AIDs except 1,6,21,23 are set */ + static const u16 except[] = { 1, 6, 21, 23 }; + DECLARE_BITMAP(exp, MAX_AID + 1); + + tim_begin(tim, &p); + pvb_add_block_bitmap(&p, 0, inverse, blk_bmap, subblocks); + tim_len = tim_end(tim, p); + + fill_bitmap_inverse(exp, 63, except, ARRAY_SIZE(except)); + + dump_tim_bits(test, tim, tim_len); + check_all_aids(test, tim, tim_len, exp); +} + +static void s1g_tim_inverse_single_test(struct kunit *test) +{ + u8 buf[256] = {}; + struct ieee80211_tim_ie *tim = (void *)buf; + u8 *p, tim_len; + bool inverse = true; + u8 blk_off = 0; + u8 single6 = 0x1f; /* 31 */ + /* All AIDs except 31 are set */ + static const u16 except[] = { 31 }; + DECLARE_BITMAP(exp, MAX_AID + 1); + + tim_begin(tim, &p); + pvb_add_single_aid(&p, blk_off, inverse, single6); + tim_len = tim_end(tim, p); + + fill_bitmap_inverse(exp, 63, except, ARRAY_SIZE(except)); + + dump_tim_bits(test, tim, tim_len); + check_all_aids(test, tim, tim_len, exp); +} + +static void s1g_tim_inverse_olb_test(struct kunit *test) +{ + u8 buf[256] = {}; + struct ieee80211_tim_ie *tim = (void *)buf; + u8 *p, tim_len; + bool inverse = true; + u8 blk_off = 0, len; + /* All AIDs except the list below are set */ + static const u16 except[] = { 1, 6, 13, 15, 17, 22, 29, 31, 33, + 38, 45, 47, 49, 54, 61, 63, 65, 70 }; + static const u8 subblocks[] = { 0x42, 0xA0, 0x42, 0xA0, 0x42, + 0xA0, 0x42, 0xA0, 0x42 }; + len = ARRAY_SIZE(subblocks); + DECLARE_BITMAP(exp, MAX_AID + 1); + + tim_begin(tim, &p); + pvb_add_olb(&p, blk_off, inverse, subblocks, len); + tim_len = tim_end(tim, p); + + fill_bitmap_inverse(exp, 127, except, ARRAY_SIZE(except)); + + dump_tim_bits(test, tim, tim_len); + check_all_aids(test, tim, tim_len, exp); +} + +static struct kunit_case s1g_tim_test_cases[] = { + KUNIT_CASE(s1g_tim_block_test), + KUNIT_CASE(s1g_tim_single_test), + KUNIT_CASE(s1g_tim_olb_test), + KUNIT_CASE(s1g_tim_inverse_block_test), + KUNIT_CASE(s1g_tim_inverse_single_test), + KUNIT_CASE(s1g_tim_inverse_olb_test), + {} +}; + +static struct kunit_suite s1g_tim = { + .name = "mac80211-s1g-tim", + .test_cases = s1g_tim_test_cases, +}; + +kunit_test_suite(s1g_tim); diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 00671ae45b2f..9d8b0a25f73c 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -59,6 +59,9 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx, if (WARN_ON_ONCE(tx->rate.idx < 0)) return 0; + if (info->band >= NUM_NL80211_BANDS) + return 0; + sband = local->hw.wiphy->bands[info->band]; txrate = &sband->bitrates[tx->rate.idx]; @@ -683,7 +686,10 @@ ieee80211_tx_h_rate_ctrl(struct ieee80211_tx_data *tx) memset(&txrc, 0, sizeof(txrc)); - sband = tx->local->hw.wiphy->bands[info->band]; + if (info->band < NUM_NL80211_BANDS) + sband = tx->local->hw.wiphy->bands[info->band]; + else + return TX_CONTINUE; len = min_t(u32, tx->skb->len + FCS_LEN, tx->local->hw.wiphy->frag_threshold); @@ -1056,9 +1062,11 @@ ieee80211_tx_h_encrypt(struct ieee80211_tx_data *tx) return ieee80211_crypto_ccmp_encrypt( tx, IEEE80211_CCMP_256_MIC_LEN); case WLAN_CIPHER_SUITE_AES_CMAC: - return ieee80211_crypto_aes_cmac_encrypt(tx); + return ieee80211_crypto_aes_cmac_encrypt( + tx, IEEE80211_CMAC_128_MIC_LEN); case WLAN_CIPHER_SUITE_BIP_CMAC_256: - return ieee80211_crypto_aes_cmac_256_encrypt(tx); + return ieee80211_crypto_aes_cmac_encrypt( + tx, IEEE80211_CMAC_256_MIC_LEN); case WLAN_CIPHER_SUITE_BIP_GMAC_128: case WLAN_CIPHER_SUITE_BIP_GMAC_256: return ieee80211_crypto_aes_gmac_encrypt(tx); @@ -1814,7 +1822,7 @@ static int invoke_tx_handlers_early(struct ieee80211_tx_data *tx) txh_done: if (unlikely(res == TX_DROP)) { - I802_DEBUG_INC(tx->local->tx_handlers_drop); + tx->sdata->tx_handlers_drop++; if (tx->skb) ieee80211_free_txskb(&tx->local->hw, tx->skb); else @@ -1858,7 +1866,7 @@ static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx) txh_done: if (unlikely(res == TX_DROP)) { - I802_DEBUG_INC(tx->local->tx_handlers_drop); + tx->sdata->tx_handlers_drop++; if (tx->skb) ieee80211_free_txskb(&tx->local->hw, tx->skb); else @@ -1942,6 +1950,7 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata, if (unlikely(res_prepare == TX_DROP)) { ieee80211_free_txskb(&local->hw, skb); + tx.sdata->tx_handlers_drop++; return true; } else if (unlikely(res_prepare == TX_QUEUED)) { return true; @@ -3728,8 +3737,10 @@ void __ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, r = ieee80211_xmit_fast_finish(sdata, sta, fast_tx->pn_offs, fast_tx->key, &tx); tx.skb = NULL; - if (r == TX_DROP) + if (r == TX_DROP) { + tx.sdata->tx_handlers_drop++; goto free; + } if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) sdata = container_of(sdata->bss, @@ -4882,15 +4893,114 @@ void ieee80211_tx_pending(struct tasklet_struct *t) /* functions for drivers to get certain frames */ +static void ieee80211_beacon_add_tim_pvb(struct ps_data *ps, + struct sk_buff *skb, + bool mcast_traffic) +{ + int i, n1 = 0, n2; + + /* + * Find largest even number N1 so that bits numbered 1 through + * (N1 x 8) - 1 in the bitmap are 0 and number N2 so that bits + * (N2 + 1) x 8 through 2007 are 0. + */ + for (i = 0; i < IEEE80211_MAX_TIM_LEN; i++) { + if (ps->tim[i]) { + n1 = i & 0xfe; + break; + } + } + n2 = n1; + for (i = IEEE80211_MAX_TIM_LEN - 1; i >= n1; i--) { + if (ps->tim[i]) { + n2 = i; + break; + } + } + + /* Bitmap control */ + skb_put_u8(skb, n1 | mcast_traffic); + /* Part Virt Bitmap */ + skb_put_data(skb, ps->tim + n1, n2 - n1 + 1); +} + +/* + * mac80211 currently supports encoding using block bitmap mode, non + * inversed. The current implementation supports up to 1600 AIDs. + * + * Block bitmap encoding breaks down the AID bitmap into blocks of 64 + * AIDs. Each block contains between 0 and 8 subblocks. Each subblock + * describes 8 AIDs and the presence of a subblock is determined by + * the block bitmap. + */ +static void ieee80211_s1g_beacon_add_tim_pvb(struct ps_data *ps, + struct sk_buff *skb, + bool mcast_traffic) +{ + int blk; + + /* + * Emit a bitmap control block with a page slice number of 31 and a + * page index of 0 which indicates as per IEEE80211-2024 9.4.2.5.1 + * that the entire page (2048 bits) indicated by the page index + * is encoded in the partial virtual bitmap. + */ + skb_put_u8(skb, mcast_traffic | (31 << 1)); + + /* Emit an encoded block for each non-zero sub-block */ + for (blk = 0; blk < IEEE80211_MAX_SUPPORTED_S1G_TIM_BLOCKS; blk++) { + u8 blk_bmap = 0; + int sblk; + + for (sblk = 0; sblk < 8; sblk++) { + int sblk_idx = blk * 8 + sblk; + + /* + * If the current subblock is non-zero, increase the + * number of subblocks to emit for the current block. + */ + if (ps->tim[sblk_idx]) + blk_bmap |= BIT(sblk); + } + + /* If the current block contains no non-zero sublocks */ + if (!blk_bmap) + continue; + + /* + * Emit a block control byte for the current encoded block + * with an encoding mode of block bitmap (0x0), not inverse + * (0x0) and the current block offset (5 bits) + */ + skb_put_u8(skb, blk << 3); + + /* + * Emit the block bitmap for the current encoded block which + * contains the present subblocks. + */ + skb_put_u8(skb, blk_bmap); + + /* Emit the present subblocks */ + for (sblk = 0; sblk < 8; sblk++) { + int sblk_idx = blk * 8 + sblk; + + if (!(blk_bmap & BIT(sblk))) + continue; + + skb_put_u8(skb, ps->tim[sblk_idx]); + } + } +} + static void __ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link, struct ps_data *ps, struct sk_buff *skb, bool is_template) { - u8 *pos, *tim; - int aid0 = 0; - int i, have_bits = 0, n1, n2; + struct element *tim; + bool mcast_traffic = false, have_bits = false; struct ieee80211_bss_conf *link_conf = link->conf; + bool s1g = ieee80211_get_link_sband(link)->band == NL80211_BAND_S1GHZ; /* Generate bitmap for TIM only if there are any STAs in power save * mode. */ @@ -4898,7 +5008,8 @@ static void __ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata, /* in the hope that this is faster than * checking byte-for-byte */ have_bits = !bitmap_empty((unsigned long *)ps->tim, - IEEE80211_MAX_AID+1); + IEEE80211_MAX_AID + 1); + if (!is_template) { if (ps->dtim_count == 0) ps->dtim_count = link_conf->dtim_period - 1; @@ -4906,51 +5017,39 @@ static void __ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata, ps->dtim_count--; } - tim = pos = skb_put(skb, 5); - *pos++ = WLAN_EID_TIM; - *pos++ = 3; - *pos++ = ps->dtim_count; - *pos++ = link_conf->dtim_period; + /* Length is set after parsing the AID bitmap */ + tim = skb_put(skb, sizeof(struct element)); + tim->id = WLAN_EID_TIM; + skb_put_u8(skb, ps->dtim_count); + skb_put_u8(skb, link_conf->dtim_period); if (ps->dtim_count == 0 && !skb_queue_empty(&ps->bc_buf)) - aid0 = 1; + mcast_traffic = true; - ps->dtim_bc_mc = aid0 == 1; + ps->dtim_bc_mc = mcast_traffic; if (have_bits) { - /* Find largest even number N1 so that bits numbered 1 through - * (N1 x 8) - 1 in the bitmap are 0 and number N2 so that bits - * (N2 + 1) x 8 through 2007 are 0. */ - n1 = 0; - for (i = 0; i < IEEE80211_MAX_TIM_LEN; i++) { - if (ps->tim[i]) { - n1 = i & 0xfe; - break; - } - } - n2 = n1; - for (i = IEEE80211_MAX_TIM_LEN - 1; i >= n1; i--) { - if (ps->tim[i]) { - n2 = i; - break; - } - } - - /* Bitmap control */ - *pos++ = n1 | aid0; - /* Part Virt Bitmap */ - skb_put_data(skb, ps->tim + n1, n2 - n1 + 1); - - tim[1] = n2 - n1 + 4; + if (s1g) + ieee80211_s1g_beacon_add_tim_pvb(ps, skb, + mcast_traffic); + else + ieee80211_beacon_add_tim_pvb(ps, skb, mcast_traffic); } else { - *pos++ = aid0; /* Bitmap control */ - - if (ieee80211_get_link_sband(link)->band != NL80211_BAND_S1GHZ) { - tim[1] = 4; + /* + * If there is no buffered unicast traffic for an S1G + * interface, we can exclude the bitmap control. This is in + * contrast to other phy types as they do include the bitmap + * control and pvb even when there is no buffered traffic. + */ + if (!s1g) { + /* Bitmap control */ + skb_put_u8(skb, mcast_traffic); /* Part Virt Bitmap */ skb_put_u8(skb, 0); } } + + tim->datalen = skb_tail_pointer(skb) - tim->data; } static int ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata, @@ -6195,7 +6294,9 @@ void ieee80211_tx_skb_tid(struct ieee80211_sub_if_data *sdata, enum nl80211_band band; rcu_read_lock(); - if (!ieee80211_vif_is_mld(&sdata->vif)) { + if (sdata->vif.type == NL80211_IFTYPE_NAN) { + band = NUM_NL80211_BANDS; + } else if (!ieee80211_vif_is_mld(&sdata->vif)) { WARN_ON(link_id >= 0); chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf); diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 32f1bc5908c5..0c46009a3d63 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1756,7 +1756,6 @@ int ieee80211_reconfig(struct ieee80211_local *local) bool sched_scan_stopped = false; bool suspended = local->suspended; bool in_reconfig = false; - u32 rts_threshold; lockdep_assert_wiphy(local->hw.wiphy); @@ -1832,7 +1831,9 @@ int ieee80211_reconfig(struct ieee80211_local *local) /* setup RTS threshold */ if (hw->wiphy->n_radio > 0) { for (i = 0; i < hw->wiphy->n_radio; i++) { - rts_threshold = hw->wiphy->radio_cfg[i].rts_threshold; + u32 rts_threshold = + hw->wiphy->radio_cfg[i].rts_threshold; + drv_set_rts_threshold(local, i, rts_threshold); } } else { @@ -2205,9 +2206,10 @@ int ieee80211_reconfig(struct ieee80211_local *local) } } + /* Passing NULL means an interface is picked for configuration */ if (local->virt_monitors > 0 && local->virt_monitors == local->open_count) - ieee80211_add_virtual_monitor(local); + ieee80211_add_virtual_monitor(local, NULL); if (!suspended) return 0; @@ -2346,7 +2348,7 @@ void ieee80211_recalc_min_chandef(struct ieee80211_sub_if_data *sdata, chanctx = container_of(chanctx_conf, struct ieee80211_chanctx, conf); - ieee80211_recalc_chanctx_min_def(local, chanctx, NULL, false); + ieee80211_recalc_chanctx_min_def(local, chanctx); } } @@ -3198,10 +3200,11 @@ bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_local *local, return true; } -bool ieee80211_chandef_s1g_oper(const struct ieee80211_s1g_oper_ie *oper, +bool ieee80211_chandef_s1g_oper(struct ieee80211_local *local, + const struct ieee80211_s1g_oper_ie *oper, struct cfg80211_chan_def *chandef) { - u32 oper_freq; + u32 oper_khz, pri_1mhz_khz, pri_2mhz_khz; if (!oper) return false; @@ -3226,12 +3229,36 @@ bool ieee80211_chandef_s1g_oper(const struct ieee80211_s1g_oper_ie *oper, return false; } - oper_freq = ieee80211_channel_to_freq_khz(oper->oper_ch, - NL80211_BAND_S1GHZ); - chandef->center_freq1 = KHZ_TO_MHZ(oper_freq); - chandef->freq1_offset = oper_freq % 1000; + chandef->s1g_primary_2mhz = false; - return true; + switch (u8_get_bits(oper->ch_width, S1G_OPER_CH_WIDTH_PRIMARY)) { + case IEEE80211_S1G_PRI_CHANWIDTH_1MHZ: + pri_1mhz_khz = ieee80211_channel_to_freq_khz( + oper->primary_ch, NL80211_BAND_S1GHZ); + break; + case IEEE80211_S1G_PRI_CHANWIDTH_2MHZ: + chandef->s1g_primary_2mhz = true; + pri_2mhz_khz = ieee80211_channel_to_freq_khz( + oper->primary_ch, NL80211_BAND_S1GHZ); + + if (u8_get_bits(oper->ch_width, S1G_OPER_CH_PRIMARY_LOCATION) == + S1G_2M_PRIMARY_LOCATION_LOWER) + pri_1mhz_khz = pri_2mhz_khz - 500; + else + pri_1mhz_khz = pri_2mhz_khz + 500; + break; + default: + return false; + } + + oper_khz = ieee80211_channel_to_freq_khz(oper->oper_ch, + NL80211_BAND_S1GHZ); + chandef->center_freq1 = KHZ_TO_MHZ(oper_khz); + chandef->freq1_offset = oper_khz % 1000; + chandef->chan = + ieee80211_get_channel_khz(local->hw.wiphy, pri_1mhz_khz); + + return chandef->chan; } int ieee80211_put_srates_elem(struct sk_buff *skb, @@ -3990,23 +4017,23 @@ static u8 ieee80211_chanctx_radar_detect(struct ieee80211_local *local, if (WARN_ON(ctx->replace_state == IEEE80211_CHANCTX_WILL_BE_REPLACED)) return 0; - list_for_each_entry(link, &ctx->reserved_links, reserved_chanctx_list) - if (link->reserved_radar_required) - radar_detect |= BIT(link->reserved.oper.width); - - /* - * An in-place reservation context should not have any assigned vifs - * until it replaces the other context. - */ - WARN_ON(ctx->replace_state == IEEE80211_CHANCTX_REPLACES_OTHER && - !list_empty(&ctx->assigned_links)); + for_each_sdata_link(local, link) { + if (rcu_access_pointer(link->conf->chanctx_conf) == &ctx->conf) { + /* + * An in-place reservation context should not have any + * assigned links until it replaces the other context. + */ + WARN_ON(ctx->replace_state == + IEEE80211_CHANCTX_REPLACES_OTHER); - list_for_each_entry(link, &ctx->assigned_links, assigned_chanctx_list) { - if (!link->radar_required) - continue; + if (link->radar_required) + radar_detect |= + BIT(link->conf->chanreq.oper.width); + } - radar_detect |= - BIT(link->conf->chanreq.oper.width); + if (link->reserved_chanctx == ctx && + link->reserved_radar_required) + radar_detect |= BIT(link->reserved.oper.width); } return radar_detect; @@ -4022,16 +4049,13 @@ bool ieee80211_is_radio_idx_in_scan_req(struct wiphy *wiphy, for (i = 0; i < scan_req->n_channels; i++) { chan = scan_req->channels[i]; chan_radio_idx = cfg80211_get_radio_idx_by_chan(wiphy, chan); - /* - * The chan_radio_idx should be valid since it's taken from a - * valid scan request. - * However, if chan_radio_idx is unexpectedly invalid (negative), - * we take a conservative approach and assume the scan request - * might use the specified radio_idx. Hence, return true. - */ - if (WARN_ON(chan_radio_idx < 0)) - return true; + /* The radio index either matched successfully, or an error + * occurred. For example, if radio-level information is + * missing, the same error value is returned. This + * typically implies a single-radio setup, in which case + * the operation should not be allowed. + */ if (chan_radio_idx == radio_idx) return true; } @@ -4514,3 +4538,11 @@ void ieee80211_clear_tpe(struct ieee80211_parsed_tpe *tpe) sizeof(tpe->psd_reg_client[i].power)); } } + +bool ieee80211_vif_nan_started(struct ieee80211_vif *vif) +{ + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + + return vif->type == NL80211_IFTYPE_NAN && sdata->u.nan.started; +} +EXPORT_SYMBOL_GPL(ieee80211_vif_nan_started); diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c index 40d5d9e48479..4a858112e4ef 100644 --- a/net/mac80211/wpa.c +++ b/net/mac80211/wpa.c @@ -828,12 +828,14 @@ static inline void bip_ipn_swap(u8 *d, const u8 *s) ieee80211_tx_result -ieee80211_crypto_aes_cmac_encrypt(struct ieee80211_tx_data *tx) +ieee80211_crypto_aes_cmac_encrypt(struct ieee80211_tx_data *tx, + unsigned int mic_len) { struct sk_buff *skb; struct ieee80211_tx_info *info; struct ieee80211_key *key = tx->key; - struct ieee80211_mmie *mmie; + struct ieee80211_mmie_var *mmie; + size_t mmie_len; u8 aad[20]; u64 pn64; @@ -848,60 +850,14 @@ ieee80211_crypto_aes_cmac_encrypt(struct ieee80211_tx_data *tx) !(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIE)) return TX_CONTINUE; - if (WARN_ON(skb_tailroom(skb) < sizeof(*mmie))) - return TX_DROP; - - mmie = skb_put(skb, sizeof(*mmie)); - mmie->element_id = WLAN_EID_MMIE; - mmie->length = sizeof(*mmie) - 2; - mmie->key_id = cpu_to_le16(key->conf.keyidx); - - /* PN = PN + 1 */ - pn64 = atomic64_inc_return(&key->conf.tx_pn); - - bip_ipn_set64(mmie->sequence_number, pn64); - - if (info->control.hw_key) - return TX_CONTINUE; - - bip_aad(skb, aad); - - /* - * MIC = AES-128-CMAC(IGTK, AAD || Management Frame Body || MMIE, 64) - */ - ieee80211_aes_cmac(key->u.aes_cmac.tfm, aad, - skb->data + 24, skb->len - 24, mmie->mic); - - return TX_CONTINUE; -} - -ieee80211_tx_result -ieee80211_crypto_aes_cmac_256_encrypt(struct ieee80211_tx_data *tx) -{ - struct sk_buff *skb; - struct ieee80211_tx_info *info; - struct ieee80211_key *key = tx->key; - struct ieee80211_mmie_16 *mmie; - u8 aad[20]; - u64 pn64; - - if (WARN_ON(skb_queue_len(&tx->skbs) != 1)) - return TX_DROP; - - skb = skb_peek(&tx->skbs); - - info = IEEE80211_SKB_CB(skb); - - if (info->control.hw_key && - !(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_MMIE)) - return TX_CONTINUE; + mmie_len = sizeof(*mmie) + mic_len; - if (WARN_ON(skb_tailroom(skb) < sizeof(*mmie))) + if (WARN_ON(skb_tailroom(skb) < mmie_len)) return TX_DROP; - mmie = skb_put(skb, sizeof(*mmie)); + mmie = skb_put(skb, mmie_len); mmie->element_id = WLAN_EID_MMIE; - mmie->length = sizeof(*mmie) - 2; + mmie->length = mmie_len - 2; mmie->key_id = cpu_to_le16(key->conf.keyidx); /* PN = PN + 1 */ @@ -914,86 +870,40 @@ ieee80211_crypto_aes_cmac_256_encrypt(struct ieee80211_tx_data *tx) bip_aad(skb, aad); - /* MIC = AES-256-CMAC(IGTK, AAD || Management Frame Body || MMIE, 128) - */ - ieee80211_aes_cmac_256(key->u.aes_cmac.tfm, aad, - skb->data + 24, skb->len - 24, mmie->mic); + if (ieee80211_aes_cmac(key->u.aes_cmac.tfm, aad, + skb->data + 24, skb->len - 24, + mmie->mic, mic_len)) + return TX_DROP; return TX_CONTINUE; } ieee80211_rx_result -ieee80211_crypto_aes_cmac_decrypt(struct ieee80211_rx_data *rx) +ieee80211_crypto_aes_cmac_decrypt(struct ieee80211_rx_data *rx, + unsigned int mic_len) { struct sk_buff *skb = rx->skb; struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); struct ieee80211_key *key = rx->key; - struct ieee80211_mmie *mmie; - u8 aad[20], mic[8], ipn[6]; + struct ieee80211_mmie_var *mmie; + size_t mmie_len; + u8 aad[20], mic[IEEE80211_CMAC_256_MIC_LEN], ipn[6]; struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; if (!ieee80211_is_mgmt(hdr->frame_control)) return RX_CONTINUE; - /* management frames are already linear */ - - if (skb->len < 24 + sizeof(*mmie)) - return RX_DROP_U_SHORT_CMAC; - - mmie = (struct ieee80211_mmie *) - (skb->data + skb->len - sizeof(*mmie)); - if (mmie->element_id != WLAN_EID_MMIE || - mmie->length != sizeof(*mmie) - 2) - return RX_DROP_U_BAD_MMIE; /* Invalid MMIE */ - - bip_ipn_swap(ipn, mmie->sequence_number); - - if (memcmp(ipn, key->u.aes_cmac.rx_pn, 6) <= 0) { - key->u.aes_cmac.replays++; - return RX_DROP_U_REPLAY; - } - - if (!(status->flag & RX_FLAG_DECRYPTED)) { - /* hardware didn't decrypt/verify MIC */ - bip_aad(skb, aad); - ieee80211_aes_cmac(key->u.aes_cmac.tfm, aad, - skb->data + 24, skb->len - 24, mic); - if (crypto_memneq(mic, mmie->mic, sizeof(mmie->mic))) { - key->u.aes_cmac.icverrors++; - return RX_DROP_U_MIC_FAIL; - } - } - - memcpy(key->u.aes_cmac.rx_pn, ipn, 6); - - /* Remove MMIE */ - skb_trim(skb, skb->len - sizeof(*mmie)); - - return RX_CONTINUE; -} - -ieee80211_rx_result -ieee80211_crypto_aes_cmac_256_decrypt(struct ieee80211_rx_data *rx) -{ - struct sk_buff *skb = rx->skb; - struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); - struct ieee80211_key *key = rx->key; - struct ieee80211_mmie_16 *mmie; - u8 aad[20], mic[16], ipn[6]; - struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; - - if (!ieee80211_is_mgmt(hdr->frame_control)) - return RX_CONTINUE; + mmie_len = sizeof(*mmie) + mic_len; /* management frames are already linear */ - if (skb->len < 24 + sizeof(*mmie)) - return RX_DROP_U_SHORT_CMAC256; + if (skb->len < 24 + mmie_len) + return mic_len == IEEE80211_CMAC_128_MIC_LEN ? + RX_DROP_U_SHORT_CMAC : RX_DROP_U_SHORT_CMAC256; - mmie = (struct ieee80211_mmie_16 *) - (skb->data + skb->len - sizeof(*mmie)); + mmie = (struct ieee80211_mmie_var *)(skb->data + skb->len - mmie_len); if (mmie->element_id != WLAN_EID_MMIE || - mmie->length != sizeof(*mmie) - 2) + mmie->length != mmie_len - 2) return RX_DROP_U_BAD_MMIE; /* Invalid MMIE */ bip_ipn_swap(ipn, mmie->sequence_number); @@ -1006,9 +916,11 @@ ieee80211_crypto_aes_cmac_256_decrypt(struct ieee80211_rx_data *rx) if (!(status->flag & RX_FLAG_DECRYPTED)) { /* hardware didn't decrypt/verify MIC */ bip_aad(skb, aad); - ieee80211_aes_cmac_256(key->u.aes_cmac.tfm, aad, - skb->data + 24, skb->len - 24, mic); - if (crypto_memneq(mic, mmie->mic, sizeof(mmie->mic))) { + if (ieee80211_aes_cmac(key->u.aes_cmac.tfm, aad, + skb->data + 24, skb->len - 24, + mic, mic_len)) + return RX_DROP_U_DECRYPT_FAIL; + if (crypto_memneq(mic, mmie->mic, mic_len)) { key->u.aes_cmac.icverrors++; return RX_DROP_U_MIC_FAIL; } @@ -1017,7 +929,7 @@ ieee80211_crypto_aes_cmac_256_decrypt(struct ieee80211_rx_data *rx) memcpy(key->u.aes_cmac.rx_pn, ipn, 6); /* Remove MMIE */ - skb_trim(skb, skb->len - sizeof(*mmie)); + skb_trim(skb, skb->len - mmie_len); return RX_CONTINUE; } @@ -1113,7 +1025,7 @@ ieee80211_crypto_aes_gmac_decrypt(struct ieee80211_rx_data *rx) memcpy(nonce, hdr->addr2, ETH_ALEN); memcpy(nonce + ETH_ALEN, ipn, 6); - mic = kmalloc(GMAC_MIC_LEN, GFP_ATOMIC); + mic = kmalloc(IEEE80211_GMAC_MIC_LEN, GFP_ATOMIC); if (!mic) return RX_DROP_U_OOM; if (ieee80211_aes_gmac(key->u.aes_gmac.tfm, aad, nonce, diff --git a/net/mac80211/wpa.h b/net/mac80211/wpa.h index a9a81abb5479..6e8846dfe710 100644 --- a/net/mac80211/wpa.h +++ b/net/mac80211/wpa.h @@ -29,13 +29,11 @@ ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx, unsigned int mic_len); ieee80211_tx_result -ieee80211_crypto_aes_cmac_encrypt(struct ieee80211_tx_data *tx); -ieee80211_tx_result -ieee80211_crypto_aes_cmac_256_encrypt(struct ieee80211_tx_data *tx); -ieee80211_rx_result -ieee80211_crypto_aes_cmac_decrypt(struct ieee80211_rx_data *rx); +ieee80211_crypto_aes_cmac_encrypt(struct ieee80211_tx_data *tx, + unsigned int mic_len); ieee80211_rx_result -ieee80211_crypto_aes_cmac_256_decrypt(struct ieee80211_rx_data *rx); +ieee80211_crypto_aes_cmac_decrypt(struct ieee80211_rx_data *rx, + unsigned int mic_len); ieee80211_tx_result ieee80211_crypto_aes_gmac_encrypt(struct ieee80211_tx_data *tx); ieee80211_rx_result diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c index df4e8cf33899..209a963112e3 100644 --- a/net/mctp/af_mctp.c +++ b/net/mctp/af_mctp.c @@ -49,7 +49,7 @@ static bool mctp_sockaddr_ext_is_ok(const struct sockaddr_mctp_ext *addr) !addr->__smctp_pad0[2]; } -static int mctp_bind(struct socket *sock, struct sockaddr *addr, int addrlen) +static int mctp_bind(struct socket *sock, struct sockaddr_unsized *addr, int addrlen) { struct sock *sk = sock->sk; struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk); @@ -128,7 +128,7 @@ out_release: /* Used to set a specific peer prior to bind. Not used for outbound * connections (Tag Owner set) since MCTP is a datagram protocol. */ -static int mctp_connect(struct socket *sock, struct sockaddr *addr, +static int mctp_connect(struct socket *sock, struct sockaddr_unsized *addr, int addrlen, int flags) { struct sock *sk = sock->sk; @@ -256,7 +256,7 @@ static int mctp_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) skb_reserve(skb, hlen); - /* set type as fist byte in payload */ + /* set type as first byte in payload */ *(u8 *)skb_put(skb, 1) = addr->smctp_type; rc = memcpy_from_msg((void *)skb_put(skb, len), msg, len); @@ -425,7 +425,7 @@ static int mctp_getsockopt(struct socket *sock, int level, int optname, return 0; } - return -EINVAL; + return -ENOPROTOOPT; } /* helpers for reading/writing the tag ioc, handling compatibility across the diff --git a/net/mctp/route.c b/net/mctp/route.c index 2b2b958ef6a3..2ac4011a953f 100644 --- a/net/mctp/route.c +++ b/net/mctp/route.c @@ -378,6 +378,7 @@ static void mctp_skb_set_flow(struct sk_buff *skb, struct mctp_sk_key *key) {} static void mctp_flow_prepare_output(struct sk_buff *skb, struct mctp_dev *dev) {} #endif +/* takes ownership of skb, both in success and failure cases */ static int mctp_frag_queue(struct mctp_sk_key *key, struct sk_buff *skb) { struct mctp_hdr *hdr = mctp_hdr(skb); @@ -387,8 +388,10 @@ static int mctp_frag_queue(struct mctp_sk_key *key, struct sk_buff *skb) & MCTP_HDR_SEQ_MASK; if (!key->reasm_head) { - /* Since we're manipulating the shared frag_list, ensure it isn't - * shared with any other SKBs. + /* Since we're manipulating the shared frag_list, ensure it + * isn't shared with any other SKBs. In the cloned case, + * this will free the skb; callers can no longer access it + * safely. */ key->reasm_head = skb_unshare(skb, GFP_ATOMIC); if (!key->reasm_head) @@ -402,10 +405,10 @@ static int mctp_frag_queue(struct mctp_sk_key *key, struct sk_buff *skb) exp_seq = (key->last_seq + 1) & MCTP_HDR_SEQ_MASK; if (this_seq != exp_seq) - return -EINVAL; + goto err_free; if (key->reasm_head->len + skb->len > mctp_message_maxlen) - return -EINVAL; + goto err_free; skb->next = NULL; skb->sk = NULL; @@ -419,6 +422,10 @@ static int mctp_frag_queue(struct mctp_sk_key *key, struct sk_buff *skb) key->reasm_head->truesize += skb->truesize; return 0; + +err_free: + kfree_skb(skb); + return -EINVAL; } static int mctp_dst_input(struct mctp_dst *dst, struct sk_buff *skb) @@ -532,18 +539,16 @@ static int mctp_dst_input(struct mctp_dst *dst, struct sk_buff *skb) * key isn't observable yet */ mctp_frag_queue(key, skb); + skb = NULL; /* if the key_add fails, we've raced with another * SOM packet with the same src, dest and tag. There's * no way to distinguish future packets, so all we - * can do is drop; we'll free the skb on exit from - * this function. + * can do is drop. */ rc = mctp_key_add(key, msk); - if (!rc) { + if (!rc) trace_mctp_key_acquire(key); - skb = NULL; - } /* we don't need to release key->lock on exit, so * clean up here and suppress the unlock via @@ -561,8 +566,7 @@ static int mctp_dst_input(struct mctp_dst *dst, struct sk_buff *skb) key = NULL; } else { rc = mctp_frag_queue(key, skb); - if (!rc) - skb = NULL; + skb = NULL; } } @@ -572,17 +576,16 @@ static int mctp_dst_input(struct mctp_dst *dst, struct sk_buff *skb) */ /* we need to be continuing an existing reassembly... */ - if (!key->reasm_head) + if (!key->reasm_head) { rc = -EINVAL; - else + } else { rc = mctp_frag_queue(key, skb); + skb = NULL; + } if (rc) goto out_unlock; - /* we've queued; the queue owns the skb now */ - skb = NULL; - /* end of message? deliver to socket, and we're done with * the reassembly/response key */ @@ -620,6 +623,7 @@ static int mctp_dst_output(struct mctp_dst *dst, struct sk_buff *skb) skb->protocol = htons(ETH_P_MCTP); skb->pkt_type = PACKET_OUTGOING; + skb->dev = dst->dev->dev; if (skb->len > dst->mtu) { kfree_skb(skb); diff --git a/net/mctp/test/route-test.c b/net/mctp/test/route-test.c index fb6b46a952cb..75ea96c10e49 100644 --- a/net/mctp/test/route-test.c +++ b/net/mctp/test/route-test.c @@ -20,7 +20,6 @@ struct mctp_frag_test { static void mctp_test_fragment(struct kunit *test) { const struct mctp_frag_test *params; - struct mctp_test_pktqueue tpq; int rc, i, n, mtu, msgsize; struct mctp_test_dev *dev; struct mctp_dst dst; @@ -43,13 +42,12 @@ static void mctp_test_fragment(struct kunit *test) dev = mctp_test_create_dev(); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, dev); - mctp_test_dst_setup(test, &dst, dev, &tpq, mtu); + mctp_test_dst_setup(test, &dst, dev, mtu); rc = mctp_do_fragment_route(&dst, skb, mtu, MCTP_TAG_OWNER); KUNIT_EXPECT_FALSE(test, rc); - n = tpq.pkts.qlen; - + n = dev->pkts.qlen; KUNIT_EXPECT_EQ(test, n, params->n_frags); for (i = 0;; i++) { @@ -61,8 +59,7 @@ static void mctp_test_fragment(struct kunit *test) first = i == 0; last = i == (n - 1); - skb2 = skb_dequeue(&tpq.pkts); - + skb2 = skb_dequeue(&dev->pkts); if (!skb2) break; @@ -99,7 +96,7 @@ static void mctp_test_fragment(struct kunit *test) kfree_skb(skb2); } - mctp_test_dst_release(&dst, &tpq); + mctp_dst_release(&dst); mctp_test_destroy_dev(dev); } @@ -130,13 +127,11 @@ struct mctp_rx_input_test { static void mctp_test_rx_input(struct kunit *test) { const struct mctp_rx_input_test *params; - struct mctp_test_pktqueue tpq; struct mctp_test_route *rt; struct mctp_test_dev *dev; struct sk_buff *skb; params = test->param_value; - test->priv = &tpq; dev = mctp_test_create_dev(); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, dev); @@ -147,13 +142,10 @@ static void mctp_test_rx_input(struct kunit *test) skb = mctp_test_create_skb(¶ms->hdr, 1); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, skb); - mctp_test_pktqueue_init(&tpq); - mctp_pkttype_receive(skb, dev->ndev, &mctp_packet_type, NULL); - KUNIT_EXPECT_EQ(test, !!tpq.pkts.qlen, params->input); + KUNIT_EXPECT_EQ(test, !!dev->pkts.qlen, params->input); - skb_queue_purge(&tpq.pkts); mctp_test_route_destroy(test, rt); mctp_test_destroy_dev(dev); } @@ -182,7 +174,6 @@ KUNIT_ARRAY_PARAM(mctp_rx_input, mctp_rx_input_tests, static void __mctp_route_test_init(struct kunit *test, struct mctp_test_dev **devp, struct mctp_dst *dst, - struct mctp_test_pktqueue *tpq, struct socket **sockp, unsigned int netid) { @@ -196,7 +187,7 @@ static void __mctp_route_test_init(struct kunit *test, if (netid != MCTP_NET_ANY) WRITE_ONCE(dev->mdev->net, netid); - mctp_test_dst_setup(test, dst, dev, tpq, 68); + mctp_test_dst_setup(test, dst, dev, 68); rc = sock_create_kern(&init_net, AF_MCTP, SOCK_DGRAM, 0, &sock); KUNIT_ASSERT_EQ(test, rc, 0); @@ -205,7 +196,7 @@ static void __mctp_route_test_init(struct kunit *test, addr.smctp_network = netid; addr.smctp_addr.s_addr = 8; addr.smctp_type = 0; - rc = kernel_bind(sock, (struct sockaddr *)&addr, sizeof(addr)); + rc = kernel_bind(sock, (struct sockaddr_unsized *)&addr, sizeof(addr)); KUNIT_ASSERT_EQ(test, rc, 0); *devp = dev; @@ -215,11 +206,10 @@ static void __mctp_route_test_init(struct kunit *test, static void __mctp_route_test_fini(struct kunit *test, struct mctp_test_dev *dev, struct mctp_dst *dst, - struct mctp_test_pktqueue *tpq, struct socket *sock) { sock_release(sock); - mctp_test_dst_release(dst, tpq); + mctp_dst_release(dst); mctp_test_destroy_dev(dev); } @@ -232,7 +222,6 @@ struct mctp_route_input_sk_test { static void mctp_test_route_input_sk(struct kunit *test) { const struct mctp_route_input_sk_test *params; - struct mctp_test_pktqueue tpq; struct sk_buff *skb, *skb2; struct mctp_test_dev *dev; struct mctp_dst dst; @@ -241,13 +230,12 @@ static void mctp_test_route_input_sk(struct kunit *test) params = test->param_value; - __mctp_route_test_init(test, &dev, &dst, &tpq, &sock, MCTP_NET_ANY); + __mctp_route_test_init(test, &dev, &dst, &sock, MCTP_NET_ANY); skb = mctp_test_create_skb_data(¶ms->hdr, ¶ms->type); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, skb); mctp_test_skb_set_dev(skb, dev); - mctp_test_pktqueue_init(&tpq); rc = mctp_dst_input(&dst, skb); @@ -266,7 +254,7 @@ static void mctp_test_route_input_sk(struct kunit *test) KUNIT_EXPECT_NULL(test, skb2); } - __mctp_route_test_fini(test, dev, &dst, &tpq, sock); + __mctp_route_test_fini(test, dev, &dst, sock); } #define FL_S (MCTP_HDR_FLAG_SOM) @@ -303,7 +291,6 @@ struct mctp_route_input_sk_reasm_test { static void mctp_test_route_input_sk_reasm(struct kunit *test) { const struct mctp_route_input_sk_reasm_test *params; - struct mctp_test_pktqueue tpq; struct sk_buff *skb, *skb2; struct mctp_test_dev *dev; struct mctp_dst dst; @@ -313,7 +300,7 @@ static void mctp_test_route_input_sk_reasm(struct kunit *test) params = test->param_value; - __mctp_route_test_init(test, &dev, &dst, &tpq, &sock, MCTP_NET_ANY); + __mctp_route_test_init(test, &dev, &dst, &sock, MCTP_NET_ANY); for (i = 0; i < params->n_hdrs; i++) { c = i; @@ -336,7 +323,7 @@ static void mctp_test_route_input_sk_reasm(struct kunit *test) KUNIT_EXPECT_NULL(test, skb2); } - __mctp_route_test_fini(test, dev, &dst, &tpq, sock); + __mctp_route_test_fini(test, dev, &dst, sock); } #define RX_FRAG(f, s) RX_HDR(1, 10, 8, FL_TO | (f) | ((s) << MCTP_HDR_SEQ_SHIFT)) @@ -438,7 +425,6 @@ struct mctp_route_input_sk_keys_test { static void mctp_test_route_input_sk_keys(struct kunit *test) { const struct mctp_route_input_sk_keys_test *params; - struct mctp_test_pktqueue tpq; struct sk_buff *skb, *skb2; struct mctp_test_dev *dev; struct mctp_sk_key *key; @@ -457,7 +443,7 @@ static void mctp_test_route_input_sk_keys(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, dev); net = READ_ONCE(dev->mdev->net); - mctp_test_dst_setup(test, &dst, dev, &tpq, 68); + mctp_test_dst_setup(test, &dst, dev, 68); rc = sock_create_kern(&init_net, AF_MCTP, SOCK_DGRAM, 0, &sock); KUNIT_ASSERT_EQ(test, rc, 0); @@ -497,7 +483,7 @@ static void mctp_test_route_input_sk_keys(struct kunit *test) skb_free_datagram(sock->sk, skb2); mctp_key_unref(key); - __mctp_route_test_fini(test, dev, &dst, &tpq, sock); + __mctp_route_test_fini(test, dev, &dst, sock); } static const struct mctp_route_input_sk_keys_test mctp_route_input_sk_keys_tests[] = { @@ -572,7 +558,6 @@ KUNIT_ARRAY_PARAM(mctp_route_input_sk_keys, mctp_route_input_sk_keys_tests, struct test_net { unsigned int netid; struct mctp_test_dev *dev; - struct mctp_test_pktqueue tpq; struct mctp_dst dst; struct socket *sock; struct sk_buff *skb; @@ -591,20 +576,18 @@ mctp_test_route_input_multiple_nets_bind_init(struct kunit *test, t->msg.data = t->netid; - __mctp_route_test_init(test, &t->dev, &t->dst, &t->tpq, &t->sock, - t->netid); + __mctp_route_test_init(test, &t->dev, &t->dst, &t->sock, t->netid); t->skb = mctp_test_create_skb_data(&hdr, &t->msg); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, t->skb); mctp_test_skb_set_dev(t->skb, t->dev); - mctp_test_pktqueue_init(&t->tpq); } static void mctp_test_route_input_multiple_nets_bind_fini(struct kunit *test, struct test_net *t) { - __mctp_route_test_fini(test, t->dev, &t->dst, &t->tpq, t->sock); + __mctp_route_test_fini(test, t->dev, &t->dst, t->sock); } /* Test that skbs from different nets (otherwise identical) get routed to their @@ -661,8 +644,7 @@ mctp_test_route_input_multiple_nets_key_init(struct kunit *test, t->msg.data = t->netid; - __mctp_route_test_init(test, &t->dev, &t->dst, &t->tpq, &t->sock, - t->netid); + __mctp_route_test_init(test, &t->dev, &t->dst, &t->sock, t->netid); msk = container_of(t->sock->sk, struct mctp_sock, sk); @@ -685,7 +667,7 @@ mctp_test_route_input_multiple_nets_key_fini(struct kunit *test, struct test_net *t) { mctp_key_unref(t->key); - __mctp_route_test_fini(test, t->dev, &t->dst, &t->tpq, t->sock); + __mctp_route_test_fini(test, t->dev, &t->dst, t->sock); } /* test that skbs from different nets (otherwise identical) get routed to their @@ -738,14 +720,13 @@ static void mctp_test_route_input_multiple_nets_key(struct kunit *test) static void mctp_test_route_input_sk_fail_single(struct kunit *test) { const struct mctp_hdr hdr = RX_HDR(1, 10, 8, FL_S | FL_E | FL_TO); - struct mctp_test_pktqueue tpq; struct mctp_test_dev *dev; struct mctp_dst dst; struct socket *sock; struct sk_buff *skb; int rc; - __mctp_route_test_init(test, &dev, &dst, &tpq, &sock, MCTP_NET_ANY); + __mctp_route_test_init(test, &dev, &dst, &sock, MCTP_NET_ANY); /* No rcvbuf space, so delivery should fail. __sock_set_rcvbuf will * clamp the minimum to SOCK_MIN_RCVBUF, so we open-code this. @@ -768,7 +749,7 @@ static void mctp_test_route_input_sk_fail_single(struct kunit *test) KUNIT_EXPECT_EQ(test, refcount_read(&skb->users), 1); kfree_skb(skb); - __mctp_route_test_fini(test, dev, &dst, &tpq, sock); + __mctp_route_test_fini(test, dev, &dst, sock); } /* Input route to socket, using a fragmented message, where sock delivery fails. @@ -776,7 +757,6 @@ static void mctp_test_route_input_sk_fail_single(struct kunit *test) static void mctp_test_route_input_sk_fail_frag(struct kunit *test) { const struct mctp_hdr hdrs[2] = { RX_FRAG(FL_S, 0), RX_FRAG(FL_E, 1) }; - struct mctp_test_pktqueue tpq; struct mctp_test_dev *dev; struct sk_buff *skbs[2]; struct mctp_dst dst; @@ -784,7 +764,7 @@ static void mctp_test_route_input_sk_fail_frag(struct kunit *test) unsigned int i; int rc; - __mctp_route_test_init(test, &dev, &dst, &tpq, &sock, MCTP_NET_ANY); + __mctp_route_test_init(test, &dev, &dst, &sock, MCTP_NET_ANY); lock_sock(sock->sk); WRITE_ONCE(sock->sk->sk_rcvbuf, 0); @@ -815,7 +795,7 @@ static void mctp_test_route_input_sk_fail_frag(struct kunit *test) KUNIT_EXPECT_EQ(test, refcount_read(&skbs[1]->users), 1); kfree_skb(skbs[1]); - __mctp_route_test_fini(test, dev, &dst, &tpq, sock); + __mctp_route_test_fini(test, dev, &dst, sock); } /* Input route to socket, using a fragmented message created from clones. @@ -833,7 +813,6 @@ static void mctp_test_route_input_cloned_frag(struct kunit *test) const size_t data_len = 3; /* arbitrary */ u8 compare[3 * ARRAY_SIZE(hdrs)]; u8 flat[3 * ARRAY_SIZE(hdrs)]; - struct mctp_test_pktqueue tpq; struct mctp_test_dev *dev; struct sk_buff *skb[5]; struct sk_buff *rx_skb; @@ -845,7 +824,7 @@ static void mctp_test_route_input_cloned_frag(struct kunit *test) total = data_len + sizeof(struct mctp_hdr); - __mctp_route_test_init(test, &dev, &dst, &tpq, &sock, MCTP_NET_ANY); + __mctp_route_test_init(test, &dev, &dst, &sock, MCTP_NET_ANY); /* Create a single skb initially with concatenated packets */ skb[0] = mctp_test_create_skb(&hdrs[0], 5 * total); @@ -922,7 +901,7 @@ static void mctp_test_route_input_cloned_frag(struct kunit *test) kfree_skb(skb[i]); } - __mctp_route_test_fini(test, dev, &dst, &tpq, sock); + __mctp_route_test_fini(test, dev, &dst, sock); } #if IS_ENABLED(CONFIG_MCTP_FLOWS) @@ -930,7 +909,6 @@ static void mctp_test_route_input_cloned_frag(struct kunit *test) static void mctp_test_flow_init(struct kunit *test, struct mctp_test_dev **devp, struct mctp_dst *dst, - struct mctp_test_pktqueue *tpq, struct socket **sock, struct sk_buff **skbp, unsigned int len) @@ -944,7 +922,7 @@ static void mctp_test_flow_init(struct kunit *test, * mctp_local_output, which will call dst->output on whatever * route we provide */ - __mctp_route_test_init(test, &dev, dst, tpq, sock, MCTP_NET_ANY); + __mctp_route_test_init(test, &dev, dst, sock, MCTP_NET_ANY); /* Assign a single EID. ->addrs is freed on mctp netdev release */ dev->mdev->addrs = kmalloc(sizeof(u8), GFP_KERNEL); @@ -965,16 +943,14 @@ static void mctp_test_flow_init(struct kunit *test, static void mctp_test_flow_fini(struct kunit *test, struct mctp_test_dev *dev, struct mctp_dst *dst, - struct mctp_test_pktqueue *tpq, struct socket *sock) { - __mctp_route_test_fini(test, dev, dst, tpq, sock); + __mctp_route_test_fini(test, dev, dst, sock); } /* test that an outgoing skb has the correct MCTP extension data set */ static void mctp_test_packet_flow(struct kunit *test) { - struct mctp_test_pktqueue tpq; struct sk_buff *skb, *skb2; struct mctp_test_dev *dev; struct mctp_dst dst; @@ -983,15 +959,15 @@ static void mctp_test_packet_flow(struct kunit *test) u8 dst_eid = 8; int n, rc; - mctp_test_flow_init(test, &dev, &dst, &tpq, &sock, &skb, 30); + mctp_test_flow_init(test, &dev, &dst, &sock, &skb, 30); rc = mctp_local_output(sock->sk, &dst, skb, dst_eid, MCTP_TAG_OWNER); KUNIT_ASSERT_EQ(test, rc, 0); - n = tpq.pkts.qlen; + n = dev->pkts.qlen; KUNIT_ASSERT_EQ(test, n, 1); - skb2 = skb_dequeue(&tpq.pkts); + skb2 = skb_dequeue(&dev->pkts); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, skb2); flow = skb_ext_find(skb2, SKB_EXT_MCTP); @@ -1000,7 +976,7 @@ static void mctp_test_packet_flow(struct kunit *test) KUNIT_ASSERT_PTR_EQ(test, flow->key->sk, sock->sk); kfree_skb(skb2); - mctp_test_flow_fini(test, dev, &dst, &tpq, sock); + mctp_test_flow_fini(test, dev, &dst, sock); } /* test that outgoing skbs, after fragmentation, all have the correct MCTP @@ -1008,7 +984,6 @@ static void mctp_test_packet_flow(struct kunit *test) */ static void mctp_test_fragment_flow(struct kunit *test) { - struct mctp_test_pktqueue tpq; struct mctp_flow *flows[2]; struct sk_buff *tx_skbs[2]; struct mctp_test_dev *dev; @@ -1018,17 +993,17 @@ static void mctp_test_fragment_flow(struct kunit *test) u8 dst_eid = 8; int n, rc; - mctp_test_flow_init(test, &dev, &dst, &tpq, &sock, &skb, 100); + mctp_test_flow_init(test, &dev, &dst, &sock, &skb, 100); rc = mctp_local_output(sock->sk, &dst, skb, dst_eid, MCTP_TAG_OWNER); KUNIT_ASSERT_EQ(test, rc, 0); - n = tpq.pkts.qlen; + n = dev->pkts.qlen; KUNIT_ASSERT_EQ(test, n, 2); /* both resulting packets should have the same flow data */ - tx_skbs[0] = skb_dequeue(&tpq.pkts); - tx_skbs[1] = skb_dequeue(&tpq.pkts); + tx_skbs[0] = skb_dequeue(&dev->pkts); + tx_skbs[1] = skb_dequeue(&dev->pkts); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, tx_skbs[0]); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, tx_skbs[1]); @@ -1044,7 +1019,7 @@ static void mctp_test_fragment_flow(struct kunit *test) kfree_skb(tx_skbs[0]); kfree_skb(tx_skbs[1]); - mctp_test_flow_fini(test, dev, &dst, &tpq, sock); + mctp_test_flow_fini(test, dev, &dst, sock); } #else @@ -1063,7 +1038,6 @@ static void mctp_test_fragment_flow(struct kunit *test) static void mctp_test_route_output_key_create(struct kunit *test) { const u8 dst_eid = 26, src_eid = 15; - struct mctp_test_pktqueue tpq; const unsigned int netid = 50; struct mctp_test_dev *dev; struct mctp_sk_key *key; @@ -1080,7 +1054,7 @@ static void mctp_test_route_output_key_create(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, dev); WRITE_ONCE(dev->mdev->net, netid); - mctp_test_dst_setup(test, &dst, dev, &tpq, 68); + mctp_test_dst_setup(test, &dst, dev, 68); rc = sock_create_kern(&init_net, AF_MCTP, SOCK_DGRAM, 0, &sock); KUNIT_ASSERT_EQ(test, rc, 0); @@ -1127,14 +1101,13 @@ static void mctp_test_route_output_key_create(struct kunit *test) KUNIT_EXPECT_FALSE(test, key->tag & MCTP_TAG_OWNER); sock_release(sock); - mctp_test_dst_release(&dst, &tpq); + mctp_dst_release(&dst); mctp_test_destroy_dev(dev); } static void mctp_test_route_extaddr_input(struct kunit *test) { static const unsigned char haddr[] = { 0xaa, 0x55 }; - struct mctp_test_pktqueue tpq; struct mctp_skb_cb *cb, *cb2; const unsigned int len = 40; struct mctp_test_dev *dev; @@ -1149,7 +1122,7 @@ static void mctp_test_route_extaddr_input(struct kunit *test) hdr.dest = 8; hdr.flags_seq_tag = FL_S | FL_E | FL_TO; - __mctp_route_test_init(test, &dev, &dst, &tpq, &sock, MCTP_NET_ANY); + __mctp_route_test_init(test, &dev, &dst, &sock, MCTP_NET_ANY); skb = mctp_test_create_skb(&hdr, len); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, skb); @@ -1178,7 +1151,7 @@ static void mctp_test_route_extaddr_input(struct kunit *test) KUNIT_EXPECT_MEMEQ(test, cb2->haddr, haddr, sizeof(haddr)); kfree_skb(skb2); - __mctp_route_test_fini(test, dev, &dst, &tpq, sock); + __mctp_route_test_fini(test, dev, &dst, sock); } static void mctp_test_route_gw_lookup(struct kunit *test) @@ -1530,14 +1503,13 @@ static void mctp_test_bind_lookup(struct kunit *test) struct socket *socks[ARRAY_SIZE(lookup_binds)]; struct sk_buff *skb_pkt = NULL, *skb_sock = NULL; struct socket *sock_ty0, *sock_expect = NULL; - struct mctp_test_pktqueue tpq; struct mctp_test_dev *dev; struct mctp_dst dst; int rc; rx = test->param_value; - __mctp_route_test_init(test, &dev, &dst, &tpq, &sock_ty0, rx->net); + __mctp_route_test_init(test, &dev, &dst, &sock_ty0, rx->net); /* Create all binds */ for (size_t i = 0; i < ARRAY_SIZE(lookup_binds); i++) { mctp_test_bind_run(test, &lookup_binds[i], @@ -1557,7 +1529,6 @@ static void mctp_test_bind_lookup(struct kunit *test) skb_pkt = mctp_test_create_skb_data(&rx->hdr, &rx->ty); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, skb_pkt); mctp_test_skb_set_dev(skb_pkt, dev); - mctp_test_pktqueue_init(&tpq); rc = mctp_dst_input(&dst, skb_pkt); if (rx->expect) { @@ -1586,13 +1557,12 @@ static void mctp_test_bind_lookup(struct kunit *test) cleanup: kfree_skb(skb_sock); - kfree_skb(skb_pkt); /* Drop all binds */ for (size_t i = 0; i < ARRAY_SIZE(lookup_binds); i++) sock_release(socks[i]); - __mctp_route_test_fini(test, dev, &dst, &tpq, sock_ty0); + __mctp_route_test_fini(test, dev, &dst, sock_ty0); } static struct kunit_case mctp_test_cases[] = { diff --git a/net/mctp/test/utils.c b/net/mctp/test/utils.c index 953d41902771..37f1ba62a2ab 100644 --- a/net/mctp/test/utils.c +++ b/net/mctp/test/utils.c @@ -13,7 +13,10 @@ static netdev_tx_t mctp_test_dev_tx(struct sk_buff *skb, struct net_device *ndev) { - kfree_skb(skb); + struct mctp_test_dev *dev = netdev_priv(ndev); + + skb_queue_tail(&dev->pkts, skb); + return NETDEV_TX_OK; } @@ -26,7 +29,7 @@ static void mctp_test_dev_setup(struct net_device *ndev) ndev->type = ARPHRD_MCTP; ndev->mtu = MCTP_DEV_TEST_MTU; ndev->hard_header_len = 0; - ndev->tx_queue_len = DEFAULT_TX_QUEUE_LEN; + ndev->tx_queue_len = 0; ndev->flags = IFF_NOARP; ndev->netdev_ops = &mctp_test_netdev_ops; ndev->needs_free_netdev = true; @@ -51,6 +54,7 @@ static struct mctp_test_dev *__mctp_test_create_dev(unsigned short lladdr_len, dev->ndev = ndev; ndev->addr_len = lladdr_len; dev_addr_set(ndev, lladdr); + skb_queue_head_init(&dev->pkts); rc = register_netdev(ndev); if (rc) { @@ -63,6 +67,11 @@ static struct mctp_test_dev *__mctp_test_create_dev(unsigned short lladdr_len, dev->mdev->net = mctp_default_net(dev_net(ndev)); rcu_read_unlock(); + /* bring the device up; we want to be able to TX immediately */ + rtnl_lock(); + dev_open(ndev, NULL); + rtnl_unlock(); + return dev; } @@ -79,26 +88,15 @@ struct mctp_test_dev *mctp_test_create_dev_lladdr(unsigned short lladdr_len, void mctp_test_destroy_dev(struct mctp_test_dev *dev) { + skb_queue_purge(&dev->pkts); mctp_dev_put(dev->mdev); unregister_netdev(dev->ndev); } -static const unsigned int test_pktqueue_magic = 0x5f713aef; - -void mctp_test_pktqueue_init(struct mctp_test_pktqueue *tpq) -{ - tpq->magic = test_pktqueue_magic; - skb_queue_head_init(&tpq->pkts); -} - static int mctp_test_dst_output(struct mctp_dst *dst, struct sk_buff *skb) { - struct kunit *test = current->kunit_test; - struct mctp_test_pktqueue *tpq = test->priv; - - KUNIT_ASSERT_EQ(test, tpq->magic, test_pktqueue_magic); - - skb_queue_tail(&tpq->pkts, skb); + skb->dev = dst->dev->dev; + dev_queue_xmit(skb); return 0; } @@ -169,11 +167,9 @@ struct mctp_test_route *mctp_test_create_route_gw(struct net *net, return rt; } -/* Convenience function for our test dst; release with mctp_test_dst_release() - */ +/* Convenience function for our test dst; release with mctp_dst_release() */ void mctp_test_dst_setup(struct kunit *test, struct mctp_dst *dst, - struct mctp_test_dev *dev, - struct mctp_test_pktqueue *tpq, unsigned int mtu) + struct mctp_test_dev *dev, unsigned int mtu) { KUNIT_EXPECT_NOT_ERR_OR_NULL(test, dev); @@ -183,15 +179,6 @@ void mctp_test_dst_setup(struct kunit *test, struct mctp_dst *dst, __mctp_dev_get(dst->dev->dev); dst->mtu = mtu; dst->output = mctp_test_dst_output; - mctp_test_pktqueue_init(tpq); - test->priv = tpq; -} - -void mctp_test_dst_release(struct mctp_dst *dst, - struct mctp_test_pktqueue *tpq) -{ - mctp_dst_release(dst); - skb_queue_purge(&tpq->pkts); } void mctp_test_route_destroy(struct kunit *test, struct mctp_test_route *rt) @@ -279,7 +266,7 @@ void mctp_test_bind_run(struct kunit *test, addr.smctp_addr.s_addr = setup->peer_addr; /* connect() type must match bind() type */ addr.smctp_type = setup->bind_type; - rc = kernel_connect(*sock, (struct sockaddr *)&addr, + rc = kernel_connect(*sock, (struct sockaddr_unsized *)&addr, sizeof(addr), 0); KUNIT_EXPECT_EQ(test, rc, 0); } @@ -292,5 +279,6 @@ void mctp_test_bind_run(struct kunit *test, addr.smctp_type = setup->bind_type; *ret_bind_errno = - kernel_bind(*sock, (struct sockaddr *)&addr, sizeof(addr)); + kernel_bind(*sock, (struct sockaddr_unsized *)&addr, + sizeof(addr)); } diff --git a/net/mctp/test/utils.h b/net/mctp/test/utils.h index 06bdb6cb5eff..4cc90c9da4d1 100644 --- a/net/mctp/test/utils.h +++ b/net/mctp/test/utils.h @@ -18,6 +18,8 @@ struct mctp_test_dev { unsigned short lladdr_len; unsigned char lladdr[MAX_ADDR_LEN]; + + struct sk_buff_head pkts; }; struct mctp_test_dev; @@ -26,11 +28,6 @@ struct mctp_test_route { struct mctp_route rt; }; -struct mctp_test_pktqueue { - unsigned int magic; - struct sk_buff_head pkts; -}; - struct mctp_test_bind_setup { mctp_eid_t bind_addr; int bind_net; @@ -59,11 +56,7 @@ struct mctp_test_route *mctp_test_create_route_gw(struct net *net, mctp_eid_t gw, unsigned int mtu); void mctp_test_dst_setup(struct kunit *test, struct mctp_dst *dst, - struct mctp_test_dev *dev, - struct mctp_test_pktqueue *tpq, unsigned int mtu); -void mctp_test_dst_release(struct mctp_dst *dst, - struct mctp_test_pktqueue *tpq); -void mctp_test_pktqueue_init(struct mctp_test_pktqueue *tpq); + struct mctp_test_dev *dev, unsigned int mtu); void mctp_test_route_destroy(struct kunit *test, struct mctp_test_route *rt); void mctp_test_skb_set_dev(struct sk_buff *skb, struct mctp_test_dev *dev); struct sk_buff *mctp_test_create_skb(const struct mctp_hdr *hdr, diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 25c88cba5c48..580aac112dd2 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -75,16 +75,23 @@ static void rtmsg_lfib(int event, u32 label, struct mpls_route *rt, struct nlmsghdr *nlh, struct net *net, u32 portid, unsigned int nlm_flags); -static struct mpls_route *mpls_route_input_rcu(struct net *net, unsigned index) +static struct mpls_route *mpls_route_input(struct net *net, unsigned int index) { - struct mpls_route *rt = NULL; + struct mpls_route __rcu **platform_label; - if (index < net->mpls.platform_labels) { - struct mpls_route __rcu **platform_label = - rcu_dereference_rtnl(net->mpls.platform_label); - rt = rcu_dereference_rtnl(platform_label[index]); - } - return rt; + platform_label = mpls_dereference(net, net->mpls.platform_label); + return mpls_dereference(net, platform_label[index]); +} + +static struct mpls_route *mpls_route_input_rcu(struct net *net, unsigned int index) +{ + struct mpls_route __rcu **platform_label; + + if (index >= net->mpls.platform_labels) + return NULL; + + platform_label = rcu_dereference(net->mpls.platform_label); + return rcu_dereference(platform_label[index]); } bool mpls_output_possible(const struct net_device *dev) @@ -129,25 +136,26 @@ bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) } EXPORT_SYMBOL_GPL(mpls_pkt_too_big); -void mpls_stats_inc_outucastpkts(struct net_device *dev, +void mpls_stats_inc_outucastpkts(struct net *net, + struct net_device *dev, const struct sk_buff *skb) { struct mpls_dev *mdev; if (skb->protocol == htons(ETH_P_MPLS_UC)) { - mdev = mpls_dev_get(dev); + mdev = mpls_dev_rcu(dev); if (mdev) MPLS_INC_STATS_LEN(mdev, skb->len, tx_packets, tx_bytes); } else if (skb->protocol == htons(ETH_P_IP)) { - IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len); + IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len); #if IS_ENABLED(CONFIG_IPV6) } else if (skb->protocol == htons(ETH_P_IPV6)) { - struct inet6_dev *in6dev = __in6_dev_get(dev); + struct inet6_dev *in6dev = in6_dev_rcu(dev); if (in6dev) - IP6_UPD_PO_STATS(dev_net(dev), in6dev, + IP6_UPD_PO_STATS(net, in6dev, IPSTATS_MIB_OUT, skb->len); #endif } @@ -342,7 +350,7 @@ static bool mpls_egress(struct net *net, struct mpls_route *rt, static int mpls_forward(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { - struct net *net = dev_net(dev); + struct net *net = dev_net_rcu(dev); struct mpls_shim_hdr *hdr; const struct mpls_nh *nh; struct mpls_route *rt; @@ -357,7 +365,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev, /* Careful this entire function runs inside of an rcu critical section */ - mdev = mpls_dev_get(dev); + mdev = mpls_dev_rcu(dev); if (!mdev) goto drop; @@ -434,7 +442,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev, dec.ttl -= 1; if (unlikely(!new_header_size && dec.bos)) { /* Penultimate hop popping */ - if (!mpls_egress(dev_net(out_dev), rt, skb, dec)) + if (!mpls_egress(net, rt, skb, dec)) goto err; } else { bool bos; @@ -451,7 +459,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev, } } - mpls_stats_inc_outucastpkts(out_dev, skb); + mpls_stats_inc_outucastpkts(net, out_dev, skb); /* If via wasn't specified then send out using device address */ if (nh->nh_via_table == MPLS_NEIGH_TABLE_UNSPEC) @@ -466,7 +474,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev, return 0; tx_err: - out_mdev = out_dev ? mpls_dev_get(out_dev) : NULL; + out_mdev = out_dev ? mpls_dev_rcu(out_dev) : NULL; if (out_mdev) MPLS_INC_STATS(out_mdev, tx_errors); goto drop; @@ -530,10 +538,23 @@ static struct mpls_route *mpls_rt_alloc(u8 num_nh, u8 max_alen, u8 max_labels) return rt; } +static void mpls_rt_free_rcu(struct rcu_head *head) +{ + struct mpls_route *rt; + + rt = container_of(head, struct mpls_route, rt_rcu); + + change_nexthops(rt) { + netdev_put(nh->nh_dev, &nh->nh_dev_tracker); + } endfor_nexthops(rt); + + kfree(rt); +} + static void mpls_rt_free(struct mpls_route *rt) { if (rt) - kfree_rcu(rt, rt_rcu); + call_rcu(&rt->rt_rcu, mpls_rt_free_rcu); } static void mpls_notify_route(struct net *net, unsigned index, @@ -557,10 +578,8 @@ static void mpls_route_update(struct net *net, unsigned index, struct mpls_route __rcu **platform_label; struct mpls_route *rt; - ASSERT_RTNL(); - - platform_label = rtnl_dereference(net->mpls.platform_label); - rt = rtnl_dereference(platform_label[index]); + platform_label = mpls_dereference(net, net->mpls.platform_label); + rt = mpls_dereference(net, platform_label[index]); rcu_assign_pointer(platform_label[index], new); mpls_notify_route(net, index, rt, new, info); @@ -569,24 +588,23 @@ static void mpls_route_update(struct net *net, unsigned index, mpls_rt_free(rt); } -static unsigned find_free_label(struct net *net) +static unsigned int find_free_label(struct net *net) { - struct mpls_route __rcu **platform_label; - size_t platform_labels; - unsigned index; + unsigned int index; - platform_label = rtnl_dereference(net->mpls.platform_label); - platform_labels = net->mpls.platform_labels; - for (index = MPLS_LABEL_FIRST_UNRESERVED; index < platform_labels; + for (index = MPLS_LABEL_FIRST_UNRESERVED; + index < net->mpls.platform_labels; index++) { - if (!rtnl_dereference(platform_label[index])) + if (!mpls_route_input(net, index)) return index; } + return LABEL_NOT_SPECIFIED; } #if IS_ENABLED(CONFIG_INET) static struct net_device *inet_fib_lookup_dev(struct net *net, + struct mpls_nh *nh, const void *addr) { struct net_device *dev; @@ -599,14 +617,14 @@ static struct net_device *inet_fib_lookup_dev(struct net *net, return ERR_CAST(rt); dev = rt->dst.dev; - dev_hold(dev); - + netdev_hold(dev, &nh->nh_dev_tracker, GFP_KERNEL); ip_rt_put(rt); return dev; } #else static struct net_device *inet_fib_lookup_dev(struct net *net, + struct mpls_nh *nh, const void *addr) { return ERR_PTR(-EAFNOSUPPORT); @@ -615,6 +633,7 @@ static struct net_device *inet_fib_lookup_dev(struct net *net, #if IS_ENABLED(CONFIG_IPV6) static struct net_device *inet6_fib_lookup_dev(struct net *net, + struct mpls_nh *nh, const void *addr) { struct net_device *dev; @@ -631,13 +650,14 @@ static struct net_device *inet6_fib_lookup_dev(struct net *net, return ERR_CAST(dst); dev = dst->dev; - dev_hold(dev); + netdev_hold(dev, &nh->nh_dev_tracker, GFP_KERNEL); dst_release(dst); return dev; } #else static struct net_device *inet6_fib_lookup_dev(struct net *net, + struct mpls_nh *nh, const void *addr) { return ERR_PTR(-EAFNOSUPPORT); @@ -653,16 +673,17 @@ static struct net_device *find_outdev(struct net *net, if (!oif) { switch (nh->nh_via_table) { case NEIGH_ARP_TABLE: - dev = inet_fib_lookup_dev(net, mpls_nh_via(rt, nh)); + dev = inet_fib_lookup_dev(net, nh, mpls_nh_via(rt, nh)); break; case NEIGH_ND_TABLE: - dev = inet6_fib_lookup_dev(net, mpls_nh_via(rt, nh)); + dev = inet6_fib_lookup_dev(net, nh, mpls_nh_via(rt, nh)); break; case NEIGH_LINK_TABLE: break; } } else { - dev = dev_get_by_index(net, oif); + dev = netdev_get_by_index(net, oif, + &nh->nh_dev_tracker, GFP_KERNEL); } if (!dev) @@ -671,8 +692,7 @@ static struct net_device *find_outdev(struct net *net, if (IS_ERR(dev)) return dev; - /* The caller is holding rtnl anyways, so release the dev reference */ - dev_put(dev); + nh->nh_dev = dev; return dev; } @@ -686,20 +706,17 @@ static int mpls_nh_assign_dev(struct net *net, struct mpls_route *rt, dev = find_outdev(net, rt, nh, oif); if (IS_ERR(dev)) { err = PTR_ERR(dev); - dev = NULL; goto errout; } /* Ensure this is a supported device */ err = -EINVAL; - if (!mpls_dev_get(dev)) - goto errout; + if (!mpls_dev_get(net, dev)) + goto errout_put; if ((nh->nh_via_table == NEIGH_LINK_TABLE) && (dev->addr_len != nh->nh_via_alen)) - goto errout; - - nh->nh_dev = dev; + goto errout_put; if (!(dev->flags & IFF_UP)) { nh->nh_flags |= RTNH_F_DEAD; @@ -713,6 +730,9 @@ static int mpls_nh_assign_dev(struct net *net, struct mpls_route *rt, return 0; +errout_put: + netdev_put(nh->nh_dev, &nh->nh_dev_tracker); + nh->nh_dev = NULL; errout: return err; } @@ -890,7 +910,8 @@ static int mpls_nh_build_multi(struct mpls_route_config *cfg, struct nlattr *nla_via, *nla_newdst; int remaining = cfg->rc_mp_len; int err = 0; - u8 nhs = 0; + + rt->rt_nhn = 0; change_nexthops(rt) { int attrlen; @@ -926,11 +947,9 @@ static int mpls_nh_build_multi(struct mpls_route_config *cfg, rt->rt_nhn_alive--; rtnh = rtnh_next(rtnh, &remaining); - nhs++; + rt->rt_nhn++; } endfor_nexthops(rt); - rt->rt_nhn = nhs; - return 0; errout: @@ -940,30 +959,28 @@ errout: static bool mpls_label_ok(struct net *net, unsigned int *index, struct netlink_ext_ack *extack) { - bool is_ok = true; - /* Reserved labels may not be set */ if (*index < MPLS_LABEL_FIRST_UNRESERVED) { NL_SET_ERR_MSG(extack, "Invalid label - must be MPLS_LABEL_FIRST_UNRESERVED or higher"); - is_ok = false; + return false; } /* The full 20 bit range may not be supported. */ - if (is_ok && *index >= net->mpls.platform_labels) { + if (*index >= net->mpls.platform_labels) { NL_SET_ERR_MSG(extack, "Label >= configured maximum in platform_labels"); - is_ok = false; + return false; } *index = array_index_nospec(*index, net->mpls.platform_labels); - return is_ok; + + return true; } static int mpls_route_add(struct mpls_route_config *cfg, struct netlink_ext_ack *extack) { - struct mpls_route __rcu **platform_label; struct net *net = cfg->rc_nlinfo.nl_net; struct mpls_route *rt, *old; int err = -EINVAL; @@ -991,8 +1008,7 @@ static int mpls_route_add(struct mpls_route_config *cfg, } err = -EEXIST; - platform_label = rtnl_dereference(net->mpls.platform_label); - old = rtnl_dereference(platform_label[index]); + old = mpls_route_input(net, index); if ((cfg->rc_nlflags & NLM_F_EXCL) && old) goto errout; @@ -1103,7 +1119,7 @@ static int mpls_fill_stats_af(struct sk_buff *skb, struct mpls_dev *mdev; struct nlattr *nla; - mdev = mpls_dev_get(dev); + mdev = mpls_dev_rcu(dev); if (!mdev) return -ENODATA; @@ -1123,7 +1139,7 @@ static size_t mpls_get_stats_af_size(const struct net_device *dev) { struct mpls_dev *mdev; - mdev = mpls_dev_get(dev); + mdev = mpls_dev_rcu(dev); if (!mdev) return 0; @@ -1264,23 +1280,32 @@ static int mpls_netconf_get_devconf(struct sk_buff *in_skb, if (err < 0) goto errout; - err = -EINVAL; - if (!tb[NETCONFA_IFINDEX]) + if (!tb[NETCONFA_IFINDEX]) { + err = -EINVAL; goto errout; + } ifindex = nla_get_s32(tb[NETCONFA_IFINDEX]); - dev = __dev_get_by_index(net, ifindex); - if (!dev) - goto errout; - - mdev = mpls_dev_get(dev); - if (!mdev) - goto errout; - err = -ENOBUFS; skb = nlmsg_new(mpls_netconf_msgsize_devconf(NETCONFA_ALL), GFP_KERNEL); - if (!skb) + if (!skb) { + err = -ENOBUFS; goto errout; + } + + rcu_read_lock(); + + dev = dev_get_by_index_rcu(net, ifindex); + if (!dev) { + err = -EINVAL; + goto errout_unlock; + } + + mdev = mpls_dev_rcu(dev); + if (!mdev) { + err = -EINVAL; + goto errout_unlock; + } err = mpls_netconf_fill_devconf(skb, mdev, NETLINK_CB(in_skb).portid, @@ -1289,12 +1314,19 @@ static int mpls_netconf_get_devconf(struct sk_buff *in_skb, if (err < 0) { /* -EMSGSIZE implies BUG in mpls_netconf_msgsize_devconf() */ WARN_ON(err == -EMSGSIZE); - kfree_skb(skb); - goto errout; + goto errout_unlock; } + err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); + + rcu_read_unlock(); errout: return err; + +errout_unlock: + rcu_read_unlock(); + kfree_skb(skb); + goto errout; } static int mpls_netconf_dump_devconf(struct sk_buff *skb, @@ -1326,7 +1358,7 @@ static int mpls_netconf_dump_devconf(struct sk_buff *skb, rcu_read_lock(); for_each_netdev_dump(net, dev, ctx->ifindex) { - mdev = mpls_dev_get(dev); + mdev = mpls_dev_rcu(dev); if (!mdev) continue; err = mpls_netconf_fill_devconf(skb, mdev, @@ -1438,8 +1470,6 @@ static struct mpls_dev *mpls_add_dev(struct net_device *dev) int err = -ENOMEM; int i; - ASSERT_RTNL(); - mdev = kzalloc(sizeof(*mdev), GFP_KERNEL); if (!mdev) return ERR_PTR(err); @@ -1481,16 +1511,15 @@ static void mpls_dev_destroy_rcu(struct rcu_head *head) static int mpls_ifdown(struct net_device *dev, int event) { - struct mpls_route __rcu **platform_label; struct net *net = dev_net(dev); - unsigned index; + unsigned int index; - platform_label = rtnl_dereference(net->mpls.platform_label); for (index = 0; index < net->mpls.platform_labels; index++) { - struct mpls_route *rt = rtnl_dereference(platform_label[index]); + struct mpls_route *rt; bool nh_del = false; u8 alive = 0; + rt = mpls_route_input(net, index); if (!rt) continue; @@ -1524,8 +1553,12 @@ static int mpls_ifdown(struct net_device *dev, int event) change_nexthops(rt) { unsigned int nh_flags = nh->nh_flags; - if (nh->nh_dev != dev) + if (nh->nh_dev != dev) { + if (nh_del) + netdev_hold(nh->nh_dev, &nh->nh_dev_tracker, + GFP_KERNEL); goto next; + } switch (event) { case NETDEV_DOWN: @@ -1557,15 +1590,14 @@ next: static void mpls_ifup(struct net_device *dev, unsigned int flags) { - struct mpls_route __rcu **platform_label; struct net *net = dev_net(dev); - unsigned index; + unsigned int index; u8 alive; - platform_label = rtnl_dereference(net->mpls.platform_label); for (index = 0; index < net->mpls.platform_labels; index++) { - struct mpls_route *rt = rtnl_dereference(platform_label[index]); + struct mpls_route *rt; + rt = mpls_route_input(net, index); if (!rt) continue; @@ -1592,28 +1624,33 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct net *net = dev_net(dev); struct mpls_dev *mdev; unsigned int flags; int err; + mutex_lock(&net->mpls.platform_mutex); + if (event == NETDEV_REGISTER) { mdev = mpls_add_dev(dev); - if (IS_ERR(mdev)) - return notifier_from_errno(PTR_ERR(mdev)); + if (IS_ERR(mdev)) { + err = PTR_ERR(mdev); + goto err; + } - return NOTIFY_OK; + goto out; } - mdev = mpls_dev_get(dev); + mdev = mpls_dev_get(net, dev); if (!mdev) - return NOTIFY_OK; + goto out; switch (event) { case NETDEV_DOWN: err = mpls_ifdown(dev, event); if (err) - return notifier_from_errno(err); + goto err; break; case NETDEV_UP: flags = netif_get_flags(dev); @@ -1629,14 +1666,15 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event, } else { err = mpls_ifdown(dev, event); if (err) - return notifier_from_errno(err); + goto err; } break; case NETDEV_UNREGISTER: err = mpls_ifdown(dev, event); if (err) - return notifier_from_errno(err); - mdev = mpls_dev_get(dev); + goto err; + + mdev = mpls_dev_get(net, dev); if (mdev) { mpls_dev_sysctl_unregister(dev, mdev); RCU_INIT_POINTER(dev->mpls_ptr, NULL); @@ -1644,16 +1682,23 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event, } break; case NETDEV_CHANGENAME: - mdev = mpls_dev_get(dev); + mdev = mpls_dev_get(net, dev); if (mdev) { mpls_dev_sysctl_unregister(dev, mdev); err = mpls_dev_sysctl_register(dev, mdev); if (err) - return notifier_from_errno(err); + goto err; } break; } + +out: + mutex_unlock(&net->mpls.platform_mutex); return NOTIFY_OK; + +err: + mutex_unlock(&net->mpls.platform_mutex); + return notifier_from_errno(err); } static struct notifier_block mpls_dev_notifier = { @@ -1928,6 +1973,7 @@ errout: static int mpls_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { + struct net *net = sock_net(skb->sk); struct mpls_route_config *cfg; int err; @@ -1939,7 +1985,9 @@ static int mpls_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) goto out; + mutex_lock(&net->mpls.platform_mutex); err = mpls_route_del(cfg, extack); + mutex_unlock(&net->mpls.platform_mutex); out: kfree(cfg); @@ -1950,6 +1998,7 @@ out: static int mpls_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { + struct net *net = sock_net(skb->sk); struct mpls_route_config *cfg; int err; @@ -1961,7 +2010,9 @@ static int mpls_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) goto out; + mutex_lock(&net->mpls.platform_mutex); err = mpls_route_add(cfg, extack); + mutex_unlock(&net->mpls.platform_mutex); out: kfree(cfg); @@ -2124,7 +2175,7 @@ static int mpls_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh, if (i == RTA_OIF) { ifindex = nla_get_u32(tb[i]); - filter->dev = __dev_get_by_index(net, ifindex); + filter->dev = dev_get_by_index_rcu(net, ifindex); if (!filter->dev) return -ENODEV; filter->filter_set = 1; @@ -2162,20 +2213,19 @@ static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb) struct net *net = sock_net(skb->sk); struct mpls_route __rcu **platform_label; struct fib_dump_filter filter = { - .rtnl_held = true, + .rtnl_held = false, }; unsigned int flags = NLM_F_MULTI; size_t platform_labels; unsigned int index; + int err; - ASSERT_RTNL(); + rcu_read_lock(); if (cb->strict_check) { - int err; - err = mpls_valid_fib_dump_req(net, nlh, &filter, cb); if (err < 0) - return err; + goto err; /* for MPLS, there is only 1 table with fixed type and flags. * If either are set in the filter then return nothing. @@ -2183,14 +2233,14 @@ static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb) if ((filter.table_id && filter.table_id != RT_TABLE_MAIN) || (filter.rt_type && filter.rt_type != RTN_UNICAST) || filter.flags) - return skb->len; + goto unlock; } index = cb->args[0]; if (index < MPLS_LABEL_FIRST_UNRESERVED) index = MPLS_LABEL_FIRST_UNRESERVED; - platform_label = rtnl_dereference(net->mpls.platform_label); + platform_label = rcu_dereference(net->mpls.platform_label); platform_labels = net->mpls.platform_labels; if (filter.filter_set) @@ -2199,7 +2249,7 @@ static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb) for (; index < platform_labels; index++) { struct mpls_route *rt; - rt = rtnl_dereference(platform_label[index]); + rt = rcu_dereference(platform_label[index]); if (!rt) continue; @@ -2214,7 +2264,13 @@ static int mpls_dump_routes(struct sk_buff *skb, struct netlink_callback *cb) } cb->args[0] = index; +unlock: + rcu_read_unlock(); return skb->len; + +err: + rcu_read_unlock(); + return err; } static inline size_t lfib_nlmsg_size(struct mpls_route *rt) @@ -2345,18 +2401,20 @@ static int mpls_getroute(struct sk_buff *in_skb, struct nlmsghdr *in_nlh, u32 portid = NETLINK_CB(in_skb).portid; u32 in_label = LABEL_NOT_SPECIFIED; struct nlattr *tb[RTA_MAX + 1]; + struct mpls_route *rt = NULL; u32 labels[MAX_NEW_LABELS]; struct mpls_shim_hdr *hdr; unsigned int hdr_size = 0; const struct mpls_nh *nh; struct net_device *dev; - struct mpls_route *rt; struct rtmsg *rtm, *r; struct nlmsghdr *nlh; struct sk_buff *skb; u8 n_labels; int err; + mutex_lock(&net->mpls.platform_mutex); + err = mpls_valid_getroute_req(in_skb, in_nlh, tb, extack); if (err < 0) goto errout; @@ -2378,7 +2436,8 @@ static int mpls_getroute(struct sk_buff *in_skb, struct nlmsghdr *in_nlh, } } - rt = mpls_route_input_rcu(net, in_label); + if (in_label < net->mpls.platform_labels) + rt = mpls_route_input(net, in_label); if (!rt) { err = -ENETUNREACH; goto errout; @@ -2399,7 +2458,8 @@ static int mpls_getroute(struct sk_buff *in_skb, struct nlmsghdr *in_nlh, goto errout_free; } - return rtnl_unicast(skb, net, portid); + err = rtnl_unicast(skb, net, portid); + goto errout; } if (tb[RTA_NEWDST]) { @@ -2491,12 +2551,14 @@ static int mpls_getroute(struct sk_buff *in_skb, struct nlmsghdr *in_nlh, err = rtnl_unicast(skb, net, portid); errout: + mutex_unlock(&net->mpls.platform_mutex); return err; nla_put_failure: nlmsg_cancel(skb, nlh); err = -EMSGSIZE; errout_free: + mutex_unlock(&net->mpls.platform_mutex); kfree_skb(skb); return err; } @@ -2519,10 +2581,13 @@ static int resize_platform_label_table(struct net *net, size_t limit) /* In case the predefined labels need to be populated */ if (limit > MPLS_LABEL_IPV4NULL) { struct net_device *lo = net->loopback_dev; + rt0 = mpls_rt_alloc(1, lo->addr_len, 0); if (IS_ERR(rt0)) goto nort0; + rt0->rt_nh->nh_dev = lo; + netdev_hold(lo, &rt0->rt_nh->nh_dev_tracker, GFP_KERNEL); rt0->rt_protocol = RTPROT_KERNEL; rt0->rt_payload_type = MPT_IPV4; rt0->rt_ttl_propagate = MPLS_TTL_PROP_DEFAULT; @@ -2533,10 +2598,13 @@ static int resize_platform_label_table(struct net *net, size_t limit) } if (limit > MPLS_LABEL_IPV6NULL) { struct net_device *lo = net->loopback_dev; + rt2 = mpls_rt_alloc(1, lo->addr_len, 0); if (IS_ERR(rt2)) goto nort2; + rt2->rt_nh->nh_dev = lo; + netdev_hold(lo, &rt2->rt_nh->nh_dev_tracker, GFP_KERNEL); rt2->rt_protocol = RTPROT_KERNEL; rt2->rt_payload_type = MPT_IPV6; rt2->rt_ttl_propagate = MPLS_TTL_PROP_DEFAULT; @@ -2546,9 +2614,10 @@ static int resize_platform_label_table(struct net *net, size_t limit) lo->addr_len); } - rtnl_lock(); + mutex_lock(&net->mpls.platform_mutex); + /* Remember the original table */ - old = rtnl_dereference(net->mpls.platform_label); + old = mpls_dereference(net, net->mpls.platform_label); old_limit = net->mpls.platform_labels; /* Free any labels beyond the new table */ @@ -2579,7 +2648,7 @@ static int resize_platform_label_table(struct net *net, size_t limit) net->mpls.platform_labels = limit; rcu_assign_pointer(net->mpls.platform_label, labels); - rtnl_unlock(); + mutex_unlock(&net->mpls.platform_mutex); mpls_rt_free(rt2); mpls_rt_free(rt0); @@ -2652,12 +2721,13 @@ static const struct ctl_table mpls_table[] = { }, }; -static int mpls_net_init(struct net *net) +static __net_init int mpls_net_init(struct net *net) { size_t table_size = ARRAY_SIZE(mpls_table); struct ctl_table *table; int i; + mutex_init(&net->mpls.platform_mutex); net->mpls.platform_labels = 0; net->mpls.platform_label = NULL; net->mpls.ip_ttl_propagate = 1; @@ -2683,7 +2753,7 @@ static int mpls_net_init(struct net *net) return 0; } -static void mpls_net_exit(struct net *net) +static __net_exit void mpls_net_exit(struct net *net) { struct mpls_route __rcu **platform_label; size_t platform_labels; @@ -2703,16 +2773,20 @@ static void mpls_net_exit(struct net *net) * As such no additional rcu synchronization is necessary when * freeing the platform_label table. */ - rtnl_lock(); - platform_label = rtnl_dereference(net->mpls.platform_label); + mutex_lock(&net->mpls.platform_mutex); + + platform_label = mpls_dereference(net, net->mpls.platform_label); platform_labels = net->mpls.platform_labels; + for (index = 0; index < platform_labels; index++) { - struct mpls_route *rt = rtnl_dereference(platform_label[index]); - RCU_INIT_POINTER(platform_label[index], NULL); + struct mpls_route *rt; + + rt = mpls_dereference(net, platform_label[index]); mpls_notify_route(net, index, rt, NULL, NULL); mpls_rt_free(rt); } - rtnl_unlock(); + + mutex_unlock(&net->mpls.platform_mutex); kvfree(platform_label); } @@ -2729,12 +2803,15 @@ static struct rtnl_af_ops mpls_af_ops __read_mostly = { }; static const struct rtnl_msg_handler mpls_rtnl_msg_handlers[] __initdata_or_module = { - {THIS_MODULE, PF_MPLS, RTM_NEWROUTE, mpls_rtm_newroute, NULL, 0}, - {THIS_MODULE, PF_MPLS, RTM_DELROUTE, mpls_rtm_delroute, NULL, 0}, - {THIS_MODULE, PF_MPLS, RTM_GETROUTE, mpls_getroute, mpls_dump_routes, 0}, + {THIS_MODULE, PF_MPLS, RTM_NEWROUTE, mpls_rtm_newroute, NULL, + RTNL_FLAG_DOIT_UNLOCKED}, + {THIS_MODULE, PF_MPLS, RTM_DELROUTE, mpls_rtm_delroute, NULL, + RTNL_FLAG_DOIT_UNLOCKED}, + {THIS_MODULE, PF_MPLS, RTM_GETROUTE, mpls_getroute, mpls_dump_routes, + RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, {THIS_MODULE, PF_MPLS, RTM_GETNETCONF, mpls_netconf_get_devconf, mpls_netconf_dump_devconf, - RTNL_FLAG_DUMP_UNLOCKED}, + RTNL_FLAG_DOIT_UNLOCKED | RTNL_FLAG_DUMP_UNLOCKED}, }; static int __init mpls_init(void) diff --git a/net/mpls/internal.h b/net/mpls/internal.h index 83c629529b57..80cb5bbcd946 100644 --- a/net/mpls/internal.h +++ b/net/mpls/internal.h @@ -88,6 +88,7 @@ enum mpls_payload_type { struct mpls_nh { /* next hop label forwarding entry */ struct net_device *nh_dev; + netdevice_tracker nh_dev_tracker; /* nh_flags is accessed under RCU in the packet path; it is * modified handling netdev events with rtnl lock held @@ -184,9 +185,20 @@ static inline struct mpls_entry_decoded mpls_entry_decode(struct mpls_shim_hdr * return result; } -static inline struct mpls_dev *mpls_dev_get(const struct net_device *dev) +#define mpls_dereference(net, p) \ + rcu_dereference_protected( \ + (p), \ + lockdep_is_held(&(net)->mpls.platform_mutex)) + +static inline struct mpls_dev *mpls_dev_rcu(const struct net_device *dev) +{ + return rcu_dereference(dev->mpls_ptr); +} + +static inline struct mpls_dev *mpls_dev_get(const struct net *net, + const struct net_device *dev) { - return rcu_dereference_rtnl(dev->mpls_ptr); + return mpls_dereference(net, dev->mpls_ptr); } int nla_put_labels(struct sk_buff *skb, int attrtype, u8 labels, @@ -196,7 +208,8 @@ int nla_get_labels(const struct nlattr *nla, u8 max_labels, u8 *labels, bool mpls_output_possible(const struct net_device *dev); unsigned int mpls_dev_mtu(const struct net_device *dev); bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu); -void mpls_stats_inc_outucastpkts(struct net_device *dev, +void mpls_stats_inc_outucastpkts(struct net *net, + struct net_device *dev, const struct sk_buff *skb); #endif /* MPLS_INTERNAL_H */ diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c index 6e73da94af7f..1a1a0eb5b787 100644 --- a/net/mpls/mpls_iptunnel.c +++ b/net/mpls/mpls_iptunnel.c @@ -53,7 +53,7 @@ static int mpls_xmit(struct sk_buff *skb) /* Find the output device */ out_dev = dst->dev; - net = dev_net(out_dev); + net = dev_net_rcu(out_dev); if (!mpls_output_possible(out_dev) || !dst->lwtstate || skb_warn_if_lro(skb)) @@ -128,7 +128,7 @@ static int mpls_xmit(struct sk_buff *skb) bos = false; } - mpls_stats_inc_outucastpkts(out_dev, skb); + mpls_stats_inc_outucastpkts(net, out_dev, skb); if (rt) { if (rt->rt_gw_family == AF_INET6) @@ -153,7 +153,7 @@ static int mpls_xmit(struct sk_buff *skb) return LWTUNNEL_XMIT_DONE; drop: - out_mdev = out_dev ? mpls_dev_get(out_dev) : NULL; + out_mdev = out_dev ? mpls_dev_rcu(out_dev) : NULL; if (out_mdev) MPLS_INC_STATS(out_mdev, tx_errors); kfree_skb(skb); diff --git a/net/mptcp/crypto.c b/net/mptcp/crypto.c index b08ba959ac4f..31948e18d97d 100644 --- a/net/mptcp/crypto.c +++ b/net/mptcp/crypto.c @@ -22,7 +22,6 @@ #include <linux/kernel.h> #include <crypto/sha2.h> -#include <linux/unaligned.h> #include "protocol.h" @@ -43,39 +42,9 @@ void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn) void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac) { - u8 input[SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE]; - u8 key1be[8]; - u8 key2be[8]; - int i; + __be64 key[2] = { cpu_to_be64(key1), cpu_to_be64(key2) }; - if (WARN_ON_ONCE(len > SHA256_DIGEST_SIZE)) - len = SHA256_DIGEST_SIZE; - - put_unaligned_be64(key1, key1be); - put_unaligned_be64(key2, key2be); - - /* Generate key xored with ipad */ - memset(input, 0x36, SHA256_BLOCK_SIZE); - for (i = 0; i < 8; i++) - input[i] ^= key1be[i]; - for (i = 0; i < 8; i++) - input[i + 8] ^= key2be[i]; - - memcpy(&input[SHA256_BLOCK_SIZE], msg, len); - - /* emit sha256(K1 || msg) on the second input block, so we can - * reuse 'input' for the last hashing - */ - sha256(input, SHA256_BLOCK_SIZE + len, &input[SHA256_BLOCK_SIZE]); - - /* Prepare second part of hmac */ - memset(input, 0x5C, SHA256_BLOCK_SIZE); - for (i = 0; i < 8; i++) - input[i] ^= key1be[i]; - for (i = 0; i < 8; i++) - input[i + 8] ^= key2be[i]; - - sha256(input, SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE, hmac); + hmac_sha256_usingrawkey((const u8 *)key, sizeof(key), msg, len, hmac); } #if IS_MODULE(CONFIG_MPTCP_KUNIT_TEST) diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index fed40dae5583..d96130e49942 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -501,10 +501,15 @@ void mptcp_active_enable(struct sock *sk) struct mptcp_pernet *pernet = mptcp_get_pernet(sock_net(sk)); if (atomic_read(&pernet->active_disable_times)) { - struct dst_entry *dst = sk_dst_get(sk); + struct net_device *dev; + struct dst_entry *dst; - if (dst && dst->dev && (dst->dev->flags & IFF_LOOPBACK)) + rcu_read_lock(); + dst = __sk_dst_get(sk); + dev = dst ? dst_dev_rcu(dst) : NULL; + if (!(dev && (dev->flags & IFF_LOOPBACK))) atomic_set(&pernet->active_disable_times, 0); + rcu_read_unlock(); } } diff --git a/net/mptcp/fastopen.c b/net/mptcp/fastopen.c index b9e451197902..82ec15bcfd7f 100644 --- a/net/mptcp/fastopen.c +++ b/net/mptcp/fastopen.c @@ -32,7 +32,8 @@ void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subf /* dequeue the skb from sk receive queue */ __skb_unlink(skb, &ssk->sk_receive_queue); skb_ext_reset(skb); - skb_orphan(skb); + + mptcp_subflow_lend_fwdmem(subflow, skb); /* We copy the fastopen data, but that don't belong to the mptcp sequence * space, need to offset it in the subflow sequence, see mptcp_subflow_get_map_offset() @@ -50,6 +51,7 @@ void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subf mptcp_data_lock(sk); DEBUG_NET_WARN_ON_ONCE(sock_owned_by_user_nocheck(sk)); + mptcp_borrow_fwdmem(sk, skb); skb_set_owner_r(skb, sk); __skb_queue_tail(&sk->sk_receive_queue, skb); mptcp_sk(sk)->bytes_received += skb->len; diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c index cf879c188ca2..f23fda0c55a7 100644 --- a/net/mptcp/mib.c +++ b/net/mptcp/mib.c @@ -71,7 +71,6 @@ static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("MPFastcloseRx", MPTCP_MIB_MPFASTCLOSERX), SNMP_MIB_ITEM("MPRstTx", MPTCP_MIB_MPRSTTX), SNMP_MIB_ITEM("MPRstRx", MPTCP_MIB_MPRSTRX), - SNMP_MIB_ITEM("RcvPruned", MPTCP_MIB_RCVPRUNED), SNMP_MIB_ITEM("SubflowStale", MPTCP_MIB_SUBFLOWSTALE), SNMP_MIB_ITEM("SubflowRecover", MPTCP_MIB_SUBFLOWRECOVER), SNMP_MIB_ITEM("SndWndShared", MPTCP_MIB_SNDWNDSHARED), @@ -85,7 +84,7 @@ static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("DssFallback", MPTCP_MIB_DSSFALLBACK), SNMP_MIB_ITEM("SimultConnectFallback", MPTCP_MIB_SIMULTCONNFALLBACK), SNMP_MIB_ITEM("FallbackFailed", MPTCP_MIB_FALLBACKFAILED), - SNMP_MIB_SENTINEL + SNMP_MIB_ITEM("WinProbe", MPTCP_MIB_WINPROBE), }; /* mptcp_mib_alloc - allocate percpu mib counters @@ -108,22 +107,23 @@ bool mptcp_mib_alloc(struct net *net) void mptcp_seq_show(struct seq_file *seq) { - unsigned long sum[ARRAY_SIZE(mptcp_snmp_list) - 1]; + unsigned long sum[ARRAY_SIZE(mptcp_snmp_list)]; + const int cnt = ARRAY_SIZE(mptcp_snmp_list); struct net *net = seq->private; int i; seq_puts(seq, "MPTcpExt:"); - for (i = 0; mptcp_snmp_list[i].name; i++) + for (i = 0; i < cnt; i++) seq_printf(seq, " %s", mptcp_snmp_list[i].name); seq_puts(seq, "\nMPTcpExt:"); memset(sum, 0, sizeof(sum)); if (net->mib.mptcp_statistics) - snmp_get_cpu_field_batch(sum, mptcp_snmp_list, - net->mib.mptcp_statistics); + snmp_get_cpu_field_batch_cnt(sum, mptcp_snmp_list, cnt, + net->mib.mptcp_statistics); - for (i = 0; mptcp_snmp_list[i].name; i++) + for (i = 0; i < cnt; i++) seq_printf(seq, " %lu", sum[i]); seq_putc(seq, '\n'); diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h index 309bac6fea32..812218b5ed2b 100644 --- a/net/mptcp/mib.h +++ b/net/mptcp/mib.h @@ -70,7 +70,6 @@ enum linux_mptcp_mib_field { MPTCP_MIB_MPFASTCLOSERX, /* Received a MP_FASTCLOSE */ MPTCP_MIB_MPRSTTX, /* Transmit a MP_RST */ MPTCP_MIB_MPRSTRX, /* Received a MP_RST */ - MPTCP_MIB_RCVPRUNED, /* Incoming packet dropped due to memory limit */ MPTCP_MIB_SUBFLOWSTALE, /* Subflows entered 'stale' status */ MPTCP_MIB_SUBFLOWRECOVER, /* Subflows returned to active status after being stale */ MPTCP_MIB_SNDWNDSHARED, /* Subflow snd wnd is overridden by msk's one */ @@ -88,6 +87,7 @@ enum linux_mptcp_mib_field { MPTCP_MIB_DSSFALLBACK, /* Bad or missing DSS */ MPTCP_MIB_SIMULTCONNFALLBACK, /* Simultaneous connect */ MPTCP_MIB_FALLBACKFAILED, /* Can't fallback due to msk status */ + MPTCP_MIB_WINPROBE, /* MPTCP-level zero window probe */ __MPTCP_MIB_MAX }; diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c index 0566dd793810..136c2d05c0ee 100644 --- a/net/mptcp/mptcp_diag.c +++ b/net/mptcp/mptcp_diag.c @@ -15,9 +15,9 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *req, - struct nlattr *bc, bool net_admin) + bool net_admin) { - if (!inet_diag_bc_sk(bc, sk)) + if (!inet_diag_bc_sk(cb->data, sk)) return 0; return inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, req, NLM_F_MULTI, @@ -76,9 +76,7 @@ static void mptcp_diag_dump_listeners(struct sk_buff *skb, struct netlink_callba const struct inet_diag_req_v2 *r, bool net_admin) { - struct inet_diag_dump_data *cb_data = cb->data; struct mptcp_diag_ctx *diag_ctx = (void *)cb->ctx; - struct nlattr *bc = cb_data->inet_diag_nla_bc; struct net *net = sock_net(skb->sk); struct inet_hashinfo *hinfo; int i; @@ -121,7 +119,7 @@ static void mptcp_diag_dump_listeners(struct sk_buff *skb, struct netlink_callba if (!refcount_inc_not_zero(&sk->sk_refcnt)) goto next_listen; - ret = sk_diag_dump(sk, skb, cb, r, bc, net_admin); + ret = sk_diag_dump(sk, skb, cb, r, net_admin); sock_put(sk); @@ -154,15 +152,10 @@ static void mptcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); struct mptcp_diag_ctx *diag_ctx = (void *)cb->ctx; struct net *net = sock_net(skb->sk); - struct inet_diag_dump_data *cb_data; struct mptcp_sock *msk; - struct nlattr *bc; BUILD_BUG_ON(sizeof(cb->ctx) < sizeof(*diag_ctx)); - cb_data = cb->data; - bc = cb_data->inet_diag_nla_bc; - while ((msk = mptcp_token_iter_next(net, &diag_ctx->s_slot, &diag_ctx->s_num)) != NULL) { struct inet_sock *inet = (struct inet_sock *)msk; @@ -181,7 +174,7 @@ static void mptcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, r->id.idiag_dport) goto next; - ret = sk_diag_dump(sk, skb, cb, r, bc, net_admin); + ret = sk_diag_dump(sk, skb, cb, r, net_admin); next: sock_put(sk); if (ret < 0) { @@ -202,7 +195,8 @@ static void mptcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_info *info = _info; - r->idiag_rqueue = sk_rmem_alloc_get(sk); + r->idiag_rqueue = sk_rmem_alloc_get(sk) + + READ_ONCE(mptcp_sk(sk)->backlog_len); r->idiag_wqueue = sk_wmem_alloc_get(sk); if (inet_sk_state_load(sk) == TCP_LISTEN) { diff --git a/net/mptcp/mptcp_pm_gen.c b/net/mptcp/mptcp_pm_gen.c index dcffd847af33..c180930a8e0e 100644 --- a/net/mptcp/mptcp_pm_gen.c +++ b/net/mptcp/mptcp_pm_gen.c @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/mptcp_pm.yaml */ /* YNL-GEN kernel source */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #include <net/netlink.h> #include <net/genetlink.h> diff --git a/net/mptcp/mptcp_pm_gen.h b/net/mptcp/mptcp_pm_gen.h index e24258f6f819..b9280bcb43f5 100644 --- a/net/mptcp/mptcp_pm_gen.h +++ b/net/mptcp/mptcp_pm_gen.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/mptcp_pm.yaml */ /* YNL-GEN kernel header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _LINUX_MPTCP_PM_GEN_H #define _LINUX_MPTCP_PM_GEN_H diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 70c0ab0ecf90..f24ae7d40e88 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -838,8 +838,11 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, opts->suboptions = 0; + /* Force later mptcp_write_options(), but do not use any actual + * option space. + */ if (unlikely(__mptcp_check_fallback(msk) && !mptcp_check_infinite_map(skb))) - return false; + return true; if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) { if (mptcp_established_options_fastclose(sk, &opt_size, remaining, opts) || @@ -985,13 +988,13 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, return false; } - if (mp_opt->deny_join_id0) - WRITE_ONCE(msk->pm.remote_deny_join_id0, true); - if (unlikely(!READ_ONCE(msk->pm.server_side))) pr_warn_once("bogus mpc option on established client sk"); set_fully_established: + if (mp_opt->deny_join_id0) + WRITE_ONCE(msk->pm.remote_deny_join_id0, true); + mptcp_data_lock((struct sock *)msk); __mptcp_subflow_fully_established(msk, subflow, mp_opt); mptcp_data_unlock((struct sock *)msk); @@ -1041,6 +1044,31 @@ static void __mptcp_snd_una_update(struct mptcp_sock *msk, u64 new_snd_una) WRITE_ONCE(msk->snd_una, new_snd_una); } +static void rwin_update(struct mptcp_sock *msk, struct sock *ssk, + struct sk_buff *skb) +{ + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + struct tcp_sock *tp = tcp_sk(ssk); + u64 mptcp_rcv_wnd; + + /* Avoid touching extra cachelines if TCP is going to accept this + * skb without filling the TCP-level window even with a possibly + * outdated mptcp-level rwin. + */ + if (!skb->len || skb->len < tcp_receive_window(tp)) + return; + + mptcp_rcv_wnd = atomic64_read(&msk->rcv_wnd_sent); + if (!after64(mptcp_rcv_wnd, subflow->rcv_wnd_sent)) + return; + + /* Some other subflow grew the mptcp-level rwin since rcv_wup, + * resync. + */ + tp->rcv_wnd += mptcp_rcv_wnd - subflow->rcv_wnd_sent; + subflow->rcv_wnd_sent = mptcp_rcv_wnd; +} + static void ack_update_msk(struct mptcp_sock *msk, struct sock *ssk, struct mptcp_options_received *mp_opt) @@ -1118,7 +1146,9 @@ static bool add_addr_hmac_valid(struct mptcp_sock *msk, return hmac == mp_opt->ahmac; } -/* Return false if a subflow has been reset, else return true */ +/* Return false in case of error (or subflow has been reset), + * else return true. + */ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); @@ -1206,6 +1236,7 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) */ if (mp_opt.use_ack) ack_update_msk(msk, sk, &mp_opt); + rwin_update(msk, sk, skb); /* Zero-data-length packets are dropped by the caller and not * propagated to the MPTCP layer, so the skb extension does not @@ -1222,7 +1253,7 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) mpext = skb_ext_add(skb, SKB_EXT_MPTCP); if (!mpext) - return true; + return false; memset(mpext, 0, sizeof(*mpext)); @@ -1292,6 +1323,10 @@ static void mptcp_set_rwin(struct tcp_sock *tp, struct tcphdr *th) if (rcv_wnd_new != rcv_wnd_old) { raise_win: + /* The msk-level rcv wnd is after the tcp level one, + * sync the latter. + */ + rcv_wnd_new = rcv_wnd_old; win = rcv_wnd_old - ack_seq; tp->rcv_wnd = min_t(u64, win, U32_MAX); new_win = tp->rcv_wnd; @@ -1315,6 +1350,21 @@ raise_win: update_wspace: WRITE_ONCE(msk->old_wspace, tp->rcv_wnd); + subflow->rcv_wnd_sent = rcv_wnd_new; +} + +static void mptcp_track_rwin(struct tcp_sock *tp) +{ + const struct sock *ssk = (const struct sock *)tp; + struct mptcp_subflow_context *subflow; + struct mptcp_sock *msk; + + if (!ssk) + return; + + subflow = mptcp_subflow_ctx(ssk); + msk = mptcp_sk(subflow->conn); + WRITE_ONCE(msk->old_wspace, tp->rcv_wnd); } __sum16 __mptcp_make_csum(u64 data_seq, u32 subflow_seq, u16 data_len, __wsum sum) @@ -1609,6 +1659,10 @@ mp_rst: opts->reset_transient, opts->reset_reason); return; + } else if (unlikely(!opts->suboptions)) { + /* Fallback to TCP */ + mptcp_track_rwin(tp); + return; } if (OPTION_MPTCP_PRIO & opts->suboptions) { diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 420d416e2603..e2040c327af6 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -18,6 +18,7 @@ struct mptcp_pm_add_entry { u8 retrans_times; struct timer_list add_timer; struct mptcp_sock *sock; + struct rcu_head rcu; }; static DEFINE_SPINLOCK(mptcp_pm_list_lock); @@ -155,7 +156,7 @@ bool mptcp_remove_anno_list_by_saddr(struct mptcp_sock *msk, entry = mptcp_pm_del_add_timer(msk, addr, false); ret = entry; - kfree(entry); + kfree_rcu(entry, rcu); return ret; } @@ -268,12 +269,34 @@ int mptcp_pm_mp_prio_send_ack(struct mptcp_sock *msk, return -EINVAL; } +static unsigned int mptcp_adjust_add_addr_timeout(struct mptcp_sock *msk) +{ + const struct net *net = sock_net((struct sock *)msk); + unsigned int rto = mptcp_get_add_addr_timeout(net); + struct mptcp_subflow_context *subflow; + unsigned int max = 0; + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + struct inet_connection_sock *icsk = inet_csk(ssk); + + if (icsk->icsk_rto > max) + max = icsk->icsk_rto; + } + + if (max && max < rto) + rto = max; + + return rto; +} + static void mptcp_pm_add_timer(struct timer_list *timer) { struct mptcp_pm_add_entry *entry = timer_container_of(entry, timer, add_timer); struct mptcp_sock *msk = entry->sock; struct sock *sk = (struct sock *)msk; + unsigned int timeout; pr_debug("msk=%p\n", msk); @@ -291,6 +314,10 @@ static void mptcp_pm_add_timer(struct timer_list *timer) goto out; } + timeout = mptcp_adjust_add_addr_timeout(msk); + if (!timeout) + goto out; + spin_lock_bh(&msk->pm.lock); if (!mptcp_pm_should_add_signal_addr(msk)) { @@ -302,7 +329,7 @@ static void mptcp_pm_add_timer(struct timer_list *timer) if (entry->retrans_times < ADD_ADDR_RETRANS_MAX) sk_reset_timer(sk, timer, - jiffies + mptcp_get_add_addr_timeout(sock_net(sk))); + jiffies + (timeout << entry->retrans_times)); spin_unlock_bh(&msk->pm.lock); @@ -319,22 +346,27 @@ mptcp_pm_del_add_timer(struct mptcp_sock *msk, { struct mptcp_pm_add_entry *entry; struct sock *sk = (struct sock *)msk; - struct timer_list *add_timer = NULL; + bool stop_timer = false; + + rcu_read_lock(); spin_lock_bh(&msk->pm.lock); entry = mptcp_lookup_anno_list_by_saddr(msk, addr); if (entry && (!check_id || entry->addr.id == addr->id)) { entry->retrans_times = ADD_ADDR_RETRANS_MAX; - add_timer = &entry->add_timer; + stop_timer = true; } if (!check_id && entry) list_del(&entry->list); spin_unlock_bh(&msk->pm.lock); - /* no lock, because sk_stop_timer_sync() is calling timer_delete_sync() */ - if (add_timer) - sk_stop_timer_sync(sk, add_timer); + /* Note: entry might have been removed by another thread. + * We hold rcu_read_lock() to ensure it is not freed under us. + */ + if (stop_timer) + sk_stop_timer_sync(sk, &entry->add_timer); + rcu_read_unlock(); return entry; } @@ -343,7 +375,7 @@ bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, { struct mptcp_pm_add_entry *add_entry = NULL; struct sock *sk = (struct sock *)msk; - struct net *net = sock_net(sk); + unsigned int timeout; lockdep_assert_held(&msk->pm.lock); @@ -353,9 +385,7 @@ bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, if (WARN_ON_ONCE(mptcp_pm_is_kernel(msk))) return false; - sk_reset_timer(sk, &add_entry->add_timer, - jiffies + mptcp_get_add_addr_timeout(net)); - return true; + goto reset_timer; } add_entry = kmalloc(sizeof(*add_entry), GFP_ATOMIC); @@ -369,8 +399,10 @@ bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, add_entry->retrans_times = 0; timer_setup(&add_entry->add_timer, mptcp_pm_add_timer, 0); - sk_reset_timer(sk, &add_entry->add_timer, - jiffies + mptcp_get_add_addr_timeout(net)); +reset_timer: + timeout = mptcp_adjust_add_addr_timeout(msk); + if (timeout) + sk_reset_timer(sk, &add_entry->add_timer, jiffies + timeout); return true; } @@ -389,7 +421,7 @@ static void mptcp_pm_free_anno_list(struct mptcp_sock *msk) list_for_each_entry_safe(entry, tmp, &free_list, list) { sk_stop_timer_sync(sk, &entry->add_timer); - kfree(entry); + kfree_rcu(entry, rcu); } } @@ -457,23 +489,24 @@ void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk) { struct mptcp_pm_data *pm = &msk->pm; - unsigned int subflows_max; + unsigned int limit_extra_subflows; int ret = 0; if (mptcp_pm_is_userspace(msk)) { if (mptcp_userspace_pm_active(msk)) { spin_lock_bh(&pm->lock); - pm->subflows++; + pm->extra_subflows++; spin_unlock_bh(&pm->lock); return true; } return false; } - subflows_max = mptcp_pm_get_subflows_max(msk); + limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); - pr_debug("msk=%p subflows=%d max=%d allow=%d\n", msk, pm->subflows, - subflows_max, READ_ONCE(pm->accept_subflow)); + pr_debug("msk=%p subflows=%d max=%d allow=%d\n", msk, + pm->extra_subflows, limit_extra_subflows, + READ_ONCE(pm->accept_subflow)); /* try to avoid acquiring the lock below */ if (!READ_ONCE(pm->accept_subflow)) @@ -481,8 +514,8 @@ bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk) spin_lock_bh(&pm->lock); if (READ_ONCE(pm->accept_subflow)) { - ret = pm->subflows < subflows_max; - if (ret && ++pm->subflows == subflows_max) + ret = pm->extra_subflows < limit_extra_subflows; + if (ret && ++pm->extra_subflows == limit_extra_subflows) WRITE_ONCE(pm->accept_subflow, false); } spin_unlock_bh(&pm->lock); @@ -561,6 +594,7 @@ void mptcp_pm_subflow_established(struct mptcp_sock *msk) void mptcp_pm_subflow_check_next(struct mptcp_sock *msk, const struct mptcp_subflow_context *subflow) { + struct sock *sk = (struct sock *)msk; struct mptcp_pm_data *pm = &msk->pm; bool update_subflows; @@ -568,7 +602,7 @@ void mptcp_pm_subflow_check_next(struct mptcp_sock *msk, if (mptcp_pm_is_userspace(msk)) { if (update_subflows) { spin_lock_bh(&pm->lock); - pm->subflows--; + pm->extra_subflows--; spin_unlock_bh(&pm->lock); } return; @@ -584,7 +618,8 @@ void mptcp_pm_subflow_check_next(struct mptcp_sock *msk, /* Even if this subflow is not really established, tell the PM to try * to pick the next ones, if possible. */ - if (mptcp_pm_nl_check_work_pending(msk)) + if (mptcp_is_fully_established(sk) && + mptcp_pm_nl_check_work_pending(msk)) mptcp_pm_schedule_work(msk, MPTCP_PM_SUBFLOW_ESTABLISHED); spin_unlock_bh(&pm->lock); @@ -611,9 +646,12 @@ void mptcp_pm_add_addr_received(const struct sock *ssk, } else { __MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_ADDADDRDROP); } - /* id0 should not have a different address */ + /* - id0 should not have a different address + * - special case for C-flag: linked to fill_local_addresses_vec() + */ } else if ((addr->id == 0 && !mptcp_pm_is_init_remote_addr(msk, addr)) || - (addr->id > 0 && !READ_ONCE(pm->accept_addr))) { + (addr->id > 0 && !READ_ONCE(pm->accept_addr) && + !mptcp_pm_add_addr_c_flag_case(msk))) { mptcp_pm_announce_addr(msk, addr, true); mptcp_pm_add_addr_send_ack(msk); } else if (mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_RECEIVED)) { @@ -999,17 +1037,17 @@ void mptcp_pm_data_reset(struct mptcp_sock *msk) WRITE_ONCE(pm->pm_type, pm_type); if (pm_type == MPTCP_PM_TYPE_KERNEL) { - bool subflows_allowed = !!mptcp_pm_get_subflows_max(msk); + bool subflows_allowed = !!mptcp_pm_get_limit_extra_subflows(msk); /* pm->work_pending must be only be set to 'true' when * pm->pm_type is set to MPTCP_PM_TYPE_KERNEL */ WRITE_ONCE(pm->work_pending, - (!!mptcp_pm_get_local_addr_max(msk) && + (!!mptcp_pm_get_endp_subflow_max(msk) && subflows_allowed) || - !!mptcp_pm_get_add_addr_signal_max(msk)); + !!mptcp_pm_get_endp_signal_max(msk)); WRITE_ONCE(pm->accept_addr, - !!mptcp_pm_get_add_addr_accept_max(msk) && + !!mptcp_pm_get_limit_add_addr_accepted(msk) && subflows_allowed); WRITE_ONCE(pm->accept_subflow, subflows_allowed); diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c index d39e7c178460..57570a44e418 100644 --- a/net/mptcp/pm_kernel.c +++ b/net/mptcp/pm_kernel.c @@ -17,14 +17,15 @@ static int pm_nl_pernet_id; struct pm_nl_pernet { /* protects pernet updates */ spinlock_t lock; - struct list_head local_addr_list; - unsigned int addrs; - unsigned int stale_loss_cnt; - unsigned int add_addr_signal_max; - unsigned int add_addr_accept_max; - unsigned int local_addr_max; - unsigned int subflows_max; - unsigned int next_id; + struct list_head endp_list; + u8 endpoints; + u8 endp_signal_max; + u8 endp_subflow_max; + u8 endp_laminar_max; + u8 endp_fullmesh_max; + u8 limit_add_addr_accepted; + u8 limit_extra_subflows; + u8 next_id; DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); }; @@ -46,37 +47,53 @@ static struct pm_nl_pernet *genl_info_pm_nl(struct genl_info *info) return pm_nl_get_pernet(genl_info_net(info)); } -unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk) +u8 mptcp_pm_get_endp_signal_max(const struct mptcp_sock *msk) { const struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - return READ_ONCE(pernet->add_addr_signal_max); + return READ_ONCE(pernet->endp_signal_max); } -EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_signal_max); +EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_signal_max); -unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk) +u8 mptcp_pm_get_endp_subflow_max(const struct mptcp_sock *msk) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - return READ_ONCE(pernet->add_addr_accept_max); + return READ_ONCE(pernet->endp_subflow_max); } -EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_accept_max); +EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_subflow_max); -unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk) +u8 mptcp_pm_get_endp_laminar_max(const struct mptcp_sock *msk) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - return READ_ONCE(pernet->subflows_max); + return READ_ONCE(pernet->endp_laminar_max); } -EXPORT_SYMBOL_GPL(mptcp_pm_get_subflows_max); +EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_laminar_max); -unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk) +u8 mptcp_pm_get_endp_fullmesh_max(const struct mptcp_sock *msk) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - return READ_ONCE(pernet->local_addr_max); + return READ_ONCE(pernet->endp_fullmesh_max); } -EXPORT_SYMBOL_GPL(mptcp_pm_get_local_addr_max); +EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_fullmesh_max); + +u8 mptcp_pm_get_limit_add_addr_accepted(const struct mptcp_sock *msk) +{ + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); + + return READ_ONCE(pernet->limit_add_addr_accepted); +} +EXPORT_SYMBOL_GPL(mptcp_pm_get_limit_add_addr_accepted); + +u8 mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk) +{ + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); + + return READ_ONCE(pernet->limit_extra_subflows); +} +EXPORT_SYMBOL_GPL(mptcp_pm_get_limit_extra_subflows); static bool lookup_subflow_by_daddr(const struct list_head *list, const struct mptcp_addr_info *daddr) @@ -110,7 +127,7 @@ select_local_address(const struct pm_nl_pernet *pernet, msk_owned_by_me(msk); rcu_read_lock(); - list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { + list_for_each_entry_rcu(entry, &pernet->endp_list, list) { if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)) continue; @@ -141,7 +158,7 @@ select_signal_address(struct pm_nl_pernet *pernet, const struct mptcp_sock *msk, * Note: removal from the local address list during the msk life-cycle * can lead to additional addresses not being announced. */ - list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { + list_for_each_entry_rcu(entry, &pernet->endp_list, list) { if (!test_bit(entry->addr.id, msk->pm.id_avail_bitmap)) continue; @@ -159,80 +176,96 @@ select_signal_address(struct pm_nl_pernet *pernet, const struct mptcp_sock *msk, return found; } -/* Fill all the remote addresses into the array addrs[], - * and return the array size. - */ -static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, - struct mptcp_addr_info *local, - bool fullmesh, - struct mptcp_addr_info *addrs) +static unsigned int +fill_remote_addr(struct mptcp_sock *msk, struct mptcp_addr_info *local, + struct mptcp_addr_info *addrs) { bool deny_id0 = READ_ONCE(msk->pm.remote_deny_join_id0); - struct sock *sk = (struct sock *)msk, *ssk; - struct mptcp_subflow_context *subflow; struct mptcp_addr_info remote = { 0 }; - unsigned int subflows_max; - int i = 0; + struct sock *sk = (struct sock *)msk; + + if (deny_id0) + return 0; - subflows_max = mptcp_pm_get_subflows_max(msk); mptcp_remote_address((struct sock_common *)sk, &remote); - /* Non-fullmesh endpoint, fill in the single entry - * corresponding to the primary MPC subflow remote address - */ - if (!fullmesh) { - if (deny_id0) - return 0; + if (!mptcp_pm_addr_families_match(sk, local, &remote)) + return 0; - if (!mptcp_pm_addr_families_match(sk, local, &remote)) - return 0; + msk->pm.extra_subflows++; + *addrs = remote; - msk->pm.subflows++; - addrs[i++] = remote; - } else { - DECLARE_BITMAP(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1); + return 1; +} - /* Forbid creation of new subflows matching existing - * ones, possibly already created by incoming ADD_ADDR - */ - bitmap_zero(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1); - mptcp_for_each_subflow(msk, subflow) - if (READ_ONCE(subflow->local_id) == local->id) - __set_bit(subflow->remote_id, unavail_id); - - mptcp_for_each_subflow(msk, subflow) { - ssk = mptcp_subflow_tcp_sock(subflow); - mptcp_remote_address((struct sock_common *)ssk, &addrs[i]); - addrs[i].id = READ_ONCE(subflow->remote_id); - if (deny_id0 && !addrs[i].id) - continue; +static unsigned int +fill_remote_addresses_fullmesh(struct mptcp_sock *msk, + struct mptcp_addr_info *local, + struct mptcp_addr_info *addrs) +{ + u8 limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); + bool deny_id0 = READ_ONCE(msk->pm.remote_deny_join_id0); + DECLARE_BITMAP(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1); + struct sock *sk = (struct sock *)msk, *ssk; + struct mptcp_subflow_context *subflow; + int i = 0; - if (test_bit(addrs[i].id, unavail_id)) - continue; + /* Forbid creation of new subflows matching existing ones, possibly + * already created by incoming ADD_ADDR + */ + bitmap_zero(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1); + mptcp_for_each_subflow(msk, subflow) + if (READ_ONCE(subflow->local_id) == local->id) + __set_bit(subflow->remote_id, unavail_id); + + mptcp_for_each_subflow(msk, subflow) { + ssk = mptcp_subflow_tcp_sock(subflow); + mptcp_remote_address((struct sock_common *)ssk, &addrs[i]); + addrs[i].id = READ_ONCE(subflow->remote_id); + if (deny_id0 && !addrs[i].id) + continue; - if (!mptcp_pm_addr_families_match(sk, local, &addrs[i])) - continue; + if (test_bit(addrs[i].id, unavail_id)) + continue; - if (msk->pm.subflows < subflows_max) { - /* forbid creating multiple address towards - * this id - */ - __set_bit(addrs[i].id, unavail_id); - msk->pm.subflows++; - i++; - } - } + if (!mptcp_pm_addr_families_match(sk, local, &addrs[i])) + continue; + + /* forbid creating multiple address towards this id */ + __set_bit(addrs[i].id, unavail_id); + msk->pm.extra_subflows++; + i++; + + if (msk->pm.extra_subflows >= limit_extra_subflows) + break; } return i; } +/* Fill all the remote addresses into the array addrs[], + * and return the array size. + */ +static unsigned int +fill_remote_addresses_vec(struct mptcp_sock *msk, struct mptcp_addr_info *local, + bool fullmesh, struct mptcp_addr_info *addrs) +{ + /* Non-fullmesh: fill in the single entry corresponding to the primary + * MPC subflow remote address, and return 1, corresponding to 1 entry. + */ + if (!fullmesh) + return fill_remote_addr(msk, local, addrs); + + /* Fullmesh endpoint: fill all possible remote addresses */ + return fill_remote_addresses_fullmesh(msk, local, addrs); +} + static struct mptcp_pm_addr_entry * __lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id) { struct mptcp_pm_addr_entry *entry; - list_for_each_entry_rcu(entry, &pernet->local_addr_list, list, + list_for_each_entry_rcu(entry, &pernet->endp_list, list, lockdep_is_held(&pernet->lock)) { if (entry->addr.id == id) return entry; @@ -245,7 +278,7 @@ __lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info) { struct mptcp_pm_addr_entry *entry; - list_for_each_entry_rcu(entry, &pernet->local_addr_list, list, + list_for_each_entry_rcu(entry, &pernet->endp_list, list, lockdep_is_held(&pernet->lock)) { if (mptcp_addresses_equal(&entry->addr, info, entry->addr.port)) return entry; @@ -253,52 +286,67 @@ __lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info) return NULL; } -static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) +static u8 mptcp_endp_get_local_id(struct mptcp_sock *msk, + const struct mptcp_addr_info *addr) { - struct sock *sk = (struct sock *)msk; - unsigned int add_addr_signal_max; - bool signal_and_subflow = false; - unsigned int local_addr_max; - struct pm_nl_pernet *pernet; - struct mptcp_pm_local local; - unsigned int subflows_max; - - pernet = pm_nl_get_pernet(sock_net(sk)); + return msk->mpc_endpoint_id == addr->id ? 0 : addr->id; +} - add_addr_signal_max = mptcp_pm_get_add_addr_signal_max(msk); - local_addr_max = mptcp_pm_get_local_addr_max(msk); - subflows_max = mptcp_pm_get_subflows_max(msk); +/* Set mpc_endpoint_id, and send MP_PRIO for ID0 if needed */ +static void mptcp_mpc_endpoint_setup(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + struct mptcp_pm_addr_entry *entry; + struct mptcp_addr_info mpc_addr; + struct pm_nl_pernet *pernet; + bool backup = false; /* do lazy endpoint usage accounting for the MPC subflows */ - if (unlikely(!(msk->pm.status & BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED))) && msk->first) { - struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(msk->first); - struct mptcp_pm_addr_entry *entry; - struct mptcp_addr_info mpc_addr; - bool backup = false; - - mptcp_local_address((struct sock_common *)msk->first, &mpc_addr); - rcu_read_lock(); - entry = __lookup_addr(pernet, &mpc_addr); - if (entry) { - __clear_bit(entry->addr.id, msk->pm.id_avail_bitmap); - msk->mpc_endpoint_id = entry->addr.id; - backup = !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP); - } - rcu_read_unlock(); + if (likely(msk->pm.status & BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED)) || + !msk->first) + return; - if (backup) - mptcp_pm_send_ack(msk, subflow, true, backup); + subflow = mptcp_subflow_ctx(msk->first); + pernet = pm_nl_get_pernet_from_msk(msk); - msk->pm.status |= BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED); + mptcp_local_address((struct sock_common *)msk->first, &mpc_addr); + rcu_read_lock(); + entry = __lookup_addr(pernet, &mpc_addr); + if (entry) { + __clear_bit(entry->addr.id, msk->pm.id_avail_bitmap); + msk->mpc_endpoint_id = entry->addr.id; + backup = !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP); } + rcu_read_unlock(); + + /* Send MP_PRIO */ + if (backup) + mptcp_pm_send_ack(msk, subflow, true, backup); + + msk->pm.status |= BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED); +} + +static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) +{ + u8 limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); + u8 endp_subflow_max = mptcp_pm_get_endp_subflow_max(msk); + u8 endp_signal_max = mptcp_pm_get_endp_signal_max(msk); + struct sock *sk = (struct sock *)msk; + bool signal_and_subflow = false; + struct mptcp_pm_local local; + + mptcp_mpc_endpoint_setup(msk); + if (!mptcp_is_fully_established(sk)) + return; pr_debug("local %d:%d signal %d:%d subflows %d:%d\n", - msk->pm.local_addr_used, local_addr_max, - msk->pm.add_addr_signaled, add_addr_signal_max, - msk->pm.subflows, subflows_max); + msk->pm.local_addr_used, endp_subflow_max, + msk->pm.add_addr_signaled, endp_signal_max, + msk->pm.extra_subflows, limit_extra_subflows); /* check first for announce */ - if (msk->pm.add_addr_signaled < add_addr_signal_max) { + if (msk->pm.add_addr_signaled < endp_signal_max) { /* due to racing events on both ends we can reach here while * previous add address is still running: if we invoke now * mptcp_pm_announce_addr(), that will fail and the @@ -333,9 +381,13 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) } subflow: + /* No need to try establishing subflows to remote id0 if not allowed */ + if (mptcp_pm_add_addr_c_flag_case(msk)) + goto exit; + /* check if should create a new subflow */ - while (msk->pm.local_addr_used < local_addr_max && - msk->pm.subflows < subflows_max) { + while (msk->pm.local_addr_used < endp_subflow_max && + msk->pm.extra_subflows < limit_extra_subflows) { struct mptcp_addr_info addrs[MPTCP_PM_ADDR_MAX]; bool fullmesh; int i, nr; @@ -364,6 +416,8 @@ subflow: __mptcp_subflow_connect(sk, &local, &addrs[i]); spin_lock_bh(&msk->pm.lock); } + +exit: mptcp_pm_nl_check_work_pending(msk); } @@ -377,90 +431,224 @@ static void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk) mptcp_pm_create_subflow_or_signal_addr(msk); } -/* Fill all the local addresses into the array addrs[], - * and return the array size. - */ -static unsigned int fill_local_addresses_vec(struct mptcp_sock *msk, - struct mptcp_addr_info *remote, - struct mptcp_pm_local *locals) +static unsigned int +fill_local_addresses_vec_fullmesh(struct mptcp_sock *msk, + struct mptcp_addr_info *remote, + struct mptcp_pm_local *locals, + bool c_flag_case) { + u8 limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); struct sock *sk = (struct sock *)msk; struct mptcp_pm_addr_entry *entry; - struct mptcp_addr_info mpc_addr; - struct pm_nl_pernet *pernet; - unsigned int subflows_max; + struct mptcp_pm_local *local; int i = 0; - pernet = pm_nl_get_pernet_from_msk(msk); - subflows_max = mptcp_pm_get_subflows_max(msk); - - mptcp_local_address((struct sock_common *)msk, &mpc_addr); - rcu_read_lock(); - list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { + list_for_each_entry_rcu(entry, &pernet->endp_list, list) { + bool is_id0; + if (!(entry->flags & MPTCP_PM_ADDR_FLAG_FULLMESH)) continue; if (!mptcp_pm_addr_families_match(sk, &entry->addr, remote)) continue; - if (msk->pm.subflows < subflows_max) { - locals[i].addr = entry->addr; - locals[i].flags = entry->flags; - locals[i].ifindex = entry->ifindex; + local = &locals[i]; + local->addr = entry->addr; + local->flags = entry->flags; + local->ifindex = entry->ifindex; + + is_id0 = local->addr.id == msk->mpc_endpoint_id; - /* Special case for ID0: set the correct ID */ - if (mptcp_addresses_equal(&locals[i].addr, &mpc_addr, locals[i].addr.port)) - locals[i].addr.id = 0; + if (c_flag_case && + (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)) { + __clear_bit(local->addr.id, msk->pm.id_avail_bitmap); - msk->pm.subflows++; - i++; + if (!is_id0) + msk->pm.local_addr_used++; } + + /* Special case for ID0: set the correct ID */ + if (is_id0) + local->addr.id = 0; + + msk->pm.extra_subflows++; + i++; + + if (msk->pm.extra_subflows >= limit_extra_subflows) + break; } rcu_read_unlock(); - /* If the array is empty, fill in the single - * 'IPADDRANY' local address + return i; +} + +static unsigned int +fill_local_laminar_endp(struct mptcp_sock *msk, struct mptcp_addr_info *remote, + struct mptcp_pm_local *locals) +{ + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); + DECLARE_BITMAP(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1); + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + struct mptcp_pm_addr_entry *entry; + struct mptcp_pm_local *local; + int found = 0; + + /* Forbid creation of new subflows matching existing ones, possibly + * already created by 'subflow' endpoints */ - if (!i) { - memset(&locals[i], 0, sizeof(locals[i])); - locals[i].addr.family = -#if IS_ENABLED(CONFIG_MPTCP_IPV6) - remote->family == AF_INET6 && - ipv6_addr_v4mapped(&remote->addr6) ? AF_INET : -#endif - remote->family; + bitmap_zero(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1); + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + if ((1 << inet_sk_state_load(ssk)) & + (TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSING | + TCPF_CLOSE)) + continue; + + __set_bit(subflow_get_local_id(subflow), unavail_id); + } + + rcu_read_lock(); + list_for_each_entry_rcu(entry, &pernet->endp_list, list) { + if (!(entry->flags & MPTCP_PM_ADDR_FLAG_LAMINAR)) + continue; + + if (!mptcp_pm_addr_families_match(sk, &entry->addr, remote)) + continue; + + if (test_bit(mptcp_endp_get_local_id(msk, &entry->addr), + unavail_id)) + continue; + + local = &locals[0]; + local->addr = entry->addr; + local->flags = entry->flags; + local->ifindex = entry->ifindex; + + if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { + __clear_bit(local->addr.id, msk->pm.id_avail_bitmap); + + if (local->addr.id != msk->mpc_endpoint_id) + msk->pm.local_addr_used++; + } + + msk->pm.extra_subflows++; + found = 1; + break; + } + rcu_read_unlock(); + + return found; +} - if (!mptcp_pm_addr_families_match(sk, &locals[i].addr, remote)) - return 0; +static unsigned int +fill_local_addresses_vec_c_flag(struct mptcp_sock *msk, + struct mptcp_addr_info *remote, + struct mptcp_pm_local *locals) +{ + u8 limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); + u8 endp_subflow_max = mptcp_pm_get_endp_subflow_max(msk); + struct sock *sk = (struct sock *)msk; + struct mptcp_pm_local *local; + int i = 0; + + while (msk->pm.local_addr_used < endp_subflow_max) { + local = &locals[i]; - msk->pm.subflows++; + if (!select_local_address(pernet, msk, local)) + break; + + __clear_bit(local->addr.id, msk->pm.id_avail_bitmap); + + if (!mptcp_pm_addr_families_match(sk, &local->addr, remote)) + continue; + + if (local->addr.id == msk->mpc_endpoint_id) + continue; + + msk->pm.local_addr_used++; + msk->pm.extra_subflows++; i++; + + if (msk->pm.extra_subflows >= limit_extra_subflows) + break; } return i; } +static unsigned int +fill_local_address_any(struct mptcp_sock *msk, struct mptcp_addr_info *remote, + struct mptcp_pm_local *local) +{ + struct sock *sk = (struct sock *)msk; + + memset(local, 0, sizeof(*local)); + local->addr.family = +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + remote->family == AF_INET6 && + ipv6_addr_v4mapped(&remote->addr6) ? AF_INET : +#endif + remote->family; + + if (!mptcp_pm_addr_families_match(sk, &local->addr, remote)) + return 0; + + msk->pm.extra_subflows++; + + return 1; +} + +/* Fill all the local addresses into the array addrs[], + * and return the array size. + */ +static unsigned int +fill_local_addresses_vec(struct mptcp_sock *msk, struct mptcp_addr_info *remote, + struct mptcp_pm_local *locals) +{ + bool c_flag_case = remote->id && mptcp_pm_add_addr_c_flag_case(msk); + + /* If there is at least one MPTCP endpoint with a fullmesh flag */ + if (mptcp_pm_get_endp_fullmesh_max(msk)) + return fill_local_addresses_vec_fullmesh(msk, remote, locals, + c_flag_case); + + /* If there is at least one MPTCP endpoint with a laminar flag */ + if (mptcp_pm_get_endp_laminar_max(msk)) + return fill_local_laminar_endp(msk, remote, locals); + + /* Special case: peer sets the C flag, accept one ADD_ADDR if default + * limits are used -- accepting no ADD_ADDR -- and use subflow endpoints + */ + if (c_flag_case) + return fill_local_addresses_vec_c_flag(msk, remote, locals); + + /* No special case: fill in the single 'IPADDRANY' local address */ + return fill_local_address_any(msk, remote, &locals[0]); +} + static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) { + u8 limit_add_addr_accepted = mptcp_pm_get_limit_add_addr_accepted(msk); + u8 limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); struct mptcp_pm_local locals[MPTCP_PM_ADDR_MAX]; struct sock *sk = (struct sock *)msk; - unsigned int add_addr_accept_max; struct mptcp_addr_info remote; - unsigned int subflows_max; bool sf_created = false; int i, nr; - add_addr_accept_max = mptcp_pm_get_add_addr_accept_max(msk); - subflows_max = mptcp_pm_get_subflows_max(msk); - pr_debug("accepted %d:%d remote family %d\n", - msk->pm.add_addr_accepted, add_addr_accept_max, + msk->pm.add_addr_accepted, limit_add_addr_accepted, msk->pm.remote.family); remote = msk->pm.remote; mptcp_pm_announce_addr(msk, &remote, true); mptcp_pm_addr_send_ack(msk); + mptcp_mpc_endpoint_setup(msk); if (lookup_subflow_by_daddr(&msk->conn_list, &remote)) return; @@ -486,19 +674,22 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) /* add_addr_accepted is not decr for ID 0 */ if (remote.id) msk->pm.add_addr_accepted++; - if (msk->pm.add_addr_accepted >= add_addr_accept_max || - msk->pm.subflows >= subflows_max) + if (msk->pm.add_addr_accepted >= limit_add_addr_accepted || + msk->pm.extra_subflows >= limit_extra_subflows) WRITE_ONCE(msk->pm.accept_addr, false); } } void mptcp_pm_nl_rm_addr(struct mptcp_sock *msk, u8 rm_id) { - if (rm_id && WARN_ON_ONCE(msk->pm.add_addr_accepted == 0)) { + if (rm_id && !WARN_ON_ONCE(msk->pm.add_addr_accepted == 0)) { + u8 limit_add_addr_accepted = + mptcp_pm_get_limit_add_addr_accepted(msk); + /* Note: if the subflow has been closed before, this * add_addr_accepted counter will not be decremented. */ - if (--msk->pm.add_addr_accepted < mptcp_pm_get_add_addr_accept_max(msk)) + if (--msk->pm.add_addr_accepted < limit_add_addr_accepted) WRITE_ONCE(msk->pm.accept_addr, true); } } @@ -523,8 +714,8 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, bool needs_id, bool replace) { struct mptcp_pm_addr_entry *cur, *del_entry = NULL; - unsigned int addr_max; int ret = -EINVAL; + u8 addr_max; spin_lock_bh(&pernet->lock); /* to keep the code simple, don't do IDR-like allocation for address ID, @@ -532,7 +723,7 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, */ if (pernet->next_id == MPTCP_PM_MAX_ADDR_ID) pernet->next_id = 1; - if (pernet->addrs >= MPTCP_PM_ADDR_MAX) { + if (pernet->endpoints >= MPTCP_PM_ADDR_MAX) { ret = -ERANGE; goto out; } @@ -546,7 +737,7 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, */ if (!address_use_port(entry)) entry->addr.port = 0; - list_for_each_entry(cur, &pernet->local_addr_list, list) { + list_for_each_entry(cur, &pernet->endp_list, list) { if (mptcp_addresses_equal(&cur->addr, &entry->addr, cur->addr.port || entry->addr.port)) { /* allow replacing the exiting endpoint only if such @@ -571,7 +762,7 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, goto out; } - pernet->addrs--; + pernet->endpoints--; entry->addr.id = cur->addr.id; list_del_rcu(&cur->list); del_entry = cur; @@ -598,19 +789,27 @@ find_next: pernet->next_id = entry->addr.id; if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) { - addr_max = pernet->add_addr_signal_max; - WRITE_ONCE(pernet->add_addr_signal_max, addr_max + 1); + addr_max = pernet->endp_signal_max; + WRITE_ONCE(pernet->endp_signal_max, addr_max + 1); } if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { - addr_max = pernet->local_addr_max; - WRITE_ONCE(pernet->local_addr_max, addr_max + 1); + addr_max = pernet->endp_subflow_max; + WRITE_ONCE(pernet->endp_subflow_max, addr_max + 1); + } + if (entry->flags & MPTCP_PM_ADDR_FLAG_LAMINAR) { + addr_max = pernet->endp_laminar_max; + WRITE_ONCE(pernet->endp_laminar_max, addr_max + 1); + } + if (entry->flags & MPTCP_PM_ADDR_FLAG_FULLMESH) { + addr_max = pernet->endp_fullmesh_max; + WRITE_ONCE(pernet->endp_fullmesh_max, addr_max + 1); } - pernet->addrs++; + pernet->endpoints++; if (!entry->addr.port) - list_add_tail_rcu(&entry->list, &pernet->local_addr_list); + list_add_tail_rcu(&entry->list, &pernet->endp_list); else - list_add_rcu(&entry->list, &pernet->local_addr_list); + list_add_rcu(&entry->list, &pernet->endp_list); ret = entry->addr.id; out: @@ -670,10 +869,10 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk, addrlen = sizeof(struct sockaddr_in6); #endif if (ssk->sk_family == AF_INET) - err = inet_bind_sk(ssk, (struct sockaddr *)&addr, addrlen); + err = inet_bind_sk(ssk, (struct sockaddr_unsized *)&addr, addrlen); #if IS_ENABLED(CONFIG_MPTCP_IPV6) else if (ssk->sk_family == AF_INET6) - err = inet6_bind_sk(ssk, (struct sockaddr *)&addr, addrlen); + err = inet6_bind_sk(ssk, (struct sockaddr_unsized *)&addr, addrlen); #endif if (err) return err; @@ -845,12 +1044,6 @@ out_free: return ret; } -static u8 mptcp_endp_get_local_id(struct mptcp_sock *msk, - const struct mptcp_addr_info *addr) -{ - return msk->mpc_endpoint_id == addr->id ? 0 : addr->id; -} - static bool mptcp_pm_remove_anno_addr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, bool force) @@ -969,8 +1162,8 @@ int mptcp_pm_nl_del_addr_doit(struct sk_buff *skb, struct genl_info *info) { struct pm_nl_pernet *pernet = genl_info_pm_nl(info); struct mptcp_pm_addr_entry addr, *entry; - unsigned int addr_max; struct nlattr *attr; + u8 addr_max; int ret; if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ENDPOINT_ADDR)) @@ -997,15 +1190,23 @@ int mptcp_pm_nl_del_addr_doit(struct sk_buff *skb, struct genl_info *info) return -EINVAL; } if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) { - addr_max = pernet->add_addr_signal_max; - WRITE_ONCE(pernet->add_addr_signal_max, addr_max - 1); + addr_max = pernet->endp_signal_max; + WRITE_ONCE(pernet->endp_signal_max, addr_max - 1); } if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { - addr_max = pernet->local_addr_max; - WRITE_ONCE(pernet->local_addr_max, addr_max - 1); + addr_max = pernet->endp_subflow_max; + WRITE_ONCE(pernet->endp_subflow_max, addr_max - 1); + } + if (entry->flags & MPTCP_PM_ADDR_FLAG_LAMINAR) { + addr_max = pernet->endp_laminar_max; + WRITE_ONCE(pernet->endp_laminar_max, addr_max - 1); + } + if (entry->flags & MPTCP_PM_ADDR_FLAG_FULLMESH) { + addr_max = pernet->endp_fullmesh_max; + WRITE_ONCE(pernet->endp_fullmesh_max, addr_max - 1); } - pernet->addrs--; + pernet->endpoints--; list_del_rcu(&entry->list); __clear_bit(entry->addr.id, pernet->id_bitmap); spin_unlock_bh(&pernet->lock); @@ -1084,10 +1285,10 @@ static void __flush_addrs(struct list_head *list) static void __reset_counters(struct pm_nl_pernet *pernet) { - WRITE_ONCE(pernet->add_addr_signal_max, 0); - WRITE_ONCE(pernet->add_addr_accept_max, 0); - WRITE_ONCE(pernet->local_addr_max, 0); - pernet->addrs = 0; + WRITE_ONCE(pernet->endp_signal_max, 0); + WRITE_ONCE(pernet->endp_subflow_max, 0); + WRITE_ONCE(pernet->endp_laminar_max, 0); + pernet->endpoints = 0; } int mptcp_pm_nl_flush_addrs_doit(struct sk_buff *skb, struct genl_info *info) @@ -1096,7 +1297,7 @@ int mptcp_pm_nl_flush_addrs_doit(struct sk_buff *skb, struct genl_info *info) LIST_HEAD(free_list); spin_lock_bh(&pernet->lock); - list_splice_init(&pernet->local_addr_list, &free_list); + list_splice_init(&pernet->endp_list, &free_list); __reset_counters(pernet); pernet->next_id = 1; bitmap_zero(pernet->id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); @@ -1182,18 +1383,18 @@ int mptcp_pm_nl_set_limits_doit(struct sk_buff *skb, struct genl_info *info) int ret; spin_lock_bh(&pernet->lock); - rcv_addrs = pernet->add_addr_accept_max; + rcv_addrs = pernet->limit_add_addr_accepted; ret = parse_limit(info, MPTCP_PM_ATTR_RCV_ADD_ADDRS, &rcv_addrs); if (ret) goto unlock; - subflows = pernet->subflows_max; + subflows = pernet->limit_extra_subflows; ret = parse_limit(info, MPTCP_PM_ATTR_SUBFLOWS, &subflows); if (ret) goto unlock; - WRITE_ONCE(pernet->add_addr_accept_max, rcv_addrs); - WRITE_ONCE(pernet->subflows_max, subflows); + WRITE_ONCE(pernet->limit_add_addr_accepted, rcv_addrs); + WRITE_ONCE(pernet->limit_extra_subflows, subflows); unlock: spin_unlock_bh(&pernet->lock); @@ -1216,11 +1417,11 @@ int mptcp_pm_nl_get_limits_doit(struct sk_buff *skb, struct genl_info *info) goto fail; if (nla_put_u32(msg, MPTCP_PM_ATTR_RCV_ADD_ADDRS, - READ_ONCE(pernet->add_addr_accept_max))) + READ_ONCE(pernet->limit_add_addr_accepted))) goto fail; if (nla_put_u32(msg, MPTCP_PM_ATTR_SUBFLOWS, - READ_ONCE(pernet->subflows_max))) + READ_ONCE(pernet->limit_extra_subflows))) goto fail; genlmsg_end(msg, reply); @@ -1319,6 +1520,18 @@ int mptcp_pm_nl_set_flags(struct mptcp_pm_addr_entry *local, changed = (local->flags ^ entry->flags) & mask; entry->flags = (entry->flags & ~mask) | (local->flags & mask); *local = *entry; + + if (changed & MPTCP_PM_ADDR_FLAG_FULLMESH) { + u8 addr_max = pernet->endp_fullmesh_max; + + if (entry->flags & MPTCP_PM_ADDR_FLAG_FULLMESH) + addr_max++; + else + addr_max--; + + WRITE_ONCE(pernet->endp_fullmesh_max, addr_max); + } + spin_unlock_bh(&pernet->lock); mptcp_pm_nl_set_flags_all(net, local, changed); @@ -1329,7 +1542,7 @@ bool mptcp_pm_nl_check_work_pending(struct mptcp_sock *msk) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - if (msk->pm.subflows == mptcp_pm_get_subflows_max(msk) || + if (msk->pm.extra_subflows == mptcp_pm_get_limit_extra_subflows(msk) || (find_next_and_bit(pernet->id_bitmap, msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1, 0) == MPTCP_PM_MAX_ADDR_ID + 1)) { WRITE_ONCE(msk->pm.work_pending, false); @@ -1361,12 +1574,11 @@ static int __net_init pm_nl_init_net(struct net *net) { struct pm_nl_pernet *pernet = pm_nl_get_pernet(net); - INIT_LIST_HEAD_RCU(&pernet->local_addr_list); + INIT_LIST_HEAD_RCU(&pernet->endp_list); /* Cit. 2 subflows ought to be enough for anybody. */ - pernet->subflows_max = 2; + pernet->limit_extra_subflows = 2; pernet->next_id = 1; - pernet->stale_loss_cnt = 4; spin_lock_init(&pernet->lock); /* No need to initialize other pernet fields, the struct is zeroed at @@ -1387,7 +1599,7 @@ static void __net_exit pm_nl_exit_net(struct list_head *net_list) * other modifiers, also netns core already waited for a * RCU grace period. */ - __flush_addrs(&pernet->local_addr_list); + __flush_addrs(&pernet->endp_list); } } diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 50aaf259959a..d5b383870f79 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -113,7 +113,7 @@ int mptcp_pm_parse_entry(struct nlattr *attr, struct genl_info *info, return err; if (tb[MPTCP_PM_ADDR_ATTR_IF_IDX]) { - u32 val = nla_get_s32(tb[MPTCP_PM_ADDR_ATTR_IF_IDX]); + s32 val = nla_get_s32(tb[MPTCP_PM_ADDR_ATTR_IF_IDX]); entry->ifindex = val; } @@ -408,11 +408,23 @@ static int mptcp_event_created(struct sk_buff *skb, const struct sock *ssk) { int err = nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token)); + u16 flags = 0; if (err) return err; - if (nla_put_u8(skb, MPTCP_ATTR_SERVER_SIDE, READ_ONCE(msk->pm.server_side))) + if (READ_ONCE(msk->pm.server_side)) { + flags |= MPTCP_PM_EV_FLAG_SERVER_SIDE; + + /* Deprecated, and only set when it is the server side */ + if (nla_put_u8(skb, MPTCP_ATTR_SERVER_SIDE, 1)) + return -EMSGSIZE; + } + + if (READ_ONCE(msk->pm.remote_deny_join_id0)) + flags |= MPTCP_PM_EV_FLAG_DENY_JOIN_ID0; + + if (flags && nla_put_u16(skb, MPTCP_ATTR_FLAGS, flags)) return -EMSGSIZE; return mptcp_event_add_subflow(skb, ssk); diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index a715dcbe0146..8cbc1920afb4 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -419,7 +419,7 @@ int mptcp_pm_nl_subflow_create_doit(struct sk_buff *skb, struct genl_info *info) if (err) mptcp_userspace_pm_delete_local_addr(msk, &entry); else - msk->pm.subflows++; + msk->pm.extra_subflows++; spin_unlock_bh(&msk->pm.lock); create_err: diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 9a287b75c1b3..e212c1374bd0 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -12,6 +12,7 @@ #include <linux/sched/signal.h> #include <linux/atomic.h> #include <net/aligned_data.h> +#include <net/rps.h> #include <net/sock.h> #include <net/inet_common.h> #include <net/inet_hashtables.h> @@ -60,11 +61,13 @@ static u64 mptcp_wnd_end(const struct mptcp_sock *msk) static const struct proto_ops *mptcp_fallback_tcp_ops(const struct sock *sk) { + unsigned short family = READ_ONCE(sk->sk_family); + #if IS_ENABLED(CONFIG_MPTCP_IPV6) - if (sk->sk_prot == &tcpv6_prot) + if (family == AF_INET6) return &inet6_stream_ops; #endif - WARN_ON_ONCE(sk->sk_prot != &tcp_prot); + WARN_ON_ONCE(family != AF_INET); return &inet_stream_ops; } @@ -75,6 +78,13 @@ bool __mptcp_try_fallback(struct mptcp_sock *msk, int fb_mib) if (__mptcp_check_fallback(msk)) return true; + /* The caller possibly is not holding the msk socket lock, but + * in the fallback case only the current subflow is touching + * the OoO queue. + */ + if (!RB_EMPTY_ROOT(&msk->out_of_order_queue)) + return false; + spin_lock_bh(&msk->fallback_lock); if (!msk->allow_infinite_fallback) { spin_unlock_bh(&msk->fallback_lock); @@ -137,26 +147,37 @@ struct sock *__mptcp_nmpc_sk(struct mptcp_sock *msk) static void mptcp_drop(struct sock *sk, struct sk_buff *skb) { - sk_drops_add(sk, skb); + sk_drops_skbadd(sk, skb); __kfree_skb(skb); } -static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to, - struct sk_buff *from) +static bool __mptcp_try_coalesce(struct sock *sk, struct sk_buff *to, + struct sk_buff *from, bool *fragstolen, + int *delta) { - bool fragstolen; - int delta; + int limit = READ_ONCE(sk->sk_rcvbuf); if (unlikely(MPTCP_SKB_CB(to)->cant_coalesce) || MPTCP_SKB_CB(from)->offset || - ((to->len + from->len) > (sk->sk_rcvbuf >> 3)) || - !skb_try_coalesce(to, from, &fragstolen, &delta)) + ((to->len + from->len) > (limit >> 3)) || + !skb_try_coalesce(to, from, fragstolen, delta)) return false; pr_debug("colesced seq %llx into %llx new len %d new end seq %llx\n", MPTCP_SKB_CB(from)->map_seq, MPTCP_SKB_CB(to)->map_seq, to->len, MPTCP_SKB_CB(from)->end_seq); MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq; + return true; +} + +static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to, + struct sk_buff *from) +{ + bool fragstolen; + int delta; + + if (!__mptcp_try_coalesce(sk, to, from, &fragstolen, &delta)) + return false; /* note the fwd memory can reach a negative value after accounting * for the delta, but the later skb free will restore a non @@ -178,6 +199,44 @@ static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to, return mptcp_try_coalesce((struct sock *)msk, to, from); } +/* "inspired" by tcp_rcvbuf_grow(), main difference: + * - mptcp does not maintain a msk-level window clamp + * - returns true when the receive buffer is actually updated + */ +static bool mptcp_rcvbuf_grow(struct sock *sk, u32 newval) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + const struct net *net = sock_net(sk); + u32 rcvwin, rcvbuf, cap, oldval; + u64 grow; + + oldval = msk->rcvq_space.space; + msk->rcvq_space.space = newval; + if (!READ_ONCE(net->ipv4.sysctl_tcp_moderate_rcvbuf) || + (sk->sk_userlocks & SOCK_RCVBUF_LOCK)) + return false; + + /* DRS is always one RTT late. */ + rcvwin = newval << 1; + + /* slow start: allow the sender to double its rate. */ + grow = (u64)rcvwin * (newval - oldval); + do_div(grow, oldval); + rcvwin += grow << 1; + + if (!RB_EMPTY_ROOT(&msk->out_of_order_queue)) + rcvwin += MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq - msk->ack_seq; + + cap = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]); + + rcvbuf = min_t(u32, mptcp_space_from_win(sk, rcvwin), cap); + if (rcvbuf > sk->sk_rcvbuf) { + WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); + return true; + } + return false; +} + /* "inspired" by tcp_data_queue_ofo(), main differences: * - use mptcp seqs * - don't cope with sacks @@ -291,29 +350,16 @@ merge_right: end: skb_condense(skb); skb_set_owner_r(skb, sk); + /* do not grow rcvbuf for not-yet-accepted or orphaned sockets. */ + if (sk->sk_socket) + mptcp_rcvbuf_grow(sk, msk->rcvq_space.space); } -static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, - struct sk_buff *skb, unsigned int offset, - size_t copy_len) +static void mptcp_init_skb(struct sock *ssk, struct sk_buff *skb, int offset, + int copy_len) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); - struct sock *sk = (struct sock *)msk; - struct sk_buff *tail; - bool has_rxtstamp; - - __skb_unlink(skb, &ssk->sk_receive_queue); - - skb_ext_reset(skb); - skb_orphan(skb); - - /* try to fetch required memory from subflow */ - if (!sk_rmem_schedule(sk, skb, skb->truesize)) { - MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RCVPRUNED); - goto drop; - } - - has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp; + bool has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp; /* the skb map_seq accounts for the skb offset: * mptcp_subflow_get_mapped_dsn() is based on the current tp->copied_seq @@ -325,6 +371,20 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, MPTCP_SKB_CB(skb)->has_rxtstamp = has_rxtstamp; MPTCP_SKB_CB(skb)->cant_coalesce = 0; + __skb_unlink(skb, &ssk->sk_receive_queue); + + skb_ext_reset(skb); + skb_dst_drop(skb); +} + +static bool __mptcp_move_skb(struct sock *sk, struct sk_buff *skb) +{ + u64 copy_len = MPTCP_SKB_CB(skb)->end_seq - MPTCP_SKB_CB(skb)->map_seq; + struct mptcp_sock *msk = mptcp_sk(sk); + struct sk_buff *tail; + + mptcp_borrow_fwdmem(sk, skb); + if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) { /* in sequence */ msk->bytes_received += copy_len; @@ -345,16 +405,13 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, * will retransmit as needed, if needed. */ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_DUPDATA); -drop: mptcp_drop(sk, skb); return false; } static void mptcp_stop_rtx_timer(struct sock *sk) { - struct inet_connection_sock *icsk = inet_csk(sk); - - sk_stop_timer(sk, &icsk->icsk_retransmit_timer); + sk_stop_timer(sk, &sk->mptcp_retransmit_timer); mptcp_sk(sk)->timer_ival = 0; } @@ -371,6 +428,20 @@ static void mptcp_close_wake_up(struct sock *sk) sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); } +static void mptcp_shutdown_subflows(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + bool slow; + + slow = lock_sock_fast(ssk); + tcp_shutdown(ssk, SEND_SHUTDOWN); + unlock_sock_fast(ssk, slow); + } +} + /* called under the msk socket lock */ static bool mptcp_pending_data_fin_ack(struct sock *sk) { @@ -395,6 +466,7 @@ static void mptcp_check_data_fin_ack(struct sock *sk) break; case TCP_CLOSING: case TCP_LAST_ACK: + mptcp_shutdown_subflows(msk); mptcp_set_state(sk, TCP_CLOSE); break; } @@ -445,7 +517,7 @@ static long mptcp_timeout_from_subflow(const struct mptcp_subflow_context *subfl const struct sock *ssk = mptcp_subflow_tcp_sock(subflow); return inet_csk(ssk)->icsk_pending && !subflow->stale_count ? - icsk_timeout(inet_csk(ssk)) - jiffies : 0; + tcp_timeout_expires(ssk) - jiffies : 0; } static void mptcp_set_timeout(struct sock *sk) @@ -529,11 +601,10 @@ static void mptcp_cleanup_rbuf(struct mptcp_sock *msk, int copied) } } -static bool mptcp_check_data_fin(struct sock *sk) +static void mptcp_check_data_fin(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); u64 rcv_data_fin_seq; - bool ret = false; /* Need to ack a DATA_FIN received from a peer while this side * of the connection is in ESTABLISHED, FIN_WAIT1, or FIN_WAIT2. @@ -563,6 +634,7 @@ static bool mptcp_check_data_fin(struct sock *sk) mptcp_set_state(sk, TCP_CLOSING); break; case TCP_FIN_WAIT2: + mptcp_shutdown_subflows(msk); mptcp_set_state(sk, TCP_CLOSE); break; default: @@ -571,12 +643,10 @@ static bool mptcp_check_data_fin(struct sock *sk) break; } - ret = true; if (!__mptcp_check_fallback(msk)) mptcp_send_ack(msk); mptcp_close_wake_up(sk); } - return ret; } static void mptcp_dss_corruption(struct mptcp_sock *msk, struct sock *ssk) @@ -587,8 +657,50 @@ static void mptcp_dss_corruption(struct mptcp_sock *msk, struct sock *ssk) } } +static void __mptcp_add_backlog(struct sock *sk, + struct mptcp_subflow_context *subflow, + struct sk_buff *skb) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct sk_buff *tail = NULL; + struct sock *ssk = skb->sk; + bool fragstolen; + int delta; + + if (unlikely(sk->sk_state == TCP_CLOSE)) { + kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE); + return; + } + + /* Try to coalesce with the last skb in our backlog */ + if (!list_empty(&msk->backlog_list)) + tail = list_last_entry(&msk->backlog_list, struct sk_buff, list); + + if (tail && MPTCP_SKB_CB(skb)->map_seq == MPTCP_SKB_CB(tail)->end_seq && + ssk == tail->sk && + __mptcp_try_coalesce(sk, tail, skb, &fragstolen, &delta)) { + skb->truesize -= delta; + kfree_skb_partial(skb, fragstolen); + __mptcp_subflow_lend_fwdmem(subflow, delta); + goto account; + } + + list_add_tail(&skb->list, &msk->backlog_list); + mptcp_subflow_lend_fwdmem(subflow, skb); + delta = skb->truesize; + +account: + WRITE_ONCE(msk->backlog_len, msk->backlog_len + delta); + + /* Possibly not accept()ed yet, keep track of memory not CG + * accounted, mptcp_graft_subflows() will handle it. + */ + if (!mem_cgroup_from_sk(ssk)) + msk->backlog_unaccounted += delta; +} + static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, - struct sock *ssk) + struct sock *ssk, bool own_msk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = (struct sock *)msk; @@ -604,9 +716,6 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, struct sk_buff *skb; bool fin; - if (sk_rmem_alloc_get(sk) > sk->sk_rcvbuf) - break; - /* try to move as much data as available */ map_remaining = subflow->map_data_len - mptcp_subflow_get_map_offset(subflow); @@ -632,7 +741,14 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, if (offset < skb->len) { size_t len = skb->len - offset; - ret = __mptcp_move_skb(msk, ssk, skb, offset, len) || ret; + mptcp_init_skb(ssk, skb, offset, len); + + if (own_msk && sk_rmem_alloc_get(sk) < sk->sk_rcvbuf) { + mptcp_subflow_lend_fwdmem(subflow, skb); + ret |= __mptcp_move_skb(sk, skb); + } else { + __mptcp_add_backlog(sk, subflow, skb); + } seq += len; if (unlikely(map_remaining < len)) { @@ -751,14 +867,10 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) struct sock *sk = (struct sock *)msk; bool moved; - moved = __mptcp_move_skbs_from_subflow(msk, ssk); + moved = __mptcp_move_skbs_from_subflow(msk, ssk, true); __mptcp_ofo_queue(msk); - if (unlikely(ssk->sk_err)) { - if (!sock_owned_by_user(sk)) - __mptcp_error_report(sk); - else - __set_bit(MPTCP_ERROR_REPORT, &msk->cb_flags); - } + if (unlikely(ssk->sk_err)) + __mptcp_subflow_error_report(sk, ssk); /* If the moves have caught up with the DATA_FIN sequence number * it's time to ack the DATA_FIN and change socket state, but @@ -770,39 +882,26 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) return moved; } -static void __mptcp_rcvbuf_update(struct sock *sk, struct sock *ssk) -{ - if (unlikely(ssk->sk_rcvbuf > sk->sk_rcvbuf)) - WRITE_ONCE(sk->sk_rcvbuf, ssk->sk_rcvbuf); -} - -static void __mptcp_data_ready(struct sock *sk, struct sock *ssk) -{ - struct mptcp_sock *msk = mptcp_sk(sk); - - __mptcp_rcvbuf_update(sk, ssk); - - /* Wake-up the reader only for in-sequence data */ - if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk)) - sk->sk_data_ready(sk); -} - void mptcp_data_ready(struct sock *sk, struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + struct mptcp_sock *msk = mptcp_sk(sk); /* The peer can send data while we are shutting down this - * subflow at msk destruction time, but we must avoid enqueuing + * subflow at subflow destruction time, but we must avoid enqueuing * more data to the msk receive queue */ - if (unlikely(subflow->disposable)) + if (unlikely(subflow->closing)) return; mptcp_data_lock(sk); - if (!sock_owned_by_user(sk)) - __mptcp_data_ready(sk, ssk); - else - __set_bit(MPTCP_DEQUEUE, &mptcp_sk(sk)->cb_flags); + if (!sock_owned_by_user(sk)) { + /* Wake-up the reader only for in-sequence data */ + if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk)) + sk->sk_data_ready(sk); + } else { + __mptcp_move_skbs_from_subflow(msk, ssk, false); + } mptcp_data_unlock(sk); } @@ -828,12 +927,6 @@ static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk) mptcp_subflow_joined(msk, ssk); spin_unlock_bh(&msk->fallback_lock); - /* attach to msk socket only after we are sure we will deal with it - * at close time - */ - if (sk->sk_socket && !ssk->sk_socket) - mptcp_sock_graft(ssk, sk->sk_socket); - mptcp_subflow_ctx(ssk)->subflow_id = msk->subflow_id++; mptcp_sockopt_sync_locked(msk, ssk); mptcp_stop_tout_timer(sk); @@ -859,12 +952,11 @@ static void __mptcp_flush_join_list(struct sock *sk, struct list_head *join_list static bool mptcp_rtx_timer_pending(struct sock *sk) { - return timer_pending(&inet_csk(sk)->icsk_retransmit_timer); + return timer_pending(&sk->mptcp_retransmit_timer); } static void mptcp_reset_rtx_timer(struct sock *sk) { - struct inet_connection_sock *icsk = inet_csk(sk); unsigned long tout; /* prevent rescheduling on close */ @@ -872,19 +964,24 @@ static void mptcp_reset_rtx_timer(struct sock *sk) return; tout = mptcp_sk(sk)->timer_ival; - sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + tout); + sk_reset_timer(sk, &sk->mptcp_retransmit_timer, jiffies + tout); } bool mptcp_schedule_work(struct sock *sk) { - if (inet_sk_state_load(sk) != TCP_CLOSE && - schedule_work(&mptcp_sk(sk)->work)) { - /* each subflow already holds a reference to the sk, and the - * workqueue is invoked by a subflow, so sk can't go away here. - */ - sock_hold(sk); + if (inet_sk_state_load(sk) == TCP_CLOSE) + return false; + + /* Get a reference on this socket, mptcp_worker() will release it. + * As mptcp_worker() might complete before us, we can not avoid + * a sock_hold()/sock_put() if schedule_work() returns false. + */ + sock_hold(sk); + + if (schedule_work(&mptcp_sk(sk)->work)) return true; - } + + sock_put(sk); return false; } @@ -949,7 +1046,7 @@ static void __mptcp_clean_una(struct sock *sk) if (WARN_ON_ONCE(!msk->recovery)) break; - WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); + msk->first_pending = mptcp_send_next(sk); } dfrag_clear(sk, dfrag); @@ -1016,11 +1113,12 @@ static void mptcp_enter_memory_pressure(struct sock *sk) mptcp_for_each_subflow(msk, subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - if (first) + if (first && !ssk->sk_bypass_prot_mem) { tcp_enter_memory_pressure(ssk); - sk_stream_moderate_sndbuf(ssk); + first = false; + } - first = false; + sk_stream_moderate_sndbuf(ssk); } __mptcp_sync_sndbuf(sk); } @@ -1241,7 +1339,12 @@ alloc_skb: if (copy == 0) { u64 snd_una = READ_ONCE(msk->snd_una); - if (snd_una != msk->snd_nxt || tcp_write_queue_tail(ssk)) { + /* No need for zero probe if there are any data pending + * either at the msk or ssk level; skb is the current write + * queue tail and can be empty at this point. + */ + if (snd_una != msk->snd_nxt || skb->len || + skb != tcp_send_head(ssk)) { tcp_remove_empty_skb(ssk); return 0; } @@ -1292,6 +1395,7 @@ alloc_skb: mpext->dsn64); if (zero_window_probe) { + MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_WINPROBE); mptcp_subflow_ctx(ssk)->rel_write_seq += copy; mpext->frozen = 1; if (READ_ONCE(msk->csum_enabled)) @@ -1494,7 +1598,7 @@ static int __subflow_push_pending(struct sock *sk, struct sock *ssk, mptcp_update_post_push(msk, dfrag, ret); } - WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); + msk->first_pending = mptcp_send_next(sk); if (msk->snd_burst <= 0 || !sk_stream_memory_free(ssk) || @@ -1740,6 +1844,20 @@ static u32 mptcp_send_limit(const struct sock *sk) return limit - not_sent; } +static void mptcp_rps_record_subflows(const struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + + if (!rfs_is_needed()) + return; + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + sock_rps_record_flow(ssk); + } +} + static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -1753,6 +1871,8 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) lock_sock(sk); + mptcp_rps_record_subflows(msk); + if (unlikely(inet_test_bit(DEFER_CONNECT, sk) || msg->msg_flags & MSG_FASTOPEN)) { int copied_syn = 0; @@ -1838,7 +1958,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) get_page(dfrag->page); list_add_tail(&dfrag->list, &msk->rtx_queue); if (!msk->first_pending) - WRITE_ONCE(msk->first_pending, dfrag); + msk->first_pending = dfrag; } pr_debug("msk=%p dfrag at seq=%llu len=%u sent=%u new=%d\n", msk, dfrag->data_seq, dfrag->data_len, dfrag->already_sent, @@ -1871,22 +1991,36 @@ do_error: static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied); -static int __mptcp_recvmsg_mskq(struct sock *sk, - struct msghdr *msg, - size_t len, int flags, +static int __mptcp_recvmsg_mskq(struct sock *sk, struct msghdr *msg, + size_t len, int flags, int copied_total, struct scm_timestamping_internal *tss, int *cmsg_flags) { struct mptcp_sock *msk = mptcp_sk(sk); struct sk_buff *skb, *tmp; + int total_data_len = 0; int copied = 0; skb_queue_walk_safe(&sk->sk_receive_queue, skb, tmp) { - u32 offset = MPTCP_SKB_CB(skb)->offset; + u32 delta, offset = MPTCP_SKB_CB(skb)->offset; u32 data_len = skb->len - offset; - u32 count = min_t(size_t, len - copied, data_len); + u32 count; int err; + if (flags & MSG_PEEK) { + /* skip already peeked skbs */ + if (total_data_len + data_len <= copied_total) { + total_data_len += data_len; + continue; + } + + /* skip the already peeked data in the current skb */ + delta = copied_total - total_data_len; + offset += delta; + data_len -= delta; + } + + count = min_t(size_t, len - copied, data_len); if (!(flags & MSG_TRUNC)) { err = skb_copy_datagram_msg(skb, offset, msg, count); if (unlikely(err < 0)) { @@ -1903,23 +2037,21 @@ static int __mptcp_recvmsg_mskq(struct sock *sk, copied += count; - if (count < data_len) { - if (!(flags & MSG_PEEK)) { + if (!(flags & MSG_PEEK)) { + msk->bytes_consumed += count; + if (count < data_len) { MPTCP_SKB_CB(skb)->offset += count; MPTCP_SKB_CB(skb)->map_seq += count; - msk->bytes_consumed += count; + break; } - break; - } - if (!(flags & MSG_PEEK)) { - /* avoid the indirect call, we know the destructor is sock_wfree */ + /* avoid the indirect call, we know the destructor is sock_rfree */ skb->destructor = NULL; + skb->sk = NULL; atomic_sub(skb->truesize, &sk->sk_rmem_alloc); sk_mem_uncharge(sk, skb->truesize); __skb_unlink(skb, &sk->sk_receive_queue); - __kfree_skb(skb); - msk->bytes_consumed += count; + skb_attempt_defer_free(skb); } if (copied >= len) @@ -1983,112 +2115,104 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) if (msk->rcvq_space.copied <= msk->rcvq_space.space) goto new_measure; - if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) && - !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { - u64 rcvwin, grow; - int rcvbuf; - - rcvwin = ((u64)msk->rcvq_space.copied << 1) + 16 * advmss; - - grow = rcvwin * (msk->rcvq_space.copied - msk->rcvq_space.space); - - do_div(grow, msk->rcvq_space.space); - rcvwin += (grow << 1); - - rcvbuf = min_t(u64, mptcp_space_from_win(sk, rcvwin), - READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); - - if (rcvbuf > sk->sk_rcvbuf) { - u32 window_clamp; - - window_clamp = mptcp_win_from_space(sk, rcvbuf); - WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); - - /* Make subflows follow along. If we do not do this, we - * get drops at subflow level if skbs can't be moved to - * the mptcp rx queue fast enough (announced rcv_win can - * exceed ssk->sk_rcvbuf). - */ - mptcp_for_each_subflow(msk, subflow) { - struct sock *ssk; - bool slow; + if (mptcp_rcvbuf_grow(sk, msk->rcvq_space.copied)) { + /* Make subflows follow along. If we do not do this, we + * get drops at subflow level if skbs can't be moved to + * the mptcp rx queue fast enough (announced rcv_win can + * exceed ssk->sk_rcvbuf). + */ + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk; + bool slow; - ssk = mptcp_subflow_tcp_sock(subflow); - slow = lock_sock_fast(ssk); - WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf); - WRITE_ONCE(tcp_sk(ssk)->window_clamp, window_clamp); - if (tcp_can_send_ack(ssk)) - tcp_cleanup_rbuf(ssk, 1); - unlock_sock_fast(ssk, slow); - } + ssk = mptcp_subflow_tcp_sock(subflow); + slow = lock_sock_fast(ssk); + /* subflows can be added before tcp_init_transfer() */ + if (tcp_sk(ssk)->rcvq_space.space) + tcp_rcvbuf_grow(ssk, msk->rcvq_space.copied); + unlock_sock_fast(ssk, slow); } } - msk->rcvq_space.space = msk->rcvq_space.copied; new_measure: msk->rcvq_space.copied = 0; msk->rcvq_space.time = mstamp; } -static struct mptcp_subflow_context * -__mptcp_first_ready_from(struct mptcp_sock *msk, - struct mptcp_subflow_context *subflow) +static bool __mptcp_move_skbs(struct sock *sk, struct list_head *skbs, u32 *delta) { - struct mptcp_subflow_context *start_subflow = subflow; + struct sk_buff *skb = list_first_entry(skbs, struct sk_buff, list); + struct mptcp_sock *msk = mptcp_sk(sk); + bool moved = false; + + *delta = 0; + while (1) { + /* If the msk recvbuf is full stop, don't drop */ + if (sk_rmem_alloc_get(sk) > sk->sk_rcvbuf) + break; + + prefetch(skb->next); + list_del(&skb->list); + *delta += skb->truesize; - while (!READ_ONCE(subflow->data_avail)) { - subflow = mptcp_next_subflow(msk, subflow); - if (subflow == start_subflow) - return NULL; + moved |= __mptcp_move_skb(sk, skb); + if (list_empty(skbs)) + break; + + skb = list_first_entry(skbs, struct sk_buff, list); } - return subflow; + + __mptcp_ofo_queue(msk); + if (moved) + mptcp_check_data_fin((struct sock *)msk); + return moved; } -static bool __mptcp_move_skbs(struct sock *sk) +static bool mptcp_can_spool_backlog(struct sock *sk, struct list_head *skbs) { - struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); - bool ret = false; - if (list_empty(&msk->conn_list)) + /* After CG initialization, subflows should never add skb before + * gaining the CG themself. + */ + DEBUG_NET_WARN_ON_ONCE(msk->backlog_unaccounted && sk->sk_socket && + mem_cgroup_from_sk(sk)); + + /* Don't spool the backlog if the rcvbuf is full. */ + if (list_empty(&msk->backlog_list) || + sk_rmem_alloc_get(sk) > sk->sk_rcvbuf) return false; - /* verify we can move any data from the subflow, eventually updating */ - if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) - mptcp_for_each_subflow(msk, subflow) - __mptcp_rcvbuf_update(sk, subflow->tcp_sock); + INIT_LIST_HEAD(skbs); + list_splice_init(&msk->backlog_list, skbs); + return true; +} - subflow = list_first_entry(&msk->conn_list, - struct mptcp_subflow_context, node); - for (;;) { - struct sock *ssk; - bool slowpath; +static void mptcp_backlog_spooled(struct sock *sk, u32 moved, + struct list_head *skbs) +{ + struct mptcp_sock *msk = mptcp_sk(sk); - /* - * As an optimization avoid traversing the subflows list - * and ev. acquiring the subflow socket lock before baling out - */ - if (sk_rmem_alloc_get(sk) > sk->sk_rcvbuf) - break; + WRITE_ONCE(msk->backlog_len, msk->backlog_len - moved); + list_splice(skbs, &msk->backlog_list); +} - subflow = __mptcp_first_ready_from(msk, subflow); - if (!subflow) - break; +static bool mptcp_move_skbs(struct sock *sk) +{ + struct list_head skbs; + bool enqueued = false; + u32 moved; - ssk = mptcp_subflow_tcp_sock(subflow); - slowpath = lock_sock_fast(ssk); - ret = __mptcp_move_skbs_from_subflow(msk, ssk) || ret; - if (unlikely(ssk->sk_err)) - __mptcp_error_report(sk); - unlock_sock_fast(ssk, slowpath); + mptcp_data_lock(sk); + while (mptcp_can_spool_backlog(sk, &skbs)) { + mptcp_data_unlock(sk); + enqueued |= __mptcp_move_skbs(sk, &skbs, &moved); - subflow = mptcp_next_subflow(msk, subflow); + mptcp_data_lock(sk); + mptcp_backlog_spooled(sk, moved, &skbs); } - - __mptcp_ofo_queue(msk); - if (ret) - mptcp_check_data_fin((struct sock *)msk); - return ret; + mptcp_data_unlock(sk); + return enqueued; } static unsigned int mptcp_inq_hint(const struct sock *sk) @@ -2131,6 +2255,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, goto out_err; } + mptcp_rps_record_subflows(msk); + timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); len = min_t(size_t, len, INT_MAX); @@ -2142,7 +2268,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, while (copied < len) { int err, bytes_read; - bytes_read = __mptcp_recvmsg_mskq(sk, msg, len - copied, flags, &tss, &cmsg_flags); + bytes_read = __mptcp_recvmsg_mskq(sk, msg, len - copied, flags, + copied, &tss, &cmsg_flags); if (unlikely(bytes_read < 0)) { if (!copied) copied = bytes_read; @@ -2151,7 +2278,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, copied += bytes_read; - if (skb_queue_empty(&sk->sk_receive_queue) && __mptcp_move_skbs(sk)) + if (!list_empty(&msk->backlog_list) && mptcp_move_skbs(sk)) continue; /* only the MPTCP socket status is relevant here. The exit @@ -2173,14 +2300,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, break; } - if (sk->sk_shutdown & RCV_SHUTDOWN) { - /* race breaker: the shutdown could be after the - * previous receive queue check - */ - if (__mptcp_move_skbs(sk)) - continue; + if (sk->sk_shutdown & RCV_SHUTDOWN) break; - } if (sk->sk_state == TCP_CLOSE) { copied = -ENOTCONN; @@ -2230,9 +2351,7 @@ out_err: static void mptcp_retransmit_timer(struct timer_list *t) { - struct inet_connection_sock *icsk = timer_container_of(icsk, t, - icsk_retransmit_timer); - struct sock *sk = &icsk->icsk_inet.sk; + struct sock *sk = timer_container_of(sk, t, mptcp_retransmit_timer); struct mptcp_sock *msk = mptcp_sk(sk); bh_lock_sock(sk); @@ -2250,7 +2369,9 @@ static void mptcp_retransmit_timer(struct timer_list *t) static void mptcp_tout_timer(struct timer_list *t) { - struct sock *sk = timer_container_of(sk, t, sk_timer); + struct inet_connection_sock *icsk = + timer_container_of(icsk, t, mptcp_tout_timer); + struct sock *sk = &icsk->icsk_inet.sk; mptcp_schedule_work(sk); sock_put(sk); @@ -2336,7 +2457,6 @@ bool __mptcp_retransmit_pending_data(struct sock *sk) /* flags for __mptcp_close_ssk() */ #define MPTCP_CF_PUSH BIT(1) -#define MPTCP_CF_FASTCLOSE BIT(2) /* be sure to send a reset only if the caller asked for it, also * clean completely the subflow status when the subflow reaches @@ -2347,7 +2467,7 @@ static void __mptcp_subflow_disconnect(struct sock *ssk, unsigned int flags) { if (((1 << ssk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) || - (flags & MPTCP_CF_FASTCLOSE)) { + subflow->send_fastclose) { /* The MPTCP code never wait on the subflow sockets, TCP-level * disconnect should never fail */ @@ -2372,6 +2492,25 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, { struct mptcp_sock *msk = mptcp_sk(sk); bool dispose_it, need_push = false; + int fwd_remaining; + + /* Do not pass RX data to the msk, even if the subflow socket is not + * going to be freed (i.e. even for the first subflow on graceful + * subflow close. + */ + lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); + subflow->closing = 1; + + /* Borrow the fwd allocated page left-over; fwd memory for the subflow + * could be negative at this point, but will be reach zero soon - when + * the data allocated using such fragment will be freed. + */ + if (subflow->lent_mem_frag) { + fwd_remaining = PAGE_SIZE - subflow->lent_mem_frag; + sk_forward_alloc_add(sk, fwd_remaining); + sk_forward_alloc_add(ssk, -fwd_remaining); + subflow->lent_mem_frag = 0; + } /* If the first subflow moved to a close state before accept, e.g. due * to an incoming reset or listener shutdown, the subflow socket is @@ -2383,7 +2522,6 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, /* ensure later check in mptcp_worker() will dispose the msk */ sock_set_flag(sk, SOCK_DEAD); mptcp_set_close_tout(sk, tcp_jiffies32 - (mptcp_close_timeout(sk) + 1)); - lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); mptcp_subflow_drop_ctx(ssk); goto out_release; } @@ -2392,16 +2530,8 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, if (dispose_it) list_del(&subflow->node); - lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); - - if ((flags & MPTCP_CF_FASTCLOSE) && !__mptcp_check_fallback(msk)) { - /* be sure to force the tcp_close path - * to generate the egress reset - */ - ssk->sk_lingertime = 0; - sock_set_flag(ssk, SOCK_LINGER); - subflow->send_fastclose = 1; - } + if (subflow->send_fastclose && ssk->sk_state != TCP_CLOSE) + tcp_set_state(ssk, TCP_CLOSE); need_push = (flags & MPTCP_CF_PUSH) && __mptcp_retransmit_pending_data(sk); if (!dispose_it) { @@ -2462,6 +2592,9 @@ out: void mptcp_close_ssk(struct sock *sk, struct sock *ssk, struct mptcp_subflow_context *subflow) { + struct mptcp_sock *msk = mptcp_sk(sk); + struct sk_buff *skb; + /* The first subflow can already be closed and still in the list */ if (subflow->close_event_done) return; @@ -2471,6 +2604,17 @@ void mptcp_close_ssk(struct sock *sk, struct sock *ssk, if (sk->sk_state == TCP_ESTABLISHED) mptcp_event(MPTCP_EVENT_SUB_CLOSED, mptcp_sk(sk), ssk, GFP_KERNEL); + /* Remove any reference from the backlog to this ssk; backlog skbs consume + * space in the msk receive queue, no need to touch sk->sk_rmem_alloc + */ + list_for_each_entry(skb, &msk->backlog_list, list) { + if (skb->sk != ssk) + continue; + + atomic_sub(skb->truesize, &skb->sk->sk_rmem_alloc); + skb->sk = NULL; + } + /* subflow aborted before reaching the fully_established status * attempt the creation of the next subflow */ @@ -2497,7 +2641,8 @@ static void __mptcp_close_subflow(struct sock *sk) if (ssk_state != TCP_CLOSE && (ssk_state != TCP_CLOSE_WAIT || - inet_sk_state_load(sk) != TCP_ESTABLISHED)) + inet_sk_state_load(sk) != TCP_ESTABLISHED || + __mptcp_check_fallback(msk))) continue; /* 'subflow_data_ready' will re-sched once rx queue is empty */ @@ -2587,7 +2732,8 @@ static void __mptcp_retrans(struct sock *sk) if (mptcp_data_fin_enabled(msk)) { struct inet_connection_sock *icsk = inet_csk(sk); - icsk->icsk_retransmits++; + WRITE_ONCE(icsk->icsk_retransmits, + icsk->icsk_retransmits + 1); mptcp_set_datafin_timeout(sk); mptcp_send_ack(msk); @@ -2595,7 +2741,7 @@ static void __mptcp_retrans(struct sock *sk) } if (!mptcp_send_head(sk)) - return; + goto clear_scheduled; goto reset_timer; } @@ -2626,7 +2772,7 @@ static void __mptcp_retrans(struct sock *sk) if (__mptcp_check_fallback(msk)) { spin_unlock_bh(&msk->fallback_lock); release_sock(ssk); - return; + goto clear_scheduled; } while (info.sent < info.limit) { @@ -2658,6 +2804,15 @@ reset_timer: if (!mptcp_rtx_timer_pending(sk)) mptcp_reset_rtx_timer(sk); + +clear_scheduled: + /* If no rtx data was available or in case of fallback, there + * could be left-over scheduled subflows; clear them all + * or later xmit could use bad ones + */ + mptcp_for_each_subflow(msk, subflow) + if (READ_ONCE(subflow->scheduled)) + mptcp_subflow_set_scheduled(subflow, false); } /* schedule the timeout timer for the relevant event: either close timeout @@ -2679,7 +2834,7 @@ void mptcp_reset_tout_timer(struct mptcp_sock *msk, unsigned long fail_tout) */ timeout = inet_csk(sk)->icsk_mtup.probe_timestamp ? close_timeout : fail_tout; - sk_reset_timer(sk, &sk->sk_timer, timeout); + sk_reset_timer(sk, &inet_csk(sk)->mptcp_tout_timer, timeout); } static void mptcp_mp_fail_no_response(struct mptcp_sock *msk) @@ -2698,15 +2853,57 @@ static void mptcp_mp_fail_no_response(struct mptcp_sock *msk) unlock_sock_fast(ssk, slow); } +static void mptcp_backlog_purge(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct sk_buff *tmp, *skb; + LIST_HEAD(backlog); + + mptcp_data_lock(sk); + list_splice_init(&msk->backlog_list, &backlog); + msk->backlog_len = 0; + mptcp_data_unlock(sk); + + list_for_each_entry_safe(skb, tmp, &backlog, list) { + mptcp_borrow_fwdmem(sk, skb); + kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE); + } + sk_mem_reclaim(sk); +} + static void mptcp_do_fastclose(struct sock *sk) { struct mptcp_subflow_context *subflow, *tmp; struct mptcp_sock *msk = mptcp_sk(sk); mptcp_set_state(sk, TCP_CLOSE); - mptcp_for_each_subflow_safe(msk, subflow, tmp) - __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), - subflow, MPTCP_CF_FASTCLOSE); + mptcp_backlog_purge(sk); + + /* Explicitly send the fastclose reset as need */ + if (__mptcp_check_fallback(msk)) + return; + + mptcp_for_each_subflow_safe(msk, subflow, tmp) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + lock_sock(ssk); + + /* Some subflow socket states don't allow/need a reset.*/ + if ((1 << ssk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) + goto unlock; + + subflow->send_fastclose = 1; + + /* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0 + * issue in __tcp_select_window(), see tcp_disconnect(). + */ + inet_csk(ssk)->icsk_ack.rcv_mss = TCP_MIN_MSS; + + tcp_send_active_reset(ssk, ssk->sk_allocation, + SK_RST_REASON_TCP_ABORT_ON_CLOSE); +unlock: + release_sock(ssk); + } } static void mptcp_worker(struct work_struct *work) @@ -2733,7 +2930,11 @@ static void mptcp_worker(struct work_struct *work) __mptcp_close_subflow(sk); if (mptcp_close_tout_expired(sk)) { + struct mptcp_subflow_context *subflow, *tmp; + mptcp_do_fastclose(sk); + mptcp_for_each_subflow_safe(msk, subflow, tmp) + __mptcp_close_ssk(sk, subflow->tcp_sock, subflow, 0); mptcp_close_wake_up(sk); } @@ -2761,11 +2962,13 @@ static void __mptcp_init_sock(struct sock *sk) INIT_LIST_HEAD(&msk->conn_list); INIT_LIST_HEAD(&msk->join_list); INIT_LIST_HEAD(&msk->rtx_queue); + INIT_LIST_HEAD(&msk->backlog_list); INIT_WORK(&msk->work, mptcp_worker); msk->out_of_order_queue = RB_ROOT; msk->first_pending = NULL; msk->timer_ival = TCP_RTO_MIN; msk->scaling_ratio = TCP_DEFAULT_SCALING_RATIO; + msk->backlog_len = 0; WRITE_ONCE(msk->first, NULL); inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss; @@ -2782,8 +2985,8 @@ static void __mptcp_init_sock(struct sock *sk) spin_lock_init(&msk->fallback_lock); /* re-use the csk retrans timer for MPTCP-level retrans */ - timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0); - timer_setup(&sk->sk_timer, mptcp_tout_timer, 0); + timer_setup(&sk->mptcp_retransmit_timer, mptcp_retransmit_timer, 0); + timer_setup(&msk->sk.mptcp_tout_timer, mptcp_tout_timer, 0); } static void mptcp_ca_reset(struct sock *sk) @@ -2838,7 +3041,7 @@ static void __mptcp_clear_xmit(struct sock *sk) struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_data_frag *dtmp, *dfrag; - WRITE_ONCE(msk->first_pending, NULL); + msk->first_pending = NULL; list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list) dfrag_clear(sk, dfrag); } @@ -2985,7 +3188,7 @@ static void __mptcp_destroy_sock(struct sock *sk) might_sleep(); mptcp_stop_rtx_timer(sk); - sk_stop_timer(sk, &sk->sk_timer); + sk_stop_timer(sk, &inet_csk(sk)->mptcp_tout_timer); msk->pm.status = 0; mptcp_release_sched(msk); @@ -3136,6 +3339,28 @@ static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk) inet_sk(msk)->inet_rcv_saddr = inet_sk(ssk)->inet_rcv_saddr; } +static void mptcp_destroy_common(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow, *tmp; + struct sock *sk = (struct sock *)msk; + + __mptcp_clear_xmit(sk); + mptcp_backlog_purge(sk); + + /* join list will be eventually flushed (with rst) at sock lock release time */ + mptcp_for_each_subflow_safe(msk, subflow, tmp) + __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), subflow, 0); + + __skb_queue_purge(&sk->sk_receive_queue); + skb_rbtree_purge(&msk->out_of_order_queue); + + /* move all the rx fwd alloc into the sk_mem_reclaim_final in + * inet_sock_destruct() will dispose it + */ + mptcp_token_destroy(msk); + mptcp_pm_destroy(msk); +} + static int mptcp_disconnect(struct sock *sk, int flags) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -3158,7 +3383,8 @@ static int mptcp_disconnect(struct sock *sk, int flags) /* msk->subflow is still intact, the following will not free the first * subflow */ - mptcp_destroy_common(msk, MPTCP_CF_FASTCLOSE); + mptcp_do_fastclose(sk); + mptcp_destroy_common(msk); /* The first subflow is already in TCP_CLOSE status, the following * can't overlap with a fallback anymore @@ -3187,6 +3413,9 @@ static int mptcp_disconnect(struct sock *sk, int flags) msk->bytes_retrans = 0; msk->rcvspace_init = 0; + /* for fallback's sake */ + WRITE_ONCE(msk->ack_seq, 0); + WRITE_ONCE(sk->sk_shutdown, 0); sk_error_report(sk); return 0; @@ -3337,34 +3566,13 @@ void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk) msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT; } -void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags) -{ - struct mptcp_subflow_context *subflow, *tmp; - struct sock *sk = (struct sock *)msk; - - __mptcp_clear_xmit(sk); - - /* join list will be eventually flushed (with rst) at sock lock release time */ - mptcp_for_each_subflow_safe(msk, subflow, tmp) - __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), subflow, flags); - - __skb_queue_purge(&sk->sk_receive_queue); - skb_rbtree_purge(&msk->out_of_order_queue); - - /* move all the rx fwd alloc into the sk_mem_reclaim_final in - * inet_sock_destruct() will dispose it - */ - mptcp_token_destroy(msk); - mptcp_pm_destroy(msk); -} - static void mptcp_destroy(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); /* allow the following to close even the initial subflow */ msk->free_first = 1; - mptcp_destroy_common(msk, 0); + mptcp_destroy_common(msk); sk_sockets_allocated_dec(sk); } @@ -3378,9 +3586,6 @@ void __mptcp_data_acked(struct sock *sk) void __mptcp_check_push(struct sock *sk, struct sock *ssk) { - if (!mptcp_send_head(sk)) - return; - if (!sock_owned_by_user(sk)) __mptcp_subflow_push_pending(sk, ssk, false); else @@ -3389,8 +3594,7 @@ void __mptcp_check_push(struct sock *sk, struct sock *ssk) #define MPTCP_FLAGS_PROCESS_CTX_NEED (BIT(MPTCP_PUSH_PENDING) | \ BIT(MPTCP_RETRANSMIT) | \ - BIT(MPTCP_FLUSH_JOIN_LIST) | \ - BIT(MPTCP_DEQUEUE)) + BIT(MPTCP_FLUSH_JOIN_LIST)) /* processes deferred events and flush wmem */ static void mptcp_release_cb(struct sock *sk) @@ -3400,9 +3604,12 @@ static void mptcp_release_cb(struct sock *sk) for (;;) { unsigned long flags = (msk->cb_flags & MPTCP_FLAGS_PROCESS_CTX_NEED); - struct list_head join_list; + struct list_head join_list, skbs; + bool spool_bl; + u32 moved; - if (!flags) + spool_bl = mptcp_can_spool_backlog(sk, &skbs); + if (!flags && !spool_bl) break; INIT_LIST_HEAD(&join_list); @@ -3424,7 +3631,7 @@ static void mptcp_release_cb(struct sock *sk) __mptcp_push_pending(sk, 0); if (flags & BIT(MPTCP_RETRANSMIT)) __mptcp_retrans(sk); - if ((flags & BIT(MPTCP_DEQUEUE)) && __mptcp_move_skbs(sk)) { + if (spool_bl && __mptcp_move_skbs(sk, &skbs, &moved)) { /* notify ack seq update */ mptcp_cleanup_rbuf(msk, 0); sk->sk_data_ready(sk); @@ -3432,6 +3639,8 @@ static void mptcp_release_cb(struct sock *sk) cond_resched(); spin_lock_bh(&sk->sk_lock.slock); + if (spool_bl) + mptcp_backlog_spooled(sk, moved, &skbs); } if (__test_and_clear_bit(MPTCP_CLEAN_UNA, &msk->cb_flags)) @@ -3554,10 +3763,26 @@ void mptcp_sock_graft(struct sock *sk, struct socket *parent) write_lock_bh(&sk->sk_callback_lock); rcu_assign_pointer(sk->sk_wq, &parent->wq); sk_set_socket(sk, parent); - WRITE_ONCE(sk->sk_uid, SOCK_INODE(parent)->i_uid); write_unlock_bh(&sk->sk_callback_lock); } +/* Can be called without holding the msk socket lock; use the callback lock + * to avoid {READ_,WRITE_}ONCE annotations on sk_socket. + */ +static void mptcp_sock_check_graft(struct sock *sk, struct sock *ssk) +{ + struct socket *sock; + + write_lock_bh(&sk->sk_callback_lock); + sock = sk->sk_socket; + write_unlock_bh(&sk->sk_callback_lock); + if (sock) { + mptcp_sock_graft(ssk, sock); + __mptcp_inherit_cgrp_data(sk, ssk); + __mptcp_inherit_memcg(sk, ssk, GFP_ATOMIC); + } +} + bool mptcp_finish_join(struct sock *ssk) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); @@ -3573,7 +3798,9 @@ bool mptcp_finish_join(struct sock *ssk) return false; } - /* active subflow, already present inside the conn_list */ + /* Active subflow, already present inside the conn_list; is grafted + * either by __mptcp_subflow_connect() or accept. + */ if (!list_empty(&subflow->node)) { spin_lock_bh(&msk->fallback_lock); if (!msk->allow_subflows) { @@ -3600,11 +3827,17 @@ bool mptcp_finish_join(struct sock *ssk) if (ret) { sock_hold(ssk); list_add_tail(&subflow->node, &msk->conn_list); + mptcp_sock_check_graft(parent, ssk); } } else { sock_hold(ssk); list_add_tail(&subflow->node, &msk->join_list); __set_bit(MPTCP_FLUSH_JOIN_LIST, &msk->cb_flags); + + /* In case of later failures, __mptcp_flush_join_list() will + * properly orphan the ssk via mptcp_close_ssk(). + */ + mptcp_sock_check_graft(parent, ssk); } mptcp_data_unlock(parent); @@ -3665,7 +3898,7 @@ static int mptcp_ioctl(struct sock *sk, int cmd, int *karg) return -EINVAL; lock_sock(sk); - if (__mptcp_move_skbs(sk)) + if (mptcp_move_skbs(sk)) mptcp_cleanup_rbuf(msk, 0); *karg = mptcp_inq_hint(sk); release_sock(sk); @@ -3687,7 +3920,8 @@ static int mptcp_ioctl(struct sock *sk, int cmd, int *karg) return 0; } -static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +static int mptcp_connect(struct sock *sk, struct sockaddr_unsized *uaddr, + int addr_len) { struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); @@ -3797,7 +4031,7 @@ static struct proto mptcp_prot = { .no_autobind = true, }; -static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +static int mptcp_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { struct mptcp_sock *msk = mptcp_sk(sock->sk); struct sock *ssk, *sk = sock->sk; @@ -3864,6 +4098,69 @@ unlock: return err; } +static void mptcp_graft_subflows(struct sock *sk) +{ + struct mptcp_subflow_context *subflow; + struct mptcp_sock *msk = mptcp_sk(sk); + + if (mem_cgroup_sockets_enabled) { + LIST_HEAD(join_list); + + /* Subflows joining after __inet_accept() will get the + * mem CG properly initialized at mptcp_finish_join() time, + * but subflows pending in join_list need explicit + * initialization before flushing `backlog_unaccounted` + * or MPTCP can later unexpectedly observe unaccounted memory. + */ + mptcp_data_lock(sk); + list_splice_init(&msk->join_list, &join_list); + mptcp_data_unlock(sk); + + __mptcp_flush_join_list(sk, &join_list); + } + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + lock_sock(ssk); + + /* Set ssk->sk_socket of accept()ed flows to mptcp socket. + * This is needed so NOSPACE flag can be set from tcp stack. + */ + if (!ssk->sk_socket) + mptcp_sock_graft(ssk, sk->sk_socket); + + if (!mem_cgroup_sk_enabled(sk)) + goto unlock; + + __mptcp_inherit_cgrp_data(sk, ssk); + __mptcp_inherit_memcg(sk, ssk, GFP_KERNEL); + +unlock: + release_sock(ssk); + } + + if (mem_cgroup_sk_enabled(sk)) { + gfp_t gfp = GFP_KERNEL | __GFP_NOFAIL; + int amt; + + /* Account the backlog memory; prior accept() is aware of + * fwd and rmem only. + */ + mptcp_data_lock(sk); + amt = sk_mem_pages(sk->sk_forward_alloc + + msk->backlog_unaccounted + + atomic_read(&sk->sk_rmem_alloc)) - + sk_mem_pages(sk->sk_forward_alloc + + atomic_read(&sk->sk_rmem_alloc)); + msk->backlog_unaccounted = 0; + mptcp_data_unlock(sk); + + if (amt) + mem_cgroup_sk_charge(sk, amt, gfp); + } +} + static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg) { @@ -3911,24 +4208,17 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, msk = mptcp_sk(newsk); msk->in_accept_queue = 0; - /* set ssk->sk_socket of accept()ed flows to mptcp socket. - * This is needed so NOSPACE flag can be set from tcp stack. - */ - mptcp_for_each_subflow(msk, subflow) { - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - - if (!ssk->sk_socket) - mptcp_sock_graft(ssk, newsock); - } + mptcp_graft_subflows(newsk); + mptcp_rps_record_subflows(msk); /* Do late cleanup for the first subflow as necessary. Also * deal with bad peers not doing a complete shutdown. */ if (unlikely(inet_sk_state_load(msk->first) == TCP_CLOSE)) { - __mptcp_close_ssk(newsk, msk->first, - mptcp_subflow_ctx(msk->first), 0); if (unlikely(list_is_singular(&msk->conn_list))) mptcp_set_state(newsk, TCP_CLOSE); + mptcp_close_ssk(newsk, msk->first, + mptcp_subflow_ctx(msk->first)); } } else { tcpfallback: diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index b15d7fab5c4b..9c0d17876b22 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -124,7 +124,6 @@ #define MPTCP_FLUSH_JOIN_LIST 5 #define MPTCP_SYNC_STATE 6 #define MPTCP_SYNC_SNDBUF 7 -#define MPTCP_DEQUEUE 8 struct mptcp_skb_cb { u64 map_seq; @@ -235,7 +234,7 @@ struct mptcp_pm_data { u8 add_addr_accepted; u8 local_addr_used; u8 pm_type; - u8 subflows; + u8 extra_subflows; u8 status; ); @@ -341,8 +340,8 @@ struct mptcp_sock { struct mptcp_pm_data pm; struct mptcp_sched_ops *sched; struct { - u32 space; /* bytes copied in last measurement window */ - u32 copied; /* bytes copied in this measurement window */ + int space; /* bytes copied in last measurement window */ + int copied; /* bytes copied in this measurement window */ u64 time; /* start time of measurement window */ u64 rtt_us; /* last maximum rtt of subflows */ } rcvq_space; @@ -357,6 +356,10 @@ struct mptcp_sock { * allow_infinite_fallback and * allow_join */ + + struct list_head backlog_list; /* protected by the data lock */ + u32 backlog_len; + u32 backlog_unaccounted; }; #define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock) @@ -407,6 +410,7 @@ static inline int mptcp_space_from_win(const struct sock *sk, int win) static inline int __mptcp_space(const struct sock *sk) { return mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - + READ_ONCE(mptcp_sk(sk)->backlog_len) - sk_rmem_alloc_get(sk)); } @@ -414,7 +418,7 @@ static inline struct mptcp_data_frag *mptcp_send_head(const struct sock *sk) { const struct mptcp_sock *msk = mptcp_sk(sk); - return READ_ONCE(msk->first_pending); + return msk->first_pending; } static inline struct mptcp_data_frag *mptcp_send_next(struct sock *sk) @@ -509,6 +513,7 @@ struct mptcp_subflow_context { u64 remote_key; u64 idsn; u64 map_seq; + u64 rcv_wnd_sent; u32 snd_isn; u32 token; u32 rel_write_seq; @@ -535,16 +540,18 @@ struct mptcp_subflow_context { send_infinite_map : 1, remote_key_valid : 1, /* received the peer key from */ disposable : 1, /* ctx can be free at ulp release time */ + closing : 1, /* must not pass rx data to msk anymore */ stale : 1, /* unable to snd/rcv data, do not use for xmit */ valid_csum_seen : 1, /* at least one csum validated */ is_mptfo : 1, /* subflow is doing TFO */ close_event_done : 1, /* has done the post-closed part */ mpc_drop : 1, /* the MPC option has been dropped in a rtx */ - __unused : 9; + __unused : 8; bool data_avail; bool scheduled; bool pm_listener; /* a listener managed by the kernel PM? */ bool fully_established; /* path validated */ + u32 lent_mem_frag; u32 remote_nonce; u64 thmac; u32 local_nonce; @@ -644,6 +651,42 @@ mptcp_send_active_reset_reason(struct sock *sk) tcp_send_active_reset(sk, GFP_ATOMIC, reason); } +/* Made the fwd mem carried by the given skb available to the msk, + * To be paired with a previous mptcp_subflow_lend_fwdmem() before freeing + * the skb or setting the skb ownership. + */ +static inline void mptcp_borrow_fwdmem(struct sock *sk, struct sk_buff *skb) +{ + struct sock *ssk = skb->sk; + + /* The subflow just lend the skb fwd memory; if the subflow meanwhile + * closed, mptcp_close_ssk() already released the ssk rcv memory. + */ + DEBUG_NET_WARN_ON_ONCE(skb->destructor); + sk_forward_alloc_add(sk, skb->truesize); + if (!ssk) + return; + + atomic_sub(skb->truesize, &ssk->sk_rmem_alloc); + skb->sk = NULL; +} + +static inline void +__mptcp_subflow_lend_fwdmem(struct mptcp_subflow_context *subflow, int size) +{ + int frag = (subflow->lent_mem_frag + size) & (PAGE_SIZE - 1); + + subflow->lent_mem_frag = frag; +} + +static inline void +mptcp_subflow_lend_fwdmem(struct mptcp_subflow_context *subflow, + struct sk_buff *skb) +{ + __mptcp_subflow_lend_fwdmem(subflow, skb->truesize); + skb->destructor = NULL; +} + static inline u64 mptcp_subflow_get_map_offset(const struct mptcp_subflow_context *subflow) { @@ -706,6 +749,9 @@ mptcp_subflow_delegated_next(struct mptcp_delegated_action *delegated) return ret; } +void __mptcp_inherit_memcg(struct sock *sk, struct sock *ssk, gfp_t gfp); +void __mptcp_inherit_cgrp_data(struct sock *sk, struct sock *ssk); + int mptcp_is_enabled(const struct net *net); unsigned int mptcp_get_add_addr_timeout(const struct net *net); int mptcp_is_checksum_enabled(const struct net *net); @@ -788,9 +834,7 @@ static inline bool mptcp_epollin_ready(const struct sock *sk) * as it can always coalesce them */ return (data_avail >= sk->sk_rcvlowat) || - (mem_cgroup_sockets_enabled && sk->sk_memcg && - mem_cgroup_under_socket_pressure(sk->sk_memcg)) || - READ_ONCE(tcp_memory_pressure); + tcp_under_memory_pressure(sk); } int mptcp_set_rcvlowat(struct sock *sk, int val); @@ -848,7 +892,7 @@ static inline void mptcp_stop_tout_timer(struct sock *sk) if (!inet_csk(sk)->icsk_mtup.probe_timestamp) return; - sk_stop_timer(sk, &sk->sk_timer); + sk_stop_timer(sk, &inet_csk(sk)->mptcp_tout_timer); inet_csk(sk)->icsk_mtup.probe_timestamp = 0; } @@ -978,8 +1022,6 @@ static inline void mptcp_propagate_sndbuf(struct sock *sk, struct sock *ssk) local_bh_enable(); } -void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags); - #define MPTCP_TOKEN_MAX_RETRIES 4 void __init mptcp_token_init(void); @@ -1182,15 +1224,17 @@ void __init mptcp_pm_userspace_register(void); void __init mptcp_pm_nl_init(void); void mptcp_pm_worker(struct mptcp_sock *msk); void __mptcp_pm_kernel_worker(struct mptcp_sock *msk); -unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk); -unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk); -unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk); -unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk); +u8 mptcp_pm_get_endp_signal_max(const struct mptcp_sock *msk); +u8 mptcp_pm_get_endp_subflow_max(const struct mptcp_sock *msk); +u8 mptcp_pm_get_endp_laminar_max(const struct mptcp_sock *msk); +u8 mptcp_pm_get_endp_fullmesh_max(const struct mptcp_sock *msk); +u8 mptcp_pm_get_limit_add_addr_accepted(const struct mptcp_sock *msk); +u8 mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk); /* called under PM lock */ static inline void __mptcp_pm_close_subflow(struct mptcp_sock *msk) { - if (--msk->pm.subflows < mptcp_pm_get_subflows_max(msk)) + if (--msk->pm.extra_subflows < mptcp_pm_get_limit_extra_subflows(msk)) WRITE_ONCE(msk->pm.accept_subflow, true); } @@ -1201,6 +1245,14 @@ static inline void mptcp_pm_close_subflow(struct mptcp_sock *msk) spin_unlock_bh(&msk->pm.lock); } +static inline bool mptcp_pm_add_addr_c_flag_case(struct mptcp_sock *msk) +{ + return READ_ONCE(msk->pm.remote_deny_join_id0) && + msk->pm.local_addr_used == 0 && + mptcp_pm_get_limit_add_addr_accepted(msk) == 0 && + msk->pm.extra_subflows < mptcp_pm_get_limit_extra_subflows(msk); +} + void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk); static inline struct mptcp_ext *mptcp_get_ext(const struct sk_buff *skb) diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 2c267aff95be..de90a2897d2d 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -962,7 +962,7 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) memset(info, 0, sizeof(*info)); - info->mptcpi_subflows = READ_ONCE(msk->pm.subflows); + info->mptcpi_extra_subflows = READ_ONCE(msk->pm.extra_subflows); info->mptcpi_add_addr_signal = READ_ONCE(msk->pm.add_addr_signaled); info->mptcpi_add_addr_accepted = READ_ONCE(msk->pm.add_addr_accepted); info->mptcpi_local_addr_used = READ_ONCE(msk->pm.local_addr_used); @@ -972,14 +972,18 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) /* The following limits only make sense for the in-kernel PM */ if (mptcp_pm_is_kernel(msk)) { - info->mptcpi_subflows_max = - mptcp_pm_get_subflows_max(msk); - info->mptcpi_add_addr_signal_max = - mptcp_pm_get_add_addr_signal_max(msk); - info->mptcpi_add_addr_accepted_max = - mptcp_pm_get_add_addr_accept_max(msk); - info->mptcpi_local_addr_max = - mptcp_pm_get_local_addr_max(msk); + info->mptcpi_limit_extra_subflows = + mptcp_pm_get_limit_extra_subflows(msk); + info->mptcpi_endp_signal_max = + mptcp_pm_get_endp_signal_max(msk); + info->mptcpi_limit_add_addr_accepted = + mptcp_pm_get_limit_add_addr_accepted(msk); + info->mptcpi_endp_subflow_max = + mptcp_pm_get_endp_subflow_max(msk); + info->mptcpi_endp_laminar_max = + mptcp_pm_get_endp_laminar_max(msk); + info->mptcpi_endp_fullmesh_max = + mptcp_pm_get_endp_fullmesh_max(msk); } if (__mptcp_check_fallback(msk)) @@ -996,7 +1000,7 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) info->mptcpi_bytes_sent = msk->bytes_sent; info->mptcpi_bytes_received = msk->bytes_received; info->mptcpi_bytes_retrans = msk->bytes_retrans; - info->mptcpi_subflows_total = info->mptcpi_subflows + + info->mptcpi_subflows_total = info->mptcpi_extra_subflows + __mptcp_has_initial_subflow(msk); now = tcp_jiffies32; info->mptcpi_last_data_sent = jiffies_to_msecs(now - msk->last_data_sent); @@ -1532,13 +1536,12 @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk) { static const unsigned int tx_rx_locks = SOCK_RCVBUF_LOCK | SOCK_SNDBUF_LOCK; struct sock *sk = (struct sock *)msk; + bool keep_open; - if (ssk->sk_prot->keepalive) { - if (sock_flag(sk, SOCK_KEEPOPEN)) - ssk->sk_prot->keepalive(ssk, 1); - else - ssk->sk_prot->keepalive(ssk, 0); - } + keep_open = sock_flag(sk, SOCK_KEEPOPEN); + if (ssk->sk_prot->keepalive) + ssk->sk_prot->keepalive(ssk, keep_open); + sock_valbool_flag(ssk, SOCK_KEEPOPEN, keep_open); ssk->sk_priority = sk->sk_priority; ssk->sk_bound_dev_if = sk->sk_bound_dev_if; diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 3f1b62a9fe88..86ce58ae533d 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -491,6 +491,9 @@ static void subflow_set_remote_key(struct mptcp_sock *msk, mptcp_crypto_key_sha(subflow->remote_key, NULL, &subflow->iasn); subflow->iasn++; + /* for fallback's sake */ + subflow->map_seq = subflow->iasn; + WRITE_ONCE(msk->remote_key, subflow->remote_key); WRITE_ONCE(msk->ack_seq, subflow->iasn); WRITE_ONCE(msk->can_ack, true); @@ -883,6 +886,10 @@ create_child: ctx->subflow_id = 1; owner = mptcp_sk(ctx->conn); + + if (mp_opt.deny_join_id0) + WRITE_ONCE(owner->pm.remote_deny_join_id0, true); + mptcp_pm_new_connection(owner, child, 1); /* with OoO packets we can reach here without ingress @@ -1281,6 +1288,7 @@ static bool subflow_is_done(const struct sock *sk) /* sched mptcp worker for subflow cleanup if no more data is pending */ static void subflow_sched_work_if_closed(struct mptcp_sock *msk, struct sock *ssk) { + const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); struct sock *sk = (struct sock *)msk; if (likely(ssk->sk_state != TCP_CLOSE && @@ -1299,7 +1307,8 @@ static void subflow_sched_work_if_closed(struct mptcp_sock *msk, struct sock *ss */ if (__mptcp_check_fallback(msk) && subflow_is_done(ssk) && msk->first == ssk && - mptcp_update_rcv_data_fin(msk, READ_ONCE(msk->ack_seq), true)) + mptcp_update_rcv_data_fin(msk, subflow->map_seq + + subflow->map_data_len, true)) mptcp_schedule_work(sk); } @@ -1429,9 +1438,12 @@ reset: skb = skb_peek(&ssk->sk_receive_queue); subflow->map_valid = 1; - subflow->map_seq = READ_ONCE(msk->ack_seq); subflow->map_data_len = skb->len; subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq - subflow->ssn_offset; + subflow->map_seq = __mptcp_expand_seq(subflow->map_seq, + subflow->iasn + + TCP_SKB_CB(skb)->seq - + subflow->ssn_offset - 1); WRITE_ONCE(subflow->data_avail, true); return true; } @@ -1656,7 +1668,7 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_pm_local *local, addrlen = sizeof(struct sockaddr_in6); #endif ssk->sk_bound_dev_if = local->ifindex; - err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen); + err = kernel_bind(sf, (struct sockaddr_unsized *)&addr, addrlen); if (err) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNTXBINDERR); pr_debug("msk=%p local=%d remote=%d bind error: %d\n", @@ -1676,7 +1688,7 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_pm_local *local, sock_hold(ssk); list_add_tail(&subflow->node, &msk->conn_list); - err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK); + err = kernel_connect(sf, (struct sockaddr_unsized *)&addr, addrlen, O_NONBLOCK); if (err && err != -EINPROGRESS) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNTXCONNECTERR); pr_debug("msk=%p local=%d remote=%d connect error: %d\n", @@ -1708,30 +1720,39 @@ err_out: return err; } -static void mptcp_attach_cgroup(struct sock *parent, struct sock *child) +void __mptcp_inherit_memcg(struct sock *sk, struct sock *ssk, gfp_t gfp) +{ + /* Only if the msk has been accepted already (and not orphaned).*/ + if (!mem_cgroup_sockets_enabled || !sk->sk_socket) + return; + + mem_cgroup_sk_inherit(sk, ssk); + __sk_charge(ssk, gfp); +} + +void __mptcp_inherit_cgrp_data(struct sock *sk, struct sock *ssk) { #ifdef CONFIG_SOCK_CGROUP_DATA - struct sock_cgroup_data *parent_skcd = &parent->sk_cgrp_data, - *child_skcd = &child->sk_cgrp_data; + struct sock_cgroup_data *sk_cd = &sk->sk_cgrp_data, + *ssk_cd = &ssk->sk_cgrp_data; /* only the additional subflows created by kworkers have to be modified */ - if (cgroup_id(sock_cgroup_ptr(parent_skcd)) != - cgroup_id(sock_cgroup_ptr(child_skcd))) { -#ifdef CONFIG_MEMCG - struct mem_cgroup *memcg = parent->sk_memcg; - - mem_cgroup_sk_free(child); - if (memcg && css_tryget(&memcg->css)) - child->sk_memcg = memcg; -#endif /* CONFIG_MEMCG */ - - cgroup_sk_free(child_skcd); - *child_skcd = *parent_skcd; - cgroup_sk_clone(child_skcd); + if (cgroup_id(sock_cgroup_ptr(sk_cd)) != + cgroup_id(sock_cgroup_ptr(ssk_cd))) { + cgroup_sk_free(ssk_cd); + *ssk_cd = *sk_cd; + cgroup_sk_clone(sk_cd); } #endif /* CONFIG_SOCK_CGROUP_DATA */ } +static void mptcp_attach_cgroup(struct sock *parent, struct sock *child) +{ + __mptcp_inherit_cgrp_data(parent, child); + if (mem_cgroup_sockets_enabled) + mem_cgroup_sk_inherit(parent, child); +} + static void mptcp_subflow_ops_override(struct sock *ssk) { #if IS_ENABLED(CONFIG_MPTCP_IPV6) @@ -2145,6 +2166,10 @@ void __init mptcp_subflow_init(void) tcp_prot_override = tcp_prot; tcp_prot_override.release_cb = tcp_release_cb_override; tcp_prot_override.diag_destroy = tcp_abort_override; +#ifdef CONFIG_BPF_SYSCALL + /* Disable sockmap processing for subflows */ + tcp_prot_override.psock_update_sk_prot = NULL; +#endif #if IS_ENABLED(CONFIG_MPTCP_IPV6) /* In struct mptcp_subflow_request_sock, we assume the TCP request sock @@ -2181,6 +2206,10 @@ void __init mptcp_subflow_init(void) tcpv6_prot_override = tcpv6_prot; tcpv6_prot_override.release_cb = tcp_release_cb_override; tcpv6_prot_override.diag_destroy = tcp_abort_override; +#ifdef CONFIG_BPF_SYSCALL + /* Disable sockmap processing for subflows */ + tcpv6_prot_override.psock_update_sk_prot = NULL; +#endif #endif mptcp_diag_subflow_init(&subflow_ulp_ops); diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index e43e20f529f8..6bfc250e474f 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -141,6 +141,7 @@ obj-$(CONFIG_NFT_FWD_NETDEV) += nft_fwd_netdev.o # flow table infrastructure obj-$(CONFIG_NF_FLOW_TABLE) += nf_flow_table.o nf_flow_table-objs := nf_flow_table_core.o nf_flow_table_ip.o \ + nf_flow_table_path.o \ nf_flow_table_offload.o nf_flow_table_xdp.o nf_flow_table-$(CONFIG_NF_FLOW_TABLE_PROCFS) += nf_flow_table_procfs.o ifeq ($(CONFIG_NF_FLOW_TABLE),m) diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 5251524b96af..5e4453e9ef8e 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -63,7 +63,7 @@ struct hbucket { : jhash_size((htable_bits) - HTABLE_REGION_BITS)) #define ahash_sizeof_regions(htable_bits) \ (ahash_numof_locks(htable_bits) * sizeof(struct ip_set_region)) -#define ahash_region(n, htable_bits) \ +#define ahash_region(n) \ ((n) / jhash_size(HTABLE_REGION_BITS)) #define ahash_bucket_start(h, htable_bits) \ ((htable_bits) < HTABLE_REGION_BITS ? 0 \ @@ -702,7 +702,7 @@ retry: #endif key = HKEY(data, h->initval, htable_bits); m = __ipset_dereference(hbucket(t, key)); - nr = ahash_region(key, htable_bits); + nr = ahash_region(key); if (!m) { m = kzalloc(sizeof(*m) + AHASH_INIT_SIZE * dsize, @@ -852,7 +852,7 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, rcu_read_lock_bh(); t = rcu_dereference_bh(h->table); key = HKEY(value, h->initval, t->htable_bits); - r = ahash_region(key, t->htable_bits); + r = ahash_region(key); atomic_inc(&t->uref); elements = t->hregion[r].elements; maxelem = t->maxelem; @@ -1050,7 +1050,7 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, rcu_read_lock_bh(); t = rcu_dereference_bh(h->table); key = HKEY(value, h->initval, t->htable_bits); - r = ahash_region(key, t->htable_bits); + r = ahash_region(key); atomic_inc(&t->uref); rcu_read_unlock_bh(); diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c index fdacbc3c15be..d54d7da58334 100644 --- a/net/netfilter/ipvs/ip_vs_app.c +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -13,8 +13,7 @@ * Author: Juan Jose Ciarlante, <jjciarla@raiz.uncu.edu.ar> */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/kernel.h> diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 965f3c8e5089..50cc492c7553 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -17,8 +17,7 @@ * Changes: */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/interrupt.h> #include <linux/in.h> @@ -885,7 +884,7 @@ static void ip_vs_conn_expire(struct timer_list *t) * conntrack cleanup for the net. */ smp_rmb(); - if (ipvs->enable) + if (READ_ONCE(ipvs->enable)) ip_vs_conn_drop_conntrack(cp); } @@ -1439,7 +1438,7 @@ void ip_vs_expire_nodest_conn_flush(struct netns_ipvs *ipvs) cond_resched_rcu(); /* netns clean up started, abort delayed work */ - if (!ipvs->enable) + if (!READ_ONCE(ipvs->enable)) break; } rcu_read_unlock(); diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index c7a8a08b7308..90d56f92c0f6 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -19,8 +19,7 @@ * Harald Welte don't use nfcache */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/kernel.h> @@ -1353,9 +1352,6 @@ ip_vs_out_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *stat if (unlikely(!skb_dst(skb))) return NF_ACCEPT; - if (!ipvs->enable) - return NF_ACCEPT; - ip_vs_fill_iph_skb(af, skb, false, &iph); #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { @@ -1940,7 +1936,7 @@ ip_vs_in_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state return NF_ACCEPT; } /* ipvs enabled in this netns ? */ - if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) + if (unlikely(sysctl_backup_only(ipvs))) return NF_ACCEPT; ip_vs_fill_iph_skb(af, skb, false, &iph); @@ -2108,7 +2104,7 @@ ip_vs_forward_icmp(void *priv, struct sk_buff *skb, int r; /* ipvs enabled in this netns ? */ - if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) + if (unlikely(sysctl_backup_only(ipvs))) return NF_ACCEPT; if (state->pf == NFPROTO_IPV4) { @@ -2295,7 +2291,7 @@ static int __net_init __ip_vs_init(struct net *net) return -ENOMEM; /* Hold the beast until a service is registered */ - ipvs->enable = 0; + WRITE_ONCE(ipvs->enable, 0); ipvs->net = net; /* Counters used for creating unique names */ ipvs->gen = atomic_read(&ipvs_netns_cnt); @@ -2367,7 +2363,7 @@ static void __net_exit __ip_vs_dev_cleanup_batch(struct list_head *net_list) ipvs = net_ipvs(net); ip_vs_unregister_hooks(ipvs, AF_INET); ip_vs_unregister_hooks(ipvs, AF_INET6); - ipvs->enable = 0; /* Disable packet reception */ + WRITE_ONCE(ipvs->enable, 0); /* Disable packet reception */ smp_wmb(); ip_vs_sync_net_cleanup(ipvs); } diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 6a6fc4478533..068702894377 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -13,8 +13,7 @@ * Changes: */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/init.h> @@ -256,7 +255,7 @@ static void est_reload_work_handler(struct work_struct *work) struct ip_vs_est_kt_data *kd = ipvs->est_kt_arr[id]; /* netns clean up started, abort delayed work */ - if (!ipvs->enable) + if (!READ_ONCE(ipvs->enable)) goto unlock; if (!kd) continue; @@ -1483,9 +1482,9 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, *svc_p = svc; - if (!ipvs->enable) { + if (!READ_ONCE(ipvs->enable)) { /* Now there is a service - full throttle */ - ipvs->enable = 1; + WRITE_ONCE(ipvs->enable, 1); /* Start estimation for first time */ ip_vs_est_reload_start(ipvs); diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c index 75f4c231f4a0..bb7aca4601ff 100644 --- a/net/netfilter/ipvs/ip_vs_dh.c +++ b/net/netfilter/ipvs/ip_vs_dh.c @@ -30,8 +30,7 @@ * */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/ip.h> #include <linux/slab.h> diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index f821ad2e19b3..77f4f637ff67 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -12,8 +12,7 @@ * get_stats()) do the per cpu summing. */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/kernel.h> #include <linux/jiffies.h> @@ -231,7 +230,7 @@ static int ip_vs_estimation_kthread(void *data) void ip_vs_est_reload_start(struct netns_ipvs *ipvs) { /* Ignore reloads before first service is added */ - if (!ipvs->enable) + if (!READ_ONCE(ipvs->enable)) return; ip_vs_est_stopped_recalc(ipvs); /* Bump the kthread configuration genid */ @@ -265,7 +264,8 @@ int ip_vs_est_kthread_start(struct netns_ipvs *ipvs, } set_user_nice(kd->task, sysctl_est_nice(ipvs)); - set_cpus_allowed_ptr(kd->task, sysctl_est_cpulist(ipvs)); + if (sysctl_est_preferred_cpulist(ipvs)) + kthread_affine_preferred(kd->task, sysctl_est_preferred_cpulist(ipvs)); pr_info("starting estimator thread %d...\n", kd->id); wake_up_process(kd->task); @@ -305,7 +305,7 @@ static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs) int i; if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads && - ipvs->enable && ipvs->est_max_threads) + READ_ONCE(ipvs->enable) && ipvs->est_max_threads) return -EINVAL; mutex_lock(&ipvs->est_mutex); @@ -342,7 +342,7 @@ static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs) } /* Start kthread tasks only when services are present */ - if (ipvs->enable && !ip_vs_est_stopped(ipvs)) { + if (READ_ONCE(ipvs->enable) && !ip_vs_est_stopped(ipvs)) { ret = ip_vs_est_kthread_start(ipvs, kd); if (ret < 0) goto out; @@ -485,7 +485,7 @@ int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) struct ip_vs_estimator *est = &stats->est; int ret; - if (!ipvs->est_max_threads && ipvs->enable) + if (!ipvs->est_max_threads && READ_ONCE(ipvs->enable)) ipvs->est_max_threads = ip_vs_est_max_threads(ipvs); est->ktid = -1; @@ -662,7 +662,7 @@ static int ip_vs_est_calc_limits(struct netns_ipvs *ipvs, int *chain_max) /* Wait for cpufreq frequency transition */ wait_event_idle_timeout(wq, kthread_should_stop(), HZ / 50); - if (!ipvs->enable || kthread_should_stop()) + if (!READ_ONCE(ipvs->enable) || kthread_should_stop()) goto stop; } @@ -680,7 +680,7 @@ static int ip_vs_est_calc_limits(struct netns_ipvs *ipvs, int *chain_max) rcu_read_unlock(); local_bh_enable(); - if (!ipvs->enable || kthread_should_stop()) + if (!READ_ONCE(ipvs->enable) || kthread_should_stop()) goto stop; cond_resched(); @@ -756,7 +756,7 @@ static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs) mutex_lock(&ipvs->est_mutex); for (id = 1; id < ipvs->est_kt_count; id++) { /* netns clean up started, abort */ - if (!ipvs->enable) + if (!READ_ONCE(ipvs->enable)) goto unlock2; kd = ipvs->est_kt_arr[id]; if (!kd) @@ -786,7 +786,7 @@ last_kt: id = ipvs->est_kt_count; next_kt: - if (!ipvs->enable || kthread_should_stop()) + if (!READ_ONCE(ipvs->enable) || kthread_should_stop()) goto unlock; id--; if (id < 0) diff --git a/net/netfilter/ipvs/ip_vs_fo.c b/net/netfilter/ipvs/ip_vs_fo.c index ab117e5bc34e..d657b47c6511 100644 --- a/net/netfilter/ipvs/ip_vs_fo.c +++ b/net/netfilter/ipvs/ip_vs_fo.c @@ -8,8 +8,7 @@ * Kenny Mathis : added initial functionality based on weight */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/kernel.h> diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index d8a284999544..b315c608fda4 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -16,8 +16,7 @@ * Author: Wouter Gadeyne */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/moduleparam.h> @@ -53,6 +52,7 @@ enum { IP_VS_FTP_EPSV, }; +static bool exiting_module; /* * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper * First port is set to the default port. @@ -605,7 +605,7 @@ static void __ip_vs_ftp_exit(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); - if (!ipvs) + if (!ipvs || !exiting_module) return; unregister_ip_vs_app(ipvs, &ip_vs_ftp); @@ -627,6 +627,7 @@ static int __init ip_vs_ftp_init(void) */ static void __exit ip_vs_ftp_exit(void) { + exiting_module = true; unregister_pernet_subsys(&ip_vs_ftp_ops); /* rcu_barrier() is called by netns */ } diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index 156181a3bacd..e6c8ed0c92f6 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -34,8 +34,7 @@ * me to write this module. */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/ip.h> #include <linux/slab.h> diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index a021e6aba3d7..a25cf7bb6185 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -32,8 +32,7 @@ * */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/ip.h> #include <linux/module.h> diff --git a/net/netfilter/ipvs/ip_vs_lc.c b/net/netfilter/ipvs/ip_vs_lc.c index c2764505e380..38cc38c5d8bb 100644 --- a/net/netfilter/ipvs/ip_vs_lc.c +++ b/net/netfilter/ipvs/ip_vs_lc.c @@ -9,8 +9,7 @@ * Wensong Zhang : added any dest with weight=0 is quiesced */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/kernel.h> diff --git a/net/netfilter/ipvs/ip_vs_mh.c b/net/netfilter/ipvs/ip_vs_mh.c index e3d7f5c879ce..f61f54004c9e 100644 --- a/net/netfilter/ipvs/ip_vs_mh.c +++ b/net/netfilter/ipvs/ip_vs_mh.c @@ -17,8 +17,7 @@ https://www.usenix.org/system/files/conference/nsdi16/nsdi16-paper-eisenbud.pdf * */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/ip.h> #include <linux/slab.h> diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c index 08adcb222986..81974f69e5bb 100644 --- a/net/netfilter/ipvs/ip_vs_nfct.c +++ b/net/netfilter/ipvs/ip_vs_nfct.c @@ -30,8 +30,7 @@ * PASV response can not be NAT-ed) but Active FTP should work */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/types.h> diff --git a/net/netfilter/ipvs/ip_vs_nq.c b/net/netfilter/ipvs/ip_vs_nq.c index ed7f5c889b41..ada158c610ce 100644 --- a/net/netfilter/ipvs/ip_vs_nq.c +++ b/net/netfilter/ipvs/ip_vs_nq.c @@ -26,8 +26,7 @@ * */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/kernel.h> diff --git a/net/netfilter/ipvs/ip_vs_ovf.c b/net/netfilter/ipvs/ip_vs_ovf.c index c7708b809700..c5c67df80a0b 100644 --- a/net/netfilter/ipvs/ip_vs_ovf.c +++ b/net/netfilter/ipvs/ip_vs_ovf.c @@ -12,8 +12,7 @@ * active connections */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/kernel.h> diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c index 166c669f0763..3035079ebd99 100644 --- a/net/netfilter/ipvs/ip_vs_pe.c +++ b/net/netfilter/ipvs/ip_vs_pe.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/spinlock.h> diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c index e4ce1d9a63f9..85f31d71e29a 100644 --- a/net/netfilter/ipvs/ip_vs_pe_sip.c +++ b/net/netfilter/ipvs/ip_vs_pe_sip.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/kernel.h> diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c index a9fd1d3fc2cb..fd9dbca24c85 100644 --- a/net/netfilter/ipvs/ip_vs_proto.c +++ b/net/netfilter/ipvs/ip_vs_proto.c @@ -8,8 +8,7 @@ * Changes: */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/kernel.h> diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c index 89602c16f6b6..44e14acc187e 100644 --- a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c +++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c @@ -6,8 +6,7 @@ * Wensong Zhang <wensong@linuxvirtualserver.org> */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/in.h> #include <linux/ip.h> diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index 7da51390cea6..f68a1533ee45 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -13,8 +13,7 @@ * protocol ip_vs_proto_data and is handled by netns */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/kernel.h> #include <linux/ip.h> diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c index 68260d91c988..0f0107c80dd2 100644 --- a/net/netfilter/ipvs/ip_vs_proto_udp.c +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -9,8 +9,7 @@ * Network name space (netns) aware. */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/in.h> #include <linux/ip.h> diff --git a/net/netfilter/ipvs/ip_vs_rr.c b/net/netfilter/ipvs/ip_vs_rr.c index 6baa34dff9f0..4125ee561cdc 100644 --- a/net/netfilter/ipvs/ip_vs_rr.c +++ b/net/netfilter/ipvs/ip_vs_rr.c @@ -14,8 +14,7 @@ * Wensong Zhang : added any dest with weight=0 is quiesced */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/kernel.h> diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c index d4903723be7e..c6e421c4e299 100644 --- a/net/netfilter/ipvs/ip_vs_sched.c +++ b/net/netfilter/ipvs/ip_vs_sched.c @@ -12,8 +12,7 @@ * Changes: */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/spinlock.h> diff --git a/net/netfilter/ipvs/ip_vs_sed.c b/net/netfilter/ipvs/ip_vs_sed.c index a46f99a56618..245a323c84cd 100644 --- a/net/netfilter/ipvs/ip_vs_sed.c +++ b/net/netfilter/ipvs/ip_vs_sed.c @@ -30,8 +30,7 @@ * */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/kernel.h> diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c index 92e77d7a6b50..0e85e07e23b9 100644 --- a/net/netfilter/ipvs/ip_vs_sh.c +++ b/net/netfilter/ipvs/ip_vs_sh.c @@ -32,8 +32,7 @@ * */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/ip.h> #include <linux/slab.h> diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 3402675bf521..54dd1514ac45 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -32,8 +32,7 @@ * Persistence support, fwmark and time-out. */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/slab.h> @@ -1435,7 +1434,7 @@ static int bind_mcastif_addr(struct socket *sock, struct net_device *dev) sin.sin_addr.s_addr = addr; sin.sin_port = 0; - return kernel_bind(sock, (struct sockaddr *)&sin, sizeof(sin)); + return kernel_bind(sock, (struct sockaddr_unsized *)&sin, sizeof(sin)); } static void get_mcast_sockaddr(union ipvs_sockaddr *sa, int *salen, @@ -1501,7 +1500,7 @@ static int make_send_sock(struct netns_ipvs *ipvs, int id, } get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->mcfg, id); - result = kernel_connect(sock, (struct sockaddr *)&mcast_addr, + result = kernel_connect(sock, (struct sockaddr_unsized *)&mcast_addr, salen, 0); if (result < 0) { pr_err("Error connecting to the multicast addr\n"); @@ -1542,7 +1541,7 @@ static int make_receive_sock(struct netns_ipvs *ipvs, int id, get_mcast_sockaddr(&mcast_addr, &salen, &ipvs->bcfg, id); sock->sk->sk_bound_dev_if = dev->ifindex; - result = kernel_bind(sock, (struct sockaddr *)&mcast_addr, salen); + result = kernel_bind(sock, (struct sockaddr_unsized *)&mcast_addr, salen); if (result < 0) { pr_err("Error binding to the multicast addr\n"); goto error; diff --git a/net/netfilter/ipvs/ip_vs_twos.c b/net/netfilter/ipvs/ip_vs_twos.c index 8d5419edde50..dbb7f5fd4688 100644 --- a/net/netfilter/ipvs/ip_vs_twos.c +++ b/net/netfilter/ipvs/ip_vs_twos.c @@ -4,8 +4,7 @@ * Authors: Darby Payne <darby.payne@applovin.com> */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/kernel.h> #include <linux/module.h> diff --git a/net/netfilter/ipvs/ip_vs_wlc.c b/net/netfilter/ipvs/ip_vs_wlc.c index 9fa500927c0a..9da445ca09a1 100644 --- a/net/netfilter/ipvs/ip_vs_wlc.c +++ b/net/netfilter/ipvs/ip_vs_wlc.c @@ -14,8 +14,7 @@ * Wensong Zhang : added any dest with weight=0 is quiesced */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/kernel.h> diff --git a/net/netfilter/ipvs/ip_vs_wrr.c b/net/netfilter/ipvs/ip_vs_wrr.c index 85ce0d04afac..99f09cbf2d9b 100644 --- a/net/netfilter/ipvs/ip_vs_wrr.c +++ b/net/netfilter/ipvs/ip_vs_wrr.c @@ -13,8 +13,7 @@ * with weight 0 when all weights are zero */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/module.h> #include <linux/kernel.h> diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 95af252b2939..3162ce3c2640 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -21,8 +21,7 @@ * - the only place where we can see skb->sk != NULL */ -#define KMSG_COMPONENT "IPVS" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "IPVS: " fmt #include <linux/kernel.h> #include <linux/slab.h> diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c index 913ede2f57f9..f1be4dd5cf85 100644 --- a/net/netfilter/nf_conncount.c +++ b/net/netfilter/nf_conncount.c @@ -122,15 +122,65 @@ find_or_evict(struct net *net, struct nf_conncount_list *list, return ERR_PTR(-EAGAIN); } +static bool get_ct_or_tuple_from_skb(struct net *net, + const struct sk_buff *skb, + u16 l3num, + struct nf_conn **ct, + struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone **zone, + bool *refcounted) +{ + const struct nf_conntrack_tuple_hash *h; + enum ip_conntrack_info ctinfo; + struct nf_conn *found_ct; + + found_ct = nf_ct_get(skb, &ctinfo); + if (found_ct && !nf_ct_is_template(found_ct)) { + *tuple = found_ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + *zone = nf_ct_zone(found_ct); + *ct = found_ct; + return true; + } + + if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), l3num, net, tuple)) + return false; + + if (found_ct) + *zone = nf_ct_zone(found_ct); + + h = nf_conntrack_find_get(net, *zone, tuple); + if (!h) + return true; + + found_ct = nf_ct_tuplehash_to_ctrack(h); + *refcounted = true; + *ct = found_ct; + + return true; +} + static int __nf_conncount_add(struct net *net, - struct nf_conncount_list *list, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone) + const struct sk_buff *skb, + u16 l3num, + struct nf_conncount_list *list) { + const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; const struct nf_conntrack_tuple_hash *found; struct nf_conncount_tuple *conn, *conn_n; + struct nf_conntrack_tuple tuple; + struct nf_conn *ct = NULL; struct nf_conn *found_ct; unsigned int collect = 0; + bool refcounted = false; + + if (!get_ct_or_tuple_from_skb(net, skb, l3num, &ct, &tuple, &zone, &refcounted)) + return -ENOENT; + + if (ct && nf_ct_is_confirmed(ct)) { + if (refcounted) + nf_ct_put(ct); + return -EEXIST; + } if ((u32)jiffies == list->last_gc) goto add_new_node; @@ -144,10 +194,10 @@ static int __nf_conncount_add(struct net *net, if (IS_ERR(found)) { /* Not found, but might be about to be confirmed */ if (PTR_ERR(found) == -EAGAIN) { - if (nf_ct_tuple_equal(&conn->tuple, tuple) && + if (nf_ct_tuple_equal(&conn->tuple, &tuple) && nf_ct_zone_id(&conn->zone, conn->zone.dir) == nf_ct_zone_id(zone, zone->dir)) - return 0; /* already exists */ + goto out_put; /* already exists */ } else { collect++; } @@ -156,7 +206,7 @@ static int __nf_conncount_add(struct net *net, found_ct = nf_ct_tuplehash_to_ctrack(found); - if (nf_ct_tuple_equal(&conn->tuple, tuple) && + if (nf_ct_tuple_equal(&conn->tuple, &tuple) && nf_ct_zone_equal(found_ct, zone, zone->dir)) { /* * We should not see tuples twice unless someone hooks @@ -165,7 +215,7 @@ static int __nf_conncount_add(struct net *net, * Attempt to avoid a re-add in this case. */ nf_ct_put(found_ct); - return 0; + goto out_put; } else if (already_closed(found_ct)) { /* * we do not care about connections which are @@ -188,31 +238,35 @@ add_new_node: if (conn == NULL) return -ENOMEM; - conn->tuple = *tuple; + conn->tuple = tuple; conn->zone = *zone; conn->cpu = raw_smp_processor_id(); conn->jiffies32 = (u32)jiffies; list_add_tail(&conn->node, &list->head); list->count++; list->last_gc = (u32)jiffies; + +out_put: + if (refcounted) + nf_ct_put(ct); return 0; } -int nf_conncount_add(struct net *net, - struct nf_conncount_list *list, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone) +int nf_conncount_add_skb(struct net *net, + const struct sk_buff *skb, + u16 l3num, + struct nf_conncount_list *list) { int ret; /* check the saved connections */ spin_lock_bh(&list->list_lock); - ret = __nf_conncount_add(net, list, tuple, zone); + ret = __nf_conncount_add(net, skb, l3num, list); spin_unlock_bh(&list->list_lock); return ret; } -EXPORT_SYMBOL_GPL(nf_conncount_add); +EXPORT_SYMBOL_GPL(nf_conncount_add_skb); void nf_conncount_list_init(struct nf_conncount_list *list) { @@ -224,8 +278,8 @@ void nf_conncount_list_init(struct nf_conncount_list *list) EXPORT_SYMBOL_GPL(nf_conncount_list_init); /* Return true if the list is empty. Must be called with BH disabled. */ -bool nf_conncount_gc_list(struct net *net, - struct nf_conncount_list *list) +static bool __nf_conncount_gc_list(struct net *net, + struct nf_conncount_list *list) { const struct nf_conntrack_tuple_hash *found; struct nf_conncount_tuple *conn, *conn_n; @@ -237,10 +291,6 @@ bool nf_conncount_gc_list(struct net *net, if ((u32)jiffies == READ_ONCE(list->last_gc)) return false; - /* don't bother if other cpu is already doing GC */ - if (!spin_trylock(&list->list_lock)) - return false; - list_for_each_entry_safe(conn, conn_n, &list->head, node) { found = find_or_evict(net, list, conn); if (IS_ERR(found)) { @@ -269,7 +319,21 @@ bool nf_conncount_gc_list(struct net *net, if (!list->count) ret = true; list->last_gc = (u32)jiffies; - spin_unlock(&list->list_lock); + + return ret; +} + +bool nf_conncount_gc_list(struct net *net, + struct nf_conncount_list *list) +{ + bool ret; + + /* don't bother if other cpu is already doing GC */ + if (!spin_trylock_bh(&list->list_lock)) + return false; + + ret = __nf_conncount_gc_list(net, list); + spin_unlock_bh(&list->list_lock); return ret; } @@ -309,19 +373,22 @@ static void schedule_gc_worker(struct nf_conncount_data *data, int tree) static unsigned int insert_tree(struct net *net, + const struct sk_buff *skb, + u16 l3num, struct nf_conncount_data *data, struct rb_root *root, unsigned int hash, - const u32 *key, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone) + const u32 *key) { struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES]; + const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; + bool do_gc = true, refcounted = false; + unsigned int count = 0, gc_count = 0; struct rb_node **rbnode, *parent; - struct nf_conncount_rb *rbconn; + struct nf_conntrack_tuple tuple; struct nf_conncount_tuple *conn; - unsigned int count = 0, gc_count = 0; - bool do_gc = true; + struct nf_conncount_rb *rbconn; + struct nf_conn *ct = NULL; spin_lock_bh(&nf_conncount_locks[hash]); restart: @@ -340,8 +407,8 @@ restart: } else { int ret; - ret = nf_conncount_add(net, &rbconn->list, tuple, zone); - if (ret) + ret = nf_conncount_add_skb(net, skb, l3num, &rbconn->list); + if (ret && ret != -EEXIST) count = 0; /* hotdrop */ else count = rbconn->list.count; @@ -364,30 +431,35 @@ restart: goto restart; } - /* expected case: match, insert new node */ - rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC); - if (rbconn == NULL) - goto out_unlock; + if (get_ct_or_tuple_from_skb(net, skb, l3num, &ct, &tuple, &zone, &refcounted)) { + /* expected case: match, insert new node */ + rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC); + if (rbconn == NULL) + goto out_unlock; - conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC); - if (conn == NULL) { - kmem_cache_free(conncount_rb_cachep, rbconn); - goto out_unlock; - } + conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC); + if (conn == NULL) { + kmem_cache_free(conncount_rb_cachep, rbconn); + goto out_unlock; + } - conn->tuple = *tuple; - conn->zone = *zone; - conn->cpu = raw_smp_processor_id(); - conn->jiffies32 = (u32)jiffies; - memcpy(rbconn->key, key, sizeof(u32) * data->keylen); + conn->tuple = tuple; + conn->zone = *zone; + conn->cpu = raw_smp_processor_id(); + conn->jiffies32 = (u32)jiffies; + memcpy(rbconn->key, key, sizeof(u32) * data->keylen); - nf_conncount_list_init(&rbconn->list); - list_add(&conn->node, &rbconn->list.head); - count = 1; - rbconn->list.count = count; + nf_conncount_list_init(&rbconn->list); + list_add(&conn->node, &rbconn->list.head); + count = 1; + rbconn->list.count = count; - rb_link_node_rcu(&rbconn->node, parent, rbnode); - rb_insert_color(&rbconn->node, root); + rb_link_node_rcu(&rbconn->node, parent, rbnode); + rb_insert_color(&rbconn->node, root); + + if (refcounted) + nf_ct_put(ct); + } out_unlock: spin_unlock_bh(&nf_conncount_locks[hash]); return count; @@ -395,10 +467,10 @@ out_unlock: static unsigned int count_tree(struct net *net, + const struct sk_buff *skb, + u16 l3num, struct nf_conncount_data *data, - const u32 *key, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone) + const u32 *key) { struct rb_root *root; struct rb_node *parent; @@ -422,7 +494,7 @@ count_tree(struct net *net, } else { int ret; - if (!tuple) { + if (!skb) { nf_conncount_gc_list(net, &rbconn->list); return rbconn->list.count; } @@ -437,19 +509,23 @@ count_tree(struct net *net, } /* same source network -> be counted! */ - ret = __nf_conncount_add(net, &rbconn->list, tuple, zone); + ret = __nf_conncount_add(net, skb, l3num, &rbconn->list); spin_unlock_bh(&rbconn->list.list_lock); - if (ret) + if (ret && ret != -EEXIST) { return 0; /* hotdrop */ - else + } else { + /* -EEXIST means add was skipped, update the list */ + if (ret == -EEXIST) + nf_conncount_gc_list(net, &rbconn->list); return rbconn->list.count; + } } } - if (!tuple) + if (!skb) return 0; - return insert_tree(net, data, root, hash, key, tuple, zone); + return insert_tree(net, skb, l3num, data, root, hash, key); } static void tree_gc_worker(struct work_struct *work) @@ -511,18 +587,19 @@ next: } /* Count and return number of conntrack entries in 'net' with particular 'key'. - * If 'tuple' is not null, insert it into the accounting data structure. - * Call with RCU read lock. + * If 'skb' is not null, insert the corresponding tuple into the accounting + * data structure. Call with RCU read lock. */ -unsigned int nf_conncount_count(struct net *net, - struct nf_conncount_data *data, - const u32 *key, - const struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_zone *zone) +unsigned int nf_conncount_count_skb(struct net *net, + const struct sk_buff *skb, + u16 l3num, + struct nf_conncount_data *data, + const u32 *key) { - return count_tree(net, data, key, tuple, zone); + return count_tree(net, skb, l3num, data, key); + } -EXPORT_SYMBOL_GPL(nf_conncount_count); +EXPORT_SYMBOL_GPL(nf_conncount_count_skb); struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int keylen) { diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 344f88295976..0b95f226f211 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1668,7 +1668,7 @@ __nf_conntrack_alloc(struct net *net, /* We don't want any race condition at early drop stage */ ct_count = atomic_inc_return(&cnet->count); - if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) { + if (unlikely(ct_count > nf_conntrack_max)) { if (!early_drop(net, hash)) { if (!conntrack_gc_work.early_drop) conntrack_gc_work.early_drop = true; diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c index af68c64acaab..81baf2082604 100644 --- a/net/netfilter/nf_conntrack_ecache.c +++ b/net/netfilter/nf_conntrack_ecache.c @@ -301,7 +301,7 @@ void nf_conntrack_ecache_work(struct net *net, enum nf_ct_ecache_state state) net->ct.ecache_dwork_pending = true; } else if (state == NFCT_ECACHE_DESTROY_SENT) { if (!hlist_nulls_empty(&cnet->ecache.dying_list)) - mod_delayed_work(system_wq, &cnet->ecache.dwork, 0); + mod_delayed_work(system_percpu_wq, &cnet->ecache.dwork, 0); else net->ct.ecache_dwork_pending = false; } diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 4ed5878cb25b..ceb48c3ca0a4 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -368,7 +368,7 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me) (cur->tuple.src.l3num == NFPROTO_UNSPEC || cur->tuple.src.l3num == me->tuple.src.l3num) && cur->tuple.dst.protonum == me->tuple.dst.protonum) { - ret = -EEXIST; + ret = -EBUSY; goto out; } } @@ -379,7 +379,7 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me) hlist_for_each_entry(cur, &nf_ct_helper_hash[h], hnode) { if (nf_ct_tuple_src_mask_cmp(&cur->tuple, &me->tuple, &mask)) { - ret = -EEXIST; + ret = -EBUSY; goto out; } } diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 486d52b45fe5..3a04665adf99 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -60,7 +60,7 @@ MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("List and change connection tracking table"); struct ctnetlink_list_dump_ctx { - struct nf_conn *last; + unsigned long last_id; unsigned int cpu; bool done; }; @@ -884,8 +884,6 @@ errout: static int ctnetlink_done(struct netlink_callback *cb) { - if (cb->args[1]) - nf_ct_put((struct nf_conn *)cb->args[1]); kfree(cb->data); return 0; } @@ -1208,19 +1206,26 @@ ignore_entry: return 0; } +static unsigned long ctnetlink_get_id(const struct nf_conn *ct) +{ + unsigned long id = nf_ct_get_id(ct); + + return id ? id : 1; +} + static int ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) { unsigned int flags = cb->data ? NLM_F_DUMP_FILTERED : 0; struct net *net = sock_net(skb->sk); - struct nf_conn *ct, *last; + unsigned long last_id = cb->args[1]; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; struct nf_conn *nf_ct_evict[8]; + struct nf_conn *ct; int res, i; spinlock_t *lockp; - last = (struct nf_conn *)cb->args[1]; i = 0; local_bh_disable(); @@ -1257,7 +1262,7 @@ restart: continue; if (cb->args[1]) { - if (ct != last) + if (ctnetlink_get_id(ct) != last_id) continue; cb->args[1] = 0; } @@ -1270,8 +1275,7 @@ restart: NFNL_MSG_TYPE(cb->nlh->nlmsg_type), ct, true, flags); if (res < 0) { - nf_conntrack_get(&ct->ct_general); - cb->args[1] = (unsigned long)ct; + cb->args[1] = ctnetlink_get_id(ct); spin_unlock(lockp); goto out; } @@ -1284,12 +1288,10 @@ restart: } out: local_bh_enable(); - if (last) { + if (last_id) { /* nf ct hash resize happened, now clear the leftover. */ - if ((struct nf_conn *)cb->args[1] == last) + if (cb->args[1] == last_id) cb->args[1] = 0; - - nf_ct_put(last); } while (i) { @@ -1731,16 +1733,6 @@ static int ctnetlink_get_conntrack(struct sk_buff *skb, return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); } -static int ctnetlink_done_list(struct netlink_callback *cb) -{ - struct ctnetlink_list_dump_ctx *ctx = (void *)cb->ctx; - - if (ctx->last) - nf_ct_put(ctx->last); - - return 0; -} - #ifdef CONFIG_NF_CONNTRACK_EVENTS static int ctnetlink_dump_one_entry(struct sk_buff *skb, struct netlink_callback *cb, @@ -1755,11 +1747,11 @@ static int ctnetlink_dump_one_entry(struct sk_buff *skb, if (l3proto && nf_ct_l3num(ct) != l3proto) return 0; - if (ctx->last) { - if (ct != ctx->last) + if (ctx->last_id) { + if (ctnetlink_get_id(ct) != ctx->last_id) return 0; - ctx->last = NULL; + ctx->last_id = 0; } /* We can't dump extension info for the unconfirmed @@ -1773,12 +1765,8 @@ static int ctnetlink_dump_one_entry(struct sk_buff *skb, cb->nlh->nlmsg_seq, NFNL_MSG_TYPE(cb->nlh->nlmsg_type), ct, dying, 0); - if (res < 0) { - if (!refcount_inc_not_zero(&ct->ct_general.use)) - return 0; - - ctx->last = ct; - } + if (res < 0) + ctx->last_id = ctnetlink_get_id(ct); return res; } @@ -1794,10 +1782,10 @@ static int ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb) { struct ctnetlink_list_dump_ctx *ctx = (void *)cb->ctx; - struct nf_conn *last = ctx->last; #ifdef CONFIG_NF_CONNTRACK_EVENTS const struct net *net = sock_net(skb->sk); struct nf_conntrack_net_ecache *ecache_net; + unsigned long last_id = ctx->last_id; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; #endif @@ -1805,7 +1793,7 @@ ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb) if (ctx->done) return 0; - ctx->last = NULL; + ctx->last_id = 0; #ifdef CONFIG_NF_CONNTRACK_EVENTS ecache_net = nf_conn_pernet_ecache(net); @@ -1816,24 +1804,21 @@ ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb) int res; ct = nf_ct_tuplehash_to_ctrack(h); - if (last && last != ct) + if (last_id && last_id != ctnetlink_get_id(ct)) continue; res = ctnetlink_dump_one_entry(skb, cb, ct, true); if (res < 0) { spin_unlock_bh(&ecache_net->dying_lock); - nf_ct_put(last); return skb->len; } - nf_ct_put(last); - last = NULL; + last_id = 0; } spin_unlock_bh(&ecache_net->dying_lock); #endif ctx->done = true; - nf_ct_put(last); return skb->len; } @@ -1845,7 +1830,6 @@ static int ctnetlink_get_ct_dying(struct sk_buff *skb, if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = ctnetlink_dump_dying, - .done = ctnetlink_done_list, }; return netlink_dump_start(info->sk, skb, info->nlh, &c); } @@ -1860,7 +1844,6 @@ static int ctnetlink_get_ct_unconfirmed(struct sk_buff *skb, if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = ctnetlink_dump_unconfirmed, - .done = ctnetlink_done_list, }; return netlink_dump_start(info->sk, skb, info->nlh, &c); } @@ -3168,23 +3151,27 @@ errout: return 0; } #endif -static int ctnetlink_exp_done(struct netlink_callback *cb) + +static unsigned long ctnetlink_exp_id(const struct nf_conntrack_expect *exp) { - if (cb->args[1]) - nf_ct_expect_put((struct nf_conntrack_expect *)cb->args[1]); - return 0; + unsigned long id = (unsigned long)exp; + + id += nf_ct_get_id(exp->master); + id += exp->class; + + return id ? id : 1; } static int ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); - struct nf_conntrack_expect *exp, *last; struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); u_int8_t l3proto = nfmsg->nfgen_family; + unsigned long last_id = cb->args[1]; + struct nf_conntrack_expect *exp; rcu_read_lock(); - last = (struct nf_conntrack_expect *)cb->args[1]; for (; cb->args[0] < nf_ct_expect_hsize; cb->args[0]++) { restart: hlist_for_each_entry_rcu(exp, &nf_ct_expect_hash[cb->args[0]], @@ -3196,7 +3183,7 @@ restart: continue; if (cb->args[1]) { - if (exp != last) + if (ctnetlink_exp_id(exp) != last_id) continue; cb->args[1] = 0; } @@ -3205,9 +3192,7 @@ restart: cb->nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, exp) < 0) { - if (!refcount_inc_not_zero(&exp->use)) - continue; - cb->args[1] = (unsigned long)exp; + cb->args[1] = ctnetlink_exp_id(exp); goto out; } } @@ -3218,32 +3203,30 @@ restart: } out: rcu_read_unlock(); - if (last) - nf_ct_expect_put(last); - return skb->len; } static int ctnetlink_exp_ct_dump_table(struct sk_buff *skb, struct netlink_callback *cb) { - struct nf_conntrack_expect *exp, *last; struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); struct nf_conn *ct = cb->data; struct nf_conn_help *help = nfct_help(ct); u_int8_t l3proto = nfmsg->nfgen_family; + unsigned long last_id = cb->args[1]; + struct nf_conntrack_expect *exp; if (cb->args[0]) return 0; rcu_read_lock(); - last = (struct nf_conntrack_expect *)cb->args[1]; + restart: hlist_for_each_entry_rcu(exp, &help->expectations, lnode) { if (l3proto && exp->tuple.src.l3num != l3proto) continue; if (cb->args[1]) { - if (exp != last) + if (ctnetlink_exp_id(exp) != last_id) continue; cb->args[1] = 0; } @@ -3251,9 +3234,7 @@ restart: cb->nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, exp) < 0) { - if (!refcount_inc_not_zero(&exp->use)) - continue; - cb->args[1] = (unsigned long)exp; + cb->args[1] = ctnetlink_exp_id(exp); goto out; } } @@ -3264,9 +3245,6 @@ restart: cb->args[0] = 1; out: rcu_read_unlock(); - if (last) - nf_ct_expect_put(last); - return skb->len; } @@ -3285,7 +3263,6 @@ static int ctnetlink_dump_exp_ct(struct net *net, struct sock *ctnl, struct nf_conntrack_zone zone; struct netlink_dump_control c = { .dump = ctnetlink_exp_ct_dump_table, - .done = ctnetlink_exp_done, }; err = ctnetlink_parse_tuple(cda, &tuple, CTA_EXPECT_MASTER, @@ -3335,7 +3312,6 @@ static int ctnetlink_get_expect(struct sk_buff *skb, else { struct netlink_dump_control c = { .dump = ctnetlink_exp_dump_table, - .done = ctnetlink_exp_done, }; return netlink_dump_start(info->sk, skb, info->nlh, &c); } diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 9b8b10a85233..207b240b14e5 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -317,6 +317,9 @@ static int ct_seq_show(struct seq_file *s, void *v) smp_acquire__after_ctrl_dep(); if (nf_ct_should_gc(ct)) { + struct ct_iter_state *st = s->private; + + st->skip_elems--; nf_ct_kill(ct); goto release; } @@ -567,16 +570,16 @@ nf_conntrack_log_invalid_sysctl(const struct ctl_table *table, int write, return ret; if (*(u8 *)table->data == 0) - return ret; + return 0; /* Load nf_log_syslog only if no logger is currently registered */ for (i = 0; i < NFPROTO_NUMPROTO; i++) { if (nf_log_is_registered(i)) - return ret; + return 0; } request_module("%s", "nf_log_syslog"); - return ret; + return 0; } static struct ctl_table_header *nf_ct_netfilter_header; @@ -645,7 +648,7 @@ static struct ctl_table nf_ct_sysctl_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, + .extra1 = SYSCTL_ONE, .extra2 = SYSCTL_INT_MAX, }, [NF_SYSCTL_CT_COUNT] = { @@ -926,7 +929,7 @@ static struct ctl_table nf_ct_netfilter_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, + .extra1 = SYSCTL_ONE, .extra2 = SYSCTL_INT_MAX, }, }; diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 9441ac3d8c1a..06e8251a6644 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -118,7 +118,10 @@ static int flow_offload_fill_route(struct flow_offload *flow, flow_tuple->in_vlan_ingress |= BIT(j); j++; } + + flow_tuple->tun = route->tuple[dir].in.tun; flow_tuple->encap_num = route->tuple[dir].in.num_encaps; + flow_tuple->tun_num = route->tuple[dir].in.num_tuns; switch (route->tuple[dir].xmit_type) { case FLOW_OFFLOAD_XMIT_DIRECT: @@ -127,11 +130,11 @@ static int flow_offload_fill_route(struct flow_offload *flow, memcpy(flow_tuple->out.h_source, route->tuple[dir].out.h_source, ETH_ALEN); flow_tuple->out.ifidx = route->tuple[dir].out.ifindex; - flow_tuple->out.hw_ifidx = route->tuple[dir].out.hw_ifindex; dst_release(dst); break; case FLOW_OFFLOAD_XMIT_XFRM: case FLOW_OFFLOAD_XMIT_NEIGH: + flow_tuple->ifidx = route->tuple[dir].out.ifindex; flow_tuple->dst_cache = dst; flow_tuple->dst_cookie = flow_offload_dst_cookie(flow_tuple); break; diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 8cd4cf7ae211..78883343e5d6 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -145,8 +145,11 @@ static bool ip_has_options(unsigned int thoff) static void nf_flow_tuple_encap(struct sk_buff *skb, struct flow_offload_tuple *tuple) { + __be16 inner_proto = skb->protocol; struct vlan_ethhdr *veth; struct pppoe_hdr *phdr; + struct iphdr *iph; + u16 offset = 0; int i = 0; if (skb_vlan_tag_present(skb)) { @@ -159,13 +162,26 @@ static void nf_flow_tuple_encap(struct sk_buff *skb, veth = (struct vlan_ethhdr *)skb_mac_header(skb); tuple->encap[i].id = ntohs(veth->h_vlan_TCI); tuple->encap[i].proto = skb->protocol; + inner_proto = veth->h_vlan_encapsulated_proto; + offset += VLAN_HLEN; break; case htons(ETH_P_PPP_SES): phdr = (struct pppoe_hdr *)skb_network_header(skb); tuple->encap[i].id = ntohs(phdr->sid); tuple->encap[i].proto = skb->protocol; + inner_proto = *((__be16 *)(phdr + 1)); + offset += PPPOE_SES_HLEN; break; } + + if (inner_proto == htons(ETH_P_IP)) { + iph = (struct iphdr *)(skb_network_header(skb) + offset); + if (iph->protocol == IPPROTO_IPIP) { + tuple->tun.dst_v4.s_addr = iph->daddr; + tuple->tun.src_v4.s_addr = iph->saddr; + tuple->tun.l3_proto = IPPROTO_IPIP; + } + } } struct nf_flowtable_ctx { @@ -277,11 +293,46 @@ static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb, return NF_STOLEN; } +static bool nf_flow_ip4_tunnel_proto(struct sk_buff *skb, u32 *psize) +{ + struct iphdr *iph; + u16 size; + + if (!pskb_may_pull(skb, sizeof(*iph) + *psize)) + return false; + + iph = (struct iphdr *)(skb_network_header(skb) + *psize); + size = iph->ihl << 2; + + if (ip_is_fragment(iph) || unlikely(ip_has_options(size))) + return false; + + if (iph->ttl <= 1) + return false; + + if (iph->protocol == IPPROTO_IPIP) + *psize += size; + + return true; +} + +static void nf_flow_ip4_tunnel_pop(struct sk_buff *skb) +{ + struct iphdr *iph = (struct iphdr *)skb_network_header(skb); + + if (iph->protocol != IPPROTO_IPIP) + return; + + skb_pull(skb, iph->ihl << 2); + skb_reset_network_header(skb); +} + static bool nf_flow_skb_encap_protocol(struct sk_buff *skb, __be16 proto, u32 *offset) { + __be16 inner_proto = skb->protocol; struct vlan_ethhdr *veth; - __be16 inner_proto; + bool ret = false; switch (skb->protocol) { case htons(ETH_P_8021Q): @@ -291,19 +342,23 @@ static bool nf_flow_skb_encap_protocol(struct sk_buff *skb, __be16 proto, veth = (struct vlan_ethhdr *)skb_mac_header(skb); if (veth->h_vlan_encapsulated_proto == proto) { *offset += VLAN_HLEN; - return true; + inner_proto = proto; + ret = true; } break; case htons(ETH_P_PPP_SES): if (nf_flow_pppoe_proto(skb, &inner_proto) && inner_proto == proto) { *offset += PPPOE_SES_HLEN; - return true; + ret = true; } break; } - return false; + if (inner_proto == htons(ETH_P_IP)) + ret = nf_flow_ip4_tunnel_proto(skb, offset); + + return ret; } static void nf_flow_encap_pop(struct sk_buff *skb, @@ -331,21 +386,23 @@ static void nf_flow_encap_pop(struct sk_buff *skb, break; } } + + if (skb->protocol == htons(ETH_P_IP)) + nf_flow_ip4_tunnel_pop(skb); } +struct nf_flow_xmit { + const void *dest; + const void *source; + struct net_device *outdev; +}; + static unsigned int nf_flow_queue_xmit(struct net *net, struct sk_buff *skb, - const struct flow_offload_tuple_rhash *tuplehash, - unsigned short type) + struct nf_flow_xmit *xmit) { - struct net_device *outdev; - - outdev = dev_get_by_index_rcu(net, tuplehash->tuple.out.ifidx); - if (!outdev) - return NF_DROP; - - skb->dev = outdev; - dev_hard_header(skb, skb->dev, type, tuplehash->tuple.out.h_dest, - tuplehash->tuple.out.h_source, skb->len); + skb->dev = xmit->outdev; + dev_hard_header(skb, skb->dev, ntohs(skb->protocol), + xmit->dest, xmit->source, skb->len); dev_queue_xmit(skb); return NF_STOLEN; @@ -357,8 +414,7 @@ nf_flow_offload_lookup(struct nf_flowtable_ctx *ctx, { struct flow_offload_tuple tuple = {}; - if (skb->protocol != htons(ETH_P_IP) && - !nf_flow_skb_encap_protocol(skb, htons(ETH_P_IP), &ctx->offset)) + if (!nf_flow_skb_encap_protocol(skb, htons(ETH_P_IP), &ctx->offset)) return NULL; if (nf_flow_tuple_ip(ctx, skb, &tuple) < 0) @@ -381,6 +437,9 @@ static int nf_flow_offload_forward(struct nf_flowtable_ctx *ctx, flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); mtu = flow->tuplehash[dir].tuple.mtu + ctx->offset; + if (flow->tuplehash[!dir].tuple.tun_num) + mtu -= sizeof(*iph); + if (unlikely(nf_flow_exceeds_mtu(skb, mtu))) return 0; @@ -414,20 +473,139 @@ static int nf_flow_offload_forward(struct nf_flowtable_ctx *ctx, return 1; } +static int nf_flow_pppoe_push(struct sk_buff *skb, u16 id) +{ + int data_len = skb->len + sizeof(__be16); + struct ppp_hdr { + struct pppoe_hdr hdr; + __be16 proto; + } *ph; + __be16 proto; + + if (skb_cow_head(skb, PPPOE_SES_HLEN)) + return -1; + + switch (skb->protocol) { + case htons(ETH_P_IP): + proto = htons(PPP_IP); + break; + case htons(ETH_P_IPV6): + proto = htons(PPP_IPV6); + break; + default: + return -1; + } + + __skb_push(skb, PPPOE_SES_HLEN); + skb_reset_network_header(skb); + + ph = (struct ppp_hdr *)(skb->data); + ph->hdr.ver = 1; + ph->hdr.type = 1; + ph->hdr.code = 0; + ph->hdr.sid = htons(id); + ph->hdr.length = htons(data_len); + ph->proto = proto; + skb->protocol = htons(ETH_P_PPP_SES); + + return 0; +} + +static int nf_flow_tunnel_ipip_push(struct net *net, struct sk_buff *skb, + struct flow_offload_tuple *tuple, + __be32 *ip_daddr) +{ + struct iphdr *iph = (struct iphdr *)skb_network_header(skb); + struct rtable *rt = dst_rtable(tuple->dst_cache); + u8 tos = iph->tos, ttl = iph->ttl; + __be16 frag_off = iph->frag_off; + u32 headroom = sizeof(*iph); + int err; + + err = iptunnel_handle_offloads(skb, SKB_GSO_IPXIP4); + if (err) + return err; + + skb_set_inner_ipproto(skb, IPPROTO_IPIP); + headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len; + err = skb_cow_head(skb, headroom); + if (err) + return err; + + skb_scrub_packet(skb, true); + skb_clear_hash_if_not_l4(skb); + + /* Push down and install the IP header. */ + skb_push(skb, sizeof(*iph)); + skb_reset_network_header(skb); + + iph = ip_hdr(skb); + iph->version = 4; + iph->ihl = sizeof(*iph) >> 2; + iph->frag_off = ip_mtu_locked(&rt->dst) ? 0 : frag_off; + iph->protocol = tuple->tun.l3_proto; + iph->tos = tos; + iph->daddr = tuple->tun.src_v4.s_addr; + iph->saddr = tuple->tun.dst_v4.s_addr; + iph->ttl = ttl; + iph->tot_len = htons(skb->len); + __ip_select_ident(net, iph, skb_shinfo(skb)->gso_segs ?: 1); + ip_send_check(iph); + + *ip_daddr = tuple->tun.src_v4.s_addr; + + return 0; +} + +static int nf_flow_tunnel_v4_push(struct net *net, struct sk_buff *skb, + struct flow_offload_tuple *tuple, + __be32 *ip_daddr) +{ + if (tuple->tun_num) + return nf_flow_tunnel_ipip_push(net, skb, tuple, ip_daddr); + + return 0; +} + +static int nf_flow_encap_push(struct sk_buff *skb, + struct flow_offload_tuple *tuple) +{ + int i; + + for (i = 0; i < tuple->encap_num; i++) { + switch (tuple->encap[i].proto) { + case htons(ETH_P_8021Q): + case htons(ETH_P_8021AD): + if (skb_vlan_push(skb, tuple->encap[i].proto, + tuple->encap[i].id) < 0) + return -1; + break; + case htons(ETH_P_PPP_SES): + if (nf_flow_pppoe_push(skb, tuple->encap[i].id) < 0) + return -1; + break; + } + } + + return 0; +} + unsigned int nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct flow_offload_tuple_rhash *tuplehash; struct nf_flowtable *flow_table = priv; + struct flow_offload_tuple *other_tuple; enum flow_offload_tuple_dir dir; struct nf_flowtable_ctx ctx = { .in = state->in, }; + struct nf_flow_xmit xmit = {}; struct flow_offload *flow; - struct net_device *outdev; + struct neighbour *neigh; struct rtable *rt; - __be32 nexthop; + __be32 ip_daddr; int ret; tuplehash = nf_flow_offload_lookup(&ctx, flow_table, skb); @@ -450,29 +628,46 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, dir = tuplehash->tuple.dir; flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); + other_tuple = &flow->tuplehash[!dir].tuple; + ip_daddr = other_tuple->src_v4.s_addr; + + if (nf_flow_tunnel_v4_push(state->net, skb, other_tuple, &ip_daddr) < 0) + return NF_DROP; + + if (nf_flow_encap_push(skb, other_tuple) < 0) + return NF_DROP; switch (tuplehash->tuple.xmit_type) { case FLOW_OFFLOAD_XMIT_NEIGH: rt = dst_rtable(tuplehash->tuple.dst_cache); - outdev = rt->dst.dev; - skb->dev = outdev; - nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr); + xmit.outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.ifidx); + if (!xmit.outdev) { + flow_offload_teardown(flow); + return NF_DROP; + } + neigh = ip_neigh_gw4(rt->dst.dev, rt_nexthop(rt, ip_daddr)); + if (IS_ERR(neigh)) { + flow_offload_teardown(flow); + return NF_DROP; + } + xmit.dest = neigh->ha; skb_dst_set_noref(skb, &rt->dst); - neigh_xmit(NEIGH_ARP_TABLE, outdev, &nexthop, skb); - ret = NF_STOLEN; break; case FLOW_OFFLOAD_XMIT_DIRECT: - ret = nf_flow_queue_xmit(state->net, skb, tuplehash, ETH_P_IP); - if (ret == NF_DROP) + xmit.outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.out.ifidx); + if (!xmit.outdev) { flow_offload_teardown(flow); + return NF_DROP; + } + xmit.dest = tuplehash->tuple.out.h_dest; + xmit.source = tuplehash->tuple.out.h_source; break; default: WARN_ON_ONCE(1); - ret = NF_DROP; - break; + return NF_DROP; } - return ret; + return nf_flow_queue_xmit(state->net, skb, &xmit); } EXPORT_SYMBOL_GPL(nf_flow_offload_ip_hook); @@ -715,13 +910,15 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, { struct flow_offload_tuple_rhash *tuplehash; struct nf_flowtable *flow_table = priv; + struct flow_offload_tuple *other_tuple; enum flow_offload_tuple_dir dir; struct nf_flowtable_ctx ctx = { .in = state->in, }; - const struct in6_addr *nexthop; + struct nf_flow_xmit xmit = {}; + struct in6_addr *ip6_daddr; struct flow_offload *flow; - struct net_device *outdev; + struct neighbour *neigh; struct rt6_info *rt; int ret; @@ -745,28 +942,42 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, dir = tuplehash->tuple.dir; flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); + other_tuple = &flow->tuplehash[!dir].tuple; + ip6_daddr = &other_tuple->src_v6; + + if (nf_flow_encap_push(skb, other_tuple) < 0) + return NF_DROP; switch (tuplehash->tuple.xmit_type) { case FLOW_OFFLOAD_XMIT_NEIGH: rt = dst_rt6_info(tuplehash->tuple.dst_cache); - outdev = rt->dst.dev; - skb->dev = outdev; - nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6); + xmit.outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.ifidx); + if (!xmit.outdev) { + flow_offload_teardown(flow); + return NF_DROP; + } + neigh = ip_neigh_gw6(rt->dst.dev, rt6_nexthop(rt, ip6_daddr)); + if (IS_ERR(neigh)) { + flow_offload_teardown(flow); + return NF_DROP; + } + xmit.dest = neigh->ha; skb_dst_set_noref(skb, &rt->dst); - neigh_xmit(NEIGH_ND_TABLE, outdev, nexthop, skb); - ret = NF_STOLEN; break; case FLOW_OFFLOAD_XMIT_DIRECT: - ret = nf_flow_queue_xmit(state->net, skb, tuplehash, ETH_P_IPV6); - if (ret == NF_DROP) + xmit.outdev = dev_get_by_index_rcu(state->net, tuplehash->tuple.out.ifidx); + if (!xmit.outdev) { flow_offload_teardown(flow); + return NF_DROP; + } + xmit.dest = tuplehash->tuple.out.h_dest; + xmit.source = tuplehash->tuple.out.h_source; break; default: WARN_ON_ONCE(1); - ret = NF_DROP; - break; + return NF_DROP; } - return ret; + return nf_flow_queue_xmit(state->net, skb, &xmit); } EXPORT_SYMBOL_GPL(nf_flow_offload_ipv6_hook); diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index e06bc36f49fe..d8f7bfd60ac6 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -555,7 +555,7 @@ static void flow_offload_redirect(struct net *net, switch (this_tuple->xmit_type) { case FLOW_OFFLOAD_XMIT_DIRECT: this_tuple = &flow->tuplehash[dir].tuple; - ifindex = this_tuple->out.hw_ifidx; + ifindex = this_tuple->out.ifidx; break; case FLOW_OFFLOAD_XMIT_NEIGH: other_tuple = &flow->tuplehash[!dir].tuple; diff --git a/net/netfilter/nf_flow_table_path.c b/net/netfilter/nf_flow_table_path.c new file mode 100644 index 000000000000..f0984cf69a09 --- /dev/null +++ b/net/netfilter/nf_flow_table_path.c @@ -0,0 +1,330 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/spinlock.h> +#include <linux/netfilter/nf_conntrack_common.h> +#include <linux/netfilter/nf_tables.h> +#include <net/ip.h> +#include <net/inet_dscp.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_extend.h> +#include <net/netfilter/nf_flow_table.h> + +static enum flow_offload_xmit_type nft_xmit_type(struct dst_entry *dst) +{ + if (dst_xfrm(dst)) + return FLOW_OFFLOAD_XMIT_XFRM; + + return FLOW_OFFLOAD_XMIT_NEIGH; +} + +static void nft_default_forward_path(struct nf_flow_route *route, + struct dst_entry *dst_cache, + enum ip_conntrack_dir dir) +{ + route->tuple[!dir].in.ifindex = dst_cache->dev->ifindex; + route->tuple[dir].dst = dst_cache; + route->tuple[dir].xmit_type = nft_xmit_type(dst_cache); +} + +static bool nft_is_valid_ether_device(const struct net_device *dev) +{ + if (!dev || (dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER || + dev->addr_len != ETH_ALEN || !is_valid_ether_addr(dev->dev_addr)) + return false; + + return true; +} + +static int nft_dev_fill_forward_path(const struct nf_flow_route *route, + const struct dst_entry *dst_cache, + const struct nf_conn *ct, + enum ip_conntrack_dir dir, u8 *ha, + struct net_device_path_stack *stack) +{ + const void *daddr = &ct->tuplehash[!dir].tuple.src.u3; + struct net_device *dev = dst_cache->dev; + struct neighbour *n; + u8 nud_state; + + if (!nft_is_valid_ether_device(dev)) + goto out; + + n = dst_neigh_lookup(dst_cache, daddr); + if (!n) + return -1; + + read_lock_bh(&n->lock); + nud_state = n->nud_state; + ether_addr_copy(ha, n->ha); + read_unlock_bh(&n->lock); + neigh_release(n); + + if (!(nud_state & NUD_VALID)) + return -1; + +out: + return dev_fill_forward_path(dev, ha, stack); +} + +struct nft_forward_info { + const struct net_device *indev; + const struct net_device *outdev; + struct id { + __u16 id; + __be16 proto; + } encap[NF_FLOW_TABLE_ENCAP_MAX]; + u8 num_encaps; + struct flow_offload_tunnel tun; + u8 num_tuns; + u8 ingress_vlans; + u8 h_source[ETH_ALEN]; + u8 h_dest[ETH_ALEN]; + enum flow_offload_xmit_type xmit_type; +}; + +static void nft_dev_path_info(const struct net_device_path_stack *stack, + struct nft_forward_info *info, + unsigned char *ha, struct nf_flowtable *flowtable) +{ + const struct net_device_path *path; + int i; + + memcpy(info->h_dest, ha, ETH_ALEN); + + for (i = 0; i < stack->num_paths; i++) { + path = &stack->path[i]; + switch (path->type) { + case DEV_PATH_ETHERNET: + case DEV_PATH_DSA: + case DEV_PATH_VLAN: + case DEV_PATH_PPPOE: + case DEV_PATH_TUN: + info->indev = path->dev; + if (is_zero_ether_addr(info->h_source)) + memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); + + if (path->type == DEV_PATH_ETHERNET) + break; + if (path->type == DEV_PATH_DSA) { + i = stack->num_paths; + break; + } + + /* DEV_PATH_VLAN, DEV_PATH_PPPOE and DEV_PATH_TUN */ + if (path->type == DEV_PATH_TUN) { + if (info->num_tuns) { + info->indev = NULL; + break; + } + info->tun.src_v6 = path->tun.src_v6; + info->tun.dst_v6 = path->tun.dst_v6; + info->tun.l3_proto = path->tun.l3_proto; + info->num_tuns++; + } else { + if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) { + info->indev = NULL; + break; + } + info->encap[info->num_encaps].id = + path->encap.id; + info->encap[info->num_encaps].proto = + path->encap.proto; + info->num_encaps++; + } + if (path->type == DEV_PATH_PPPOE) + memcpy(info->h_dest, path->encap.h_dest, ETH_ALEN); + break; + case DEV_PATH_BRIDGE: + if (is_zero_ether_addr(info->h_source)) + memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); + + switch (path->bridge.vlan_mode) { + case DEV_PATH_BR_VLAN_UNTAG_HW: + info->ingress_vlans |= BIT(info->num_encaps - 1); + break; + case DEV_PATH_BR_VLAN_TAG: + if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) { + info->indev = NULL; + break; + } + info->encap[info->num_encaps].id = path->bridge.vlan_id; + info->encap[info->num_encaps].proto = path->bridge.vlan_proto; + info->num_encaps++; + break; + case DEV_PATH_BR_VLAN_UNTAG: + if (WARN_ON_ONCE(info->num_encaps-- == 0)) { + info->indev = NULL; + break; + } + break; + case DEV_PATH_BR_VLAN_KEEP: + break; + } + info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; + break; + default: + info->indev = NULL; + break; + } + } + info->outdev = info->indev; + + if (nf_flowtable_hw_offload(flowtable) && + nft_is_valid_ether_device(info->indev)) + info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; +} + +static bool nft_flowtable_find_dev(const struct net_device *dev, + struct nft_flowtable *ft) +{ + struct nft_hook *hook; + bool found = false; + + list_for_each_entry_rcu(hook, &ft->hook_list, list) { + if (!nft_hook_find_ops_rcu(hook, dev)) + continue; + + found = true; + break; + } + + return found; +} + +static int nft_flow_tunnel_update_route(const struct nft_pktinfo *pkt, + struct flow_offload_tunnel *tun, + struct nf_flow_route *route, + enum ip_conntrack_dir dir) +{ + struct dst_entry *cur_dst = route->tuple[dir].dst; + struct dst_entry *tun_dst = NULL; + struct flowi fl = {}; + + switch (nft_pf(pkt)) { + case NFPROTO_IPV4: + fl.u.ip4.daddr = tun->dst_v4.s_addr; + fl.u.ip4.saddr = tun->src_v4.s_addr; + fl.u.ip4.flowi4_iif = nft_in(pkt)->ifindex; + fl.u.ip4.flowi4_dscp = ip4h_dscp(ip_hdr(pkt->skb)); + fl.u.ip4.flowi4_mark = pkt->skb->mark; + fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC; + break; + case NFPROTO_IPV6: + fl.u.ip6.daddr = tun->dst_v6; + fl.u.ip6.saddr = tun->src_v6; + fl.u.ip6.flowi6_iif = nft_in(pkt)->ifindex; + fl.u.ip6.flowlabel = ip6_flowinfo(ipv6_hdr(pkt->skb)); + fl.u.ip6.flowi6_mark = pkt->skb->mark; + fl.u.ip6.flowi6_flags = FLOWI_FLAG_ANYSRC; + break; + } + + nf_route(nft_net(pkt), &tun_dst, &fl, false, nft_pf(pkt)); + if (!tun_dst) + return -ENOENT; + + route->tuple[dir].dst = tun_dst; + dst_release(cur_dst); + + return 0; +} + +static void nft_dev_forward_path(const struct nft_pktinfo *pkt, + struct nf_flow_route *route, + const struct nf_conn *ct, + enum ip_conntrack_dir dir, + struct nft_flowtable *ft) +{ + const struct dst_entry *dst = route->tuple[dir].dst; + struct net_device_path_stack stack; + struct nft_forward_info info = {}; + unsigned char ha[ETH_ALEN]; + int i; + + if (nft_dev_fill_forward_path(route, dst, ct, dir, ha, &stack) >= 0) + nft_dev_path_info(&stack, &info, ha, &ft->data); + + if (!info.indev || !nft_flowtable_find_dev(info.indev, ft)) + return; + + route->tuple[!dir].in.ifindex = info.indev->ifindex; + for (i = 0; i < info.num_encaps; i++) { + route->tuple[!dir].in.encap[i].id = info.encap[i].id; + route->tuple[!dir].in.encap[i].proto = info.encap[i].proto; + } + + if (info.num_tuns && + !nft_flow_tunnel_update_route(pkt, &info.tun, route, dir)) { + route->tuple[!dir].in.tun.src_v6 = info.tun.dst_v6; + route->tuple[!dir].in.tun.dst_v6 = info.tun.src_v6; + route->tuple[!dir].in.tun.l3_proto = info.tun.l3_proto; + route->tuple[!dir].in.num_tuns = info.num_tuns; + } + + route->tuple[!dir].in.num_encaps = info.num_encaps; + route->tuple[!dir].in.ingress_vlans = info.ingress_vlans; + route->tuple[dir].out.ifindex = info.outdev->ifindex; + + if (info.xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) { + memcpy(route->tuple[dir].out.h_source, info.h_source, ETH_ALEN); + memcpy(route->tuple[dir].out.h_dest, info.h_dest, ETH_ALEN); + route->tuple[dir].xmit_type = info.xmit_type; + } +} + +int nft_flow_route(const struct nft_pktinfo *pkt, const struct nf_conn *ct, + struct nf_flow_route *route, enum ip_conntrack_dir dir, + struct nft_flowtable *ft) +{ + struct dst_entry *this_dst = skb_dst(pkt->skb); + struct dst_entry *other_dst = NULL; + struct flowi fl; + + memset(&fl, 0, sizeof(fl)); + switch (nft_pf(pkt)) { + case NFPROTO_IPV4: + fl.u.ip4.daddr = ct->tuplehash[dir].tuple.src.u3.ip; + fl.u.ip4.saddr = ct->tuplehash[!dir].tuple.src.u3.ip; + fl.u.ip4.flowi4_oif = nft_in(pkt)->ifindex; + fl.u.ip4.flowi4_iif = this_dst->dev->ifindex; + fl.u.ip4.flowi4_dscp = ip4h_dscp(ip_hdr(pkt->skb)); + fl.u.ip4.flowi4_mark = pkt->skb->mark; + fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC; + break; + case NFPROTO_IPV6: + fl.u.ip6.daddr = ct->tuplehash[dir].tuple.src.u3.in6; + fl.u.ip6.saddr = ct->tuplehash[!dir].tuple.src.u3.in6; + fl.u.ip6.flowi6_oif = nft_in(pkt)->ifindex; + fl.u.ip6.flowi6_iif = this_dst->dev->ifindex; + fl.u.ip6.flowlabel = ip6_flowinfo(ipv6_hdr(pkt->skb)); + fl.u.ip6.flowi6_mark = pkt->skb->mark; + fl.u.ip6.flowi6_flags = FLOWI_FLAG_ANYSRC; + break; + } + + if (!dst_hold_safe(this_dst)) + return -ENOENT; + + nf_route(nft_net(pkt), &other_dst, &fl, false, nft_pf(pkt)); + if (!other_dst) { + dst_release(this_dst); + return -ENOENT; + } + + nft_default_forward_path(route, this_dst, dir); + nft_default_forward_path(route, other_dst, !dir); + + if (route->tuple[dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) + nft_dev_forward_path(pkt, route, ct, dir, ft); + if (route->tuple[!dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) + nft_dev_forward_path(pkt, route, ct, !dir, ft); + + return 0; +} +EXPORT_SYMBOL_GPL(nft_flow_route); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 13d0ed9d1895..f3de2f9bbebf 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -151,12 +151,12 @@ static void nft_ctx_init(struct nft_ctx *ctx, bitmap_zero(ctx->reg_inited, NFT_REG32_NUM); } -static struct nft_trans *nft_trans_alloc_gfp(const struct nft_ctx *ctx, - int msg_type, u32 size, gfp_t gfp) +static struct nft_trans *nft_trans_alloc(const struct nft_ctx *ctx, + int msg_type, u32 size) { struct nft_trans *trans; - trans = kzalloc(size, gfp); + trans = kzalloc(size, GFP_KERNEL); if (trans == NULL) return NULL; @@ -172,12 +172,6 @@ static struct nft_trans *nft_trans_alloc_gfp(const struct nft_ctx *ctx, return trans; } -static struct nft_trans *nft_trans_alloc(const struct nft_ctx *ctx, - int msg_type, u32 size) -{ - return nft_trans_alloc_gfp(ctx, msg_type, size, GFP_KERNEL); -} - static struct nft_trans_binding *nft_trans_get_binding(struct nft_trans *trans) { switch (trans->msg_type) { @@ -442,8 +436,7 @@ static bool nft_trans_collapse_set_elem_allowed(const struct nft_trans_elem *a, static bool nft_trans_collapse_set_elem(struct nftables_pernet *nft_net, struct nft_trans_elem *tail, - struct nft_trans_elem *trans, - gfp_t gfp) + struct nft_trans_elem *trans) { unsigned int nelems, old_nelems = tail->nelems; struct nft_trans_elem *new_trans; @@ -466,9 +459,11 @@ static bool nft_trans_collapse_set_elem(struct nftables_pernet *nft_net, /* krealloc might free tail which invalidates list pointers */ list_del_init(&tail->nft_trans.list); - new_trans = krealloc(tail, struct_size(tail, elems, nelems), gfp); + new_trans = krealloc(tail, struct_size(tail, elems, nelems), + GFP_KERNEL); if (!new_trans) { - list_add_tail(&tail->nft_trans.list, &nft_net->commit_list); + list_add_tail(&tail->nft_trans.list, + &nft_net->commit_list); return false; } @@ -484,7 +479,7 @@ static bool nft_trans_collapse_set_elem(struct nftables_pernet *nft_net, } static bool nft_trans_try_collapse(struct nftables_pernet *nft_net, - struct nft_trans *trans, gfp_t gfp) + struct nft_trans *trans) { struct nft_trans *tail; @@ -501,7 +496,7 @@ static bool nft_trans_try_collapse(struct nftables_pernet *nft_net, case NFT_MSG_DELSETELEM: return nft_trans_collapse_set_elem(nft_net, nft_trans_container_elem(tail), - nft_trans_container_elem(trans), gfp); + nft_trans_container_elem(trans)); } return false; @@ -537,17 +532,14 @@ static void nft_trans_commit_list_add_tail(struct net *net, struct nft_trans *tr } } -static void nft_trans_commit_list_add_elem(struct net *net, struct nft_trans *trans, - gfp_t gfp) +static void nft_trans_commit_list_add_elem(struct net *net, struct nft_trans *trans) { struct nftables_pernet *nft_net = nft_pernet(net); WARN_ON_ONCE(trans->msg_type != NFT_MSG_NEWSETELEM && trans->msg_type != NFT_MSG_DELSETELEM); - might_alloc(gfp); - - if (nft_trans_try_collapse(nft_net, trans, gfp)) { + if (nft_trans_try_collapse(nft_net, trans)) { kfree(trans); return; } @@ -1131,11 +1123,14 @@ nf_tables_chain_type_lookup(struct net *net, const struct nlattr *nla, return ERR_PTR(-ENOENT); } -static __be16 nft_base_seq(const struct net *net) +static unsigned int nft_base_seq(const struct net *net) { - struct nftables_pernet *nft_net = nft_pernet(net); + return READ_ONCE(net->nft.base_seq); +} - return htons(nft_net->base_seq & 0xffff); +static __be16 nft_base_seq_be16(const struct net *net) +{ + return htons(nft_base_seq(net) & 0xffff); } static const struct nla_policy nft_table_policy[NFTA_TABLE_MAX + 1] = { @@ -1155,7 +1150,7 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net, nlh = nfnl_msg_put(skb, portid, seq, nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), - flags, family, NFNETLINK_V0, nft_base_seq(net)); + flags, family, NFNETLINK_V0, nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; @@ -1248,7 +1243,7 @@ static int nf_tables_dump_tables(struct sk_buff *skb, rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) @@ -1959,6 +1954,18 @@ nla_put_failure: return -ENOSPC; } +static bool hook_is_prefix(struct nft_hook *hook) +{ + return strlen(hook->ifname) >= hook->ifnamelen; +} + +static int nft_nla_put_hook_dev(struct sk_buff *skb, struct nft_hook *hook) +{ + int attr = hook_is_prefix(hook) ? NFTA_DEVICE_PREFIX : NFTA_DEVICE_NAME; + + return nla_put_string(skb, attr, hook->ifname); +} + static int nft_dump_basechain_hook(struct sk_buff *skb, const struct net *net, int family, const struct nft_base_chain *basechain, @@ -1990,16 +1997,15 @@ static int nft_dump_basechain_hook(struct sk_buff *skb, if (!first) first = hook; - if (nla_put(skb, NFTA_DEVICE_NAME, - hook->ifnamelen, hook->ifname)) + if (nft_nla_put_hook_dev(skb, hook)) goto nla_put_failure; n++; } nla_nest_end(skb, nest_devs); if (n == 1 && - nla_put(skb, NFTA_HOOK_DEV, - first->ifnamelen, first->ifname)) + !hook_is_prefix(first) && + nla_put_string(skb, NFTA_HOOK_DEV, first->ifname)) goto nla_put_failure; } nla_nest_end(skb, nest); @@ -2019,7 +2025,7 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, nlh = nfnl_msg_put(skb, portid, seq, nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), - flags, family, NFNETLINK_V0, nft_base_seq(net)); + flags, family, NFNETLINK_V0, nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; @@ -2122,7 +2128,7 @@ static int nf_tables_dump_chains(struct sk_buff *skb, rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) @@ -2310,7 +2316,8 @@ void nf_tables_chain_destroy(struct nft_chain *chain) } static struct nft_hook *nft_netdev_hook_alloc(struct net *net, - const struct nlattr *attr) + const struct nlattr *attr, + bool prefix) { struct nf_hook_ops *ops; struct net_device *dev; @@ -2327,7 +2334,8 @@ static struct nft_hook *nft_netdev_hook_alloc(struct net *net, if (err < 0) goto err_hook_free; - hook->ifnamelen = nla_len(attr); + /* include the terminating NUL-char when comparing non-prefixes */ + hook->ifnamelen = strlen(hook->ifname) + !prefix; /* nf_tables_netdev_event() is called under rtnl_mutex, this is * indirectly serializing all the other holders of the commit_mutex with @@ -2374,14 +2382,22 @@ static int nf_tables_parse_netdev_hooks(struct net *net, struct nft_hook *hook, *next; const struct nlattr *tmp; int rem, n = 0, err; + bool prefix; nla_for_each_nested(tmp, attr, rem) { - if (nla_type(tmp) != NFTA_DEVICE_NAME) { + switch (nla_type(tmp)) { + case NFTA_DEVICE_NAME: + prefix = false; + break; + case NFTA_DEVICE_PREFIX: + prefix = true; + break; + default: err = -EINVAL; goto err_hook; } - hook = nft_netdev_hook_alloc(net, tmp); + hook = nft_netdev_hook_alloc(net, tmp, prefix); if (IS_ERR(hook)) { NL_SET_BAD_ATTR(extack, tmp); err = PTR_ERR(hook); @@ -2427,7 +2443,7 @@ static int nft_chain_parse_netdev(struct net *net, struct nlattr *tb[], int err; if (tb[NFTA_HOOK_DEV]) { - hook = nft_netdev_hook_alloc(net, tb[NFTA_HOOK_DEV]); + hook = nft_netdev_hook_alloc(net, tb[NFTA_HOOK_DEV], false); if (IS_ERR(hook)) { NL_SET_BAD_ATTR(extack, tb[NFTA_HOOK_DEV]); return PTR_ERR(hook); @@ -2803,6 +2819,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, struct nft_chain *chain = ctx->chain; struct nft_chain_hook hook = {}; struct nft_stats __percpu *stats = NULL; + struct nftables_pernet *nft_net; struct nft_hook *h, *next; struct nf_hook_ops *ops; struct nft_trans *trans; @@ -2845,6 +2862,20 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, if (nft_hook_list_find(&basechain->hook_list, h)) { list_del(&h->list); nft_netdev_hook_free(h); + continue; + } + + nft_net = nft_pernet(ctx->net); + list_for_each_entry(trans, &nft_net->commit_list, list) { + if (trans->msg_type != NFT_MSG_NEWCHAIN || + trans->table != ctx->table || + !nft_trans_chain_update(trans)) + continue; + + if (nft_hook_list_find(&nft_trans_chain_hooks(trans), h)) { + nft_chain_release_hook(&hook); + return -EEXIST; + } } } } else { @@ -3635,7 +3666,7 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, u16 type = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); nlh = nfnl_msg_put(skb, portid, seq, type, flags, family, NFNETLINK_V0, - nft_base_seq(net)); + nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; @@ -3803,7 +3834,7 @@ static int nf_tables_dump_rules(struct sk_buff *skb, rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) @@ -4014,7 +4045,7 @@ static int nf_tables_getrule_reset(struct sk_buff *skb, buf = kasprintf(GFP_ATOMIC, "%.*s:%u", nla_len(nla[NFTA_RULE_TABLE]), (char *)nla_data(nla[NFTA_RULE_TABLE]), - nft_net->base_seq); + nft_base_seq(net)); audit_log_nfcfg(buf, info->nfmsg->nfgen_family, 1, AUDIT_NFT_OP_RULE_RESET, GFP_ATOMIC); kfree(buf); @@ -4851,7 +4882,7 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, nlh = nfnl_msg_put(skb, portid, seq, nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), flags, ctx->family, NFNETLINK_V0, - nft_base_seq(ctx->net)); + nft_base_seq_be16(ctx->net)); if (!nlh) goto nla_put_failure; @@ -4996,7 +5027,7 @@ static int nf_tables_dump_sets(struct sk_buff *skb, struct netlink_callback *cb) rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (ctx->family != NFPROTO_UNSPEC && @@ -5739,7 +5770,11 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_binding *binding) { struct nft_set_binding *i; - struct nft_set_iter iter; + struct nft_set_iter iter = { + .genmask = nft_genmask_next(ctx->net), + .type = NFT_ITER_UPDATE, + .fn = nf_tables_bind_check_setelem, + }; if (!list_empty(&set->bindings) && nft_set_is_anonymous(set)) return -EBUSY; @@ -5754,13 +5789,6 @@ int nf_tables_bind_set(const struct nft_ctx *ctx, struct nft_set *set, goto bind; } - iter.genmask = nft_genmask_next(ctx->net); - iter.type = NFT_ITER_UPDATE; - iter.skip = 0; - iter.count = 0; - iter.err = 0; - iter.fn = nf_tables_bind_check_setelem; - set->ops->walk(ctx, set, &iter); if (!iter.err) iter.err = nft_set_catchall_bind_check(ctx, set); @@ -6164,7 +6192,17 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) struct nftables_pernet *nft_net; struct nft_table *table; struct nft_set *set; - struct nft_set_dump_args args; + struct nft_set_dump_args args = { + .cb = cb, + .skb = skb, + .reset = dump_ctx->reset, + .iter = { + .genmask = nft_genmask_cur(net), + .type = NFT_ITER_READ, + .skip = cb->args[0], + .fn = nf_tables_dump_setelem, + }, + }; bool set_found = false; struct nlmsghdr *nlh; struct nlattr *nest; @@ -6173,7 +6211,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (dump_ctx->ctx.family != NFPROTO_UNSPEC && @@ -6202,7 +6240,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) seq = cb->nlh->nlmsg_seq; nlh = nfnl_msg_put(skb, portid, seq, event, NLM_F_MULTI, - table->family, NFNETLINK_V0, nft_base_seq(net)); + table->family, NFNETLINK_V0, nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; @@ -6215,15 +6253,6 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) if (nest == NULL) goto nla_put_failure; - args.cb = cb; - args.skb = skb; - args.reset = dump_ctx->reset; - args.iter.genmask = nft_genmask_cur(net); - args.iter.type = NFT_ITER_READ; - args.iter.skip = cb->args[0]; - args.iter.count = 0; - args.iter.err = 0; - args.iter.fn = nf_tables_dump_setelem; set->ops->walk(&dump_ctx->ctx, set, &args.iter); if (!args.iter.err && args.iter.count == cb->args[0]) @@ -6295,7 +6324,7 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb, event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); nlh = nfnl_msg_put(skb, portid, seq, event, flags, ctx->family, - NFNETLINK_V0, nft_base_seq(ctx->net)); + NFNETLINK_V0, nft_base_seq_be16(ctx->net)); if (!nlh) goto nla_put_failure; @@ -6594,7 +6623,7 @@ static int nf_tables_getsetelem_reset(struct sk_buff *skb, } nelems++; } - audit_log_nft_set_reset(dump_ctx.ctx.table, nft_net->base_seq, nelems); + audit_log_nft_set_reset(dump_ctx.ctx.table, nft_base_seq(info->net), nelems); out_unlock: rcu_read_unlock(); @@ -7534,7 +7563,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, } ue->priv = elem_priv; - nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL); + nft_trans_commit_list_add_elem(ctx->net, trans); goto err_elem_free; } } @@ -7558,7 +7587,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, } nft_trans_container_elem(trans)->elems[0].priv = elem.priv; - nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL); + nft_trans_commit_list_add_elem(ctx->net, trans); return 0; err_set_full: @@ -7824,7 +7853,7 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, nft_setelem_data_deactivate(ctx->net, set, elem.priv); nft_trans_container_elem(trans)->elems[0].priv = elem.priv; - nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL); + nft_trans_commit_list_add_elem(ctx->net, trans); return 0; fail_ops: @@ -7849,9 +7878,8 @@ static int nft_setelem_flush(const struct nft_ctx *ctx, if (!nft_set_elem_active(ext, iter->genmask)) return 0; - trans = nft_trans_alloc_gfp(ctx, NFT_MSG_DELSETELEM, - struct_size_t(struct nft_trans_elem, elems, 1), - GFP_ATOMIC); + trans = nft_trans_alloc(ctx, NFT_MSG_DELSETELEM, + struct_size_t(struct nft_trans_elem, elems, 1)); if (!trans) return -ENOMEM; @@ -7862,7 +7890,7 @@ static int nft_setelem_flush(const struct nft_ctx *ctx, nft_trans_elem_set(trans) = set; nft_trans_container_elem(trans)->nelems = 1; nft_trans_container_elem(trans)->elems[0].priv = elem_priv; - nft_trans_commit_list_add_elem(ctx->net, trans, GFP_ATOMIC); + nft_trans_commit_list_add_elem(ctx->net, trans); return 0; } @@ -7879,7 +7907,7 @@ static int __nft_set_catchall_flush(const struct nft_ctx *ctx, nft_setelem_data_deactivate(ctx->net, set, elem_priv); nft_trans_container_elem(trans)->elems[0].priv = elem_priv; - nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL); + nft_trans_commit_list_add_elem(ctx->net, trans); return 0; } @@ -8345,7 +8373,7 @@ static int nf_tables_fill_obj_info(struct sk_buff *skb, struct net *net, nlh = nfnl_msg_put(skb, portid, seq, nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), - flags, family, NFNETLINK_V0, nft_base_seq(net)); + flags, family, NFNETLINK_V0, nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; @@ -8410,7 +8438,7 @@ static int nf_tables_dump_obj(struct sk_buff *skb, struct netlink_callback *cb) rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) @@ -8444,7 +8472,7 @@ cont: idx++; } if (ctx->reset && entries) - audit_log_obj_reset(table, nft_net->base_seq, entries); + audit_log_obj_reset(table, nft_base_seq(net), entries); if (rc < 0) break; } @@ -8613,7 +8641,7 @@ static int nf_tables_getobj_reset(struct sk_buff *skb, buf = kasprintf(GFP_ATOMIC, "%.*s:%u", nla_len(nla[NFTA_OBJ_TABLE]), (char *)nla_data(nla[NFTA_OBJ_TABLE]), - nft_net->base_seq); + nft_base_seq(net)); audit_log_nfcfg(buf, info->nfmsg->nfgen_family, 1, AUDIT_NFT_OP_OBJ_RESET, GFP_ATOMIC); kfree(buf); @@ -8718,9 +8746,8 @@ void nft_obj_notify(struct net *net, const struct nft_table *table, struct nft_object *obj, u32 portid, u32 seq, int event, u16 flags, int family, int report, gfp_t gfp) { - struct nftables_pernet *nft_net = nft_pernet(net); char *buf = kasprintf(gfp, "%s:%u", - table->name, nft_net->base_seq); + table->name, nft_base_seq(net)); audit_log_nfcfg(buf, family, @@ -9060,6 +9087,7 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh, { const struct nlattr * const *nla = ctx->nla; struct nft_flowtable_hook flowtable_hook; + struct nftables_pernet *nft_net; struct nft_hook *hook, *next; struct nf_hook_ops *ops; struct nft_trans *trans; @@ -9076,6 +9104,20 @@ static int nft_flowtable_update(struct nft_ctx *ctx, const struct nlmsghdr *nlh, if (nft_hook_list_find(&flowtable->hook_list, hook)) { list_del(&hook->list); nft_netdev_hook_free(hook); + continue; + } + + nft_net = nft_pernet(ctx->net); + list_for_each_entry(trans, &nft_net->commit_list, list) { + if (trans->msg_type != NFT_MSG_NEWFLOWTABLE || + trans->table != ctx->table || + !nft_trans_flowtable_update(trans)) + continue; + + if (nft_hook_list_find(&nft_trans_flowtable_hooks(trans), hook)) { + err = -EEXIST; + goto err_flowtable_update_hook; + } } } @@ -9391,7 +9433,7 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, nlh = nfnl_msg_put(skb, portid, seq, nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event), - flags, family, NFNETLINK_V0, nft_base_seq(net)); + flags, family, NFNETLINK_V0, nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; @@ -9428,8 +9470,7 @@ static int nf_tables_fill_flowtable_info(struct sk_buff *skb, struct net *net, list_for_each_entry_rcu(hook, hook_list, list, lockdep_commit_lock_is_held(net)) { - if (nla_put(skb, NFTA_DEVICE_NAME, - hook->ifnamelen, hook->ifname)) + if (nft_nla_put_hook_dev(skb, hook)) goto nla_put_failure; } nla_nest_end(skb, nest_devs); @@ -9461,7 +9502,7 @@ static int nf_tables_dump_flowtable(struct sk_buff *skb, rcu_read_lock(); nft_net = nft_pernet(net); - cb->seq = READ_ONCE(nft_net->base_seq); + cb->seq = nft_base_seq(net); list_for_each_entry_rcu(table, &nft_net->tables, list) { if (family != NFPROTO_UNSPEC && family != table->family) @@ -9646,17 +9687,16 @@ static void nf_tables_flowtable_destroy(struct nft_flowtable *flowtable) static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net, u32 portid, u32 seq) { - struct nftables_pernet *nft_net = nft_pernet(net); struct nlmsghdr *nlh; char buf[TASK_COMM_LEN]; int event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_NEWGEN); nlh = nfnl_msg_put(skb, portid, seq, event, 0, AF_UNSPEC, - NFNETLINK_V0, nft_base_seq(net)); + NFNETLINK_V0, nft_base_seq_be16(net)); if (!nlh) goto nla_put_failure; - if (nla_put_be32(skb, NFTA_GEN_ID, htonl(nft_net->base_seq)) || + if (nla_put_be32(skb, NFTA_GEN_ID, htonl(nft_base_seq(net))) || nla_put_be32(skb, NFTA_GEN_PROC_PID, htonl(task_pid_nr(current))) || nla_put_string(skb, NFTA_GEN_PROC_NAME, get_task_comm(buf, current))) goto nla_put_failure; @@ -10918,11 +10958,12 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) * Bump generation counter, invalidate any dump in progress. * Cannot fail after this point. */ - base_seq = READ_ONCE(nft_net->base_seq); + base_seq = nft_base_seq(net); while (++base_seq == 0) ; - WRITE_ONCE(nft_net->base_seq, base_seq); + /* pairs with smp_load_acquire in nft_lookup_eval */ + smp_store_release(&net->nft.base_seq, base_seq); gc_seq = nft_gc_seq_begin(nft_net); @@ -11131,7 +11172,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) nft_commit_notify(net, NETLINK_CB(skb).portid); nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN); - nf_tables_commit_audit_log(&adl, nft_net->base_seq); + nf_tables_commit_audit_log(&adl, nft_base_seq(net)); nft_gc_seq_end(nft_net, gc_seq); nft_net->validate_state = NFT_VALIDATE_SKIP; @@ -11456,7 +11497,7 @@ static bool nf_tables_valid_genid(struct net *net, u32 genid) mutex_lock(&nft_net->commit_mutex); nft_net->tstamp = get_jiffies_64(); - genid_ok = genid == 0 || nft_net->base_seq == genid; + genid_ok = genid == 0 || nft_base_seq(net) == genid; if (!genid_ok) mutex_unlock(&nft_net->commit_mutex); @@ -12093,7 +12134,7 @@ static int __net_init nf_tables_init_net(struct net *net) INIT_LIST_HEAD(&nft_net->module_list); INIT_LIST_HEAD(&nft_net->notify_list); mutex_init(&nft_net->commit_mutex); - nft_net->base_seq = 1; + net->nft.base_seq = 1; nft_net->gc_seq = 0; nft_net->validate_state = NFT_VALIDATE_SKIP; INIT_WORK(&nft_net->destroy_work, nf_tables_trans_destroy_work); diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index e598a2a252b0..811d02b4c4f7 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -376,6 +376,7 @@ static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh, const struct nfnetlink_subsystem *ss; const struct nfnl_callback *nc; struct netlink_ext_ack extack; + struct nlmsghdr *onlh = nlh; LIST_HEAD(err_list); u32 status; int err; @@ -386,6 +387,7 @@ replay: status = 0; replay_abort: skb = netlink_skb_clone(oskb, GFP_KERNEL); + nlh = onlh; if (!skb) return netlink_ack(oskb, nlh, -ENOMEM, NULL); diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c index 92b984fa8175..657764774a2d 100644 --- a/net/netfilter/nft_connlimit.c +++ b/net/netfilter/nft_connlimit.c @@ -24,33 +24,27 @@ static inline void nft_connlimit_do_eval(struct nft_connlimit *priv, const struct nft_pktinfo *pkt, const struct nft_set_ext *ext) { - const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; - const struct nf_conntrack_tuple *tuple_ptr; - struct nf_conntrack_tuple tuple; - enum ip_conntrack_info ctinfo; - const struct nf_conn *ct; unsigned int count; + int err; - tuple_ptr = &tuple; - - ct = nf_ct_get(pkt->skb, &ctinfo); - if (ct != NULL) { - tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; - zone = nf_ct_zone(ct); - } else if (!nf_ct_get_tuplepr(pkt->skb, skb_network_offset(pkt->skb), - nft_pf(pkt), nft_net(pkt), &tuple)) { - regs->verdict.code = NF_DROP; - return; - } - - if (nf_conncount_add(nft_net(pkt), priv->list, tuple_ptr, zone)) { - regs->verdict.code = NF_DROP; - return; + err = nf_conncount_add_skb(nft_net(pkt), pkt->skb, nft_pf(pkt), priv->list); + if (err) { + if (err == -EEXIST) { + /* Call gc to update the list count if any connection has + * been closed already. This is useful for softlimit + * connections like limiting bandwidth based on a number + * of open connections. + */ + nf_conncount_gc_list(nft_net(pkt), priv->list); + } else { + regs->verdict.code = NF_DROP; + return; + } } - count = priv->list->count; + count = READ_ONCE(priv->list->count); - if ((count > priv->limit) ^ priv->invert) { + if ((count > READ_ONCE(priv->limit)) ^ READ_ONCE(priv->invert)) { regs->verdict.code = NFT_BREAK; return; } @@ -137,6 +131,16 @@ static int nft_connlimit_obj_init(const struct nft_ctx *ctx, return nft_connlimit_do_init(ctx, tb, priv); } +static void nft_connlimit_obj_update(struct nft_object *obj, + struct nft_object *newobj) +{ + struct nft_connlimit *newpriv = nft_obj_data(newobj); + struct nft_connlimit *priv = nft_obj_data(obj); + + WRITE_ONCE(priv->limit, newpriv->limit); + WRITE_ONCE(priv->invert, newpriv->invert); +} + static void nft_connlimit_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj) { @@ -166,6 +170,7 @@ static const struct nft_object_ops nft_connlimit_obj_ops = { .init = nft_connlimit_obj_init, .destroy = nft_connlimit_obj_destroy, .dump = nft_connlimit_obj_dump, + .update = nft_connlimit_obj_update, }; static struct nft_object_type nft_connlimit_obj_type __read_mostly = { @@ -238,13 +243,8 @@ static void nft_connlimit_destroy_clone(const struct nft_ctx *ctx, static bool nft_connlimit_gc(struct net *net, const struct nft_expr *expr) { struct nft_connlimit *priv = nft_expr_priv(expr); - bool ret; - - local_bh_disable(); - ret = nf_conncount_gc_list(net, priv->list); - local_bh_enable(); - return ret; + return nf_conncount_gc_list(net, priv->list); } static struct nft_expr_type nft_connlimit_type; diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index d526e69a2a2b..6f2ae7cad731 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -22,6 +22,7 @@ #include <net/netfilter/nf_conntrack_timeout.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_expect.h> +#include <net/netfilter/nf_conntrack_seqadj.h> struct nft_ct_helper_obj { struct nf_conntrack_helper *helper4; @@ -379,6 +380,14 @@ static bool nft_ct_tmpl_alloc_pcpu(void) } #endif +static void __nft_ct_get_destroy(const struct nft_ctx *ctx, struct nft_ct *priv) +{ +#ifdef CONFIG_NF_CONNTRACK_LABELS + if (priv->key == NFT_CT_LABELS) + nf_connlabels_put(ctx->net); +#endif +} + static int nft_ct_get_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) @@ -413,6 +422,10 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, if (tb[NFTA_CT_DIRECTION] != NULL) return -EINVAL; len = NF_CT_LABELS_MAX_SIZE; + + err = nf_connlabels_get(ctx->net, (len * BITS_PER_BYTE) - 1); + if (err) + return err; break; #endif case NFT_CT_HELPER: @@ -494,7 +507,8 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, case IP_CT_DIR_REPLY: break; default: - return -EINVAL; + err = -EINVAL; + goto err; } } @@ -502,11 +516,11 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, err = nft_parse_register_store(ctx, tb[NFTA_CT_DREG], &priv->dreg, NULL, NFT_DATA_VALUE, len); if (err < 0) - return err; + goto err; err = nf_ct_netns_get(ctx->net, ctx->family); if (err < 0) - return err; + goto err; if (priv->key == NFT_CT_BYTES || priv->key == NFT_CT_PKTS || @@ -514,6 +528,9 @@ static int nft_ct_get_init(const struct nft_ctx *ctx, nf_ct_set_acct(ctx->net, true); return 0; +err: + __nft_ct_get_destroy(ctx, priv); + return err; } static void __nft_ct_set_destroy(const struct nft_ctx *ctx, struct nft_ct *priv) @@ -626,6 +643,9 @@ err1: static void nft_ct_get_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) { + struct nft_ct *priv = nft_expr_priv(expr); + + __nft_ct_get_destroy(ctx, priv); nf_ct_netns_put(ctx->net, ctx->family); } @@ -1173,6 +1193,10 @@ static void nft_ct_helper_obj_eval(struct nft_object *obj, if (help) { rcu_assign_pointer(help->helper, to_assign); set_bit(IPS_HELPER_BIT, &ct->status); + + if ((ct->status & IPS_NAT_MASK) && !nfct_seqadj(ct)) + if (!nfct_seqadj_ext_add(ct)) + regs->verdict.code = NF_DROP; } } diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 225ff293cd50..b8f76c9057fd 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -9,7 +9,7 @@ #include <linux/netfilter/nf_conntrack_common.h> #include <linux/netfilter/nf_tables.h> #include <net/ip.h> -#include <net/inet_dscp.h> +#include <net/flow.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_conntrack_core.h> @@ -20,258 +20,6 @@ struct nft_flow_offload { struct nft_flowtable *flowtable; }; -static enum flow_offload_xmit_type nft_xmit_type(struct dst_entry *dst) -{ - if (dst_xfrm(dst)) - return FLOW_OFFLOAD_XMIT_XFRM; - - return FLOW_OFFLOAD_XMIT_NEIGH; -} - -static void nft_default_forward_path(struct nf_flow_route *route, - struct dst_entry *dst_cache, - enum ip_conntrack_dir dir) -{ - route->tuple[!dir].in.ifindex = dst_cache->dev->ifindex; - route->tuple[dir].dst = dst_cache; - route->tuple[dir].xmit_type = nft_xmit_type(dst_cache); -} - -static bool nft_is_valid_ether_device(const struct net_device *dev) -{ - if (!dev || (dev->flags & IFF_LOOPBACK) || dev->type != ARPHRD_ETHER || - dev->addr_len != ETH_ALEN || !is_valid_ether_addr(dev->dev_addr)) - return false; - - return true; -} - -static int nft_dev_fill_forward_path(const struct nf_flow_route *route, - const struct dst_entry *dst_cache, - const struct nf_conn *ct, - enum ip_conntrack_dir dir, u8 *ha, - struct net_device_path_stack *stack) -{ - const void *daddr = &ct->tuplehash[!dir].tuple.src.u3; - struct net_device *dev = dst_cache->dev; - struct neighbour *n; - u8 nud_state; - - if (!nft_is_valid_ether_device(dev)) - goto out; - - n = dst_neigh_lookup(dst_cache, daddr); - if (!n) - return -1; - - read_lock_bh(&n->lock); - nud_state = n->nud_state; - ether_addr_copy(ha, n->ha); - read_unlock_bh(&n->lock); - neigh_release(n); - - if (!(nud_state & NUD_VALID)) - return -1; - -out: - return dev_fill_forward_path(dev, ha, stack); -} - -struct nft_forward_info { - const struct net_device *indev; - const struct net_device *outdev; - const struct net_device *hw_outdev; - struct id { - __u16 id; - __be16 proto; - } encap[NF_FLOW_TABLE_ENCAP_MAX]; - u8 num_encaps; - u8 ingress_vlans; - u8 h_source[ETH_ALEN]; - u8 h_dest[ETH_ALEN]; - enum flow_offload_xmit_type xmit_type; -}; - -static void nft_dev_path_info(const struct net_device_path_stack *stack, - struct nft_forward_info *info, - unsigned char *ha, struct nf_flowtable *flowtable) -{ - const struct net_device_path *path; - int i; - - memcpy(info->h_dest, ha, ETH_ALEN); - - for (i = 0; i < stack->num_paths; i++) { - path = &stack->path[i]; - switch (path->type) { - case DEV_PATH_ETHERNET: - case DEV_PATH_DSA: - case DEV_PATH_VLAN: - case DEV_PATH_PPPOE: - info->indev = path->dev; - if (is_zero_ether_addr(info->h_source)) - memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); - - if (path->type == DEV_PATH_ETHERNET) - break; - if (path->type == DEV_PATH_DSA) { - i = stack->num_paths; - break; - } - - /* DEV_PATH_VLAN and DEV_PATH_PPPOE */ - if (info->num_encaps >= NF_FLOW_TABLE_ENCAP_MAX) { - info->indev = NULL; - break; - } - if (!info->outdev) - info->outdev = path->dev; - info->encap[info->num_encaps].id = path->encap.id; - info->encap[info->num_encaps].proto = path->encap.proto; - info->num_encaps++; - if (path->type == DEV_PATH_PPPOE) - memcpy(info->h_dest, path->encap.h_dest, ETH_ALEN); - break; - case DEV_PATH_BRIDGE: - if (is_zero_ether_addr(info->h_source)) - memcpy(info->h_source, path->dev->dev_addr, ETH_ALEN); - - switch (path->bridge.vlan_mode) { - case DEV_PATH_BR_VLAN_UNTAG_HW: - info->ingress_vlans |= BIT(info->num_encaps - 1); - break; - case DEV_PATH_BR_VLAN_TAG: - info->encap[info->num_encaps].id = path->bridge.vlan_id; - info->encap[info->num_encaps].proto = path->bridge.vlan_proto; - info->num_encaps++; - break; - case DEV_PATH_BR_VLAN_UNTAG: - info->num_encaps--; - break; - case DEV_PATH_BR_VLAN_KEEP: - break; - } - info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; - break; - default: - info->indev = NULL; - break; - } - } - if (!info->outdev) - info->outdev = info->indev; - - info->hw_outdev = info->indev; - - if (nf_flowtable_hw_offload(flowtable) && - nft_is_valid_ether_device(info->indev)) - info->xmit_type = FLOW_OFFLOAD_XMIT_DIRECT; -} - -static bool nft_flowtable_find_dev(const struct net_device *dev, - struct nft_flowtable *ft) -{ - struct nft_hook *hook; - bool found = false; - - list_for_each_entry_rcu(hook, &ft->hook_list, list) { - if (!nft_hook_find_ops_rcu(hook, dev)) - continue; - - found = true; - break; - } - - return found; -} - -static void nft_dev_forward_path(struct nf_flow_route *route, - const struct nf_conn *ct, - enum ip_conntrack_dir dir, - struct nft_flowtable *ft) -{ - const struct dst_entry *dst = route->tuple[dir].dst; - struct net_device_path_stack stack; - struct nft_forward_info info = {}; - unsigned char ha[ETH_ALEN]; - int i; - - if (nft_dev_fill_forward_path(route, dst, ct, dir, ha, &stack) >= 0) - nft_dev_path_info(&stack, &info, ha, &ft->data); - - if (!info.indev || !nft_flowtable_find_dev(info.indev, ft)) - return; - - route->tuple[!dir].in.ifindex = info.indev->ifindex; - for (i = 0; i < info.num_encaps; i++) { - route->tuple[!dir].in.encap[i].id = info.encap[i].id; - route->tuple[!dir].in.encap[i].proto = info.encap[i].proto; - } - route->tuple[!dir].in.num_encaps = info.num_encaps; - route->tuple[!dir].in.ingress_vlans = info.ingress_vlans; - - if (info.xmit_type == FLOW_OFFLOAD_XMIT_DIRECT) { - memcpy(route->tuple[dir].out.h_source, info.h_source, ETH_ALEN); - memcpy(route->tuple[dir].out.h_dest, info.h_dest, ETH_ALEN); - route->tuple[dir].out.ifindex = info.outdev->ifindex; - route->tuple[dir].out.hw_ifindex = info.hw_outdev->ifindex; - route->tuple[dir].xmit_type = info.xmit_type; - } -} - -static int nft_flow_route(const struct nft_pktinfo *pkt, - const struct nf_conn *ct, - struct nf_flow_route *route, - enum ip_conntrack_dir dir, - struct nft_flowtable *ft) -{ - struct dst_entry *this_dst = skb_dst(pkt->skb); - struct dst_entry *other_dst = NULL; - struct flowi fl; - - memset(&fl, 0, sizeof(fl)); - switch (nft_pf(pkt)) { - case NFPROTO_IPV4: - fl.u.ip4.daddr = ct->tuplehash[dir].tuple.src.u3.ip; - fl.u.ip4.saddr = ct->tuplehash[!dir].tuple.src.u3.ip; - fl.u.ip4.flowi4_oif = nft_in(pkt)->ifindex; - fl.u.ip4.flowi4_iif = this_dst->dev->ifindex; - fl.u.ip4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip_hdr(pkt->skb))); - fl.u.ip4.flowi4_mark = pkt->skb->mark; - fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC; - break; - case NFPROTO_IPV6: - fl.u.ip6.daddr = ct->tuplehash[dir].tuple.src.u3.in6; - fl.u.ip6.saddr = ct->tuplehash[!dir].tuple.src.u3.in6; - fl.u.ip6.flowi6_oif = nft_in(pkt)->ifindex; - fl.u.ip6.flowi6_iif = this_dst->dev->ifindex; - fl.u.ip6.flowlabel = ip6_flowinfo(ipv6_hdr(pkt->skb)); - fl.u.ip6.flowi6_mark = pkt->skb->mark; - fl.u.ip6.flowi6_flags = FLOWI_FLAG_ANYSRC; - break; - } - - if (!dst_hold_safe(this_dst)) - return -ENOENT; - - nf_route(nft_net(pkt), &other_dst, &fl, false, nft_pf(pkt)); - if (!other_dst) { - dst_release(this_dst); - return -ENOENT; - } - - nft_default_forward_path(route, this_dst, dir); - nft_default_forward_path(route, other_dst, !dir); - - if (route->tuple[dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH && - route->tuple[!dir].xmit_type == FLOW_OFFLOAD_XMIT_NEIGH) { - nft_dev_forward_path(route, ct, dir, ft); - nft_dev_forward_path(route, ct, !dir, ft); - } - - return 0; -} - static bool nft_flow_offload_skip(struct sk_buff *skb, int family) { if (skb_sec_path(skb)) diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index 40c602ffbcba..fc2d7c5d83c8 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -24,11 +24,11 @@ struct nft_lookup { struct nft_set_binding binding; }; -#ifdef CONFIG_MITIGATION_RETPOLINE -const struct nft_set_ext * -nft_set_do_lookup(const struct net *net, const struct nft_set *set, - const u32 *key) +static const struct nft_set_ext * +__nft_set_do_lookup(const struct net *net, const struct nft_set *set, + const u32 *key) { +#ifdef CONFIG_MITIGATION_RETPOLINE if (set->ops == &nft_set_hash_fast_type.ops) return nft_hash_lookup_fast(net, set, key); if (set->ops == &nft_set_hash_type.ops) @@ -51,10 +51,46 @@ nft_set_do_lookup(const struct net *net, const struct nft_set *set, return nft_rbtree_lookup(net, set, key); WARN_ON_ONCE(1); +#endif return set->ops->lookup(net, set, key); } + +static unsigned int nft_base_seq(const struct net *net) +{ + /* pairs with smp_store_release() in nf_tables_commit() */ + return smp_load_acquire(&net->nft.base_seq); +} + +static bool nft_lookup_should_retry(const struct net *net, unsigned int seq) +{ + return unlikely(seq != nft_base_seq(net)); +} + +const struct nft_set_ext * +nft_set_do_lookup(const struct net *net, const struct nft_set *set, + const u32 *key) +{ + const struct nft_set_ext *ext; + unsigned int base_seq; + + do { + base_seq = nft_base_seq(net); + + ext = __nft_set_do_lookup(net, set, key); + if (ext) + break; + /* No match? There is a small chance that lookup was + * performed in the old generation, but nf_tables_commit() + * already unlinked a (matching) element. + * + * We need to repeat the lookup to make sure that we didn't + * miss a matching element in the new generation. + */ + } while (nft_lookup_should_retry(net, base_seq)); + + return ext; +} EXPORT_SYMBOL_GPL(nft_set_do_lookup); -#endif void nft_lookup_eval(const struct nft_expr *expr, struct nft_regs *regs, @@ -210,19 +246,16 @@ static int nft_lookup_validate(const struct nft_ctx *ctx, const struct nft_expr *expr) { const struct nft_lookup *priv = nft_expr_priv(expr); - struct nft_set_iter iter; + struct nft_set_iter iter = { + .genmask = nft_genmask_next(ctx->net), + .type = NFT_ITER_UPDATE, + .fn = nft_setelem_validate, + }; if (!(priv->set->flags & NFT_SET_MAP) || priv->set->dtype != NFT_DATA_VERDICT) return 0; - iter.genmask = nft_genmask_next(ctx->net); - iter.type = NFT_ITER_UPDATE; - iter.skip = 0; - iter.count = 0; - iter.err = 0; - iter.fn = nft_setelem_validate; - priv->set->ops->walk(ctx, priv->set, &iter); if (!iter.err) iter.err = nft_set_catchall_validate(ctx, priv->set); diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c index 8ee66a86c3bc..1a62e384766a 100644 --- a/net/netfilter/nft_objref.c +++ b/net/netfilter/nft_objref.c @@ -22,6 +22,35 @@ void nft_objref_eval(const struct nft_expr *expr, obj->ops->eval(obj, regs, pkt); } +static int nft_objref_validate_obj_type(const struct nft_ctx *ctx, u32 type) +{ + unsigned int hooks; + + switch (type) { + case NFT_OBJECT_SYNPROXY: + if (ctx->family != NFPROTO_IPV4 && + ctx->family != NFPROTO_IPV6 && + ctx->family != NFPROTO_INET) + return -EOPNOTSUPP; + + hooks = (1 << NF_INET_LOCAL_IN) | (1 << NF_INET_FORWARD); + + return nft_chain_validate_hooks(ctx->chain, hooks); + default: + break; + } + + return 0; +} + +static int nft_objref_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + struct nft_object *obj = nft_objref_priv(expr); + + return nft_objref_validate_obj_type(ctx, obj->ops->type->type); +} + static int nft_objref_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) @@ -93,6 +122,7 @@ static const struct nft_expr_ops nft_objref_ops = { .activate = nft_objref_activate, .deactivate = nft_objref_deactivate, .dump = nft_objref_dump, + .validate = nft_objref_validate, .reduce = NFT_REDUCE_READONLY, }; @@ -197,6 +227,14 @@ static void nft_objref_map_destroy(const struct nft_ctx *ctx, nf_tables_destroy_set(ctx, priv->set); } +static int nft_objref_map_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr) +{ + const struct nft_objref_map *priv = nft_expr_priv(expr); + + return nft_objref_validate_obj_type(ctx, priv->set->objtype); +} + static const struct nft_expr_ops nft_objref_map_ops = { .type = &nft_objref_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_objref_map)), @@ -206,6 +244,7 @@ static const struct nft_expr_ops nft_objref_map_ops = { .deactivate = nft_objref_map_deactivate, .destroy = nft_objref_map_destroy, .dump = nft_objref_map_dump, + .validate = nft_objref_map_validate, .reduce = NFT_REDUCE_READONLY, }; diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 7dfc5343dae4..b0214418f75a 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -40,7 +40,7 @@ static bool nft_payload_rebuild_vlan_hdr(const struct sk_buff *skb, int mac_off, /* add vlan header into the user buffer for if tag was removed by offloads */ static bool -nft_payload_copy_vlan(u32 *d, const struct sk_buff *skb, u8 offset, u8 len) +nft_payload_copy_vlan(u32 *d, const struct sk_buff *skb, u16 offset, u8 len) { int mac_off = skb_mac_header(skb) - skb->data; u8 *vlanh, *dst_u8 = (u8 *) d; @@ -212,7 +212,7 @@ static const struct nla_policy nft_payload_policy[NFTA_PAYLOAD_MAX + 1] = { [NFTA_PAYLOAD_SREG] = { .type = NLA_U32 }, [NFTA_PAYLOAD_DREG] = { .type = NLA_U32 }, [NFTA_PAYLOAD_BASE] = { .type = NLA_U32 }, - [NFTA_PAYLOAD_OFFSET] = NLA_POLICY_MAX(NLA_BE32, 255), + [NFTA_PAYLOAD_OFFSET] = { .type = NLA_BE32 }, [NFTA_PAYLOAD_LEN] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_PAYLOAD_CSUM_TYPE] = { .type = NLA_U32 }, [NFTA_PAYLOAD_CSUM_OFFSET] = NLA_POLICY_MAX(NLA_BE32, 255), @@ -684,7 +684,7 @@ static const struct nft_expr_ops nft_payload_inner_ops = { static inline void nft_csum_replace(__sum16 *sum, __wsum fsum, __wsum tsum) { - *sum = csum_fold(csum_add(csum_sub(~csum_unfold(*sum), fsum), tsum)); + csum_replace4(sum, (__force __be32)fsum, (__force __be32)tsum); if (*sum == 0) *sum = CSUM_MANGLED_0; } @@ -797,7 +797,7 @@ static int nft_payload_csum_inet(struct sk_buff *skb, const u32 *src, struct nft_payload_set { enum nft_payload_bases base:8; - u8 offset; + u16 offset; u8 len; u8 sreg; u8 csum_type; @@ -812,7 +812,7 @@ struct nft_payload_vlan_hdr { }; static bool -nft_payload_set_vlan(const u32 *src, struct sk_buff *skb, u8 offset, u8 len, +nft_payload_set_vlan(const u32 *src, struct sk_buff *skb, u16 offset, u8 len, int *vlan_hlen) { struct nft_payload_vlan_hdr *vlanh; @@ -940,14 +940,18 @@ static int nft_payload_set_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { + u32 csum_offset, offset, csum_type = NFT_PAYLOAD_CSUM_NONE; struct nft_payload_set *priv = nft_expr_priv(expr); - u32 csum_offset, csum_type = NFT_PAYLOAD_CSUM_NONE; int err; priv->base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE])); - priv->offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET])); priv->len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN])); + err = nft_parse_u32_check(tb[NFTA_PAYLOAD_OFFSET], U16_MAX, &offset); + if (err < 0) + return err; + priv->offset = offset; + if (tb[NFTA_PAYLOAD_CSUM_TYPE]) csum_type = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_TYPE])); if (tb[NFTA_PAYLOAD_CSUM_OFFSET]) { @@ -1069,7 +1073,7 @@ nft_payload_select_ops(const struct nft_ctx *ctx, if (tb[NFTA_PAYLOAD_DREG] == NULL) return ERR_PTR(-EINVAL); - err = nft_parse_u32_check(tb[NFTA_PAYLOAD_OFFSET], U8_MAX, &offset); + err = nft_parse_u32_check(tb[NFTA_PAYLOAD_OFFSET], U16_MAX, &offset); if (err < 0) return ERR_PTR(err); diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index c24c922f895d..8d3f040a904a 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -226,7 +226,8 @@ static void nft_bitmap_walk(const struct nft_ctx *ctx, const struct nft_bitmap *priv = nft_set_priv(set); struct nft_bitmap_elem *be; - list_for_each_entry_rcu(be, &priv->list, head) { + list_for_each_entry_rcu(be, &priv->list, head, + lockdep_is_held(&nft_pernet(ctx->net)->commit_mutex)) { if (iter->count < iter->skip) goto cont; diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 266d0c637225..ba01ce75d6de 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -30,6 +30,7 @@ struct nft_rhash { struct nft_rhash_elem { struct nft_elem_priv priv; struct rhash_head node; + struct llist_node walk_node; u32 wq_gc_seq; struct nft_set_ext ext; }; @@ -144,6 +145,7 @@ nft_rhash_update(struct nft_set *set, const u32 *key, goto err1; he = nft_elem_priv_cast(elem_priv); + init_llist_node(&he->walk_node); prev = rhashtable_lookup_get_insert_key(&priv->ht, &arg, &he->node, nft_rhash_params); if (IS_ERR(prev)) @@ -180,6 +182,7 @@ static int nft_rhash_insert(const struct net *net, const struct nft_set *set, }; struct nft_rhash_elem *prev; + init_llist_node(&he->walk_node); prev = rhashtable_lookup_get_insert_key(&priv->ht, &arg, &he->node, nft_rhash_params); if (IS_ERR(prev)) @@ -261,12 +264,12 @@ static bool nft_rhash_delete(const struct nft_set *set, return true; } -static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, - struct nft_set_iter *iter) +static void nft_rhash_walk_ro(const struct nft_ctx *ctx, struct nft_set *set, + struct nft_set_iter *iter) { struct nft_rhash *priv = nft_set_priv(set); - struct nft_rhash_elem *he; struct rhashtable_iter hti; + struct nft_rhash_elem *he; rhashtable_walk_enter(&priv->ht, &hti); rhashtable_walk_start(&hti); @@ -295,6 +298,97 @@ cont: rhashtable_walk_exit(&hti); } +static void nft_rhash_walk_update(const struct nft_ctx *ctx, + struct nft_set *set, + struct nft_set_iter *iter) +{ + struct nft_rhash *priv = nft_set_priv(set); + struct nft_rhash_elem *he, *tmp; + struct llist_node *first_node; + struct rhashtable_iter hti; + LLIST_HEAD(walk_list); + + lockdep_assert_held(&nft_pernet(ctx->net)->commit_mutex); + + if (set->in_update_walk) { + /* This can happen with bogus rulesets during ruleset validation + * when a verdict map causes a jump back to the same map. + * + * Without this extra check the walk_next loop below will see + * elems on the callers walk_list and skip (not validate) them. + */ + iter->err = -EMLINK; + return; + } + + /* walk happens under RCU. + * + * We create a snapshot list so ->iter callback can sleep. + * commit_mutex is held, elements can ... + * .. be added in parallel from dataplane (dynset) + * .. be marked as dead in parallel from dataplane (dynset). + * .. be queued for removal in parallel (gc timeout). + * .. not be freed: transaction mutex is held. + */ + rhashtable_walk_enter(&priv->ht, &hti); + rhashtable_walk_start(&hti); + + while ((he = rhashtable_walk_next(&hti))) { + if (IS_ERR(he)) { + if (PTR_ERR(he) != -EAGAIN) { + iter->err = PTR_ERR(he); + break; + } + + continue; + } + + /* rhashtable resized during walk, skip */ + if (llist_on_list(&he->walk_node)) + continue; + + llist_add(&he->walk_node, &walk_list); + } + rhashtable_walk_stop(&hti); + rhashtable_walk_exit(&hti); + + first_node = __llist_del_all(&walk_list); + set->in_update_walk = true; + llist_for_each_entry_safe(he, tmp, first_node, walk_node) { + if (iter->err == 0) { + iter->err = iter->fn(ctx, set, iter, &he->priv); + if (iter->err == 0) + iter->count++; + } + + /* all entries must be cleared again, else next ->walk iteration + * will skip entries. + */ + init_llist_node(&he->walk_node); + } + set->in_update_walk = false; +} + +static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, + struct nft_set_iter *iter) +{ + switch (iter->type) { + case NFT_ITER_UPDATE: + /* only relevant for netlink dumps which use READ type */ + WARN_ON_ONCE(iter->skip != 0); + + nft_rhash_walk_update(ctx, set, iter); + break; + case NFT_ITER_READ: + nft_rhash_walk_ro(ctx, set, iter); + break; + default: + iter->err = -EINVAL; + WARN_ON_ONCE(1); + break; + } +} + static bool nft_rhash_expr_needs_gc_run(const struct nft_set *set, struct nft_set_ext *ext) { diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 1a19649c2851..112fe46788b6 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -397,7 +397,7 @@ int pipapo_refill(unsigned long *map, unsigned int len, unsigned int rules, } /** - * pipapo_get() - Get matching element reference given key data + * pipapo_get_slow() - Get matching element reference given key data * @m: storage containing the set elements * @data: Key data to be matched against existing elements * @genmask: If set, check that element is active in given genmask @@ -414,27 +414,28 @@ int pipapo_refill(unsigned long *map, unsigned int len, unsigned int rules, * * Return: pointer to &struct nft_pipapo_elem on match, NULL otherwise. */ -static struct nft_pipapo_elem *pipapo_get(const struct nft_pipapo_match *m, - const u8 *data, u8 genmask, - u64 tstamp) +static struct nft_pipapo_elem *pipapo_get_slow(const struct nft_pipapo_match *m, + const u8 *data, u8 genmask, + u64 tstamp) { + unsigned long *res_map, *fill_map, *map; struct nft_pipapo_scratch *scratch; - unsigned long *res_map, *fill_map; const struct nft_pipapo_field *f; bool map_index; int i; local_bh_disable(); - if (unlikely(!raw_cpu_ptr(m->scratch))) - goto out; - scratch = *raw_cpu_ptr(m->scratch); + if (unlikely(!scratch)) + goto out; + __local_lock_nested_bh(&scratch->bh_lock); map_index = scratch->map_index; - res_map = scratch->map + (map_index ? m->bsize_max : 0); - fill_map = scratch->map + (map_index ? 0 : m->bsize_max); + map = NFT_PIPAPO_LT_ALIGN(&scratch->__map[0]); + res_map = map + (map_index ? m->bsize_max : 0); + fill_map = map + (map_index ? 0 : m->bsize_max); pipapo_resmap_init(m, res_map); @@ -465,6 +466,7 @@ next_match: last); if (b < 0) { scratch->map_index = map_index; + __local_unlock_nested_bh(&scratch->bh_lock); local_bh_enable(); return NULL; @@ -484,6 +486,7 @@ next_match: * *next* bitmap (not initial) for the next packet. */ scratch->map_index = map_index; + __local_unlock_nested_bh(&scratch->bh_lock); local_bh_enable(); return e; } @@ -498,12 +501,47 @@ next_match: data += NFT_PIPAPO_GROUPS_PADDING(f); } + __local_unlock_nested_bh(&scratch->bh_lock); out: local_bh_enable(); return NULL; } /** + * pipapo_get() - Get matching element reference given key data + * @m: Storage containing the set elements + * @data: Key data to be matched against existing elements + * @genmask: If set, check that element is active in given genmask + * @tstamp: Timestamp to check for expired elements + * + * This is a dispatcher function, either calling out the generic C + * implementation or, if available, the AVX2 one. + * This helper is only called from the control plane, with either RCU + * read lock or transaction mutex held. + * + * Return: pointer to &struct nft_pipapo_elem on match, NULL otherwise. + */ +static struct nft_pipapo_elem *pipapo_get(const struct nft_pipapo_match *m, + const u8 *data, u8 genmask, + u64 tstamp) +{ + struct nft_pipapo_elem *e; + + local_bh_disable(); + +#if defined(CONFIG_X86_64) && !defined(CONFIG_UML) + if (boot_cpu_has(X86_FEATURE_AVX2) && irq_fpu_usable()) { + e = pipapo_get_avx2(m, data, genmask, tstamp); + local_bh_enable(); + return e; + } +#endif + e = pipapo_get_slow(m, data, genmask, tstamp); + local_bh_enable(); + return e; +} + +/** * nft_pipapo_lookup() - Dataplane fronted for main lookup function * @net: Network namespace * @set: nftables API set representation @@ -511,6 +549,22 @@ out: * * This function is called from the data path. It will search for * an element matching the given key in the current active copy. + * Unlike other set types, this uses 0 instead of nft_genmask_cur(). + * + * This is because new (future) elements are not reachable from + * priv->match, they get added to priv->clone instead. + * When the commit phase flips the generation bitmask, the + * 'now old' entries are skipped but without the 'now current' + * elements becoming visible. Using nft_genmask_cur() thus creates + * inconsistent state: matching old entries get skipped but thew + * newly matching entries are unreachable. + * + * GENMASK_ANY doesn't work for the same reason: old-gen entries get + * skipped, new-gen entries are only reachable from priv->clone. + * + * nft_pipapo_commit swaps ->clone and ->match shortly after the + * genbit flip. As ->clone doesn't contain the old entries in the first + * place, lookup will only find the now-current ones. * * Return: ntables API extension pointer or NULL if no match. */ @@ -519,12 +573,11 @@ nft_pipapo_lookup(const struct net *net, const struct nft_set *set, const u32 *key) { struct nft_pipapo *priv = nft_set_priv(set); - u8 genmask = nft_genmask_cur(net); const struct nft_pipapo_match *m; const struct nft_pipapo_elem *e; m = rcu_dereference(priv->match); - e = pipapo_get(m, (const u8 *)key, genmask, get_jiffies_64()); + e = pipapo_get_slow(m, (const u8 *)key, 0, get_jiffies_64()); return e ? &e->ext : NULL; } @@ -1137,22 +1190,17 @@ static void pipapo_map(struct nft_pipapo_match *m, } /** - * pipapo_free_scratch() - Free per-CPU map at original (not aligned) address + * pipapo_free_scratch() - Free per-CPU map at original address * @m: Matching data * @cpu: CPU number */ static void pipapo_free_scratch(const struct nft_pipapo_match *m, unsigned int cpu) { struct nft_pipapo_scratch *s; - void *mem; s = *per_cpu_ptr(m->scratch, cpu); - if (!s) - return; - mem = s; - mem -= s->align_off; - kvfree(mem); + kvfree(s); } /** @@ -1169,11 +1217,8 @@ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone, for_each_possible_cpu(i) { struct nft_pipapo_scratch *scratch; -#ifdef NFT_PIPAPO_ALIGN - void *scratch_aligned; - u32 align_off; -#endif - scratch = kvzalloc_node(struct_size(scratch, map, bsize_max * 2) + + + scratch = kvzalloc_node(struct_size(scratch, __map, bsize_max * 2) + NFT_PIPAPO_ALIGN_HEADROOM, GFP_KERNEL_ACCOUNT, cpu_to_node(i)); if (!scratch) { @@ -1188,23 +1233,7 @@ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone, } pipapo_free_scratch(clone, i); - -#ifdef NFT_PIPAPO_ALIGN - /* Align &scratch->map (not the struct itself): the extra - * %NFT_PIPAPO_ALIGN_HEADROOM bytes passed to kzalloc_node() - * above guarantee we can waste up to those bytes in order - * to align the map field regardless of its offset within - * the struct. - */ - BUILD_BUG_ON(offsetof(struct nft_pipapo_scratch, map) > NFT_PIPAPO_ALIGN_HEADROOM); - - scratch_aligned = NFT_PIPAPO_LT_ALIGN(&scratch->map); - scratch_aligned -= offsetof(struct nft_pipapo_scratch, map); - align_off = scratch_aligned - (void *)scratch; - - scratch = scratch_aligned; - scratch->align_off = align_off; -#endif + local_lock_init(&scratch->bh_lock); *per_cpu_ptr(clone->scratch, i) = scratch; } diff --git a/net/netfilter/nft_set_pipapo.h b/net/netfilter/nft_set_pipapo.h index 4a2ff85ce1c4..eaab422aa56a 100644 --- a/net/netfilter/nft_set_pipapo.h +++ b/net/netfilter/nft_set_pipapo.h @@ -124,14 +124,14 @@ struct nft_pipapo_field { /** * struct nft_pipapo_scratch - percpu data used for lookup and matching + * @bh_lock: PREEMPT_RT local spinlock * @map_index: Current working bitmap index, toggled between field matches - * @align_off: Offset to get the originally allocated address - * @map: store partial matching results during lookup + * @__map: store partial matching results during lookup */ struct nft_pipapo_scratch { + local_lock_t bh_lock; u8 map_index; - u32 align_off; - unsigned long map[]; + unsigned long __map[]; }; /** diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c index db5d367e43c4..7ff90325c97f 100644 --- a/net/netfilter/nft_set_pipapo_avx2.c +++ b/net/netfilter/nft_set_pipapo_avx2.c @@ -1099,7 +1099,7 @@ bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features, desc->field_count < NFT_PIPAPO_MIN_FIELDS) return false; - if (!boot_cpu_has(X86_FEATURE_AVX2) || !boot_cpu_has(X86_FEATURE_AVX)) + if (!boot_cpu_has(X86_FEATURE_AVX2)) return false; est->size = pipapo_estimate_size(desc); @@ -1133,76 +1133,59 @@ static inline void pipapo_resmap_init_avx2(const struct nft_pipapo_match *m, uns } /** - * nft_pipapo_avx2_lookup() - Lookup function for AVX2 implementation - * @net: Network namespace - * @set: nftables API set representation - * @key: nftables API element representation containing key data + * pipapo_get_avx2() - Lookup function for AVX2 implementation + * @m: Storage containing the set elements + * @data: Key data to be matched against existing elements + * @genmask: If set, check that element is active in given genmask + * @tstamp: Timestamp to check for expired elements * * For more details, see DOC: Theory of Operation in nft_set_pipapo.c. * * This implementation exploits the repetitive characteristic of the algorithm * to provide a fast, vectorised version using the AVX2 SIMD instruction set. * - * Return: true on match, false otherwise. + * The caller must check that the FPU is usable. + * This function must be called with BH disabled. + * + * Return: pointer to &struct nft_pipapo_elem on match, NULL otherwise. */ -const struct nft_set_ext * -nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, - const u32 *key) +struct nft_pipapo_elem *pipapo_get_avx2(const struct nft_pipapo_match *m, + const u8 *data, u8 genmask, + u64 tstamp) { - struct nft_pipapo *priv = nft_set_priv(set); struct nft_pipapo_scratch *scratch; - u8 genmask = nft_genmask_cur(net); - const struct nft_pipapo_match *m; const struct nft_pipapo_field *f; - const u8 *rp = (const u8 *)key; - const struct nft_set_ext *ext; - unsigned long *res, *fill; + unsigned long *res, *fill, *map; bool map_index; int i; - local_bh_disable(); - - if (unlikely(!irq_fpu_usable())) { - ext = nft_pipapo_lookup(net, set, key); + scratch = *raw_cpu_ptr(m->scratch); + if (unlikely(!scratch)) + return NULL; - local_bh_enable(); - return ext; - } + __local_lock_nested_bh(&scratch->bh_lock); + map_index = scratch->map_index; + map = NFT_PIPAPO_LT_ALIGN(&scratch->__map[0]); + res = map + (map_index ? m->bsize_max : 0); + fill = map + (map_index ? 0 : m->bsize_max); - m = rcu_dereference(priv->match); + pipapo_resmap_init_avx2(m, res); - /* This also protects access to all data related to scratch maps. - * - * Note that we don't need a valid MXCSR state for any of the + /* Note that we don't need a valid MXCSR state for any of the * operations we use here, so pass 0 as mask and spare a LDMXCSR * instruction. */ kernel_fpu_begin_mask(0); - scratch = *raw_cpu_ptr(m->scratch); - if (unlikely(!scratch)) { - kernel_fpu_end(); - local_bh_enable(); - return NULL; - } - - map_index = scratch->map_index; - - res = scratch->map + (map_index ? m->bsize_max : 0); - fill = scratch->map + (map_index ? 0 : m->bsize_max); - - pipapo_resmap_init_avx2(m, res); - nft_pipapo_avx2_prepare(); -next_match: nft_pipapo_for_each_field(f, i, m) { bool last = i == m->field_count - 1, first = !i; int ret = 0; #define NFT_SET_PIPAPO_AVX2_LOOKUP(b, n) \ (ret = nft_pipapo_avx2_lookup_##b##b_##n(res, fill, f, \ - ret, rp, \ + ret, data, \ first, last)) if (likely(f->bb == 8)) { @@ -1218,7 +1201,7 @@ next_match: NFT_SET_PIPAPO_AVX2_LOOKUP(8, 16); } else { ret = nft_pipapo_avx2_lookup_slow(m, res, fill, f, - ret, rp, + ret, data, first, last); } } else { @@ -1234,7 +1217,7 @@ next_match: NFT_SET_PIPAPO_AVX2_LOOKUP(4, 32); } else { ret = nft_pipapo_avx2_lookup_slow(m, res, fill, f, - ret, rp, + ret, data, first, last); } } @@ -1242,29 +1225,78 @@ next_match: #undef NFT_SET_PIPAPO_AVX2_LOOKUP - if (ret < 0) - goto out; +next_match: + if (ret < 0) { + scratch->map_index = map_index; + kernel_fpu_end(); + __local_unlock_nested_bh(&scratch->bh_lock); + return NULL; + } if (last) { - ext = &f->mt[ret].e->ext; - if (unlikely(nft_set_elem_expired(ext) || - !nft_set_elem_active(ext, genmask))) { - ext = NULL; + struct nft_pipapo_elem *e; + + e = f->mt[ret].e; + if (unlikely(__nft_set_elem_expired(&e->ext, tstamp) || + !nft_set_elem_active(&e->ext, genmask))) { + ret = pipapo_refill(res, f->bsize, f->rules, + fill, f->mt, last); goto next_match; } - goto out; + scratch->map_index = map_index; + kernel_fpu_end(); + __local_unlock_nested_bh(&scratch->bh_lock); + return e; } + map_index = !map_index; swap(res, fill); - rp += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); + data += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); } -out: - if (i % 2) - scratch->map_index = !map_index; kernel_fpu_end(); + __local_unlock_nested_bh(&scratch->bh_lock); + return NULL; +} + +/** + * nft_pipapo_avx2_lookup() - Dataplane frontend for AVX2 implementation + * @net: Network namespace + * @set: nftables API set representation + * @key: nftables API element representation containing key data + * + * This function is called from the data path. It will search for + * an element matching the given key in the current active copy using + * the AVX2 routines if the FPU is usable or fall back to the generic + * implementation of the algorithm otherwise. + * + * Return: nftables API extension pointer or NULL if no match. + */ +const struct nft_set_ext * +nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, + const u32 *key) +{ + struct nft_pipapo *priv = nft_set_priv(set); + const struct nft_pipapo_match *m; + const u8 *rp = (const u8 *)key; + const struct nft_pipapo_elem *e; + + local_bh_disable(); + + if (unlikely(!irq_fpu_usable())) { + const struct nft_set_ext *ext; + + ext = nft_pipapo_lookup(net, set, key); + + local_bh_enable(); + return ext; + } + + m = rcu_dereference(priv->match); + + e = pipapo_get_avx2(m, rp, 0, get_jiffies_64()); local_bh_enable(); - return ext; + return e ? &e->ext : NULL; } diff --git a/net/netfilter/nft_set_pipapo_avx2.h b/net/netfilter/nft_set_pipapo_avx2.h index dbb6aaca8a7a..c2999b63da3f 100644 --- a/net/netfilter/nft_set_pipapo_avx2.h +++ b/net/netfilter/nft_set_pipapo_avx2.h @@ -5,8 +5,12 @@ #include <asm/fpu/xstate.h> #define NFT_PIPAPO_ALIGN (XSAVE_YMM_SIZE / BITS_PER_BYTE) +struct nft_pipapo_match; bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features, struct nft_set_estimate *est); +struct nft_pipapo_elem *pipapo_get_avx2(const struct nft_pipapo_match *m, + const u8 *data, u8 genmask, + u64 tstamp); #endif /* defined(CONFIG_X86_64) && !defined(CONFIG_UML) */ #endif /* _NFT_SET_PIPAPO_AVX2_H */ diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 938a257c069e..ca594161b840 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -77,7 +77,9 @@ __nft_rbtree_lookup(const struct net *net, const struct nft_set *set, nft_rbtree_interval_end(rbe) && nft_rbtree_interval_start(interval)) continue; - interval = rbe; + if (nft_set_elem_active(&rbe->ext, genmask) && + !nft_rbtree_elem_expired(rbe)) + interval = rbe; } else if (d > 0) parent = rcu_dereference_raw(parent->rb_right); else { @@ -102,8 +104,6 @@ __nft_rbtree_lookup(const struct net *net, const struct nft_set *set, } if (set->flags & NFT_SET_INTERVAL && interval != NULL && - nft_set_elem_active(&interval->ext, genmask) && - !nft_rbtree_elem_expired(interval) && nft_rbtree_interval_start(interval)) return &interval->ext; @@ -584,15 +584,14 @@ nft_rbtree_deactivate(const struct net *net, const struct nft_set *set, return NULL; } -static void nft_rbtree_walk(const struct nft_ctx *ctx, - struct nft_set *set, - struct nft_set_iter *iter) +static void nft_rbtree_do_walk(const struct nft_ctx *ctx, + struct nft_set *set, + struct nft_set_iter *iter) { struct nft_rbtree *priv = nft_set_priv(set); struct nft_rbtree_elem *rbe; struct rb_node *node; - read_lock_bh(&priv->lock); for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { rbe = rb_entry(node, struct nft_rbtree_elem, node); @@ -600,14 +599,34 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, goto cont; iter->err = iter->fn(ctx, set, iter, &rbe->priv); - if (iter->err < 0) { - read_unlock_bh(&priv->lock); + if (iter->err < 0) return; - } cont: iter->count++; } - read_unlock_bh(&priv->lock); +} + +static void nft_rbtree_walk(const struct nft_ctx *ctx, + struct nft_set *set, + struct nft_set_iter *iter) +{ + struct nft_rbtree *priv = nft_set_priv(set); + + switch (iter->type) { + case NFT_ITER_UPDATE: + lockdep_assert_held(&nft_pernet(ctx->net)->commit_mutex); + nft_rbtree_do_walk(ctx, set, iter); + break; + case NFT_ITER_READ: + read_lock_bh(&priv->lock); + nft_rbtree_do_walk(ctx, set, iter); + read_unlock_bh(&priv->lock); + break; + default: + iter->err = -EINVAL; + WARN_ON_ONCE(1); + break; + } } static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set, diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c index 35d0409b0095..36affbb697c2 100644 --- a/net/netfilter/nft_socket.c +++ b/net/netfilter/nft_socket.c @@ -217,7 +217,7 @@ static int nft_socket_init(const struct nft_ctx *ctx, level += err; /* Implies a giant cgroup tree */ - if (WARN_ON_ONCE(level > 255)) + if (level > 255) return -EOPNOTSUPP; priv->level = level; diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index 0189f8b6b0bd..848287ab79cf 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -31,8 +31,6 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) { struct net *net = xt_net(par); const struct xt_connlimit_info *info = par->matchinfo; - struct nf_conntrack_tuple tuple; - const struct nf_conntrack_tuple *tuple_ptr = &tuple; const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; enum ip_conntrack_info ctinfo; const struct nf_conn *ct; @@ -40,13 +38,8 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) u32 key[5]; ct = nf_ct_get(skb, &ctinfo); - if (ct != NULL) { - tuple_ptr = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + if (ct) zone = nf_ct_zone(ct); - } else if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), - xt_family(par), net, &tuple)) { - goto hotdrop; - } if (xt_family(par) == NFPROTO_IPV6) { const struct ipv6hdr *iph = ipv6_hdr(skb); @@ -69,10 +62,9 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) key[1] = zone->id; } - connections = nf_conncount_count(net, info->data, key, tuple_ptr, - zone); + connections = nf_conncount_count_skb(net, skb, xt_family(par), info->data, key); if (connections == 0) - /* kmalloc failed, drop it entirely */ + /* kmalloc failed or tuple couldn't be found, drop it entirely */ goto hotdrop; return (connections > info->limit) ^ !!(info->flags & XT_CONNLIMIT_INVERT); diff --git a/net/netlabel/netlabel_user.c b/net/netlabel/netlabel_user.c index 0d04d23aafe7..0da652844dd6 100644 --- a/net/netlabel/netlabel_user.c +++ b/net/netlabel/netlabel_user.c @@ -84,7 +84,6 @@ struct audit_buffer *netlbl_audit_start_common(int type, struct netlbl_audit *audit_info) { struct audit_buffer *audit_buf; - struct lsm_context ctx; if (audit_enabled == AUDIT_OFF) return NULL; @@ -96,12 +95,7 @@ struct audit_buffer *netlbl_audit_start_common(int type, audit_log_format(audit_buf, "netlabel: auid=%u ses=%u", from_kuid(&init_user_ns, audit_info->loginuid), audit_info->sessionid); - - if (lsmprop_is_set(&audit_info->prop) && - security_lsmprop_to_secctx(&audit_info->prop, &ctx) > 0) { - audit_log_format(audit_buf, " subj=%s", ctx.context); - security_release_secctx(&ctx); - } + audit_log_subj_ctx(audit_buf, &audit_info->prop); return audit_buf; } diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index e2f7080dd5d7..8e5151f0c6e4 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -356,7 +356,7 @@ static void netlink_overrun(struct sock *sk) sk_error_report(sk); } } - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); } static void netlink_rcv_wake(struct sock *sk) @@ -596,10 +596,8 @@ static void netlink_remove(struct sock *sk) table = &nl_table[sk->sk_protocol]; if (!rhashtable_remove_fast(&table->hash, &nlk_sk(sk)->node, - netlink_rhashtable_params)) { - WARN_ON(refcount_read(&sk->sk_refcnt) == 1); + netlink_rhashtable_params)) __sock_put(sk); - } netlink_table_grab(); if (nlk_sk(sk)->subscriptions) { @@ -968,7 +966,7 @@ static void netlink_undo_bind(int group, long unsigned int groups, nlk->netlink_unbind(sock_net(sk), undo + 1); } -static int netlink_bind(struct socket *sock, struct sockaddr *addr, +static int netlink_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len) { struct sock *sk = sock->sk; @@ -1056,7 +1054,7 @@ unlock: return err; } -static int netlink_connect(struct socket *sock, struct sockaddr *addr, +static int netlink_connect(struct socket *sock, struct sockaddr_unsized *addr, int alen, int flags) { int err = 0; @@ -2711,7 +2709,7 @@ static int netlink_native_seq_show(struct seq_file *seq, void *v) sk_wmem_alloc_get(s), READ_ONCE(nlk->cb_running), refcount_read(&s->sk_refcnt), - atomic_read(&s->sk_drops), + sk_drops_read(s), sock_i_ino(s) ); diff --git a/net/netlink/diag.c b/net/netlink/diag.c index 61981e01fd6f..b8e58132e8af 100644 --- a/net/netlink/diag.c +++ b/net/netlink/diag.c @@ -168,7 +168,7 @@ mc_list: NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - __sock_i_ino(sk)) < 0) { + sock_i_ino(sk)) < 0) { ret = 1; break; } diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 104732d34543..978c129c6095 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -1836,6 +1836,9 @@ static int genl_bind(struct net *net, int group) !ns_capable(net->user_ns, CAP_SYS_ADMIN)) ret = -EPERM; + if (ret) + break; + if (family->bind) family->bind(i); diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 3331669d8e33..5ed1a71ceec1 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -561,7 +561,7 @@ static int nr_release(struct socket *sock) return 0; } -static int nr_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +static int nr_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { struct sock *sk = sock->sk; struct nr_sock *nr = nr_sk(sk); @@ -632,8 +632,8 @@ static int nr_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) return 0; } -static int nr_connect(struct socket *sock, struct sockaddr *uaddr, - int addr_len, int flags) +static int nr_connect(struct socket *sock, struct sockaddr_unsized *uaddr, + int addr_len, int flags) { struct sock *sk = sock->sk; struct nr_sock *nr = nr_sk(sk); diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index 57a2f97004e1..f1be1e84f665 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -56,7 +56,7 @@ static struct proto llcp_sock_proto = { .obj_size = sizeof(struct nfc_llcp_sock), }; -static int llcp_sock_bind(struct socket *sock, struct sockaddr *addr, int alen) +static int llcp_sock_bind(struct socket *sock, struct sockaddr_unsized *addr, int alen) { struct sock *sk = sock->sk; struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk); @@ -146,7 +146,7 @@ error: return ret; } -static int llcp_raw_sock_bind(struct socket *sock, struct sockaddr *addr, +static int llcp_raw_sock_bind(struct socket *sock, struct sockaddr_unsized *addr, int alen) { struct sock *sk = sock->sk; @@ -648,7 +648,7 @@ out: return err; } -static int llcp_sock_connect(struct socket *sock, struct sockaddr *_addr, +static int llcp_sock_connect(struct socket *sock, struct sockaddr_unsized *_addr, int len, int flags) { struct sock *sk = sock->sk; diff --git a/net/nfc/nci/ntf.c b/net/nfc/nci/ntf.c index a818eff27e6b..418b84e2b260 100644 --- a/net/nfc/nci/ntf.c +++ b/net/nfc/nci/ntf.c @@ -27,11 +27,16 @@ /* Handle NCI Notification packets */ -static void nci_core_reset_ntf_packet(struct nci_dev *ndev, - const struct sk_buff *skb) +static int nci_core_reset_ntf_packet(struct nci_dev *ndev, + const struct sk_buff *skb) { /* Handle NCI 2.x core reset notification */ - const struct nci_core_reset_ntf *ntf = (void *)skb->data; + const struct nci_core_reset_ntf *ntf; + + if (skb->len < sizeof(struct nci_core_reset_ntf)) + return -EINVAL; + + ntf = (struct nci_core_reset_ntf *)skb->data; ndev->nci_ver = ntf->nci_ver; pr_debug("nci_ver 0x%x, config_status 0x%x\n", @@ -42,15 +47,22 @@ static void nci_core_reset_ntf_packet(struct nci_dev *ndev, __le32_to_cpu(ntf->manufact_specific_info); nci_req_complete(ndev, NCI_STATUS_OK); + + return 0; } -static void nci_core_conn_credits_ntf_packet(struct nci_dev *ndev, - struct sk_buff *skb) +static int nci_core_conn_credits_ntf_packet(struct nci_dev *ndev, + struct sk_buff *skb) { - struct nci_core_conn_credit_ntf *ntf = (void *) skb->data; + struct nci_core_conn_credit_ntf *ntf; struct nci_conn_info *conn_info; int i; + if (skb->len < sizeof(struct nci_core_conn_credit_ntf)) + return -EINVAL; + + ntf = (struct nci_core_conn_credit_ntf *)skb->data; + pr_debug("num_entries %d\n", ntf->num_entries); if (ntf->num_entries > NCI_MAX_NUM_CONN) @@ -68,7 +80,7 @@ static void nci_core_conn_credits_ntf_packet(struct nci_dev *ndev, conn_info = nci_get_conn_info_by_conn_id(ndev, ntf->conn_entries[i].conn_id); if (!conn_info) - return; + return 0; atomic_add(ntf->conn_entries[i].credits, &conn_info->credits_cnt); @@ -77,12 +89,19 @@ static void nci_core_conn_credits_ntf_packet(struct nci_dev *ndev, /* trigger the next tx */ if (!skb_queue_empty(&ndev->tx_q)) queue_work(ndev->tx_wq, &ndev->tx_work); + + return 0; } -static void nci_core_generic_error_ntf_packet(struct nci_dev *ndev, - const struct sk_buff *skb) +static int nci_core_generic_error_ntf_packet(struct nci_dev *ndev, + const struct sk_buff *skb) { - __u8 status = skb->data[0]; + __u8 status; + + if (skb->len < 1) + return -EINVAL; + + status = skb->data[0]; pr_debug("status 0x%x\n", status); @@ -91,12 +110,19 @@ static void nci_core_generic_error_ntf_packet(struct nci_dev *ndev, (the state remains the same) */ nci_req_complete(ndev, status); } + + return 0; } -static void nci_core_conn_intf_error_ntf_packet(struct nci_dev *ndev, - struct sk_buff *skb) +static int nci_core_conn_intf_error_ntf_packet(struct nci_dev *ndev, + struct sk_buff *skb) { - struct nci_core_intf_error_ntf *ntf = (void *) skb->data; + struct nci_core_intf_error_ntf *ntf; + + if (skb->len < sizeof(struct nci_core_intf_error_ntf)) + return -EINVAL; + + ntf = (struct nci_core_intf_error_ntf *)skb->data; ntf->conn_id = nci_conn_id(&ntf->conn_id); @@ -105,6 +131,8 @@ static void nci_core_conn_intf_error_ntf_packet(struct nci_dev *ndev, /* complete the data exchange transaction, if exists */ if (test_bit(NCI_DATA_EXCHANGE, &ndev->flags)) nci_data_exchange_complete(ndev, NULL, ntf->conn_id, -EIO); + + return 0; } static const __u8 * @@ -329,13 +357,18 @@ void nci_clear_target_list(struct nci_dev *ndev) ndev->n_targets = 0; } -static void nci_rf_discover_ntf_packet(struct nci_dev *ndev, - const struct sk_buff *skb) +static int nci_rf_discover_ntf_packet(struct nci_dev *ndev, + const struct sk_buff *skb) { struct nci_rf_discover_ntf ntf; - const __u8 *data = skb->data; + const __u8 *data; bool add_target = true; + if (skb->len < sizeof(struct nci_rf_discover_ntf)) + return -EINVAL; + + data = skb->data; + ntf.rf_discovery_id = *data++; ntf.rf_protocol = *data++; ntf.rf_tech_and_mode = *data++; @@ -390,6 +423,8 @@ static void nci_rf_discover_ntf_packet(struct nci_dev *ndev, nfc_targets_found(ndev->nfc_dev, ndev->targets, ndev->n_targets); } + + return 0; } static int nci_extract_activation_params_iso_dep(struct nci_dev *ndev, @@ -553,14 +588,19 @@ static int nci_store_ats_nfc_iso_dep(struct nci_dev *ndev, return NCI_STATUS_OK; } -static void nci_rf_intf_activated_ntf_packet(struct nci_dev *ndev, - const struct sk_buff *skb) +static int nci_rf_intf_activated_ntf_packet(struct nci_dev *ndev, + const struct sk_buff *skb) { struct nci_conn_info *conn_info; struct nci_rf_intf_activated_ntf ntf; - const __u8 *data = skb->data; + const __u8 *data; int err = NCI_STATUS_OK; + if (skb->len < sizeof(struct nci_rf_intf_activated_ntf)) + return -EINVAL; + + data = skb->data; + ntf.rf_discovery_id = *data++; ntf.rf_interface = *data++; ntf.rf_protocol = *data++; @@ -667,7 +707,7 @@ exit: if (err == NCI_STATUS_OK) { conn_info = ndev->rf_conn_info; if (!conn_info) - return; + return 0; conn_info->max_pkt_payload_len = ntf.max_data_pkt_payload_size; conn_info->initial_num_credits = ntf.initial_num_credits; @@ -721,19 +761,26 @@ listen: pr_err("error when signaling tm activation\n"); } } + + return 0; } -static void nci_rf_deactivate_ntf_packet(struct nci_dev *ndev, - const struct sk_buff *skb) +static int nci_rf_deactivate_ntf_packet(struct nci_dev *ndev, + const struct sk_buff *skb) { const struct nci_conn_info *conn_info; - const struct nci_rf_deactivate_ntf *ntf = (void *)skb->data; + const struct nci_rf_deactivate_ntf *ntf; + + if (skb->len < sizeof(struct nci_rf_deactivate_ntf)) + return -EINVAL; + + ntf = (struct nci_rf_deactivate_ntf *)skb->data; pr_debug("entry, type 0x%x, reason 0x%x\n", ntf->type, ntf->reason); conn_info = ndev->rf_conn_info; if (!conn_info) - return; + return 0; /* drop tx data queue */ skb_queue_purge(&ndev->tx_q); @@ -765,14 +812,20 @@ static void nci_rf_deactivate_ntf_packet(struct nci_dev *ndev, } nci_req_complete(ndev, NCI_STATUS_OK); + + return 0; } -static void nci_nfcee_discover_ntf_packet(struct nci_dev *ndev, - const struct sk_buff *skb) +static int nci_nfcee_discover_ntf_packet(struct nci_dev *ndev, + const struct sk_buff *skb) { u8 status = NCI_STATUS_OK; - const struct nci_nfcee_discover_ntf *nfcee_ntf = - (struct nci_nfcee_discover_ntf *)skb->data; + const struct nci_nfcee_discover_ntf *nfcee_ntf; + + if (skb->len < sizeof(struct nci_nfcee_discover_ntf)) + return -EINVAL; + + nfcee_ntf = (struct nci_nfcee_discover_ntf *)skb->data; /* NFCForum NCI 9.2.1 HCI Network Specific Handling * If the NFCC supports the HCI Network, it SHALL return one, @@ -783,6 +836,8 @@ static void nci_nfcee_discover_ntf_packet(struct nci_dev *ndev, ndev->cur_params.id = nfcee_ntf->nfcee_id; nci_req_complete(ndev, status); + + return 0; } void nci_ntf_packet(struct nci_dev *ndev, struct sk_buff *skb) @@ -809,35 +864,43 @@ void nci_ntf_packet(struct nci_dev *ndev, struct sk_buff *skb) switch (ntf_opcode) { case NCI_OP_CORE_RESET_NTF: - nci_core_reset_ntf_packet(ndev, skb); + if (nci_core_reset_ntf_packet(ndev, skb)) + goto end; break; case NCI_OP_CORE_CONN_CREDITS_NTF: - nci_core_conn_credits_ntf_packet(ndev, skb); + if (nci_core_conn_credits_ntf_packet(ndev, skb)) + goto end; break; case NCI_OP_CORE_GENERIC_ERROR_NTF: - nci_core_generic_error_ntf_packet(ndev, skb); + if (nci_core_generic_error_ntf_packet(ndev, skb)) + goto end; break; case NCI_OP_CORE_INTF_ERROR_NTF: - nci_core_conn_intf_error_ntf_packet(ndev, skb); + if (nci_core_conn_intf_error_ntf_packet(ndev, skb)) + goto end; break; case NCI_OP_RF_DISCOVER_NTF: - nci_rf_discover_ntf_packet(ndev, skb); + if (nci_rf_discover_ntf_packet(ndev, skb)) + goto end; break; case NCI_OP_RF_INTF_ACTIVATED_NTF: - nci_rf_intf_activated_ntf_packet(ndev, skb); + if (nci_rf_intf_activated_ntf_packet(ndev, skb)) + goto end; break; case NCI_OP_RF_DEACTIVATE_NTF: - nci_rf_deactivate_ntf_packet(ndev, skb); + if (nci_rf_deactivate_ntf_packet(ndev, skb)) + goto end; break; case NCI_OP_NFCEE_DISCOVER_NTF: - nci_nfcee_discover_ntf_packet(ndev, skb); + if (nci_nfcee_discover_ntf_packet(ndev, skb)) + goto end; break; case NCI_OP_RF_NFCEE_ACTION_NTF: diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c index 5125392bb68e..b049022399ae 100644 --- a/net/nfc/rawsock.c +++ b/net/nfc/rawsock.c @@ -73,7 +73,7 @@ static int rawsock_release(struct socket *sock) return 0; } -static int rawsock_connect(struct socket *sock, struct sockaddr *_addr, +static int rawsock_connect(struct socket *sock, struct sockaddr_unsized *_addr, int len, int flags) { struct sock *sk = sock->sk; diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 2832e0794197..792ca44a461d 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -572,69 +572,6 @@ static int set_ipv6(struct sk_buff *skb, struct sw_flow_key *flow_key, return 0; } -static int set_nsh(struct sk_buff *skb, struct sw_flow_key *flow_key, - const struct nlattr *a) -{ - struct nshhdr *nh; - size_t length; - int err; - u8 flags; - u8 ttl; - int i; - - struct ovs_key_nsh key; - struct ovs_key_nsh mask; - - err = nsh_key_from_nlattr(a, &key, &mask); - if (err) - return err; - - /* Make sure the NSH base header is there */ - if (!pskb_may_pull(skb, skb_network_offset(skb) + NSH_BASE_HDR_LEN)) - return -ENOMEM; - - nh = nsh_hdr(skb); - length = nsh_hdr_len(nh); - - /* Make sure the whole NSH header is there */ - err = skb_ensure_writable(skb, skb_network_offset(skb) + - length); - if (unlikely(err)) - return err; - - nh = nsh_hdr(skb); - skb_postpull_rcsum(skb, nh, length); - flags = nsh_get_flags(nh); - flags = OVS_MASKED(flags, key.base.flags, mask.base.flags); - flow_key->nsh.base.flags = flags; - ttl = nsh_get_ttl(nh); - ttl = OVS_MASKED(ttl, key.base.ttl, mask.base.ttl); - flow_key->nsh.base.ttl = ttl; - nsh_set_flags_and_ttl(nh, flags, ttl); - nh->path_hdr = OVS_MASKED(nh->path_hdr, key.base.path_hdr, - mask.base.path_hdr); - flow_key->nsh.base.path_hdr = nh->path_hdr; - switch (nh->mdtype) { - case NSH_M_TYPE1: - for (i = 0; i < NSH_MD1_CONTEXT_SIZE; i++) { - nh->md1.context[i] = - OVS_MASKED(nh->md1.context[i], key.context[i], - mask.context[i]); - } - memcpy(flow_key->nsh.context, nh->md1.context, - sizeof(nh->md1.context)); - break; - case NSH_M_TYPE2: - memset(flow_key->nsh.context, 0, - sizeof(flow_key->nsh.context)); - break; - default: - return -EINVAL; - } - skb_postpush_rcsum(skb, nh, length); - return 0; -} - /* Must follow skb_ensure_writable() since that can move the skb data. */ static void set_tp_port(struct sk_buff *skb, __be16 *port, __be16 new_port, __sum16 *check) @@ -1130,10 +1067,6 @@ static int execute_masked_set_action(struct sk_buff *skb, get_mask(a, struct ovs_key_ethernet *)); break; - case OVS_KEY_ATTR_NSH: - err = set_nsh(skb, flow_key, a); - break; - case OVS_KEY_ATTR_IPV4: err = set_ipv4(skb, flow_key, nla_data(a), get_mask(a, struct ovs_key_ipv4 *)); @@ -1170,6 +1103,7 @@ static int execute_masked_set_action(struct sk_buff *skb, case OVS_KEY_ATTR_CT_LABELS: case OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4: case OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6: + case OVS_KEY_ATTR_NSH: err = -EINVAL; break; } diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index e573e9221302..a0811e1fba65 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -928,8 +928,8 @@ static u32 ct_limit_get(const struct ovs_ct_limit_info *info, u16 zone) } static int ovs_ct_check_limit(struct net *net, - const struct ovs_conntrack_info *info, - const struct nf_conntrack_tuple *tuple) + const struct sk_buff *skb, + const struct ovs_conntrack_info *info) { struct ovs_net *ovs_net = net_generic(net, ovs_net_id); const struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info; @@ -942,8 +942,9 @@ static int ovs_ct_check_limit(struct net *net, if (per_zone_limit == OVS_CT_LIMIT_UNLIMITED) return 0; - connections = nf_conncount_count(net, ct_limit_info->data, - &conncount_key, tuple, &info->zone); + connections = nf_conncount_count_skb(net, skb, info->family, + ct_limit_info->data, + &conncount_key); if (connections > per_zone_limit) return -ENOMEM; @@ -972,8 +973,7 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key, #if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) if (static_branch_unlikely(&ovs_ct_limit_enabled)) { if (!nf_ct_is_confirmed(ct)) { - err = ovs_ct_check_limit(net, info, - &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + err = ovs_ct_check_limit(net, skb, info); if (err) { net_warn_ratelimited("openvswitch: zone: %u " "exceeds conntrack limit\n", @@ -1770,8 +1770,8 @@ static int __ovs_ct_limit_get_zone_limit(struct net *net, zone_limit.limit = limit; nf_ct_zone_init(&ct_zone, zone_id, NF_CT_DEFAULT_ZONE_DIR, 0); - zone_limit.count = nf_conncount_count(net, data, &conncount_key, NULL, - &ct_zone); + zone_limit.count = nf_conncount_count_skb(net, NULL, 0, data, + &conncount_key); return nla_put_nohdr(reply, sizeof(zone_limit), &zone_limit); } diff --git a/net/openvswitch/dp_notify.c b/net/openvswitch/dp_notify.c index 7af0cde8b293..a2af90ee99af 100644 --- a/net/openvswitch/dp_notify.c +++ b/net/openvswitch/dp_notify.c @@ -75,7 +75,7 @@ static int dp_device_event(struct notifier_block *unused, unsigned long event, /* schedule vport destroy, dev_put and genl notification */ ovs_net = net_generic(dev_net(dev), ovs_net_id); - queue_work(system_wq, &ovs_net->dp_notify_work); + queue_work(system_percpu_wq, &ovs_net->dp_notify_work); } return NOTIFY_DONE; diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index b80bd3a90773..66366982f604 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -129,15 +129,13 @@ void ovs_flow_stats_get(const struct sw_flow *flow, struct ovs_flow_stats *ovs_stats, unsigned long *used, __be16 *tcp_flags) { - int cpu; + unsigned int cpu; *used = 0; *tcp_flags = 0; memset(ovs_stats, 0, sizeof(*ovs_stats)); - /* We open code this to make sure cpu 0 is always considered */ - for (cpu = 0; cpu < nr_cpu_ids; - cpu = cpumask_next(cpu, flow->cpu_used_mask)) { + for_each_cpu(cpu, flow->cpu_used_mask) { struct sw_flow_stats *stats = rcu_dereference_ovsl(flow->stats[cpu]); if (stats) { @@ -158,11 +156,9 @@ void ovs_flow_stats_get(const struct sw_flow *flow, /* Called with ovs_mutex. */ void ovs_flow_stats_clear(struct sw_flow *flow) { - int cpu; + unsigned int cpu; - /* We open code this to make sure cpu 0 is always considered */ - for (cpu = 0; cpu < nr_cpu_ids; - cpu = cpumask_next(cpu, flow->cpu_used_mask)) { + for_each_cpu(cpu, flow->cpu_used_mask) { struct sw_flow_stats *stats = ovsl_dereference(flow->stats[cpu]); if (stats) { diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index ad64bb9ab5e2..1cb4f97335d8 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -1305,6 +1305,11 @@ static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match, return 0; } +/* + * Constructs NSH header 'nh' from attributes of OVS_ACTION_ATTR_PUSH_NSH, + * where 'nh' points to a memory block of 'size' bytes. It's assumed that + * attributes were previously validated with validate_push_nsh(). + */ int nsh_hdr_from_nlattr(const struct nlattr *attr, struct nshhdr *nh, size_t size) { @@ -1314,8 +1319,6 @@ int nsh_hdr_from_nlattr(const struct nlattr *attr, u8 ttl = 0; int mdlen = 0; - /* validate_nsh has check this, so we needn't do duplicate check here - */ if (size < NSH_BASE_HDR_LEN) return -ENOBUFS; @@ -1359,46 +1362,6 @@ int nsh_hdr_from_nlattr(const struct nlattr *attr, return 0; } -int nsh_key_from_nlattr(const struct nlattr *attr, - struct ovs_key_nsh *nsh, struct ovs_key_nsh *nsh_mask) -{ - struct nlattr *a; - int rem; - - /* validate_nsh has check this, so we needn't do duplicate check here - */ - nla_for_each_nested(a, attr, rem) { - int type = nla_type(a); - - switch (type) { - case OVS_NSH_KEY_ATTR_BASE: { - const struct ovs_nsh_key_base *base = nla_data(a); - const struct ovs_nsh_key_base *base_mask = base + 1; - - nsh->base = *base; - nsh_mask->base = *base_mask; - break; - } - case OVS_NSH_KEY_ATTR_MD1: { - const struct ovs_nsh_key_md1 *md1 = nla_data(a); - const struct ovs_nsh_key_md1 *md1_mask = md1 + 1; - - memcpy(nsh->context, md1->context, sizeof(*md1)); - memcpy(nsh_mask->context, md1_mask->context, - sizeof(*md1_mask)); - break; - } - case OVS_NSH_KEY_ATTR_MD2: - /* Not supported yet */ - return -ENOTSUPP; - default: - return -EINVAL; - } - } - - return 0; -} - static int nsh_key_put_from_nlattr(const struct nlattr *attr, struct sw_flow_match *match, bool is_mask, bool is_push_nsh, bool log) @@ -2839,17 +2802,13 @@ static int validate_and_copy_set_tun(const struct nlattr *attr, return err; } -static bool validate_nsh(const struct nlattr *attr, bool is_mask, - bool is_push_nsh, bool log) +static bool validate_push_nsh(const struct nlattr *attr, bool log) { struct sw_flow_match match; struct sw_flow_key key; - int ret = 0; ovs_match_init(&match, &key, true, NULL); - ret = nsh_key_put_from_nlattr(attr, &match, is_mask, - is_push_nsh, log); - return !ret; + return !nsh_key_put_from_nlattr(attr, &match, false, true, log); } /* Return false if there are any non-masked bits set. @@ -2997,13 +2956,6 @@ static int validate_set(const struct nlattr *a, break; - case OVS_KEY_ATTR_NSH: - if (eth_type != htons(ETH_P_NSH)) - return -EINVAL; - if (!validate_nsh(nla_data(a), masked, false, log)) - return -EINVAL; - break; - default: return -EINVAL; } @@ -3437,7 +3389,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, return -EINVAL; } mac_proto = MAC_PROTO_NONE; - if (!validate_nsh(nla_data(a), false, true, true)) + if (!validate_push_nsh(nla_data(a), log)) return -EINVAL; break; diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h index fe7f77fc5f18..ff8cdecbe346 100644 --- a/net/openvswitch/flow_netlink.h +++ b/net/openvswitch/flow_netlink.h @@ -65,8 +65,6 @@ int ovs_nla_put_actions(const struct nlattr *attr, void ovs_nla_free_flow_actions(struct sw_flow_actions *); void ovs_nla_free_flow_actions_rcu(struct sw_flow_actions *); -int nsh_key_from_nlattr(const struct nlattr *attr, struct ovs_key_nsh *nsh, - struct ovs_key_nsh *nsh_mask); int nsh_hdr_from_nlattr(const struct nlattr *attr, struct nshhdr *nh, size_t size); diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index d108ae0bd0ee..ffc72a741a50 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -107,16 +107,15 @@ int ovs_flow_tbl_count(const struct flow_table *table) static void flow_free(struct sw_flow *flow) { - int cpu; + unsigned int cpu; if (ovs_identifier_is_key(&flow->id)) kfree(flow->id.unmasked_key); if (flow->sf_acts) ovs_nla_free_flow_actions((struct sw_flow_actions __force *) flow->sf_acts); - /* We open code this to make sure cpu 0 is always considered */ - for (cpu = 0; cpu < nr_cpu_ids; - cpu = cpumask_next(cpu, flow->cpu_used_mask)) { + + for_each_cpu(cpu, flow->cpu_used_mask) { if (flow->stats[cpu]) kmem_cache_free(flow_stats_cache, (struct sw_flow_stats __force *)flow->stats[cpu]); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index a7017d7f0927..494d628d10a5 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -203,8 +203,7 @@ static void prb_retire_current_block(struct tpacket_kbdq_core *, static int prb_queue_frozen(struct tpacket_kbdq_core *); static void prb_open_block(struct tpacket_kbdq_core *, struct tpacket_block_desc *); -static void prb_retire_rx_blk_timer_expired(struct timer_list *); -static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *); +static enum hrtimer_restart prb_retire_rx_blk_timer_expired(struct hrtimer *); static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *); static void prb_clear_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *); @@ -579,33 +578,13 @@ static __be16 vlan_get_protocol_dgram(const struct sk_buff *skb) return proto; } -static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc) -{ - timer_delete_sync(&pkc->retire_blk_timer); -} - static void prb_shutdown_retire_blk_timer(struct packet_sock *po, struct sk_buff_head *rb_queue) { struct tpacket_kbdq_core *pkc; pkc = GET_PBDQC_FROM_RB(&po->rx_ring); - - spin_lock_bh(&rb_queue->lock); - pkc->delete_blk_timer = 1; - spin_unlock_bh(&rb_queue->lock); - - prb_del_retire_blk_timer(pkc); -} - -static void prb_setup_retire_blk_timer(struct packet_sock *po) -{ - struct tpacket_kbdq_core *pkc; - - pkc = GET_PBDQC_FROM_RB(&po->rx_ring); - timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired, - 0); - pkc->retire_blk_timer.expires = jiffies; + hrtimer_cancel(&pkc->retire_blk_timer); } static int prb_calc_retire_blk_tmo(struct packet_sock *po, @@ -669,57 +648,36 @@ static void init_prb_bdqc(struct packet_sock *po, p1->knum_blocks = req_u->req3.tp_block_nr; p1->hdrlen = po->tp_hdrlen; p1->version = po->tp_version; - p1->last_kactive_blk_num = 0; po->stats.stats3.tp_freeze_q_cnt = 0; if (req_u->req3.tp_retire_blk_tov) - p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov; + p1->interval_ktime = ms_to_ktime(req_u->req3.tp_retire_blk_tov); else - p1->retire_blk_tov = prb_calc_retire_blk_tmo(po, - req_u->req3.tp_block_size); - p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov); + p1->interval_ktime = ms_to_ktime(prb_calc_retire_blk_tmo(po, + req_u->req3.tp_block_size)); p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv; rwlock_init(&p1->blk_fill_in_prog_lock); p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv); prb_init_ft_ops(p1, req_u); - prb_setup_retire_blk_timer(po); + hrtimer_setup(&p1->retire_blk_timer, prb_retire_rx_blk_timer_expired, + CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); + hrtimer_start(&p1->retire_blk_timer, p1->interval_ktime, + HRTIMER_MODE_REL_SOFT); prb_open_block(p1, pbd); } -/* Do NOT update the last_blk_num first. - * Assumes sk_buff_head lock is held. - */ -static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc) -{ - mod_timer(&pkc->retire_blk_timer, - jiffies + pkc->tov_in_jiffies); - pkc->last_kactive_blk_num = pkc->kactive_blk_num; -} - /* - * Timer logic: - * 1) We refresh the timer only when we open a block. - * By doing this we don't waste cycles refreshing the timer - * on packet-by-packet basis. - * * With a 1MB block-size, on a 1Gbps line, it will take * i) ~8 ms to fill a block + ii) memcpy etc. * In this cut we are not accounting for the memcpy time. * - * So, if the user sets the 'tmo' to 10ms then the timer - * will never fire while the block is still getting filled - * (which is what we want). However, the user could choose - * to close a block early and that's fine. - * - * But when the timer does fire, we check whether or not to refresh it. * Since the tmo granularity is in msecs, it is not too expensive * to refresh the timer, lets say every '8' msecs. * Either the user can set the 'tmo' or we can derive it based on * a) line-speed and b) block-size. * prb_calc_retire_blk_tmo() calculates the tmo. - * */ -static void prb_retire_rx_blk_timer_expired(struct timer_list *t) +static enum hrtimer_restart prb_retire_rx_blk_timer_expired(struct hrtimer *t) { struct packet_sock *po = timer_container_of(po, t, rx_ring.prb_bdqc.retire_blk_timer); @@ -732,9 +690,6 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t) frozen = prb_queue_frozen(pkc); pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); - if (unlikely(pkc->delete_blk_timer)) - goto out; - /* We only need to plug the race when the block is partially filled. * tpacket_rcv: * lock(); increment BLOCK_NUM_PKTS; unlock() @@ -750,46 +705,31 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t) write_unlock(&pkc->blk_fill_in_prog_lock); } - if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) { - if (!frozen) { - if (!BLOCK_NUM_PKTS(pbd)) { - /* An empty block. Just refresh the timer. */ - goto refresh_timer; - } + if (!frozen) { + if (BLOCK_NUM_PKTS(pbd)) { + /* Not an empty block. Need retire the block. */ prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO); - if (!prb_dispatch_next_block(pkc, po)) - goto refresh_timer; - else - goto out; - } else { - /* Case 1. Queue was frozen because user-space was - * lagging behind. + prb_dispatch_next_block(pkc, po); + } + } else { + /* Case 1. Queue was frozen because user-space was + * lagging behind. + */ + if (!prb_curr_blk_in_use(pbd)) { + /* Case 2. queue was frozen,user-space caught up, + * now the link went idle && the timer fired. + * We don't have a block to close.So we open this + * block and restart the timer. + * opening a block thaws the queue,restarts timer + * Thawing/timer-refresh is a side effect. */ - if (prb_curr_blk_in_use(pbd)) { - /* - * Ok, user-space is still behind. - * So just refresh the timer. - */ - goto refresh_timer; - } else { - /* Case 2. queue was frozen,user-space caught up, - * now the link went idle && the timer fired. - * We don't have a block to close.So we open this - * block and restart the timer. - * opening a block thaws the queue,restarts timer - * Thawing/timer-refresh is a side effect. - */ - prb_open_block(pkc, pbd); - goto out; - } + prb_open_block(pkc, pbd); } } -refresh_timer: - _prb_refresh_rx_retire_blk_timer(pkc); - -out: + hrtimer_forward_now(&pkc->retire_blk_timer, pkc->interval_ktime); spin_unlock(&po->sk.sk_receive_queue.lock); + return HRTIMER_RESTART; } static void prb_flush_block(struct tpacket_kbdq_core *pkc1, @@ -883,11 +823,18 @@ static void prb_thaw_queue(struct tpacket_kbdq_core *pkc) } /* - * Side effect of opening a block: + * prb_open_block is called by tpacket_rcv or timer callback. * - * 1) prb_queue is thawed. - * 2) retire_blk_timer is refreshed. + * Reasons why NOT update hrtimer in prb_open_block: + * 1) It will increase complexity to distinguish the two caller scenario. + * 2) hrtimer_cancel and hrtimer_start need to be called if you want to update + * TMO of an already enqueued hrtimer, leading to complex shutdown logic. * + * One side effect of NOT update hrtimer when called by tpacket_rcv is that + * a newly opened block triggered by tpacket_rcv may be retired earlier than + * expected. On the other hand, if timeout is updated in prb_open_block, the + * frequent reception of network packets that leads to prb_open_block being + * called may cause hrtimer to be removed and enqueued repeatedly. */ static void prb_open_block(struct tpacket_kbdq_core *pkc1, struct tpacket_block_desc *pbd1) @@ -921,7 +868,6 @@ static void prb_open_block(struct tpacket_kbdq_core *pkc1, pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size; prb_thaw_queue(pkc1); - _prb_refresh_rx_retire_blk_timer(pkc1); smp_wmb(); } @@ -2265,7 +2211,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, drop_n_acct: atomic_inc(&po->tp_drops); - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); drop_reason = SKB_DROP_REASON_PACKET_SOCK_ERROR; drop_n_restore: @@ -3333,11 +3279,12 @@ out_unlock: * Bind a packet socket to a device */ -static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, +static int packet_bind_spkt(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { struct sock *sk = sock->sk; - char name[sizeof(uaddr->sa_data_min) + 1]; + struct sockaddr *sa = (struct sockaddr *)uaddr; + char name[sizeof(sa->sa_data) + 1]; /* * Check legality @@ -3348,13 +3295,13 @@ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, /* uaddr->sa_data comes from the userspace, it's not guaranteed to be * zero-terminated. */ - memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data_min)); - name[sizeof(uaddr->sa_data_min)] = 0; + memcpy(name, sa->sa_data, sizeof(sa->sa_data)); + name[sizeof(sa->sa_data)] = 0; return packet_do_bind(sk, name, 0, 0); } -static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +static int packet_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr; struct sock *sk = sock->sk; @@ -3634,11 +3581,11 @@ static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr, return -EOPNOTSUPP; uaddr->sa_family = AF_PACKET; - memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data_min)); + memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data)); rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(sk), READ_ONCE(pkt_sk(sk)->ifindex)); if (dev) - strscpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data_min)); + strscpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data)); rcu_read_unlock(); return sizeof(*uaddr); diff --git a/net/packet/diag.c b/net/packet/diag.c index 6ce1dcc284d9..c8f43e0c1925 100644 --- a/net/packet/diag.c +++ b/net/packet/diag.c @@ -83,7 +83,7 @@ static int pdiag_put_ring(struct packet_ring_buffer *ring, int ver, int nl_type, pdr.pdr_frame_nr = ring->frame_max + 1; if (ver > TPACKET_V2) { - pdr.pdr_retire_tmo = ring->prb_bdqc.retire_blk_tov; + pdr.pdr_retire_tmo = ktime_to_ms(ring->prb_bdqc.interval_ktime); pdr.pdr_sizeof_priv = ring->prb_bdqc.blk_sizeof_priv; pdr.pdr_features = ring->prb_bdqc.feature_req_word; } else { diff --git a/net/packet/internal.h b/net/packet/internal.h index 1e743d0316fd..b76e645cd78d 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -20,15 +20,10 @@ struct tpacket_kbdq_core { unsigned int feature_req_word; unsigned int hdrlen; unsigned char reset_pending_on_curr_blk; - unsigned char delete_blk_timer; unsigned short kactive_blk_num; unsigned short blk_sizeof_priv; - /* last_kactive_blk_num: - * trick to see if user-space has caught up - * in order to avoid refreshing timer when every single pkt arrives. - */ - unsigned short last_kactive_blk_num; + unsigned short version; char *pkblk_start; char *pkblk_end; @@ -38,6 +33,7 @@ struct tpacket_kbdq_core { uint64_t knxt_seq_num; char *prev; char *nxt_offset; + struct sk_buff *skb; rwlock_t blk_fill_in_prog_lock; @@ -45,12 +41,10 @@ struct tpacket_kbdq_core { /* Default is set to 8ms */ #define DEFAULT_PRB_RETIRE_TOV (8) - unsigned short retire_blk_tov; - unsigned short version; - unsigned long tov_in_jiffies; + ktime_t interval_ktime; /* timer to retire an outstanding block */ - struct timer_list retire_blk_timer; + struct hrtimer retire_blk_timer; }; struct pgv { diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c index a27efa4faa4e..238a9638d2b0 100644 --- a/net/phonet/af_phonet.c +++ b/net/phonet/af_phonet.c @@ -22,7 +22,7 @@ #include <net/phonet/pn_dev.h> /* Transport protocol registration */ -static const struct phonet_protocol *proto_tab[PHONET_NPROTO] __read_mostly; +static const struct phonet_protocol __rcu *proto_tab[PHONET_NPROTO] __read_mostly; static const struct phonet_protocol *phonet_proto_get(unsigned int protocol) { @@ -482,7 +482,7 @@ void phonet_proto_unregister(unsigned int protocol, const struct phonet_protocol *pp) { mutex_lock(&proto_tab_lock); - BUG_ON(proto_tab[protocol] != pp); + BUG_ON(rcu_access_pointer(proto_tab[protocol]) != pp); RCU_INIT_POINTER(proto_tab[protocol], NULL); mutex_unlock(&proto_tab_lock); synchronize_rcu(); diff --git a/net/phonet/pep.c b/net/phonet/pep.c index 62527e1ebb88..120e711ea78c 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -376,7 +376,7 @@ static int pipe_do_rcv(struct sock *sk, struct sk_buff *skb) case PNS_PEP_CTRL_REQ: if (skb_queue_len(&pn->ctrlreq_queue) >= PNPIPE_CTRLREQ_MAX) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); break; } __skb_pull(skb, 4); @@ -397,7 +397,7 @@ static int pipe_do_rcv(struct sock *sk, struct sk_buff *skb) } if (pn->rx_credits == 0) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); err = -ENOBUFS; break; } @@ -567,7 +567,7 @@ static int pipe_handler_do_rcv(struct sock *sk, struct sk_buff *skb) } if (pn->rx_credits == 0) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); err = NET_RX_DROP; break; } @@ -882,7 +882,8 @@ drop: return newsk; } -static int pep_sock_connect(struct sock *sk, struct sockaddr *addr, int len) +static int pep_sock_connect(struct sock *sk, struct sockaddr_unsized *addr, + int len) { struct pep_sock *pn = pep_sk(sk); int err; diff --git a/net/phonet/socket.c b/net/phonet/socket.c index ea4d5e6533db..4423d483c630 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -153,7 +153,7 @@ EXPORT_SYMBOL(pn_sock_unhash); static DEFINE_MUTEX(port_mutex); -static int pn_socket_bind(struct socket *sock, struct sockaddr *addr, int len) +static int pn_socket_bind(struct socket *sock, struct sockaddr_unsized *addr, int len) { struct sock *sk = sock->sk; struct pn_sock *pn = pn_sk(sk); @@ -206,16 +206,16 @@ static int pn_socket_autobind(struct socket *sock) memset(&sa, 0, sizeof(sa)); sa.spn_family = AF_PHONET; - err = pn_socket_bind(sock, (struct sockaddr *)&sa, - sizeof(struct sockaddr_pn)); + err = pn_socket_bind(sock, (struct sockaddr_unsized *)&sa, + sizeof(struct sockaddr_pn)); if (err != -EINVAL) return err; BUG_ON(!pn_port(pn_sk(sock->sk)->sobject)); return 0; /* socket was already bound */ } -static int pn_socket_connect(struct socket *sock, struct sockaddr *addr, - int len, int flags) +static int pn_socket_connect(struct socket *sock, struct sockaddr_unsized *addr, + int len, int flags) { struct sock *sk = sock->sk; struct pn_sock *pn = pn_sk(sk); @@ -587,7 +587,7 @@ static int pn_sock_seq_show(struct seq_file *seq, void *v) from_kuid_munged(seq_user_ns(seq), sk_uid(sk)), sock_i_ino(sk), refcount_read(&sk->sk_refcnt), sk, - atomic_read(&sk->sk_drops)); + sk_drops_read(sk)); } seq_pad(seq, '\n'); return 0; @@ -602,7 +602,7 @@ const struct seq_operations pn_sock_seq_ops = { #endif static struct { - struct sock *sk[256]; + struct sock __rcu *sk[256]; } pnres; /* @@ -654,7 +654,7 @@ int pn_sock_unbind_res(struct sock *sk, u8 res) return -EPERM; mutex_lock(&resource_mutex); - if (pnres.sk[res] == sk) { + if (rcu_access_pointer(pnres.sk[res]) == sk) { RCU_INIT_POINTER(pnres.sk[res], NULL); ret = 0; } @@ -673,7 +673,7 @@ void pn_sock_unbind_all_res(struct sock *sk) mutex_lock(&resource_mutex); for (res = 0; res < 256; res++) { - if (pnres.sk[res] == sk) { + if (rcu_access_pointer(pnres.sk[res]) == sk) { RCU_INIT_POINTER(pnres.sk[res], NULL); match++; } @@ -688,7 +688,7 @@ void pn_sock_unbind_all_res(struct sock *sk) } #ifdef CONFIG_PROC_FS -static struct sock **pn_res_get_idx(struct seq_file *seq, loff_t pos) +static struct sock __rcu **pn_res_get_idx(struct seq_file *seq, loff_t pos) { struct net *net = seq_file_net(seq); unsigned int i; @@ -697,7 +697,7 @@ static struct sock **pn_res_get_idx(struct seq_file *seq, loff_t pos) return NULL; for (i = 0; i < 256; i++) { - if (pnres.sk[i] == NULL) + if (rcu_access_pointer(pnres.sk[i]) == NULL) continue; if (!pos) return pnres.sk + i; @@ -706,7 +706,7 @@ static struct sock **pn_res_get_idx(struct seq_file *seq, loff_t pos) return NULL; } -static struct sock **pn_res_get_next(struct seq_file *seq, struct sock **sk) +static struct sock __rcu **pn_res_get_next(struct seq_file *seq, struct sock __rcu **sk) { struct net *net = seq_file_net(seq); unsigned int i; @@ -728,7 +728,7 @@ static void *pn_res_seq_start(struct seq_file *seq, loff_t *pos) static void *pn_res_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct sock **sk; + struct sock __rcu **sk; if (v == SEQ_START_TOKEN) sk = pn_res_get_idx(seq, 0); @@ -747,11 +747,12 @@ static void pn_res_seq_stop(struct seq_file *seq, void *v) static int pn_res_seq_show(struct seq_file *seq, void *v) { seq_setwidth(seq, 63); - if (v == SEQ_START_TOKEN) + if (v == SEQ_START_TOKEN) { seq_puts(seq, "rs uid inode"); - else { - struct sock **psk = v; - struct sock *sk = *psk; + } else { + struct sock __rcu **psk = v; + struct sock *sk = rcu_dereference_protected(*psk, + lockdep_is_held(&resource_mutex)); seq_printf(seq, "%02X %5u %lu", (int) (psk - pnres.sk), diff --git a/net/psp/Kconfig b/net/psp/Kconfig new file mode 100644 index 000000000000..371e8771f3bd --- /dev/null +++ b/net/psp/Kconfig @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# PSP configuration +# +config INET_PSP + bool "PSP Security Protocol support" + depends on INET + select SKB_DECRYPTED + select SOCK_VALIDATE_XMIT + help + Enable kernel support for the PSP Security Protocol (PSP). + For more information see: + https://raw.githubusercontent.com/google/psp/main/doc/PSP_Arch_Spec.pdf + + If unsure, say N. diff --git a/net/psp/Makefile b/net/psp/Makefile new file mode 100644 index 000000000000..eb5ff3c5bfb2 --- /dev/null +++ b/net/psp/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_INET_PSP) += psp.o + +psp-y := psp_main.o psp_nl.o psp_sock.o psp-nl-gen.o diff --git a/net/psp/psp-nl-gen.c b/net/psp/psp-nl-gen.c new file mode 100644 index 000000000000..22a48d0fa378 --- /dev/null +++ b/net/psp/psp-nl-gen.c @@ -0,0 +1,139 @@ +// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/psp.yaml */ +/* YNL-GEN kernel source */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ + +#include <net/netlink.h> +#include <net/genetlink.h> + +#include "psp-nl-gen.h" + +#include <uapi/linux/psp.h> + +/* Common nested types */ +const struct nla_policy psp_keys_nl_policy[PSP_A_KEYS_SPI + 1] = { + [PSP_A_KEYS_KEY] = { .type = NLA_BINARY, }, + [PSP_A_KEYS_SPI] = { .type = NLA_U32, }, +}; + +/* PSP_CMD_DEV_GET - do */ +static const struct nla_policy psp_dev_get_nl_policy[PSP_A_DEV_ID + 1] = { + [PSP_A_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1), +}; + +/* PSP_CMD_DEV_SET - do */ +static const struct nla_policy psp_dev_set_nl_policy[PSP_A_DEV_PSP_VERSIONS_ENA + 1] = { + [PSP_A_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1), + [PSP_A_DEV_PSP_VERSIONS_ENA] = NLA_POLICY_MASK(NLA_U32, 0xf), +}; + +/* PSP_CMD_KEY_ROTATE - do */ +static const struct nla_policy psp_key_rotate_nl_policy[PSP_A_DEV_ID + 1] = { + [PSP_A_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1), +}; + +/* PSP_CMD_RX_ASSOC - do */ +static const struct nla_policy psp_rx_assoc_nl_policy[PSP_A_ASSOC_SOCK_FD + 1] = { + [PSP_A_ASSOC_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1), + [PSP_A_ASSOC_VERSION] = NLA_POLICY_MAX(NLA_U32, 3), + [PSP_A_ASSOC_SOCK_FD] = { .type = NLA_U32, }, +}; + +/* PSP_CMD_TX_ASSOC - do */ +static const struct nla_policy psp_tx_assoc_nl_policy[PSP_A_ASSOC_SOCK_FD + 1] = { + [PSP_A_ASSOC_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1), + [PSP_A_ASSOC_VERSION] = NLA_POLICY_MAX(NLA_U32, 3), + [PSP_A_ASSOC_TX_KEY] = NLA_POLICY_NESTED(psp_keys_nl_policy), + [PSP_A_ASSOC_SOCK_FD] = { .type = NLA_U32, }, +}; + +/* PSP_CMD_GET_STATS - do */ +static const struct nla_policy psp_get_stats_nl_policy[PSP_A_STATS_DEV_ID + 1] = { + [PSP_A_STATS_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1), +}; + +/* Ops table for psp */ +static const struct genl_split_ops psp_nl_ops[] = { + { + .cmd = PSP_CMD_DEV_GET, + .pre_doit = psp_device_get_locked, + .doit = psp_nl_dev_get_doit, + .post_doit = psp_device_unlock, + .policy = psp_dev_get_nl_policy, + .maxattr = PSP_A_DEV_ID, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = PSP_CMD_DEV_GET, + .dumpit = psp_nl_dev_get_dumpit, + .flags = GENL_CMD_CAP_DUMP, + }, + { + .cmd = PSP_CMD_DEV_SET, + .pre_doit = psp_device_get_locked, + .doit = psp_nl_dev_set_doit, + .post_doit = psp_device_unlock, + .policy = psp_dev_set_nl_policy, + .maxattr = PSP_A_DEV_PSP_VERSIONS_ENA, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = PSP_CMD_KEY_ROTATE, + .pre_doit = psp_device_get_locked, + .doit = psp_nl_key_rotate_doit, + .post_doit = psp_device_unlock, + .policy = psp_key_rotate_nl_policy, + .maxattr = PSP_A_DEV_ID, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = PSP_CMD_RX_ASSOC, + .pre_doit = psp_assoc_device_get_locked, + .doit = psp_nl_rx_assoc_doit, + .post_doit = psp_device_unlock, + .policy = psp_rx_assoc_nl_policy, + .maxattr = PSP_A_ASSOC_SOCK_FD, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = PSP_CMD_TX_ASSOC, + .pre_doit = psp_assoc_device_get_locked, + .doit = psp_nl_tx_assoc_doit, + .post_doit = psp_device_unlock, + .policy = psp_tx_assoc_nl_policy, + .maxattr = PSP_A_ASSOC_SOCK_FD, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = PSP_CMD_GET_STATS, + .pre_doit = psp_device_get_locked, + .doit = psp_nl_get_stats_doit, + .post_doit = psp_device_unlock, + .policy = psp_get_stats_nl_policy, + .maxattr = PSP_A_STATS_DEV_ID, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = PSP_CMD_GET_STATS, + .dumpit = psp_nl_get_stats_dumpit, + .flags = GENL_CMD_CAP_DUMP, + }, +}; + +static const struct genl_multicast_group psp_nl_mcgrps[] = { + [PSP_NLGRP_MGMT] = { "mgmt", }, + [PSP_NLGRP_USE] = { "use", }, +}; + +struct genl_family psp_nl_family __ro_after_init = { + .name = PSP_FAMILY_NAME, + .version = PSP_FAMILY_VERSION, + .netnsok = true, + .parallel_ops = true, + .module = THIS_MODULE, + .split_ops = psp_nl_ops, + .n_split_ops = ARRAY_SIZE(psp_nl_ops), + .mcgrps = psp_nl_mcgrps, + .n_mcgrps = ARRAY_SIZE(psp_nl_mcgrps), +}; diff --git a/net/psp/psp-nl-gen.h b/net/psp/psp-nl-gen.h new file mode 100644 index 000000000000..599c5f1c82f2 --- /dev/null +++ b/net/psp/psp-nl-gen.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/psp.yaml */ +/* YNL-GEN kernel header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ + +#ifndef _LINUX_PSP_GEN_H +#define _LINUX_PSP_GEN_H + +#include <net/netlink.h> +#include <net/genetlink.h> + +#include <uapi/linux/psp.h> + +/* Common nested types */ +extern const struct nla_policy psp_keys_nl_policy[PSP_A_KEYS_SPI + 1]; + +int psp_device_get_locked(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info); +int psp_assoc_device_get_locked(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info); +void +psp_device_unlock(const struct genl_split_ops *ops, struct sk_buff *skb, + struct genl_info *info); + +int psp_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info); +int psp_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); +int psp_nl_dev_set_doit(struct sk_buff *skb, struct genl_info *info); +int psp_nl_key_rotate_doit(struct sk_buff *skb, struct genl_info *info); +int psp_nl_rx_assoc_doit(struct sk_buff *skb, struct genl_info *info); +int psp_nl_tx_assoc_doit(struct sk_buff *skb, struct genl_info *info); +int psp_nl_get_stats_doit(struct sk_buff *skb, struct genl_info *info); +int psp_nl_get_stats_dumpit(struct sk_buff *skb, struct netlink_callback *cb); + +enum { + PSP_NLGRP_MGMT, + PSP_NLGRP_USE, +}; + +extern struct genl_family psp_nl_family; + +#endif /* _LINUX_PSP_GEN_H */ diff --git a/net/psp/psp.h b/net/psp/psp.h new file mode 100644 index 000000000000..9f19137593a0 --- /dev/null +++ b/net/psp/psp.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __PSP_PSP_H +#define __PSP_PSP_H + +#include <linux/list.h> +#include <linux/lockdep.h> +#include <linux/mutex.h> +#include <net/netns/generic.h> +#include <net/psp.h> +#include <net/sock.h> + +extern struct xarray psp_devs; +extern struct mutex psp_devs_lock; + +void psp_dev_free(struct psp_dev *psd); +int psp_dev_check_access(struct psp_dev *psd, struct net *net); + +void psp_nl_notify_dev(struct psp_dev *psd, u32 cmd); + +struct psp_assoc *psp_assoc_create(struct psp_dev *psd); +struct psp_dev *psp_dev_get_for_sock(struct sock *sk); +void psp_dev_tx_key_del(struct psp_dev *psd, struct psp_assoc *pas); +int psp_sock_assoc_set_rx(struct sock *sk, struct psp_assoc *pas, + struct psp_key_parsed *key, + struct netlink_ext_ack *extack); +int psp_sock_assoc_set_tx(struct sock *sk, struct psp_dev *psd, + u32 version, struct psp_key_parsed *key, + struct netlink_ext_ack *extack); +void psp_assocs_key_rotated(struct psp_dev *psd); + +static inline void psp_dev_get(struct psp_dev *psd) +{ + refcount_inc(&psd->refcnt); +} + +static inline bool psp_dev_tryget(struct psp_dev *psd) +{ + return refcount_inc_not_zero(&psd->refcnt); +} + +static inline void psp_dev_put(struct psp_dev *psd) +{ + if (refcount_dec_and_test(&psd->refcnt)) + psp_dev_free(psd); +} + +static inline bool psp_dev_is_registered(struct psp_dev *psd) +{ + lockdep_assert_held(&psd->lock); + return !!psd->ops; +} + +#endif /* __PSP_PSP_H */ diff --git a/net/psp/psp_main.c b/net/psp/psp_main.c new file mode 100644 index 000000000000..a8534124f626 --- /dev/null +++ b/net/psp/psp_main.c @@ -0,0 +1,323 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/bitfield.h> +#include <linux/list.h> +#include <linux/netdevice.h> +#include <linux/xarray.h> +#include <net/net_namespace.h> +#include <net/psp.h> +#include <net/udp.h> + +#include "psp.h" +#include "psp-nl-gen.h" + +DEFINE_XARRAY_ALLOC1(psp_devs); +struct mutex psp_devs_lock; + +/** + * DOC: PSP locking + * + * psp_devs_lock protects the psp_devs xarray. + * Ordering is take the psp_devs_lock and then the instance lock. + * Each instance is protected by RCU, and has a refcount. + * When driver unregisters the instance gets flushed, but struct sticks around. + */ + +/** + * psp_dev_check_access() - check if user in a given net ns can access PSP dev + * @psd: PSP device structure user is trying to access + * @net: net namespace user is in + * + * Return: 0 if PSP device should be visible in @net, errno otherwise. + */ +int psp_dev_check_access(struct psp_dev *psd, struct net *net) +{ + if (dev_net(psd->main_netdev) == net) + return 0; + return -ENOENT; +} + +/** + * psp_dev_create() - create and register PSP device + * @netdev: main netdevice + * @psd_ops: driver callbacks + * @psd_caps: device capabilities + * @priv_ptr: back-pointer to driver private data + * + * Return: pointer to allocated PSP device, or ERR_PTR. + */ +struct psp_dev * +psp_dev_create(struct net_device *netdev, + struct psp_dev_ops *psd_ops, struct psp_dev_caps *psd_caps, + void *priv_ptr) +{ + struct psp_dev *psd; + static u32 last_id; + int err; + + if (WARN_ON(!psd_caps->versions || + !psd_ops->set_config || + !psd_ops->key_rotate || + !psd_ops->rx_spi_alloc || + !psd_ops->tx_key_add || + !psd_ops->tx_key_del || + !psd_ops->get_stats)) + return ERR_PTR(-EINVAL); + + psd = kzalloc(sizeof(*psd), GFP_KERNEL); + if (!psd) + return ERR_PTR(-ENOMEM); + + psd->main_netdev = netdev; + psd->ops = psd_ops; + psd->caps = psd_caps; + psd->drv_priv = priv_ptr; + + mutex_init(&psd->lock); + INIT_LIST_HEAD(&psd->active_assocs); + INIT_LIST_HEAD(&psd->prev_assocs); + INIT_LIST_HEAD(&psd->stale_assocs); + refcount_set(&psd->refcnt, 1); + + mutex_lock(&psp_devs_lock); + err = xa_alloc_cyclic(&psp_devs, &psd->id, psd, xa_limit_16b, + &last_id, GFP_KERNEL); + if (err) { + mutex_unlock(&psp_devs_lock); + kfree(psd); + return ERR_PTR(err); + } + mutex_lock(&psd->lock); + mutex_unlock(&psp_devs_lock); + + psp_nl_notify_dev(psd, PSP_CMD_DEV_ADD_NTF); + + rcu_assign_pointer(netdev->psp_dev, psd); + + mutex_unlock(&psd->lock); + + return psd; +} +EXPORT_SYMBOL(psp_dev_create); + +void psp_dev_free(struct psp_dev *psd) +{ + mutex_lock(&psp_devs_lock); + xa_erase(&psp_devs, psd->id); + mutex_unlock(&psp_devs_lock); + + mutex_destroy(&psd->lock); + kfree_rcu(psd, rcu); +} + +/** + * psp_dev_unregister() - unregister PSP device + * @psd: PSP device structure + */ +void psp_dev_unregister(struct psp_dev *psd) +{ + struct psp_assoc *pas, *next; + + mutex_lock(&psp_devs_lock); + mutex_lock(&psd->lock); + + psp_nl_notify_dev(psd, PSP_CMD_DEV_DEL_NTF); + + /* Wait until psp_dev_free() to call xa_erase() to prevent a + * different psd from being added to the xarray with this id, while + * there are still references to this psd being held. + */ + xa_store(&psp_devs, psd->id, NULL, GFP_KERNEL); + mutex_unlock(&psp_devs_lock); + + list_splice_init(&psd->active_assocs, &psd->prev_assocs); + list_splice_init(&psd->prev_assocs, &psd->stale_assocs); + list_for_each_entry_safe(pas, next, &psd->stale_assocs, assocs_list) + psp_dev_tx_key_del(psd, pas); + + rcu_assign_pointer(psd->main_netdev->psp_dev, NULL); + + psd->ops = NULL; + psd->drv_priv = NULL; + + mutex_unlock(&psd->lock); + + psp_dev_put(psd); +} +EXPORT_SYMBOL(psp_dev_unregister); + +unsigned int psp_key_size(u32 version) +{ + switch (version) { + case PSP_VERSION_HDR0_AES_GCM_128: + case PSP_VERSION_HDR0_AES_GMAC_128: + return 16; + case PSP_VERSION_HDR0_AES_GCM_256: + case PSP_VERSION_HDR0_AES_GMAC_256: + return 32; + default: + return 0; + } +} +EXPORT_SYMBOL(psp_key_size); + +static void psp_write_headers(struct net *net, struct sk_buff *skb, __be32 spi, + u8 ver, unsigned int udp_len, __be16 sport) +{ + struct udphdr *uh = udp_hdr(skb); + struct psphdr *psph = (struct psphdr *)(uh + 1); + + uh->dest = htons(PSP_DEFAULT_UDP_PORT); + uh->source = udp_flow_src_port(net, skb, 0, 0, false); + uh->check = 0; + uh->len = htons(udp_len); + + psph->nexthdr = IPPROTO_TCP; + psph->hdrlen = PSP_HDRLEN_NOOPT; + psph->crypt_offset = 0; + psph->verfl = FIELD_PREP(PSPHDR_VERFL_VERSION, ver) | + FIELD_PREP(PSPHDR_VERFL_ONE, 1); + psph->spi = spi; + memset(&psph->iv, 0, sizeof(psph->iv)); +} + +/* Encapsulate a TCP packet with PSP by adding the UDP+PSP headers and filling + * them in. + */ +bool psp_dev_encapsulate(struct net *net, struct sk_buff *skb, __be32 spi, + u8 ver, __be16 sport) +{ + u32 network_len = skb_network_header_len(skb); + u32 ethr_len = skb_mac_header_len(skb); + u32 bufflen = ethr_len + network_len; + + if (skb_cow_head(skb, PSP_ENCAP_HLEN)) + return false; + + skb_push(skb, PSP_ENCAP_HLEN); + skb->mac_header -= PSP_ENCAP_HLEN; + skb->network_header -= PSP_ENCAP_HLEN; + skb->transport_header -= PSP_ENCAP_HLEN; + memmove(skb->data, skb->data + PSP_ENCAP_HLEN, bufflen); + + if (skb->protocol == htons(ETH_P_IP)) { + ip_hdr(skb)->protocol = IPPROTO_UDP; + be16_add_cpu(&ip_hdr(skb)->tot_len, PSP_ENCAP_HLEN); + ip_hdr(skb)->check = 0; + ip_hdr(skb)->check = + ip_fast_csum((u8 *)ip_hdr(skb), ip_hdr(skb)->ihl); + } else if (skb->protocol == htons(ETH_P_IPV6)) { + ipv6_hdr(skb)->nexthdr = IPPROTO_UDP; + be16_add_cpu(&ipv6_hdr(skb)->payload_len, PSP_ENCAP_HLEN); + } else { + return false; + } + + skb_set_inner_ipproto(skb, IPPROTO_TCP); + skb_set_inner_transport_header(skb, skb_transport_offset(skb) + + PSP_ENCAP_HLEN); + skb->encapsulation = 1; + psp_write_headers(net, skb, spi, ver, + skb->len - skb_transport_offset(skb), sport); + + return true; +} +EXPORT_SYMBOL(psp_dev_encapsulate); + +/* Receive handler for PSP packets. + * + * Presently it accepts only already-authenticated packets and does not + * support optional fields, such as virtualization cookies. The caller should + * ensure that skb->data is pointing to the mac header, and that skb->mac_len + * is set. This function does not currently adjust skb->csum (CHECKSUM_COMPLETE + * is not supported). + */ +int psp_dev_rcv(struct sk_buff *skb, u16 dev_id, u8 generation, bool strip_icv) +{ + int l2_hlen = 0, l3_hlen, encap; + struct psp_skb_ext *pse; + struct psphdr *psph; + struct ethhdr *eth; + struct udphdr *uh; + __be16 proto; + bool is_udp; + + eth = (struct ethhdr *)skb->data; + proto = __vlan_get_protocol(skb, eth->h_proto, &l2_hlen); + if (proto == htons(ETH_P_IP)) + l3_hlen = sizeof(struct iphdr); + else if (proto == htons(ETH_P_IPV6)) + l3_hlen = sizeof(struct ipv6hdr); + else + return -EINVAL; + + if (unlikely(!pskb_may_pull(skb, l2_hlen + l3_hlen + PSP_ENCAP_HLEN))) + return -EINVAL; + + if (proto == htons(ETH_P_IP)) { + struct iphdr *iph = (struct iphdr *)(skb->data + l2_hlen); + + is_udp = iph->protocol == IPPROTO_UDP; + l3_hlen = iph->ihl * 4; + if (l3_hlen != sizeof(struct iphdr) && + !pskb_may_pull(skb, l2_hlen + l3_hlen + PSP_ENCAP_HLEN)) + return -EINVAL; + } else { + struct ipv6hdr *ipv6h = (struct ipv6hdr *)(skb->data + l2_hlen); + + is_udp = ipv6h->nexthdr == IPPROTO_UDP; + } + + if (unlikely(!is_udp)) + return -EINVAL; + + uh = (struct udphdr *)(skb->data + l2_hlen + l3_hlen); + if (unlikely(uh->dest != htons(PSP_DEFAULT_UDP_PORT))) + return -EINVAL; + + pse = skb_ext_add(skb, SKB_EXT_PSP); + if (!pse) + return -EINVAL; + + psph = (struct psphdr *)(skb->data + l2_hlen + l3_hlen + + sizeof(struct udphdr)); + pse->spi = psph->spi; + pse->dev_id = dev_id; + pse->generation = generation; + pse->version = FIELD_GET(PSPHDR_VERFL_VERSION, psph->verfl); + + encap = PSP_ENCAP_HLEN; + encap += strip_icv ? PSP_TRL_SIZE : 0; + + if (proto == htons(ETH_P_IP)) { + struct iphdr *iph = (struct iphdr *)(skb->data + l2_hlen); + + iph->protocol = psph->nexthdr; + iph->tot_len = htons(ntohs(iph->tot_len) - encap); + iph->check = 0; + iph->check = ip_fast_csum((u8 *)iph, iph->ihl); + } else { + struct ipv6hdr *ipv6h = (struct ipv6hdr *)(skb->data + l2_hlen); + + ipv6h->nexthdr = psph->nexthdr; + ipv6h->payload_len = htons(ntohs(ipv6h->payload_len) - encap); + } + + memmove(skb->data + PSP_ENCAP_HLEN, skb->data, l2_hlen + l3_hlen); + skb_pull(skb, PSP_ENCAP_HLEN); + + if (strip_icv) + pskb_trim(skb, skb->len - PSP_TRL_SIZE); + + return 0; +} +EXPORT_SYMBOL(psp_dev_rcv); + +static int __init psp_init(void) +{ + mutex_init(&psp_devs_lock); + + return genl_register_family(&psp_nl_family); +} + +subsys_initcall(psp_init); diff --git a/net/psp/psp_nl.c b/net/psp/psp_nl.c new file mode 100644 index 000000000000..6afd7707ec12 --- /dev/null +++ b/net/psp/psp_nl.c @@ -0,0 +1,598 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/ethtool.h> +#include <linux/skbuff.h> +#include <linux/xarray.h> +#include <net/genetlink.h> +#include <net/psp.h> +#include <net/sock.h> + +#include "psp-nl-gen.h" +#include "psp.h" + +/* Netlink helpers */ + +static struct sk_buff *psp_nl_reply_new(struct genl_info *info) +{ + struct sk_buff *rsp; + void *hdr; + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) + return NULL; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) { + nlmsg_free(rsp); + return NULL; + } + + return rsp; +} + +static int psp_nl_reply_send(struct sk_buff *rsp, struct genl_info *info) +{ + /* Note that this *only* works with a single message per skb! */ + nlmsg_end(rsp, (struct nlmsghdr *)rsp->data); + + return genlmsg_reply(rsp, info); +} + +/* Device stuff */ + +static struct psp_dev * +psp_device_get_and_lock(struct net *net, struct nlattr *dev_id) +{ + struct psp_dev *psd; + int err; + + mutex_lock(&psp_devs_lock); + psd = xa_load(&psp_devs, nla_get_u32(dev_id)); + if (!psd) { + mutex_unlock(&psp_devs_lock); + return ERR_PTR(-ENODEV); + } + + mutex_lock(&psd->lock); + mutex_unlock(&psp_devs_lock); + + err = psp_dev_check_access(psd, net); + if (err) { + mutex_unlock(&psd->lock); + return ERR_PTR(err); + } + + return psd; +} + +int psp_device_get_locked(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + if (GENL_REQ_ATTR_CHECK(info, PSP_A_DEV_ID)) + return -EINVAL; + + info->user_ptr[0] = psp_device_get_and_lock(genl_info_net(info), + info->attrs[PSP_A_DEV_ID]); + return PTR_ERR_OR_ZERO(info->user_ptr[0]); +} + +void +psp_device_unlock(const struct genl_split_ops *ops, struct sk_buff *skb, + struct genl_info *info) +{ + struct socket *socket = info->user_ptr[1]; + struct psp_dev *psd = info->user_ptr[0]; + + mutex_unlock(&psd->lock); + if (socket) + sockfd_put(socket); +} + +static int +psp_nl_dev_fill(struct psp_dev *psd, struct sk_buff *rsp, + const struct genl_info *info) +{ + void *hdr; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) + return -EMSGSIZE; + + if (nla_put_u32(rsp, PSP_A_DEV_ID, psd->id) || + nla_put_u32(rsp, PSP_A_DEV_IFINDEX, psd->main_netdev->ifindex) || + nla_put_u32(rsp, PSP_A_DEV_PSP_VERSIONS_CAP, psd->caps->versions) || + nla_put_u32(rsp, PSP_A_DEV_PSP_VERSIONS_ENA, psd->config.versions)) + goto err_cancel_msg; + + genlmsg_end(rsp, hdr); + return 0; + +err_cancel_msg: + genlmsg_cancel(rsp, hdr); + return -EMSGSIZE; +} + +void psp_nl_notify_dev(struct psp_dev *psd, u32 cmd) +{ + struct genl_info info; + struct sk_buff *ntf; + + if (!genl_has_listeners(&psp_nl_family, dev_net(psd->main_netdev), + PSP_NLGRP_MGMT)) + return; + + ntf = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!ntf) + return; + + genl_info_init_ntf(&info, &psp_nl_family, cmd); + if (psp_nl_dev_fill(psd, ntf, &info)) { + nlmsg_free(ntf); + return; + } + + genlmsg_multicast_netns(&psp_nl_family, dev_net(psd->main_netdev), ntf, + 0, PSP_NLGRP_MGMT, GFP_KERNEL); +} + +int psp_nl_dev_get_doit(struct sk_buff *req, struct genl_info *info) +{ + struct psp_dev *psd = info->user_ptr[0]; + struct sk_buff *rsp; + int err; + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) + return -ENOMEM; + + err = psp_nl_dev_fill(psd, rsp, info); + if (err) + goto err_free_msg; + + return genlmsg_reply(rsp, info); + +err_free_msg: + nlmsg_free(rsp); + return err; +} + +static int +psp_nl_dev_get_dumpit_one(struct sk_buff *rsp, struct netlink_callback *cb, + struct psp_dev *psd) +{ + if (psp_dev_check_access(psd, sock_net(rsp->sk))) + return 0; + + return psp_nl_dev_fill(psd, rsp, genl_info_dump(cb)); +} + +int psp_nl_dev_get_dumpit(struct sk_buff *rsp, struct netlink_callback *cb) +{ + struct psp_dev *psd; + int err = 0; + + mutex_lock(&psp_devs_lock); + xa_for_each_start(&psp_devs, cb->args[0], psd, cb->args[0]) { + mutex_lock(&psd->lock); + err = psp_nl_dev_get_dumpit_one(rsp, cb, psd); + mutex_unlock(&psd->lock); + if (err) + break; + } + mutex_unlock(&psp_devs_lock); + + return err; +} + +int psp_nl_dev_set_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct psp_dev *psd = info->user_ptr[0]; + struct psp_dev_config new_config; + struct sk_buff *rsp; + int err; + + memcpy(&new_config, &psd->config, sizeof(new_config)); + + if (info->attrs[PSP_A_DEV_PSP_VERSIONS_ENA]) { + new_config.versions = + nla_get_u32(info->attrs[PSP_A_DEV_PSP_VERSIONS_ENA]); + if (new_config.versions & ~psd->caps->versions) { + NL_SET_ERR_MSG(info->extack, "Requested PSP versions not supported by the device"); + return -EINVAL; + } + } else { + NL_SET_ERR_MSG(info->extack, "No settings present"); + return -EINVAL; + } + + rsp = psp_nl_reply_new(info); + if (!rsp) + return -ENOMEM; + + if (memcmp(&new_config, &psd->config, sizeof(new_config))) { + err = psd->ops->set_config(psd, &new_config, info->extack); + if (err) + goto err_free_rsp; + + memcpy(&psd->config, &new_config, sizeof(new_config)); + } + + psp_nl_notify_dev(psd, PSP_CMD_DEV_CHANGE_NTF); + + return psp_nl_reply_send(rsp, info); + +err_free_rsp: + nlmsg_free(rsp); + return err; +} + +int psp_nl_key_rotate_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct psp_dev *psd = info->user_ptr[0]; + struct genl_info ntf_info; + struct sk_buff *ntf, *rsp; + u8 prev_gen; + int err; + + rsp = psp_nl_reply_new(info); + if (!rsp) + return -ENOMEM; + + genl_info_init_ntf(&ntf_info, &psp_nl_family, PSP_CMD_KEY_ROTATE_NTF); + ntf = psp_nl_reply_new(&ntf_info); + if (!ntf) { + err = -ENOMEM; + goto err_free_rsp; + } + + if (nla_put_u32(rsp, PSP_A_DEV_ID, psd->id) || + nla_put_u32(ntf, PSP_A_DEV_ID, psd->id)) { + err = -EMSGSIZE; + goto err_free_ntf; + } + + /* suggest the next gen number, driver can override */ + prev_gen = psd->generation; + psd->generation = (prev_gen + 1) & PSP_GEN_VALID_MASK; + + err = psd->ops->key_rotate(psd, info->extack); + if (err) + goto err_free_ntf; + + WARN_ON_ONCE((psd->generation && psd->generation == prev_gen) || + psd->generation & ~PSP_GEN_VALID_MASK); + + psp_assocs_key_rotated(psd); + psd->stats.rotations++; + + nlmsg_end(ntf, (struct nlmsghdr *)ntf->data); + genlmsg_multicast_netns(&psp_nl_family, dev_net(psd->main_netdev), ntf, + 0, PSP_NLGRP_USE, GFP_KERNEL); + return psp_nl_reply_send(rsp, info); + +err_free_ntf: + nlmsg_free(ntf); +err_free_rsp: + nlmsg_free(rsp); + return err; +} + +/* Key etc. */ + +int psp_assoc_device_get_locked(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + struct socket *socket; + struct psp_dev *psd; + struct nlattr *id; + int fd, err; + + if (GENL_REQ_ATTR_CHECK(info, PSP_A_ASSOC_SOCK_FD)) + return -EINVAL; + + fd = nla_get_u32(info->attrs[PSP_A_ASSOC_SOCK_FD]); + socket = sockfd_lookup(fd, &err); + if (!socket) + return err; + + if (!sk_is_tcp(socket->sk)) { + NL_SET_ERR_MSG_ATTR(info->extack, + info->attrs[PSP_A_ASSOC_SOCK_FD], + "Unsupported socket family and type"); + err = -EOPNOTSUPP; + goto err_sock_put; + } + + psd = psp_dev_get_for_sock(socket->sk); + if (psd) { + err = psp_dev_check_access(psd, genl_info_net(info)); + if (err) { + psp_dev_put(psd); + psd = NULL; + } + } + + if (!psd && GENL_REQ_ATTR_CHECK(info, PSP_A_ASSOC_DEV_ID)) { + err = -EINVAL; + goto err_sock_put; + } + + id = info->attrs[PSP_A_ASSOC_DEV_ID]; + if (psd) { + mutex_lock(&psd->lock); + if (id && psd->id != nla_get_u32(id)) { + mutex_unlock(&psd->lock); + NL_SET_ERR_MSG_ATTR(info->extack, id, + "Device id vs socket mismatch"); + err = -EINVAL; + goto err_psd_put; + } + + psp_dev_put(psd); + } else { + psd = psp_device_get_and_lock(genl_info_net(info), id); + if (IS_ERR(psd)) { + err = PTR_ERR(psd); + goto err_sock_put; + } + } + + info->user_ptr[0] = psd; + info->user_ptr[1] = socket; + + return 0; + +err_psd_put: + psp_dev_put(psd); +err_sock_put: + sockfd_put(socket); + return err; +} + +static int +psp_nl_parse_key(struct genl_info *info, u32 attr, struct psp_key_parsed *key, + unsigned int key_sz) +{ + struct nlattr *nest = info->attrs[attr]; + struct nlattr *tb[PSP_A_KEYS_SPI + 1]; + u32 spi; + int err; + + err = nla_parse_nested(tb, ARRAY_SIZE(tb) - 1, nest, + psp_keys_nl_policy, info->extack); + if (err) + return err; + + if (NL_REQ_ATTR_CHECK(info->extack, nest, tb, PSP_A_KEYS_KEY) || + NL_REQ_ATTR_CHECK(info->extack, nest, tb, PSP_A_KEYS_SPI)) + return -EINVAL; + + if (nla_len(tb[PSP_A_KEYS_KEY]) != key_sz) { + NL_SET_ERR_MSG_ATTR(info->extack, tb[PSP_A_KEYS_KEY], + "incorrect key length"); + return -EINVAL; + } + + spi = nla_get_u32(tb[PSP_A_KEYS_SPI]); + if (!(spi & PSP_SPI_KEY_ID)) { + NL_SET_ERR_MSG_ATTR(info->extack, tb[PSP_A_KEYS_KEY], + "invalid SPI: lower 31b must be non-zero"); + return -EINVAL; + } + + key->spi = cpu_to_be32(spi); + memcpy(key->key, nla_data(tb[PSP_A_KEYS_KEY]), key_sz); + + return 0; +} + +static int +psp_nl_put_key(struct sk_buff *skb, u32 attr, u32 version, + struct psp_key_parsed *key) +{ + int key_sz = psp_key_size(version); + void *nest; + + nest = nla_nest_start(skb, attr); + + if (nla_put_u32(skb, PSP_A_KEYS_SPI, be32_to_cpu(key->spi)) || + nla_put(skb, PSP_A_KEYS_KEY, key_sz, key->key)) { + nla_nest_cancel(skb, nest); + return -EMSGSIZE; + } + + nla_nest_end(skb, nest); + + return 0; +} + +int psp_nl_rx_assoc_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct socket *socket = info->user_ptr[1]; + struct psp_dev *psd = info->user_ptr[0]; + struct psp_key_parsed key; + struct psp_assoc *pas; + struct sk_buff *rsp; + u32 version; + int err; + + if (GENL_REQ_ATTR_CHECK(info, PSP_A_ASSOC_VERSION)) + return -EINVAL; + + version = nla_get_u32(info->attrs[PSP_A_ASSOC_VERSION]); + if (!(psd->caps->versions & (1 << version))) { + NL_SET_BAD_ATTR(info->extack, info->attrs[PSP_A_ASSOC_VERSION]); + return -EOPNOTSUPP; + } + + rsp = psp_nl_reply_new(info); + if (!rsp) + return -ENOMEM; + + pas = psp_assoc_create(psd); + if (!pas) { + err = -ENOMEM; + goto err_free_rsp; + } + pas->version = version; + + err = psd->ops->rx_spi_alloc(psd, version, &key, info->extack); + if (err) + goto err_free_pas; + + if (nla_put_u32(rsp, PSP_A_ASSOC_DEV_ID, psd->id) || + psp_nl_put_key(rsp, PSP_A_ASSOC_RX_KEY, version, &key)) { + err = -EMSGSIZE; + goto err_free_pas; + } + + err = psp_sock_assoc_set_rx(socket->sk, pas, &key, info->extack); + if (err) { + NL_SET_BAD_ATTR(info->extack, info->attrs[PSP_A_ASSOC_SOCK_FD]); + goto err_free_pas; + } + psp_assoc_put(pas); + + return psp_nl_reply_send(rsp, info); + +err_free_pas: + psp_assoc_put(pas); +err_free_rsp: + nlmsg_free(rsp); + return err; +} + +int psp_nl_tx_assoc_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct socket *socket = info->user_ptr[1]; + struct psp_dev *psd = info->user_ptr[0]; + struct psp_key_parsed key; + struct sk_buff *rsp; + unsigned int key_sz; + u32 version; + int err; + + if (GENL_REQ_ATTR_CHECK(info, PSP_A_ASSOC_VERSION) || + GENL_REQ_ATTR_CHECK(info, PSP_A_ASSOC_TX_KEY)) + return -EINVAL; + + version = nla_get_u32(info->attrs[PSP_A_ASSOC_VERSION]); + if (!(psd->caps->versions & (1 << version))) { + NL_SET_BAD_ATTR(info->extack, info->attrs[PSP_A_ASSOC_VERSION]); + return -EOPNOTSUPP; + } + + key_sz = psp_key_size(version); + if (!key_sz) + return -EINVAL; + + err = psp_nl_parse_key(info, PSP_A_ASSOC_TX_KEY, &key, key_sz); + if (err < 0) + return err; + + rsp = psp_nl_reply_new(info); + if (!rsp) + return -ENOMEM; + + err = psp_sock_assoc_set_tx(socket->sk, psd, version, &key, + info->extack); + if (err) + goto err_free_msg; + + return psp_nl_reply_send(rsp, info); + +err_free_msg: + nlmsg_free(rsp); + return err; +} + +static int +psp_nl_stats_fill(struct psp_dev *psd, struct sk_buff *rsp, + const struct genl_info *info) +{ + unsigned int required_cnt = sizeof(struct psp_dev_stats) / sizeof(u64); + struct psp_dev_stats stats; + void *hdr; + int i; + + memset(&stats, 0xff, sizeof(stats)); + psd->ops->get_stats(psd, &stats); + + for (i = 0; i < required_cnt; i++) + if (WARN_ON_ONCE(stats.required[i] == ETHTOOL_STAT_NOT_SET)) + return -EOPNOTSUPP; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) + return -EMSGSIZE; + + if (nla_put_u32(rsp, PSP_A_STATS_DEV_ID, psd->id) || + nla_put_uint(rsp, PSP_A_STATS_KEY_ROTATIONS, + psd->stats.rotations) || + nla_put_uint(rsp, PSP_A_STATS_STALE_EVENTS, psd->stats.stales) || + nla_put_uint(rsp, PSP_A_STATS_RX_PACKETS, stats.rx_packets) || + nla_put_uint(rsp, PSP_A_STATS_RX_BYTES, stats.rx_bytes) || + nla_put_uint(rsp, PSP_A_STATS_RX_AUTH_FAIL, stats.rx_auth_fail) || + nla_put_uint(rsp, PSP_A_STATS_RX_ERROR, stats.rx_error) || + nla_put_uint(rsp, PSP_A_STATS_RX_BAD, stats.rx_bad) || + nla_put_uint(rsp, PSP_A_STATS_TX_PACKETS, stats.tx_packets) || + nla_put_uint(rsp, PSP_A_STATS_TX_BYTES, stats.tx_bytes) || + nla_put_uint(rsp, PSP_A_STATS_TX_ERROR, stats.tx_error)) + goto err_cancel_msg; + + genlmsg_end(rsp, hdr); + return 0; + +err_cancel_msg: + genlmsg_cancel(rsp, hdr); + return -EMSGSIZE; +} + +int psp_nl_get_stats_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct psp_dev *psd = info->user_ptr[0]; + struct sk_buff *rsp; + int err; + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) + return -ENOMEM; + + err = psp_nl_stats_fill(psd, rsp, info); + if (err) + goto err_free_msg; + + return genlmsg_reply(rsp, info); + +err_free_msg: + nlmsg_free(rsp); + return err; +} + +static int +psp_nl_stats_get_dumpit_one(struct sk_buff *rsp, struct netlink_callback *cb, + struct psp_dev *psd) +{ + if (psp_dev_check_access(psd, sock_net(rsp->sk))) + return 0; + + return psp_nl_stats_fill(psd, rsp, genl_info_dump(cb)); +} + +int psp_nl_get_stats_dumpit(struct sk_buff *rsp, struct netlink_callback *cb) +{ + struct psp_dev *psd; + int err = 0; + + mutex_lock(&psp_devs_lock); + xa_for_each_start(&psp_devs, cb->args[0], psd, cb->args[0]) { + mutex_lock(&psd->lock); + err = psp_nl_stats_get_dumpit_one(rsp, cb, psd); + mutex_unlock(&psd->lock); + if (err) + break; + } + mutex_unlock(&psp_devs_lock); + + return err; +} diff --git a/net/psp/psp_sock.c b/net/psp/psp_sock.c new file mode 100644 index 000000000000..f785672b7df6 --- /dev/null +++ b/net/psp/psp_sock.c @@ -0,0 +1,294 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/file.h> +#include <linux/net.h> +#include <linux/rcupdate.h> +#include <linux/tcp.h> + +#include <net/ip.h> +#include <net/psp.h> +#include "psp.h" + +struct psp_dev *psp_dev_get_for_sock(struct sock *sk) +{ + struct psp_dev *psd = NULL; + struct dst_entry *dst; + + rcu_read_lock(); + dst = __sk_dst_get(sk); + if (dst) { + psd = rcu_dereference(dst_dev_rcu(dst)->psp_dev); + if (psd && !psp_dev_tryget(psd)) + psd = NULL; + } + rcu_read_unlock(); + + return psd; +} + +static struct sk_buff * +psp_validate_xmit(struct sock *sk, struct net_device *dev, struct sk_buff *skb) +{ + struct psp_assoc *pas; + bool good; + + rcu_read_lock(); + pas = psp_skb_get_assoc_rcu(skb); + good = !pas || rcu_access_pointer(dev->psp_dev) == pas->psd; + rcu_read_unlock(); + if (!good) { + sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_PSP_OUTPUT); + return NULL; + } + + return skb; +} + +struct psp_assoc *psp_assoc_create(struct psp_dev *psd) +{ + struct psp_assoc *pas; + + lockdep_assert_held(&psd->lock); + + pas = kzalloc(struct_size(pas, drv_data, psd->caps->assoc_drv_spc), + GFP_KERNEL_ACCOUNT); + if (!pas) + return NULL; + + pas->psd = psd; + pas->dev_id = psd->id; + pas->generation = psd->generation; + psp_dev_get(psd); + refcount_set(&pas->refcnt, 1); + + list_add_tail(&pas->assocs_list, &psd->active_assocs); + + return pas; +} + +static struct psp_assoc *psp_assoc_dummy(struct psp_assoc *pas) +{ + struct psp_dev *psd = pas->psd; + size_t sz; + + lockdep_assert_held(&psd->lock); + + sz = struct_size(pas, drv_data, psd->caps->assoc_drv_spc); + return kmemdup(pas, sz, GFP_KERNEL); +} + +static int psp_dev_tx_key_add(struct psp_dev *psd, struct psp_assoc *pas, + struct netlink_ext_ack *extack) +{ + return psd->ops->tx_key_add(psd, pas, extack); +} + +void psp_dev_tx_key_del(struct psp_dev *psd, struct psp_assoc *pas) +{ + if (pas->tx.spi) + psd->ops->tx_key_del(psd, pas); + list_del(&pas->assocs_list); +} + +static void psp_assoc_free(struct work_struct *work) +{ + struct psp_assoc *pas = container_of(work, struct psp_assoc, work); + struct psp_dev *psd = pas->psd; + + mutex_lock(&psd->lock); + if (psd->ops) + psp_dev_tx_key_del(psd, pas); + mutex_unlock(&psd->lock); + psp_dev_put(psd); + kfree(pas); +} + +static void psp_assoc_free_queue(struct rcu_head *head) +{ + struct psp_assoc *pas = container_of(head, struct psp_assoc, rcu); + + INIT_WORK(&pas->work, psp_assoc_free); + schedule_work(&pas->work); +} + +/** + * psp_assoc_put() - release a reference on a PSP association + * @pas: association to release + */ +void psp_assoc_put(struct psp_assoc *pas) +{ + if (pas && refcount_dec_and_test(&pas->refcnt)) + call_rcu(&pas->rcu, psp_assoc_free_queue); +} + +void psp_sk_assoc_free(struct sock *sk) +{ + struct psp_assoc *pas = rcu_dereference_protected(sk->psp_assoc, 1); + + rcu_assign_pointer(sk->psp_assoc, NULL); + psp_assoc_put(pas); +} + +int psp_sock_assoc_set_rx(struct sock *sk, struct psp_assoc *pas, + struct psp_key_parsed *key, + struct netlink_ext_ack *extack) +{ + int err; + + memcpy(&pas->rx, key, sizeof(*key)); + + lock_sock(sk); + + if (psp_sk_assoc(sk)) { + NL_SET_ERR_MSG(extack, "Socket already has PSP state"); + err = -EBUSY; + goto exit_unlock; + } + + refcount_inc(&pas->refcnt); + rcu_assign_pointer(sk->psp_assoc, pas); + err = 0; + +exit_unlock: + release_sock(sk); + + return err; +} + +static int psp_sock_recv_queue_check(struct sock *sk, struct psp_assoc *pas) +{ + struct psp_skb_ext *pse; + struct sk_buff *skb; + + skb_rbtree_walk(skb, &tcp_sk(sk)->out_of_order_queue) { + pse = skb_ext_find(skb, SKB_EXT_PSP); + if (!psp_pse_matches_pas(pse, pas)) + return -EBUSY; + } + + skb_queue_walk(&sk->sk_receive_queue, skb) { + pse = skb_ext_find(skb, SKB_EXT_PSP); + if (!psp_pse_matches_pas(pse, pas)) + return -EBUSY; + } + return 0; +} + +int psp_sock_assoc_set_tx(struct sock *sk, struct psp_dev *psd, + u32 version, struct psp_key_parsed *key, + struct netlink_ext_ack *extack) +{ + struct inet_connection_sock *icsk; + struct psp_assoc *pas, *dummy; + int err; + + lock_sock(sk); + + pas = psp_sk_assoc(sk); + if (!pas) { + NL_SET_ERR_MSG(extack, "Socket has no Rx key"); + err = -EINVAL; + goto exit_unlock; + } + if (pas->psd != psd) { + NL_SET_ERR_MSG(extack, "Rx key from different device"); + err = -EINVAL; + goto exit_unlock; + } + if (pas->version != version) { + NL_SET_ERR_MSG(extack, + "PSP version mismatch with existing state"); + err = -EINVAL; + goto exit_unlock; + } + if (pas->tx.spi) { + NL_SET_ERR_MSG(extack, "Tx key already set"); + err = -EBUSY; + goto exit_unlock; + } + + err = psp_sock_recv_queue_check(sk, pas); + if (err) { + NL_SET_ERR_MSG(extack, "Socket has incompatible segments already in the recv queue"); + goto exit_unlock; + } + + /* Pass a fake association to drivers to make sure they don't + * try to store pointers to it. For re-keying we'll need to + * re-allocate the assoc structures. + */ + dummy = psp_assoc_dummy(pas); + if (!dummy) { + err = -ENOMEM; + goto exit_unlock; + } + + memcpy(&dummy->tx, key, sizeof(*key)); + err = psp_dev_tx_key_add(psd, dummy, extack); + if (err) + goto exit_free_dummy; + + memcpy(pas->drv_data, dummy->drv_data, psd->caps->assoc_drv_spc); + memcpy(&pas->tx, key, sizeof(*key)); + + WRITE_ONCE(sk->sk_validate_xmit_skb, psp_validate_xmit); + tcp_write_collapse_fence(sk); + pas->upgrade_seq = tcp_sk(sk)->rcv_nxt; + + icsk = inet_csk(sk); + icsk->icsk_ext_hdr_len += psp_sk_overhead(sk); + icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); + +exit_free_dummy: + kfree(dummy); +exit_unlock: + release_sock(sk); + return err; +} + +void psp_assocs_key_rotated(struct psp_dev *psd) +{ + struct psp_assoc *pas, *next; + + /* Mark the stale associations as invalid, they will no longer + * be able to Rx any traffic. + */ + list_for_each_entry_safe(pas, next, &psd->prev_assocs, assocs_list) { + pas->generation |= ~PSP_GEN_VALID_MASK; + psd->stats.stales++; + } + list_splice_init(&psd->prev_assocs, &psd->stale_assocs); + list_splice_init(&psd->active_assocs, &psd->prev_assocs); + + /* TODO: we should inform the sockets that got shut down */ +} + +void psp_twsk_init(struct inet_timewait_sock *tw, const struct sock *sk) +{ + struct psp_assoc *pas = psp_sk_assoc(sk); + + if (pas) + refcount_inc(&pas->refcnt); + rcu_assign_pointer(tw->psp_assoc, pas); + tw->tw_validate_xmit_skb = psp_validate_xmit; +} + +void psp_twsk_assoc_free(struct inet_timewait_sock *tw) +{ + struct psp_assoc *pas = rcu_dereference_protected(tw->psp_assoc, 1); + + rcu_assign_pointer(tw->psp_assoc, NULL); + psp_assoc_put(pas); +} + +void psp_reply_set_decrypted(const struct sock *sk, struct sk_buff *skb) +{ + struct psp_assoc *pas; + + rcu_read_lock(); + pas = psp_sk_get_assoc_rcu(sk); + if (pas && pas->tx.spi) + skb->decrypted = 1; + rcu_read_unlock(); +} +EXPORT_IPV6_MOD_GPL(psp_reply_set_decrypted); diff --git a/net/qrtr/af_qrtr.c b/net/qrtr/af_qrtr.c index 00c51cf693f3..dab839f61ee9 100644 --- a/net/qrtr/af_qrtr.c +++ b/net/qrtr/af_qrtr.c @@ -824,7 +824,7 @@ static int qrtr_autobind(struct socket *sock) } /* Bind socket to specified sockaddr. */ -static int qrtr_bind(struct socket *sock, struct sockaddr *saddr, int len) +static int qrtr_bind(struct socket *sock, struct sockaddr_unsized *saddr, int len) { DECLARE_SOCKADDR(struct sockaddr_qrtr *, addr, saddr); struct qrtr_sock *ipc = qrtr_sk(sock->sk); @@ -1084,7 +1084,7 @@ out: return rc; } -static int qrtr_connect(struct socket *sock, struct sockaddr *saddr, +static int qrtr_connect(struct socket *sock, struct sockaddr_unsized *saddr, int len, int flags) { DECLARE_SOCKADDR(struct sockaddr_qrtr *, addr, saddr); diff --git a/net/qrtr/ns.c b/net/qrtr/ns.c index 3de9350cbf30..bfcc1a453f23 100644 --- a/net/qrtr/ns.c +++ b/net/qrtr/ns.c @@ -714,7 +714,7 @@ int qrtr_ns_init(void) sq.sq_port = QRTR_PORT_CTRL; qrtr_ns.local_node = sq.sq_node; - ret = kernel_bind(qrtr_ns.sock, (struct sockaddr *)&sq, sizeof(sq)); + ret = kernel_bind(qrtr_ns.sock, (struct sockaddr_unsized *)&sq, sizeof(sq)); if (ret < 0) { pr_err("failed to bind to socket\n"); goto err_wq; diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index 086a13170e09..b396c673dfaf 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -242,7 +242,7 @@ static __poll_t rds_poll(struct file *file, struct socket *sock, if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) mask |= (EPOLLOUT | EPOLLWRNORM); if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) - mask |= POLLERR; + mask |= EPOLLERR; read_unlock_irqrestore(&rs->rs_recv_lock, flags); /* clear state any time we wake a seen-congested socket */ @@ -533,7 +533,7 @@ out: } -static int rds_connect(struct socket *sock, struct sockaddr *uaddr, +static int rds_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; diff --git a/net/rds/bind.c b/net/rds/bind.c index 97a29172a8ee..f800d920d969 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c @@ -160,7 +160,7 @@ void rds_remove_bound(struct rds_sock *rs) rs->rs_bound_addr = in6addr_any; } -int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +int rds_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { struct sock *sk = sock->sk; struct rds_sock *rs = rds_sk_to_rs(sk); diff --git a/net/rds/connection.c b/net/rds/connection.c index d62f486ab29f..68bc88cce84e 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -57,16 +57,17 @@ static struct hlist_head *rds_conn_bucket(const struct in6_addr *laddr, static u32 rds6_hash_secret __read_mostly; static u32 rds_hash_secret __read_mostly; - u32 lhash, fhash, hash; + __be32 lhash, fhash; + u32 hash; net_get_random_once(&rds_hash_secret, sizeof(rds_hash_secret)); net_get_random_once(&rds6_hash_secret, sizeof(rds6_hash_secret)); - lhash = (__force u32)laddr->s6_addr32[3]; + lhash = laddr->s6_addr32[3]; #if IS_ENABLED(CONFIG_IPV6) - fhash = __ipv6_addr_jhash(faddr, rds6_hash_secret); + fhash = (__force __be32)__ipv6_addr_jhash(faddr, rds6_hash_secret); #else - fhash = (__force u32)faddr->s6_addr32[3]; + fhash = faddr->s6_addr32[3]; #endif hash = __inet_ehashfn(lhash, 0, fhash, 0, rds_hash_secret); diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c index 28c1b0022178..bd861191157b 100644 --- a/net/rds/ib_frmr.c +++ b/net/rds/ib_frmr.c @@ -133,12 +133,15 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr) ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_dma_len, &off, PAGE_SIZE); - if (unlikely(ret != ibmr->sg_dma_len)) - return ret < 0 ? ret : -EINVAL; + if (unlikely(ret != ibmr->sg_dma_len)) { + ret = ret < 0 ? ret : -EINVAL; + goto out_inc; + } - if (cmpxchg(&frmr->fr_state, - FRMR_IS_FREE, FRMR_IS_INUSE) != FRMR_IS_FREE) - return -EBUSY; + if (cmpxchg(&frmr->fr_state, FRMR_IS_FREE, FRMR_IS_INUSE) != FRMR_IS_FREE) { + ret = -EBUSY; + goto out_inc; + } atomic_inc(&ibmr->ic->i_fastreg_inuse_count); @@ -166,11 +169,10 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr) /* Failure here can be because of -ENOMEM as well */ rds_transition_frwr_state(ibmr, FRMR_IS_INUSE, FRMR_IS_STALE); - atomic_inc(&ibmr->ic->i_fastreg_wrs); if (printk_ratelimit()) pr_warn("RDS/IB: %s returned error(%d)\n", __func__, ret); - goto out; + goto out_inc; } /* Wait for the registration to complete in order to prevent an invalid @@ -179,8 +181,10 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr) */ wait_event(frmr->fr_reg_done, !frmr->fr_reg); -out: + return ret; +out_inc: + atomic_inc(&ibmr->ic->i_fastreg_wrs); return ret; } diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h index ea5e9aee4959..5884de8c6f45 100644 --- a/net/rds/ib_mr.h +++ b/net/rds/ib_mr.h @@ -108,7 +108,6 @@ struct rds_ib_mr_pool { }; extern struct workqueue_struct *rds_ib_mr_wq; -extern bool prefer_frmr; struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_dev, int npages); diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index d1cfceeff133..6585164c7059 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -672,7 +672,8 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev, int rds_ib_mr_init(void) { - rds_ib_mr_wq = alloc_workqueue("rds_mr_flushd", WQ_MEM_RECLAIM, 0); + rds_ib_mr_wq = alloc_workqueue("rds_mr_flushd", + WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!rds_ib_mr_wq) return -ENOMEM; return 0; diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index e53b7f266bd7..4248dfa816eb 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -1034,7 +1034,7 @@ void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic, rds_ib_stats_inc(s_ib_rx_ring_empty); if (rds_ib_ring_low(&ic->i_recv_ring)) { - rds_ib_recv_refill(conn, 0, GFP_NOWAIT | __GFP_NOWARN); + rds_ib_recv_refill(conn, 0, GFP_NOWAIT); rds_ib_stats_inc(s_ib_rx_refill_from_cq); } } diff --git a/net/rds/message.c b/net/rds/message.c index 7af59d2443e5..199a899a43e9 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -44,8 +44,8 @@ static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = { [RDS_EXTHDR_VERSION] = sizeof(struct rds_ext_header_version), [RDS_EXTHDR_RDMA] = sizeof(struct rds_ext_header_rdma), [RDS_EXTHDR_RDMA_DEST] = sizeof(struct rds_ext_header_rdma_dest), -[RDS_EXTHDR_NPATHS] = sizeof(u16), -[RDS_EXTHDR_GEN_NUM] = sizeof(u32), +[RDS_EXTHDR_NPATHS] = sizeof(__be16), +[RDS_EXTHDR_GEN_NUM] = sizeof(__be32), }; void rds_message_addref(struct rds_message *rm) diff --git a/net/rds/rds.h b/net/rds/rds.h index dc360252c515..a029e5fcdea7 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -93,7 +93,7 @@ enum { /* Max number of multipaths per RDS connection. Must be a power of 2 */ #define RDS_MPATH_WORKERS 8 -#define RDS_MPATH_HASH(rs, n) (jhash_1word((rs)->rs_bound_port, \ +#define RDS_MPATH_HASH(rs, n) (jhash_1word(ntohs((rs)->rs_bound_port), \ (rs)->rs_hash_initval) & ((n) - 1)) #define IS_CANONICAL(laddr, faddr) (htonl(laddr) < htonl(faddr)) @@ -735,7 +735,7 @@ extern wait_queue_head_t rds_poll_waitq; /* bind.c */ -int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); +int rds_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len); void rds_remove_bound(struct rds_sock *rs); struct rds_sock *rds_find_bound(const struct in6_addr *addr, __be16 port, __u32 scope_id); diff --git a/net/rds/recv.c b/net/rds/recv.c index 5627f80013f8..66205d6924bf 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -202,8 +202,8 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr, unsigned int pos = 0, type, len; union { struct rds_ext_header_version version; - u16 rds_npaths; - u32 rds_gen_num; + __be16 rds_npaths; + __be32 rds_gen_num; } buffer; u32 new_peer_gen_num = 0; diff --git a/net/rds/send.c b/net/rds/send.c index 42d991bc8543..0b3d0ef2f008 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -1454,8 +1454,8 @@ rds_send_probe(struct rds_conn_path *cp, __be16 sport, if (RDS_HS_PROBE(be16_to_cpu(sport), be16_to_cpu(dport)) && cp->cp_conn->c_trans->t_mp_capable) { - u16 npaths = cpu_to_be16(RDS_MPATH_WORKERS); - u32 my_gen_num = cpu_to_be32(cp->cp_conn->c_my_gen_num); + __be16 npaths = cpu_to_be16(RDS_MPATH_WORKERS); + __be32 my_gen_num = cpu_to_be32(cp->cp_conn->c_my_gen_num); rds_message_add_extension(&rm->m_inc.i_hdr, RDS_EXTHDR_NPATHS, &npaths, diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index a0046e99d6df..92891b0d224d 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -145,7 +145,7 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp) addrlen = sizeof(sin); } - ret = kernel_bind(sock, addr, addrlen); + ret = kernel_bind(sock, (struct sockaddr_unsized *)addr, addrlen); if (ret) { rdsdebug("bind failed with %d at address %pI6c\n", ret, &conn->c_laddr); @@ -173,7 +173,7 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp) * own the socket */ rds_tcp_set_callbacks(sock, cp); - ret = kernel_connect(sock, addr, addrlen, O_NONBLOCK); + ret = kernel_connect(sock, (struct sockaddr_unsized *)addr, addrlen, O_NONBLOCK); rdsdebug("connect to address %pI6c returned %d\n", &conn->c_faddr, ret); if (ret == -EINPROGRESS) diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 91e34af3fe5d..820d3e20de19 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -290,7 +290,7 @@ struct socket *rds_tcp_listen_init(struct net *net, bool isv6) addr_len = sizeof(*sin); } - ret = kernel_bind(sock, (struct sockaddr *)&ss, addr_len); + ret = kernel_bind(sock, (struct sockaddr_unsized *)&ss, addr_len); if (ret < 0) { rdsdebug("could not bind %s listener socket: %d\n", isv6 ? "IPv6" : "IPv4", ret); diff --git a/net/rfkill/input.c b/net/rfkill/input.c index 598d0a61bda7..53d286b10843 100644 --- a/net/rfkill/input.c +++ b/net/rfkill/input.c @@ -159,7 +159,7 @@ static void rfkill_schedule_global_op(enum rfkill_sched_op op) rfkill_op_pending = true; if (op == RFKILL_GLOBAL_OP_EPO && !rfkill_is_epo_lock_active()) { /* bypass the limiter for EPO */ - mod_delayed_work(system_wq, &rfkill_op_work, 0); + mod_delayed_work(system_percpu_wq, &rfkill_op_work, 0); rfkill_last_scheduled = jiffies; } else rfkill_schedule_ratelimited(); diff --git a/net/rfkill/rfkill-gpio.c b/net/rfkill/rfkill-gpio.c index 41e657e97761..cf2dcec6ce5a 100644 --- a/net/rfkill/rfkill-gpio.c +++ b/net/rfkill/rfkill-gpio.c @@ -94,10 +94,10 @@ static const struct dmi_system_id rfkill_gpio_deny_table[] = { static int rfkill_gpio_probe(struct platform_device *pdev) { struct rfkill_gpio_data *rfkill; - struct gpio_desc *gpio; + const char *type_name = NULL; const char *name_property; const char *type_property; - const char *type_name; + struct gpio_desc *gpio; int ret; if (dmi_check_system(rfkill_gpio_deny_table)) diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 4e72b636a46a..fd67494f2815 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -170,7 +170,7 @@ void rose_kill_by_neigh(struct rose_neigh *neigh) if (rose->neighbour == neigh) { rose_disconnect(s, ENETUNREACH, ROSE_OUT_OF_ORDER, 0); - rose->neighbour->use--; + rose_neigh_put(rose->neighbour); rose->neighbour = NULL; } } @@ -212,7 +212,7 @@ start: if (rose->device == dev) { rose_disconnect(sk, ENETUNREACH, ROSE_OUT_OF_ORDER, 0); if (rose->neighbour) - rose->neighbour->use--; + rose_neigh_put(rose->neighbour); netdev_put(rose->device, &rose->dev_tracker); rose->device = NULL; } @@ -655,7 +655,7 @@ static int rose_release(struct socket *sock) break; case ROSE_STATE_2: - rose->neighbour->use--; + rose_neigh_put(rose->neighbour); release_sock(sk); rose_disconnect(sk, 0, -1, -1); lock_sock(sk); @@ -693,7 +693,7 @@ static int rose_release(struct socket *sock) return 0; } -static int rose_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +static int rose_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { struct sock *sk = sock->sk; struct rose_sock *rose = rose_sk(sk); @@ -765,7 +765,8 @@ out_release: return err; } -static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) +static int rose_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, + int flags) { struct sock *sk = sock->sk; struct rose_sock *rose = rose_sk(sk); @@ -823,6 +824,7 @@ static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_le rose->lci = rose_new_lci(rose->neighbour); if (!rose->lci) { err = -ENETUNREACH; + rose_neigh_put(rose->neighbour); goto out_release; } @@ -834,12 +836,14 @@ static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_le dev = rose_dev_first(); if (!dev) { err = -ENETUNREACH; + rose_neigh_put(rose->neighbour); goto out_release; } user = ax25_findbyuid(current_euid()); if (!user) { err = -EINVAL; + rose_neigh_put(rose->neighbour); dev_put(dev); goto out_release; } @@ -874,8 +878,6 @@ static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_le rose->state = ROSE_STATE_1; - rose->neighbour->use++; - rose_write_internal(sk, ROSE_CALL_REQUEST); rose_start_heartbeat(sk); rose_start_t1timer(sk); @@ -1077,7 +1079,7 @@ int rose_rx_call_request(struct sk_buff *skb, struct net_device *dev, struct ros GFP_ATOMIC); make_rose->facilities = facilities; - make_rose->neighbour->use++; + rose_neigh_hold(make_rose->neighbour); if (rose_sk(sk)->defer) { make_rose->state = ROSE_STATE_5; diff --git a/net/rose/rose_in.c b/net/rose/rose_in.c index 3e99181e759f..0276b393f0e5 100644 --- a/net/rose/rose_in.c +++ b/net/rose/rose_in.c @@ -56,7 +56,7 @@ static int rose_state1_machine(struct sock *sk, struct sk_buff *skb, int framety case ROSE_CLEAR_REQUEST: rose_write_internal(sk, ROSE_CLEAR_CONFIRMATION); rose_disconnect(sk, ECONNREFUSED, skb->data[3], skb->data[4]); - rose->neighbour->use--; + rose_neigh_put(rose->neighbour); break; default: @@ -79,12 +79,12 @@ static int rose_state2_machine(struct sock *sk, struct sk_buff *skb, int framety case ROSE_CLEAR_REQUEST: rose_write_internal(sk, ROSE_CLEAR_CONFIRMATION); rose_disconnect(sk, 0, skb->data[3], skb->data[4]); - rose->neighbour->use--; + rose_neigh_put(rose->neighbour); break; case ROSE_CLEAR_CONFIRMATION: rose_disconnect(sk, 0, -1, -1); - rose->neighbour->use--; + rose_neigh_put(rose->neighbour); break; default: @@ -121,7 +121,7 @@ static int rose_state3_machine(struct sock *sk, struct sk_buff *skb, int framety case ROSE_CLEAR_REQUEST: rose_write_internal(sk, ROSE_CLEAR_CONFIRMATION); rose_disconnect(sk, 0, skb->data[3], skb->data[4]); - rose->neighbour->use--; + rose_neigh_put(rose->neighbour); break; case ROSE_RR: @@ -234,7 +234,7 @@ static int rose_state4_machine(struct sock *sk, struct sk_buff *skb, int framety case ROSE_CLEAR_REQUEST: rose_write_internal(sk, ROSE_CLEAR_CONFIRMATION); rose_disconnect(sk, 0, skb->data[3], skb->data[4]); - rose->neighbour->use--; + rose_neigh_put(rose->neighbour); break; default: @@ -254,7 +254,7 @@ static int rose_state5_machine(struct sock *sk, struct sk_buff *skb, int framety if (frametype == ROSE_CLEAR_REQUEST) { rose_write_internal(sk, ROSE_CLEAR_CONFIRMATION); rose_disconnect(sk, 0, skb->data[3], skb->data[4]); - rose_sk(sk)->neighbour->use--; + rose_neigh_put(rose_sk(sk)->neighbour); } return 0; diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c index b72bf8a08d48..a1e9b05ef6f5 100644 --- a/net/rose/rose_route.c +++ b/net/rose/rose_route.c @@ -93,11 +93,11 @@ static int __must_check rose_add_node(struct rose_route_struct *rose_route, rose_neigh->ax25 = NULL; rose_neigh->dev = dev; rose_neigh->count = 0; - rose_neigh->use = 0; rose_neigh->dce_mode = 0; rose_neigh->loopback = 0; rose_neigh->number = rose_neigh_no++; rose_neigh->restarted = 0; + refcount_set(&rose_neigh->use, 1); skb_queue_head_init(&rose_neigh->queue); @@ -178,6 +178,7 @@ static int __must_check rose_add_node(struct rose_route_struct *rose_route, } } rose_neigh->count++; + rose_neigh_hold(rose_neigh); goto out; } @@ -187,6 +188,7 @@ static int __must_check rose_add_node(struct rose_route_struct *rose_route, rose_node->neighbour[rose_node->count] = rose_neigh; rose_node->count++; rose_neigh->count++; + rose_neigh_hold(rose_neigh); } out: @@ -234,20 +236,12 @@ static void rose_remove_neigh(struct rose_neigh *rose_neigh) if ((s = rose_neigh_list) == rose_neigh) { rose_neigh_list = rose_neigh->next; - if (rose_neigh->ax25) - ax25_cb_put(rose_neigh->ax25); - kfree(rose_neigh->digipeat); - kfree(rose_neigh); return; } while (s != NULL && s->next != NULL) { if (s->next == rose_neigh) { s->next = rose_neigh->next; - if (rose_neigh->ax25) - ax25_cb_put(rose_neigh->ax25); - kfree(rose_neigh->digipeat); - kfree(rose_neigh); return; } @@ -263,10 +257,10 @@ static void rose_remove_route(struct rose_route *rose_route) struct rose_route *s; if (rose_route->neigh1 != NULL) - rose_route->neigh1->use--; + rose_neigh_put(rose_route->neigh1); if (rose_route->neigh2 != NULL) - rose_route->neigh2->use--; + rose_neigh_put(rose_route->neigh2); if ((s = rose_route_list) == rose_route) { rose_route_list = rose_route->next; @@ -330,9 +324,12 @@ static int rose_del_node(struct rose_route_struct *rose_route, for (i = 0; i < rose_node->count; i++) { if (rose_node->neighbour[i] == rose_neigh) { rose_neigh->count--; + rose_neigh_put(rose_neigh); - if (rose_neigh->count == 0 && rose_neigh->use == 0) + if (rose_neigh->count == 0) { rose_remove_neigh(rose_neigh); + rose_neigh_put(rose_neigh); + } rose_node->count--; @@ -381,11 +378,11 @@ void rose_add_loopback_neigh(void) sn->ax25 = NULL; sn->dev = NULL; sn->count = 0; - sn->use = 0; sn->dce_mode = 1; sn->loopback = 1; sn->number = rose_neigh_no++; sn->restarted = 1; + refcount_set(&sn->use, 1); skb_queue_head_init(&sn->queue); @@ -436,6 +433,7 @@ int rose_add_loopback_node(const rose_address *address) rose_node_list = rose_node; rose_loopback_neigh->count++; + rose_neigh_hold(rose_loopback_neigh); out: spin_unlock_bh(&rose_node_list_lock); @@ -467,6 +465,7 @@ void rose_del_loopback_node(const rose_address *address) rose_remove_node(rose_node); rose_loopback_neigh->count--; + rose_neigh_put(rose_loopback_neigh); out: spin_unlock_bh(&rose_node_list_lock); @@ -506,6 +505,7 @@ void rose_rt_device_down(struct net_device *dev) memmove(&t->neighbour[i], &t->neighbour[i + 1], sizeof(t->neighbour[0]) * (t->count - i)); + rose_neigh_put(s); } if (t->count <= 0) @@ -513,6 +513,7 @@ void rose_rt_device_down(struct net_device *dev) } rose_remove_neigh(s); + rose_neigh_put(s); } spin_unlock_bh(&rose_neigh_list_lock); spin_unlock_bh(&rose_node_list_lock); @@ -548,6 +549,7 @@ static int rose_clear_routes(void) { struct rose_neigh *s, *rose_neigh; struct rose_node *t, *rose_node; + int i; spin_lock_bh(&rose_node_list_lock); spin_lock_bh(&rose_neigh_list_lock); @@ -558,17 +560,21 @@ static int rose_clear_routes(void) while (rose_node != NULL) { t = rose_node; rose_node = rose_node->next; - if (!t->loopback) + + if (!t->loopback) { + for (i = 0; i < t->count; i++) + rose_neigh_put(t->neighbour[i]); rose_remove_node(t); + } } while (rose_neigh != NULL) { s = rose_neigh; rose_neigh = rose_neigh->next; - if (s->use == 0 && !s->loopback) { - s->count = 0; + if (!s->loopback) { rose_remove_neigh(s); + rose_neigh_put(s); } } @@ -684,6 +690,7 @@ struct rose_neigh *rose_get_neigh(rose_address *addr, unsigned char *cause, for (i = 0; i < node->count; i++) { if (node->neighbour[i]->restarted) { res = node->neighbour[i]; + rose_neigh_hold(node->neighbour[i]); goto out; } } @@ -695,6 +702,7 @@ struct rose_neigh *rose_get_neigh(rose_address *addr, unsigned char *cause, for (i = 0; i < node->count; i++) { if (!rose_ftimer_running(node->neighbour[i])) { res = node->neighbour[i]; + rose_neigh_hold(node->neighbour[i]); goto out; } failed = 1; @@ -784,13 +792,13 @@ static void rose_del_route_by_neigh(struct rose_neigh *rose_neigh) } if (rose_route->neigh1 == rose_neigh) { - rose_route->neigh1->use--; + rose_neigh_put(rose_route->neigh1); rose_route->neigh1 = NULL; rose_transmit_clear_request(rose_route->neigh2, rose_route->lci2, ROSE_OUT_OF_ORDER, 0); } if (rose_route->neigh2 == rose_neigh) { - rose_route->neigh2->use--; + rose_neigh_put(rose_route->neigh2); rose_route->neigh2 = NULL; rose_transmit_clear_request(rose_route->neigh1, rose_route->lci1, ROSE_OUT_OF_ORDER, 0); } @@ -919,7 +927,7 @@ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25) rose_clear_queues(sk); rose->cause = ROSE_NETWORK_CONGESTION; rose->diagnostic = 0; - rose->neighbour->use--; + rose_neigh_put(rose->neighbour); rose->neighbour = NULL; rose->lci = 0; rose->state = ROSE_STATE_0; @@ -1044,12 +1052,12 @@ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25) if ((new_lci = rose_new_lci(new_neigh)) == 0) { rose_transmit_clear_request(rose_neigh, lci, ROSE_NETWORK_CONGESTION, 71); - goto out; + goto put_neigh; } if ((rose_route = kmalloc(sizeof(*rose_route), GFP_ATOMIC)) == NULL) { rose_transmit_clear_request(rose_neigh, lci, ROSE_NETWORK_CONGESTION, 120); - goto out; + goto put_neigh; } rose_route->lci1 = lci; @@ -1062,8 +1070,8 @@ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25) rose_route->lci2 = new_lci; rose_route->neigh2 = new_neigh; - rose_route->neigh1->use++; - rose_route->neigh2->use++; + rose_neigh_hold(rose_route->neigh1); + rose_neigh_hold(rose_route->neigh2); rose_route->next = rose_route_list; rose_route_list = rose_route; @@ -1075,6 +1083,8 @@ int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25) rose_transmit_link(skb, rose_route->neigh2); res = 1; +put_neigh: + rose_neigh_put(new_neigh); out: spin_unlock_bh(&rose_route_list_lock); spin_unlock_bh(&rose_neigh_list_lock); @@ -1190,7 +1200,7 @@ static int rose_neigh_show(struct seq_file *seq, void *v) (rose_neigh->loopback) ? "RSLOOP-0" : ax2asc(buf, &rose_neigh->callsign), rose_neigh->dev ? rose_neigh->dev->name : "???", rose_neigh->count, - rose_neigh->use, + refcount_read(&rose_neigh->use) - rose_neigh->count - 1, (rose_neigh->dce_mode) ? "DCE" : "DTE", (rose_neigh->restarted) ? "yes" : "no", ax25_display_timer(&rose_neigh->t0timer) / HZ, @@ -1295,18 +1305,22 @@ void __exit rose_rt_free(void) struct rose_neigh *s, *rose_neigh = rose_neigh_list; struct rose_node *t, *rose_node = rose_node_list; struct rose_route *u, *rose_route = rose_route_list; + int i; while (rose_neigh != NULL) { s = rose_neigh; rose_neigh = rose_neigh->next; rose_remove_neigh(s); + rose_neigh_put(s); } while (rose_node != NULL) { t = rose_node; rose_node = rose_node->next; + for (i = 0; i < t->count; i++) + rose_neigh_put(t->neighbour[i]); rose_remove_node(t); } diff --git a/net/rose/rose_timer.c b/net/rose/rose_timer.c index 020369c49587..bb60a1654d61 100644 --- a/net/rose/rose_timer.c +++ b/net/rose/rose_timer.c @@ -180,7 +180,7 @@ static void rose_timer_expiry(struct timer_list *t) break; case ROSE_STATE_2: /* T3 */ - rose->neighbour->use--; + rose_neigh_put(rose->neighbour); rose_disconnect(sk, ETIMEDOUT, -1, -1); break; diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 36df0274d7b7..0c2c68c4b07e 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -127,7 +127,7 @@ static int rxrpc_validate_address(struct rxrpc_sock *rx, /* * bind a local address to an RxRPC socket */ -static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len) +static int rxrpc_bind(struct socket *sock, struct sockaddr_unsized *saddr, int len) { struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *)saddr; struct rxrpc_local *local; @@ -481,7 +481,7 @@ EXPORT_SYMBOL(rxrpc_kernel_set_notifications); * - this just targets it at a specific destination; no actual connection * negotiation takes place */ -static int rxrpc_connect(struct socket *sock, struct sockaddr *addr, +static int rxrpc_connect(struct socket *sock, struct sockaddr_unsized *addr, int addr_len, int flags) { struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *)addr; diff --git a/net/rxrpc/rxgk.c b/net/rxrpc/rxgk.c index 1e19c605bcc8..dce5a3d8a964 100644 --- a/net/rxrpc/rxgk.c +++ b/net/rxrpc/rxgk.c @@ -475,7 +475,7 @@ static int rxgk_verify_packet_integrity(struct rxrpc_call *call, struct krb5_buffer metadata; unsigned int offset = sp->offset, len = sp->len; size_t data_offset = 0, data_len = len; - u32 ac; + u32 ac = 0; int ret = -ENOMEM; _enter(""); @@ -499,9 +499,10 @@ static int rxgk_verify_packet_integrity(struct rxrpc_call *call, ret = rxgk_verify_mic_skb(gk->krb5, gk->rx_Kc, &metadata, skb, &offset, &len, &ac); kfree(hdr); - if (ret == -EPROTO) { - rxrpc_abort_eproto(call, skb, ac, - rxgk_abort_1_verify_mic_eproto); + if (ret < 0) { + if (ret != -ENOMEM) + rxrpc_abort_eproto(call, skb, ac, + rxgk_abort_1_verify_mic_eproto); } else { sp->offset = offset; sp->len = len; @@ -524,15 +525,16 @@ static int rxgk_verify_packet_encrypted(struct rxrpc_call *call, struct rxgk_header hdr; unsigned int offset = sp->offset, len = sp->len; int ret; - u32 ac; + u32 ac = 0; _enter(""); ret = rxgk_decrypt_skb(gk->krb5, gk->rx_enc, skb, &offset, &len, &ac); - if (ret == -EPROTO) - rxrpc_abort_eproto(call, skb, ac, rxgk_abort_2_decrypt_eproto); - if (ret < 0) + if (ret < 0) { + if (ret != -ENOMEM) + rxrpc_abort_eproto(call, skb, ac, rxgk_abort_2_decrypt_eproto); goto error; + } if (len < sizeof(hdr)) { ret = rxrpc_abort_eproto(call, skb, RXGK_PACKETSHORT, diff --git a/net/rxrpc/rxgk_app.c b/net/rxrpc/rxgk_app.c index b94b77a1c317..30275cb5ba3e 100644 --- a/net/rxrpc/rxgk_app.c +++ b/net/rxrpc/rxgk_app.c @@ -54,6 +54,10 @@ int rxgk_yfs_decode_ticket(struct rxrpc_connection *conn, struct sk_buff *skb, _enter(""); + if (ticket_len < 10 * sizeof(__be32)) + return rxrpc_abort_conn(conn, skb, RXGK_INCONSISTENCY, -EPROTO, + rxgk_abort_resp_short_yfs_tkt); + /* Get the session key length */ ret = skb_copy_bits(skb, ticket_offset, tmp, sizeof(tmp)); if (ret < 0) @@ -187,7 +191,7 @@ int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb, struct key *server_key; unsigned int ticket_offset, ticket_len; u32 kvno, enctype; - int ret, ec; + int ret, ec = 0; struct { __be32 kvno; @@ -195,22 +199,23 @@ int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb, __be32 token_len; } container; + if (token_len < sizeof(container)) + goto short_packet; + /* Decode the RXGK_TokenContainer object. This tells us which server * key we should be using. We can then fetch the key, get the secret * and set up the crypto to extract the token. */ if (skb_copy_bits(skb, token_offset, &container, sizeof(container)) < 0) - return rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO, - rxgk_abort_resp_tok_short); + goto short_packet; kvno = ntohl(container.kvno); enctype = ntohl(container.enctype); ticket_len = ntohl(container.token_len); ticket_offset = token_offset + sizeof(container); - if (xdr_round_up(ticket_len) > token_len - 3 * 4) - return rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO, - rxgk_abort_resp_tok_short); + if (xdr_round_up(ticket_len) > token_len - sizeof(container)) + goto short_packet; _debug("KVNO %u", kvno); _debug("ENC %u", enctype); @@ -236,9 +241,11 @@ int rxgk_extract_token(struct rxrpc_connection *conn, struct sk_buff *skb, &ticket_offset, &ticket_len, &ec); crypto_free_aead(token_enc); token_enc = NULL; - if (ret < 0) - return rxrpc_abort_conn(conn, skb, ec, ret, - rxgk_abort_resp_tok_dec); + if (ret < 0) { + if (ret != -ENOMEM) + return rxrpc_abort_conn(conn, skb, ec, ret, + rxgk_abort_resp_tok_dec); + } ret = conn->security->default_decode_ticket(conn, skb, ticket_offset, ticket_len, _key); @@ -283,4 +290,8 @@ temporary_error: * also come out this way if the ticket decryption fails. */ return ret; + +short_packet: + return rxrpc_abort_conn(conn, skb, RXGK_PACKETSHORT, -EPROTO, + rxgk_abort_resp_tok_short); } diff --git a/net/rxrpc/rxgk_common.h b/net/rxrpc/rxgk_common.h index 7370a5655985..80164d89e19c 100644 --- a/net/rxrpc/rxgk_common.h +++ b/net/rxrpc/rxgk_common.h @@ -88,11 +88,16 @@ int rxgk_decrypt_skb(const struct krb5_enctype *krb5, *_offset += offset; *_len = len; break; + case -EBADMSG: /* Checksum mismatch. */ case -EPROTO: - case -EBADMSG: *_error_code = RXGK_SEALEDINCON; break; + case -EMSGSIZE: + *_error_code = RXGK_PACKETSHORT; + break; + case -ENOPKG: /* Would prefer RXGK_BADETYPE, but not available for YFS. */ default: + *_error_code = RXGK_INCONSISTENCY; break; } @@ -127,11 +132,16 @@ int rxgk_verify_mic_skb(const struct krb5_enctype *krb5, *_offset += offset; *_len = len; break; + case -EBADMSG: /* Checksum mismatch */ case -EPROTO: - case -EBADMSG: *_error_code = RXGK_SEALEDINCON; break; + case -EMSGSIZE: + *_error_code = RXGK_PACKETSHORT; + break; + case -ENOPKG: /* Would prefer RXGK_BADETYPE, but not available for YFS. */ default: + *_error_code = RXGK_INCONSISTENCY; break; } diff --git a/net/rxrpc/rxperf.c b/net/rxrpc/rxperf.c index 0377301156b0..98ea76fae70f 100644 --- a/net/rxrpc/rxperf.c +++ b/net/rxrpc/rxperf.c @@ -211,7 +211,7 @@ static int rxperf_open_socket(void) ret = rxrpc_sock_set_security_keyring(socket->sk, rxperf_sec_keyring); - ret = kernel_bind(socket, (struct sockaddr *)&srx, sizeof(srx)); + ret = kernel_bind(socket, (struct sockaddr_unsized *)&srx, sizeof(srx)); if (ret < 0) goto error_2; @@ -630,7 +630,7 @@ static int __init rxperf_init(void) pr_info("Server registering\n"); - rxperf_workqueue = alloc_workqueue("rxperf", 0, 0); + rxperf_workqueue = alloc_workqueue("rxperf", WQ_PERCPU, 0); if (!rxperf_workqueue) goto error_workqueue; diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 9e468e463467..ff6be5cfe2b0 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -1585,7 +1585,7 @@ void tcf_action_update_stats(struct tc_action *a, u64 bytes, u64 packets, } _bstats_update(&a->tcfa_bstats, bytes, packets); - a->tcfa_qstats.drops += drops; + atomic_add(drops, &a->tcfa_drops); if (hw) _bstats_update(&a->tcfa_bstats_hw, bytes, packets); } @@ -1594,8 +1594,9 @@ EXPORT_SYMBOL(tcf_action_update_stats); int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p, int compat_mode) { - int err = 0; + struct gnet_stats_queue qstats = {0}; struct gnet_dump d; + int err = 0; if (p == NULL) goto errout; @@ -1619,14 +1620,17 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p, if (err < 0) goto errout; + qstats.drops = atomic_read(&p->tcfa_drops); + qstats.overlimits = atomic_read(&p->tcfa_overlimits); + if (gnet_stats_copy_basic(&d, p->cpu_bstats, &p->tcfa_bstats, false) < 0 || gnet_stats_copy_basic_hw(&d, p->cpu_bstats_hw, &p->tcfa_bstats_hw, false) < 0 || gnet_stats_copy_rate_est(&d, &p->tcfa_rate_est) < 0 || gnet_stats_copy_queue(&d, p->cpu_qstats, - &p->tcfa_qstats, - p->tcfa_qstats.qlen) < 0) + &qstats, + qstats.qlen) < 0) goto errout; if (gnet_stats_finish_copy(&d) < 0) diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 396b576390d0..c2b5bc19e091 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -47,12 +47,10 @@ TC_INDIRECT_SCOPE int tcf_bpf_act(struct sk_buff *skb, filter = rcu_dereference(prog->filter); if (at_ingress) { __skb_push(skb, skb->mac_len); - bpf_compute_data_pointers(skb); - filter_res = bpf_prog_run(filter, skb); + filter_res = bpf_prog_run_data_pointers(filter, skb); __skb_pull(skb, skb->mac_len); } else { - bpf_compute_data_pointers(skb); - filter_res = bpf_prog_run(filter, skb); + filter_res = bpf_prog_run_data_pointers(filter, skb); } if (unlikely(!skb->tstamp && skb->tstamp_type)) skb->tstamp_type = SKB_CLOCK_REALTIME; diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 3e89927d7116..26ba8c2d20ab 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -195,13 +195,15 @@ static inline int tcf_connmark_dump(struct sk_buff *skb, struct tc_action *a, const struct tcf_connmark_info *ci = to_connmark(a); unsigned char *b = skb_tail_pointer(skb); const struct tcf_connmark_parms *parms; - struct tc_connmark opt = { - .index = ci->tcf_index, - .refcnt = refcount_read(&ci->tcf_refcnt) - ref, - .bindcnt = atomic_read(&ci->tcf_bindcnt) - bind, - }; + struct tc_connmark opt; struct tcf_t t; + memset(&opt, 0, sizeof(opt)); + + opt.index = ci->tcf_index; + opt.refcnt = refcount_read(&ci->tcf_refcnt) - ref; + opt.bindcnt = atomic_read(&ci->tcf_bindcnt) - bind; + rcu_read_lock(); parms = rcu_dereference(ci->parms); diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 6749a4a9a9cd..2b6ac7069dc1 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -948,9 +948,9 @@ static int tcf_ct_act_nat(struct sk_buff *skb, return err & NF_VERDICT_MASK; if (action & BIT(NF_NAT_MANIP_SRC)) - tc_skb_cb(skb)->post_ct_snat = 1; + qdisc_skb_cb(skb)->post_ct_snat = 1; if (action & BIT(NF_NAT_MANIP_DST)) - tc_skb_cb(skb)->post_ct_dnat = 1; + qdisc_skb_cb(skb)->post_ct_dnat = 1; return err; #else @@ -986,7 +986,7 @@ TC_INDIRECT_SCOPE int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a, tcf_action_update_bstats(&c->common, skb); if (clear) { - tc_skb_cb(skb)->post_ct = false; + qdisc_skb_cb(skb)->post_ct = false; ct = nf_ct_get(skb, &ctinfo); if (ct) { nf_ct_put(ct); @@ -1097,7 +1097,7 @@ do_nat: out_push: skb_push_rcsum(skb, nh_ofs); - tc_skb_cb(skb)->post_ct = true; + qdisc_skb_cb(skb)->post_ct = true; tc_skb_cb(skb)->zone = p->zone; out_clear: if (defrag) diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 107c6d83dc5c..1dfdda6c2d4c 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -644,13 +644,15 @@ static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind, unsigned char *b = skb_tail_pointer(skb); struct tcf_ife_info *ife = to_ife(a); struct tcf_ife_params *p; - struct tc_ife opt = { - .index = ife->tcf_index, - .refcnt = refcount_read(&ife->tcf_refcnt) - ref, - .bindcnt = atomic_read(&ife->tcf_bindcnt) - bind, - }; + struct tc_ife opt; struct tcf_t t; + memset(&opt, 0, sizeof(opt)); + + opt.index = ife->tcf_index; + opt.refcnt = refcount_read(&ife->tcf_refcnt) - ref; + opt.bindcnt = atomic_read(&ife->tcf_bindcnt) - bind; + spin_lock_bh(&ife->tcf_lock); opt.action = ife->tcf_action; p = rcu_dereference_protected(ife->params, diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 5f01f567c934..f27b583def78 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -29,31 +29,6 @@ static LIST_HEAD(mirred_list); static DEFINE_SPINLOCK(mirred_list_lock); -#define MIRRED_NEST_LIMIT 4 - -#ifndef CONFIG_PREEMPT_RT -static u8 tcf_mirred_nest_level_inc_return(void) -{ - return __this_cpu_inc_return(softnet_data.xmit.sched_mirred_nest); -} - -static void tcf_mirred_nest_level_dec(void) -{ - __this_cpu_dec(softnet_data.xmit.sched_mirred_nest); -} - -#else -static u8 tcf_mirred_nest_level_inc_return(void) -{ - return current->net_xmit.sched_mirred_nest++; -} - -static void tcf_mirred_nest_level_dec(void) -{ - current->net_xmit.sched_mirred_nest--; -} -#endif - static bool tcf_mirred_is_act_redirect(int action) { return action == TCA_EGRESS_REDIR || action == TCA_INGRESS_REDIR; @@ -439,44 +414,53 @@ TC_INDIRECT_SCOPE int tcf_mirred_act(struct sk_buff *skb, { struct tcf_mirred *m = to_mirred(a); int retval = READ_ONCE(m->tcf_action); - unsigned int nest_level; + struct netdev_xmit *xmit; bool m_mac_header_xmit; struct net_device *dev; - int m_eaction; + int i, m_eaction; u32 blockid; - nest_level = tcf_mirred_nest_level_inc_return(); - if (unlikely(nest_level > MIRRED_NEST_LIMIT)) { +#ifdef CONFIG_PREEMPT_RT + xmit = ¤t->net_xmit; +#else + xmit = this_cpu_ptr(&softnet_data.xmit); +#endif + if (unlikely(xmit->sched_mirred_nest >= MIRRED_NEST_LIMIT)) { net_warn_ratelimited("Packet exceeded mirred recursion limit on dev %s\n", netdev_name(skb->dev)); - retval = TC_ACT_SHOT; - goto dec_nest_level; + return TC_ACT_SHOT; } tcf_lastuse_update(&m->tcf_tm); tcf_action_update_bstats(&m->common, skb); blockid = READ_ONCE(m->tcfm_blockid); - if (blockid) { - retval = tcf_blockcast(skb, m, blockid, res, retval); - goto dec_nest_level; - } + if (blockid) + return tcf_blockcast(skb, m, blockid, res, retval); dev = rcu_dereference_bh(m->tcfm_dev); if (unlikely(!dev)) { pr_notice_once("tc mirred: target device is gone\n"); tcf_action_inc_overlimit_qstats(&m->common); - goto dec_nest_level; + return retval; } + for (i = 0; i < xmit->sched_mirred_nest; i++) { + if (xmit->sched_mirred_dev[i] != dev) + continue; + pr_notice_once("tc mirred: loop on device %s\n", + netdev_name(dev)); + tcf_action_inc_overlimit_qstats(&m->common); + return retval; + } + + xmit->sched_mirred_dev[xmit->sched_mirred_nest++] = dev; m_mac_header_xmit = READ_ONCE(m->tcfm_mac_header_xmit); m_eaction = READ_ONCE(m->tcfm_eaction); retval = tcf_mirred_to_dev(skb, m, dev, m_mac_header_xmit, m_eaction, retval); - -dec_nest_level: - tcf_mirred_nest_level_dec(); + xmit->sched_mirred_nest--; return retval; } diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index f3abe0545989..8e69a919b4fe 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -72,7 +72,6 @@ static int reset_policy(struct tc_action *a, const struct nlattr *defdata, d = to_defact(a); spin_lock_bh(&d->tcf_lock); goto_ch = tcf_action_set_ctrlact(a, p->action, goto_ch); - memset(d->tcfd_defdata, 0, SIMP_MAX_DATA); nla_strscpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA); spin_unlock_bh(&d->tcf_lock); if (goto_ch) diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index dc0229693461..a9e0c1326e2a 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -27,19 +27,18 @@ TC_INDIRECT_SCOPE int tcf_skbmod_act(struct sk_buff *skb, struct tcf_result *res) { struct tcf_skbmod *d = to_skbmod(a); - int action, max_edit_len, err; struct tcf_skbmod_params *p; + int max_edit_len, err; u64 flags; tcf_lastuse_update(&d->tcf_tm); bstats_update(this_cpu_ptr(d->common.cpu_bstats), skb); - action = READ_ONCE(d->tcf_action); - if (unlikely(action == TC_ACT_SHOT)) + p = rcu_dereference_bh(d->skbmod_p); + if (unlikely(p->action == TC_ACT_SHOT)) goto drop; max_edit_len = skb_mac_header_len(skb); - p = rcu_dereference_bh(d->skbmod_p); flags = p->flags; /* tcf_skbmod_init() guarantees "flags" to be one of the following: @@ -85,7 +84,7 @@ TC_INDIRECT_SCOPE int tcf_skbmod_act(struct sk_buff *skb, INET_ECN_set_ce(skb); out: - return action; + return p->action; drop: qstats_overlimit_inc(this_cpu_ptr(d->common.cpu_qstats)); @@ -193,7 +192,7 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla, } p->flags = lflags; - + p->action = parm->action; if (ovr) spin_lock_bh(&d->tcf_lock); /* Protected by tcf_lock if overwriting existing action. */ @@ -248,10 +247,9 @@ static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a, opt.index = d->tcf_index; opt.refcnt = refcount_read(&d->tcf_refcnt) - ref; opt.bindcnt = atomic_read(&d->tcf_bindcnt) - bind; - spin_lock_bh(&d->tcf_lock); - opt.action = d->tcf_action; - p = rcu_dereference_protected(d->skbmod_p, - lockdep_is_held(&d->tcf_lock)); + rcu_read_lock(); + p = rcu_dereference(d->skbmod_p); + opt.action = p->action; opt.flags = p->flags; if (nla_put(skb, TCA_SKBMOD_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -269,10 +267,10 @@ static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a, if (nla_put_64bit(skb, TCA_SKBMOD_TM, sizeof(t), &t, TCA_SKBMOD_PAD)) goto nla_put_failure; - spin_unlock_bh(&d->tcf_lock); + rcu_read_unlock(); return skb->len; nla_put_failure: - spin_unlock_bh(&d->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 2cef4b08befb..876b30c5709e 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -29,13 +29,11 @@ TC_INDIRECT_SCOPE int tunnel_key_act(struct sk_buff *skb, { struct tcf_tunnel_key *t = to_tunnel_key(a); struct tcf_tunnel_key_params *params; - int action; params = rcu_dereference_bh(t->params); tcf_lastuse_update(&t->tcf_tm); tcf_action_update_bstats(&t->common, skb); - action = READ_ONCE(t->tcf_action); switch (params->tcft_action) { case TCA_TUNNEL_KEY_ACT_RELEASE: @@ -51,7 +49,7 @@ TC_INDIRECT_SCOPE int tunnel_key_act(struct sk_buff *skb, break; } - return action; + return params->action; } static const struct nla_policy @@ -532,6 +530,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, params_new->tcft_action = parm->t_action; params_new->tcft_enc_metadata = metadata; + params_new->action = parm->action; spin_lock_bh(&t->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); params_new = rcu_replace_pointer(t->params, params_new, @@ -726,10 +725,9 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a, }; struct tcf_t tm; - spin_lock_bh(&t->tcf_lock); - params = rcu_dereference_protected(t->params, - lockdep_is_held(&t->tcf_lock)); - opt.action = t->tcf_action; + rcu_read_lock(); + params = rcu_dereference(t->params); + opt.action = params->action; opt.t_action = params->tcft_action; if (nla_put(skb, TCA_TUNNEL_KEY_PARMS, sizeof(opt), &opt)) @@ -766,12 +764,12 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a, if (nla_put_64bit(skb, TCA_TUNNEL_KEY_TM, sizeof(tm), &tm, TCA_TUNNEL_KEY_PAD)) goto nla_put_failure; - spin_unlock_bh(&t->tcf_lock); + rcu_read_unlock(); return skb->len; nla_put_failure: - spin_unlock_bh(&t->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 383bf18b6862..a74621797d69 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -25,7 +25,6 @@ TC_INDIRECT_SCOPE int tcf_vlan_act(struct sk_buff *skb, { struct tcf_vlan *v = to_vlan(a); struct tcf_vlan_params *p; - int action; int err; u16 tci; @@ -38,8 +37,6 @@ TC_INDIRECT_SCOPE int tcf_vlan_act(struct sk_buff *skb, if (skb_at_tc_ingress(skb)) skb_push_rcsum(skb, skb->mac_len); - action = READ_ONCE(v->tcf_action); - p = rcu_dereference_bh(v->vlan_p); switch (p->tcfv_action) { @@ -97,7 +94,7 @@ out: skb_pull_rcsum(skb, skb->mac_len); skb_reset_mac_len(skb); - return action; + return p->action; drop: tcf_action_inc_drop_qstats(&v->common); @@ -255,6 +252,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, ETH_ALEN); } + p->action = parm->action; spin_lock_bh(&v->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); p = rcu_replace_pointer(v->vlan_p, p, lockdep_is_held(&v->tcf_lock)); @@ -297,9 +295,9 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a, }; struct tcf_t t; - spin_lock_bh(&v->tcf_lock); - opt.action = v->tcf_action; - p = rcu_dereference_protected(v->vlan_p, lockdep_is_held(&v->tcf_lock)); + rcu_read_lock(); + p = rcu_dereference(v->vlan_p); + opt.action = p->action; opt.v_action = p->tcfv_action; if (nla_put(skb, TCA_VLAN_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -325,12 +323,12 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a, tcf_tm_dump(&t, &v->tcf_tm); if (nla_put_64bit(skb, TCA_VLAN_TM, sizeof(t), &t, TCA_VLAN_PAD)) goto nla_put_failure; - spin_unlock_bh(&v->tcf_lock); + rcu_read_unlock(); return skb->len; nla_put_failure: - spin_unlock_bh(&v->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index ecec0a1e1c1a..ebca4b926dcf 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1866,15 +1866,15 @@ int tcf_classify(struct sk_buff *skb, struct tc_skb_cb *cb = tc_skb_cb(skb); ext = tc_skb_ext_alloc(skb); - if (WARN_ON_ONCE(!ext)) { + if (!ext) { tcf_set_drop_reason(skb, SKB_DROP_REASON_NOMEM); return TC_ACT_SHOT; } ext->chain = last_executed_chain; ext->mru = cb->mru; - ext->post_ct = cb->post_ct; - ext->post_ct_snat = cb->post_ct_snat; - ext->post_ct_dnat = cb->post_ct_dnat; + ext->post_ct = qdisc_skb_cb(skb)->post_ct; + ext->post_ct_snat = qdisc_skb_cb(skb)->post_ct_snat; + ext->post_ct_dnat = qdisc_skb_cb(skb)->post_ct_dnat; ext->zone = cb->zone; } } diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 7fbe42f0e5c2..a32754a2658b 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -97,12 +97,10 @@ TC_INDIRECT_SCOPE int cls_bpf_classify(struct sk_buff *skb, } else if (at_ingress) { /* It is safe to push/pull even if skb_shared() */ __skb_push(skb, skb->mac_len); - bpf_compute_data_pointers(skb); - filter_res = bpf_prog_run(prog->filter, skb); + filter_res = bpf_prog_run_data_pointers(prog->filter, skb); __skb_pull(skb, skb->mac_len); } else { - bpf_compute_data_pointers(skb); - filter_res = bpf_prog_run(prog->filter, skb); + filter_res = bpf_prog_run_data_pointers(prog->filter, skb); } if (unlikely(!skb->tstamp && skb->tstamp_type)) skb->tstamp_type = SKB_CLOCK_REALTIME; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 099ff6a3e1f5..7669371c1354 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -326,7 +326,7 @@ TC_INDIRECT_SCOPE int fl_classify(struct sk_buff *skb, struct tcf_result *res) { struct cls_fl_head *head = rcu_dereference_bh(tp->root); - bool post_ct = tc_skb_cb(skb)->post_ct; + bool post_ct = qdisc_skb_cb(skb)->post_ct; u16 zone = tc_skb_cb(skb)->zone; struct fl_flow_key skb_key; struct fl_flow_mask *mask; diff --git a/net/sched/em_canid.c b/net/sched/em_canid.c index 5337bc462755..2d27f91d8441 100644 --- a/net/sched/em_canid.c +++ b/net/sched/em_canid.c @@ -99,6 +99,9 @@ static int em_canid_match(struct sk_buff *skb, struct tcf_ematch *m, int i; const struct can_filter *lp; + if (!pskb_may_pull(skb, CAN_MTU)) + return 0; + can_id = em_canid_get_id(skb); if (can_id & CAN_EFF_FLAG) { diff --git a/net/sched/em_cmp.c b/net/sched/em_cmp.c index 64b637f18bc7..48c1bce74f49 100644 --- a/net/sched/em_cmp.c +++ b/net/sched/em_cmp.c @@ -22,9 +22,12 @@ static int em_cmp_match(struct sk_buff *skb, struct tcf_ematch *em, struct tcf_pkt_info *info) { struct tcf_em_cmp *cmp = (struct tcf_em_cmp *) em->data; - unsigned char *ptr = tcf_get_base_ptr(skb, cmp->layer) + cmp->off; + unsigned char *ptr = tcf_get_base_ptr(skb, cmp->layer); u32 val = 0; + if (!ptr) + return 0; + ptr += cmp->off; if (!tcf_valid_offset(skb, ptr, cmp->align)) return 0; diff --git a/net/sched/em_nbyte.c b/net/sched/em_nbyte.c index 4f9f21a05d5e..c65ffa5fff94 100644 --- a/net/sched/em_nbyte.c +++ b/net/sched/em_nbyte.c @@ -42,6 +42,8 @@ static int em_nbyte_match(struct sk_buff *skb, struct tcf_ematch *em, struct nbyte_data *nbyte = (struct nbyte_data *) em->data; unsigned char *ptr = tcf_get_base_ptr(skb, nbyte->hdr.layer); + if (!ptr) + return 0; ptr += nbyte->hdr.off; if (!tcf_valid_offset(skb, ptr, nbyte->hdr.len)) diff --git a/net/sched/em_text.c b/net/sched/em_text.c index 6b3d0af72c39..692e2be1793e 100644 --- a/net/sched/em_text.c +++ b/net/sched/em_text.c @@ -29,12 +29,19 @@ static int em_text_match(struct sk_buff *skb, struct tcf_ematch *m, struct tcf_pkt_info *info) { struct text_match *tm = EM_TEXT_PRIV(m); + unsigned char *ptr; int from, to; - from = tcf_get_base_ptr(skb, tm->from_layer) - skb->data; + ptr = tcf_get_base_ptr(skb, tm->from_layer); + if (!ptr) + return 0; + from = ptr - skb->data; from += tm->from_offset; - to = tcf_get_base_ptr(skb, tm->to_layer) - skb->data; + ptr = tcf_get_base_ptr(skb, tm->to_layer); + if (!ptr) + return 0; + to = ptr - skb->data; to += tm->to_offset; return skb_find_text(skb, from, to, tm->config) != UINT_MAX; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index d7c767b861a4..f56b18c8aebf 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -431,7 +431,7 @@ struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) { if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) && - !memcmp(&rtab->data, nla_data(tab), 1024)) { + !memcmp(&rtab->data, nla_data(tab), TC_RTAB_SIZE)) { rtab->refcnt++; return rtab; } @@ -441,7 +441,7 @@ struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, if (rtab) { rtab->rate = *r; rtab->refcnt = 1; - memcpy(rtab->data, nla_data(tab), 1024); + memcpy(rtab->data, nla_data(tab), TC_RTAB_SIZE); if (r->linklayer == TC_LINKLAYER_UNAWARE) r->linklayer = __detect_linklayer(r, rtab->data); rtab->next = qdisc_rtab_list; @@ -1599,6 +1599,11 @@ static int __tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, NL_SET_ERR_MSG(extack, "Failed to find specified qdisc"); return -ENOENT; } + if (p->flags & TCQ_F_INGRESS) { + NL_SET_ERR_MSG(extack, + "Cannot add children to ingress/clsact qdisc"); + return -EOPNOTSUPP; + } q = qdisc_leaf(p, clid, extack); if (IS_ERR(q)) return PTR_ERR(q); diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index dbcfb948c867..4a64d6397b6f 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -1398,15 +1398,15 @@ static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb) const struct skb_shared_info *shinfo = skb_shinfo(skb); unsigned int hdr_len, last_len = 0; u32 off = skb_network_offset(skb); + u16 segs = qdisc_pkt_segs(skb); u32 len = qdisc_pkt_len(skb); - u16 segs = 1; q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8); - if (!shinfo->gso_size) + if (segs == 1) return cake_calc_overhead(q, len, off); - /* borrowed from qdisc_pkt_len_init() */ + /* borrowed from qdisc_pkt_len_segs_init() */ if (!skb->encapsulation) hdr_len = skb_transport_offset(skb); else @@ -1430,12 +1430,6 @@ static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb) hdr_len += sizeof(struct udphdr); } - if (unlikely(shinfo->gso_type & SKB_GSO_DODGY)) - segs = DIV_ROUND_UP(skb->len - hdr_len, - shinfo->gso_size); - else - segs = shinfo->gso_segs; - len = shinfo->gso_size + hdr_len; last_len = skb->len - shinfo->gso_size * (segs - 1); @@ -1597,7 +1591,6 @@ static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free) qdisc_drop_reason(skb, sch, to_free, SKB_DROP_REASON_QDISC_OVERLIMIT); sch->q.qlen--; - qdisc_tree_reduce_backlog(sch, 1, len); cake_heapify(q, 0); @@ -1743,14 +1736,14 @@ static void cake_reconfigure(struct Qdisc *sch); static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { + u32 idx, tin, prev_qlen, prev_backlog, drop_id; struct cake_sched_data *q = qdisc_priv(sch); - int len = qdisc_pkt_len(skb); - int ret; + int len = qdisc_pkt_len(skb), ret; struct sk_buff *ack = NULL; ktime_t now = ktime_get(); struct cake_tin_data *b; struct cake_flow *flow; - u32 idx; + bool same_flow = false; /* choose flow to insert into */ idx = cake_classify(sch, &b, skb, q->flow_mode, &ret); @@ -1760,6 +1753,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, __qdisc_drop(skb, to_free); return ret; } + tin = (u32)(b - q->tins); idx--; flow = &b->flows[idx]; @@ -1787,7 +1781,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (unlikely(len > b->max_skblen)) b->max_skblen = len; - if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) { + if (qdisc_pkt_segs(skb) > 1 && q->rate_flags & CAKE_FLAG_SPLIT_GSO) { struct sk_buff *segs, *nskb; netdev_features_t features = netif_skb_features(skb); unsigned int slen = 0, numsegs = 0; @@ -1799,6 +1793,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, skb_list_walk_safe(segs, segs, nskb) { skb_mark_not_on_list(segs); qdisc_skb_cb(segs)->pkt_len = segs->len; + qdisc_skb_cb(segs)->pkt_segs = 1; cobalt_set_enqueue_time(segs, now); get_cobalt_cb(segs)->adjusted_len = cake_overhead(q, segs); @@ -1822,6 +1817,8 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, consume_skb(skb); } else { /* not splitting */ + int ack_pkt_len = 0; + cobalt_set_enqueue_time(skb, now); get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb); flow_queue_add(flow, skb); @@ -1832,13 +1829,13 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (ack) { b->ack_drops++; sch->qstats.drops++; - b->bytes += qdisc_pkt_len(ack); - len -= qdisc_pkt_len(ack); + ack_pkt_len = qdisc_pkt_len(ack); + b->bytes += ack_pkt_len; q->buffer_used += skb->truesize - ack->truesize; if (q->rate_flags & CAKE_FLAG_INGRESS) cake_advance_shaper(q, b, ack, now, true); - qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack)); + qdisc_tree_reduce_backlog(sch, 1, ack_pkt_len); consume_skb(ack); } else { sch->q.qlen++; @@ -1847,11 +1844,11 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, /* stats */ b->packets++; - b->bytes += len; - b->backlogs[idx] += len; - b->tin_backlog += len; - sch->qstats.backlog += len; - q->avg_window_bytes += len; + b->bytes += len - ack_pkt_len; + b->backlogs[idx] += len - ack_pkt_len; + b->tin_backlog += len - ack_pkt_len; + sch->qstats.backlog += len - ack_pkt_len; + q->avg_window_bytes += len - ack_pkt_len; } if (q->overflow_timeout) @@ -1926,15 +1923,29 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (q->buffer_used > q->buffer_max_used) q->buffer_max_used = q->buffer_used; - if (q->buffer_used > q->buffer_limit) { - u32 dropped = 0; + if (q->buffer_used <= q->buffer_limit) + return NET_XMIT_SUCCESS; - while (q->buffer_used > q->buffer_limit) { - dropped++; - cake_drop(sch, to_free); - } - b->drop_overlimit += dropped; + prev_qlen = sch->q.qlen; + prev_backlog = sch->qstats.backlog; + + while (q->buffer_used > q->buffer_limit) { + drop_id = cake_drop(sch, to_free); + if ((drop_id >> 16) == tin && + (drop_id & 0xFFFF) == idx) + same_flow = true; } + + prev_qlen -= sch->q.qlen; + prev_backlog -= sch->qstats.backlog; + b->drop_overlimit += prev_qlen; + + if (same_flow) { + qdisc_tree_reduce_backlog(sch, prev_qlen - 1, + prev_backlog - len); + return NET_XMIT_CN; + } + qdisc_tree_reduce_backlog(sch, prev_qlen, prev_backlog); return NET_XMIT_SUCCESS; } @@ -2178,7 +2189,7 @@ retry: b->tin_dropped++; qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb)); qdisc_qstats_drop(sch); - kfree_skb_reason(skb, reason); + qdisc_dequeue_drop(sch, skb, reason); if (q->rate_flags & CAKE_FLAG_INGRESS) goto retry; } @@ -2719,6 +2730,8 @@ static int cake_init(struct Qdisc *sch, struct nlattr *opt, int i, j, err; sch->limit = 10240; + sch->flags |= TCQ_F_DEQUEUE_DROPS; + q->tin_mode = CAKE_DIFFSERV_DIFFSERV3; q->flow_mode = CAKE_FLOW_TRIPLE; diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c index c93761040c6e..c6551578f1cf 100644 --- a/net/sched/sch_codel.c +++ b/net/sched/sch_codel.c @@ -52,7 +52,7 @@ static void drop_func(struct sk_buff *skb, void *ctx) { struct Qdisc *sch = ctx; - kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_CONGESTED); + qdisc_dequeue_drop(sch, skb, SKB_DROP_REASON_QDISC_CONGESTED); qdisc_qstats_drop(sch); } @@ -101,9 +101,9 @@ static const struct nla_policy codel_policy[TCA_CODEL_MAX + 1] = { static int codel_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { + unsigned int dropped_pkts = 0, dropped_bytes = 0; struct codel_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_CODEL_MAX + 1]; - unsigned int qlen, dropped = 0; int err; err = nla_parse_nested_deprecated(tb, TCA_CODEL_MAX, opt, @@ -142,15 +142,17 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt, WRITE_ONCE(q->params.ecn, !!nla_get_u32(tb[TCA_CODEL_ECN])); - qlen = sch->q.qlen; while (sch->q.qlen > sch->limit) { struct sk_buff *skb = qdisc_dequeue_internal(sch, true); - dropped += qdisc_pkt_len(skb); - qdisc_qstats_backlog_dec(sch, skb); + if (!skb) + break; + + dropped_pkts++; + dropped_bytes += qdisc_pkt_len(skb); rtnl_qdisc_drop(skb, sch); } - qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped); + qdisc_tree_reduce_backlog(sch, dropped_pkts, dropped_bytes); sch_tree_unlock(sch); return 0; @@ -180,6 +182,8 @@ static int codel_init(struct Qdisc *sch, struct nlattr *opt, else sch->flags &= ~TCQ_F_CAN_BYPASS; + sch->flags |= TCQ_F_DEQUEUE_DROPS; + return 0; } diff --git a/net/sched/sch_dualpi2.c b/net/sched/sch_dualpi2.c index 845375ebd4ea..6d7e6389758d 100644 --- a/net/sched/sch_dualpi2.c +++ b/net/sched/sch_dualpi2.c @@ -475,6 +475,7 @@ static int dualpi2_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, * (3) Enqueue fragment & set ts in dualpi2_enqueue_skb */ qdisc_skb_cb(nskb)->pkt_len = nskb->len; + qdisc_skb_cb(nskb)->pkt_segs = 1; dualpi2_skb_cb(nskb)->classified = dualpi2_skb_cb(skb)->classified; dualpi2_skb_cb(nskb)->ect = dualpi2_skb_cb(skb)->ect; @@ -927,7 +928,8 @@ static int dualpi2_init(struct Qdisc *sch, struct nlattr *opt, q->sch = sch; dualpi2_reset_default(sch); - hrtimer_setup(&q->pi2_timer, dualpi2_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); + hrtimer_setup(&q->pi2_timer, dualpi2_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS_PINNED_SOFT); if (opt && nla_len(opt)) { err = dualpi2_change(sch, opt, extack); @@ -937,7 +939,7 @@ static int dualpi2_init(struct Qdisc *sch, struct nlattr *opt, } hrtimer_start(&q->pi2_timer, next_pi2_timeout(q), - HRTIMER_MODE_ABS_PINNED); + HRTIMER_MODE_ABS_PINNED_SOFT); return 0; } diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c index 037f764822b9..82635dd2cfa5 100644 --- a/net/sched/sch_ets.c +++ b/net/sched/sch_ets.c @@ -651,6 +651,12 @@ static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt, sch_tree_lock(sch); + for (i = nbands; i < oldbands; i++) { + if (i >= q->nstrict && q->classes[i].qdisc->q.qlen) + list_del_init(&q->classes[i].alist); + qdisc_purge_queue(q->classes[i].qdisc); + } + WRITE_ONCE(q->nbands, nbands); for (i = nstrict; i < q->nstrict; i++) { if (q->classes[i].qdisc->q.qlen) { @@ -658,11 +664,6 @@ static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt, q->classes[i].deficit = quanta[i]; } } - for (i = q->nbands; i < oldbands; i++) { - if (i >= q->nstrict && q->classes[i].qdisc->q.qlen) - list_del_init(&q->classes[i].alist); - qdisc_purge_queue(q->classes[i].qdisc); - } WRITE_ONCE(q->nstrict, nstrict); memcpy(q->prio2band, priomap, sizeof(priomap)); diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 902ff5470607..6e5f2f4f2415 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -480,7 +480,10 @@ static void fq_erase_head(struct Qdisc *sch, struct fq_flow *flow, struct sk_buff *skb) { if (skb == flow->head) { - flow->head = skb->next; + struct sk_buff *next = skb->next; + + prefetch(next); + flow->head = next; } else { rb_erase(&skb->rbnode, &flow->t_root); skb->dev = qdisc_dev(sch); @@ -497,6 +500,7 @@ static void fq_dequeue_skb(struct Qdisc *sch, struct fq_flow *flow, skb_mark_not_on_list(skb); qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; + qdisc_bstats_update(sch, skb); } static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb) @@ -711,6 +715,7 @@ begin: goto begin; } prefetch(&skb->end); + fq_dequeue_skb(sch, f, skb); if ((s64)(now - time_next_packet - q->ce_threshold) > 0) { INET_ECN_set_ce(skb); q->stat_ce_mark++; @@ -718,7 +723,6 @@ begin: if (--f->qlen == 0) q->inactive_flows++; q->band_pkt_count[fq_skb_cb(skb)->band]--; - fq_dequeue_skb(sch, f, skb); } else { head->first = f->next; /* force a pass through old_flows to prevent starvation */ @@ -776,7 +780,6 @@ begin: f->time_next_packet = now + len; } out: - qdisc_bstats_update(sch, skb); return skb; } @@ -1013,11 +1016,11 @@ static int fq_load_priomap(struct fq_sched_data *q, static int fq_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { + unsigned int dropped_pkts = 0, dropped_bytes = 0; struct fq_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_FQ_MAX + 1]; - int err, drop_count = 0; - unsigned drop_len = 0; u32 fq_log; + int err; err = nla_parse_nested_deprecated(tb, TCA_FQ_MAX, opt, fq_policy, NULL); @@ -1135,16 +1138,18 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt, err = fq_resize(sch, fq_log); sch_tree_lock(sch); } + while (sch->q.qlen > sch->limit) { struct sk_buff *skb = qdisc_dequeue_internal(sch, false); if (!skb) break; - drop_len += qdisc_pkt_len(skb); + + dropped_pkts++; + dropped_bytes += qdisc_pkt_len(skb); rtnl_kfree_skbs(skb, skb); - drop_count++; } - qdisc_tree_reduce_backlog(sch, drop_count, drop_len); + qdisc_tree_reduce_backlog(sch, dropped_pkts, dropped_bytes); sch_tree_unlock(sch); return err; diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 2a0f3a513bfa..dc187c7f06b1 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -275,7 +275,7 @@ static void drop_func(struct sk_buff *skb, void *ctx) { struct Qdisc *sch = ctx; - kfree_skb_reason(skb, SKB_DROP_REASON_QDISC_CONGESTED); + qdisc_dequeue_drop(sch, skb, SKB_DROP_REASON_QDISC_CONGESTED); qdisc_qstats_drop(sch); } @@ -366,6 +366,7 @@ static const struct nla_policy fq_codel_policy[TCA_FQ_CODEL_MAX + 1] = { static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { + unsigned int dropped_pkts = 0, dropped_bytes = 0; struct fq_codel_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_FQ_CODEL_MAX + 1]; u32 quantum = 0; @@ -443,13 +444,14 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt, q->memory_usage > q->memory_limit) { struct sk_buff *skb = qdisc_dequeue_internal(sch, false); - q->cstats.drop_len += qdisc_pkt_len(skb); + if (!skb) + break; + + dropped_pkts++; + dropped_bytes += qdisc_pkt_len(skb); rtnl_kfree_skbs(skb, skb); - q->cstats.drop_count++; } - qdisc_tree_reduce_backlog(sch, q->cstats.drop_count, q->cstats.drop_len); - q->cstats.drop_count = 0; - q->cstats.drop_len = 0; + qdisc_tree_reduce_backlog(sch, dropped_pkts, dropped_bytes); sch_tree_unlock(sch); return 0; @@ -517,6 +519,9 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt, sch->flags |= TCQ_F_CAN_BYPASS; else sch->flags &= ~TCQ_F_CAN_BYPASS; + + sch->flags |= TCQ_F_DEQUEUE_DROPS; + return 0; alloc_failure: diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c index b0e34daf1f75..7b96bc3ff891 100644 --- a/net/sched/sch_fq_pie.c +++ b/net/sched/sch_fq_pie.c @@ -287,10 +287,9 @@ begin: static int fq_pie_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { + unsigned int dropped_pkts = 0, dropped_bytes = 0; struct fq_pie_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_FQ_PIE_MAX + 1]; - unsigned int len_dropped = 0; - unsigned int num_dropped = 0; int err; err = nla_parse_nested(tb, TCA_FQ_PIE_MAX, opt, fq_pie_policy, extack); @@ -368,11 +367,14 @@ static int fq_pie_change(struct Qdisc *sch, struct nlattr *opt, while (sch->q.qlen > sch->limit) { struct sk_buff *skb = qdisc_dequeue_internal(sch, false); - len_dropped += qdisc_pkt_len(skb); - num_dropped += 1; + if (!skb) + break; + + dropped_pkts++; + dropped_bytes += qdisc_pkt_len(skb); rtnl_kfree_skbs(skb, skb); } - qdisc_tree_reduce_backlog(sch, num_dropped, len_dropped); + qdisc_tree_reduce_backlog(sch, dropped_pkts, dropped_bytes); sch_tree_unlock(sch); return 0; diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 1e008a228ebd..852e603c1755 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -180,9 +180,10 @@ static inline void dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q) static void try_bulk_dequeue_skb(struct Qdisc *q, struct sk_buff *skb, const struct netdev_queue *txq, - int *packets) + int *packets, int budget) { int bytelimit = qdisc_avail_bulklimit(txq) - skb->len; + int cnt = 0; while (bytelimit > 0) { struct sk_buff *nskb = q->dequeue(q); @@ -193,8 +194,10 @@ static void try_bulk_dequeue_skb(struct Qdisc *q, bytelimit -= nskb->len; /* covers GSO len */ skb->next = nskb; skb = nskb; - (*packets)++; /* GSO counts as one pkt */ + if (++cnt >= budget) + break; } + (*packets) += cnt; skb_mark_not_on_list(skb); } @@ -228,7 +231,7 @@ static void try_bulk_dequeue_skb_slow(struct Qdisc *q, * A requeued skb (via q->gso_skb) can also be a SKB list. */ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate, - int *packets) + int *packets, int budget) { const struct netdev_queue *txq = q->dev_queue; struct sk_buff *skb = NULL; @@ -295,7 +298,7 @@ validate: if (skb) { bulk: if (qdisc_may_bulk(q)) - try_bulk_dequeue_skb(q, skb, txq, packets); + try_bulk_dequeue_skb(q, skb, txq, packets, budget); else try_bulk_dequeue_skb_slow(q, skb, packets); } @@ -387,7 +390,7 @@ bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q, * >0 - queue is not empty. * */ -static inline bool qdisc_restart(struct Qdisc *q, int *packets) +static inline bool qdisc_restart(struct Qdisc *q, int *packets, int budget) { spinlock_t *root_lock = NULL; struct netdev_queue *txq; @@ -396,7 +399,7 @@ static inline bool qdisc_restart(struct Qdisc *q, int *packets) bool validate; /* Dequeue packet */ - skb = dequeue_skb(q, &validate, packets); + skb = dequeue_skb(q, &validate, packets, budget); if (unlikely(!skb)) return false; @@ -414,7 +417,7 @@ void __qdisc_run(struct Qdisc *q) int quota = READ_ONCE(net_hotdata.dev_tx_weight); int packets; - while (qdisc_restart(q, &packets)) { + while (qdisc_restart(q, &packets, quota)) { quota -= packets; if (quota <= 0) { if (q->flags & TCQ_F_NOLOCK) @@ -666,7 +669,6 @@ struct Qdisc noop_qdisc = { .ops = &noop_qdisc_ops, .q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock), .dev_queue = &noop_netdev_queue, - .busylock = __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock), .gso_skb = { .next = (struct sk_buff *)&noop_qdisc.gso_skb, .prev = (struct sk_buff *)&noop_qdisc.gso_skb, @@ -679,7 +681,6 @@ struct Qdisc noop_qdisc = { .qlen = 0, .lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.skb_bad_txq.lock), }, - .owner = -1, }; EXPORT_SYMBOL(noop_qdisc); @@ -971,10 +972,6 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, } } - spin_lock_init(&sch->busylock); - lockdep_set_class(&sch->busylock, - dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); - /* seqlock has the same scope of busylock, for NOLOCK qdisc */ spin_lock_init(&sch->seqlock); lockdep_set_class(&sch->seqlock, @@ -985,7 +982,6 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, sch->enqueue = ops->enqueue; sch->dequeue = ops->dequeue; sch->dev_queue = dev_queue; - sch->owner = -1; netdev_hold(dev, &sch->dev_tracker, GFP_KERNEL); refcount_set(&sch->refcnt, 1); diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index 5aa434b46707..2d4855e28a28 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -508,9 +508,9 @@ static const struct nla_policy hhf_policy[TCA_HHF_MAX + 1] = { static int hhf_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { + unsigned int dropped_pkts = 0, dropped_bytes = 0; struct hhf_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_HHF_MAX + 1]; - unsigned int qlen, prev_backlog; int err; u64 non_hh_quantum; u32 new_quantum = q->quantum; @@ -561,15 +561,17 @@ static int hhf_change(struct Qdisc *sch, struct nlattr *opt, usecs_to_jiffies(us)); } - qlen = sch->q.qlen; - prev_backlog = sch->qstats.backlog; while (sch->q.qlen > sch->limit) { struct sk_buff *skb = qdisc_dequeue_internal(sch, false); + if (!skb) + break; + + dropped_pkts++; + dropped_bytes += qdisc_pkt_len(skb); rtnl_kfree_skbs(skb, skb); } - qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, - prev_backlog - sch->qstats.backlog); + qdisc_tree_reduce_backlog(sch, dropped_pkts, dropped_bytes); sch_tree_unlock(sch); return 0; diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index c968ea763774..b5e40c51655a 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -592,7 +592,7 @@ htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff) */ static inline void htb_activate(struct htb_sched *q, struct htb_class *cl) { - WARN_ON(cl->level || !cl->leaf.q || !cl->leaf.q->q.qlen); + WARN_ON(cl->level || !cl->leaf.q); if (!cl->prio_activity) { cl->prio_activity = 1 << cl->prio; diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index eafc316ae319..32a5f3304046 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -429,6 +429,7 @@ static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff *segs; netdev_features_t features = netif_skb_features(skb); + qdisc_skb_cb(skb)->pkt_segs = 1; segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); if (IS_ERR_OR_NULL(segs)) { diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index ad46ee3ed5a9..0a377313b6a9 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -141,9 +141,9 @@ static const struct nla_policy pie_policy[TCA_PIE_MAX + 1] = { static int pie_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { + unsigned int dropped_pkts = 0, dropped_bytes = 0; struct pie_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_PIE_MAX + 1]; - unsigned int qlen, dropped = 0; int err; err = nla_parse_nested_deprecated(tb, TCA_PIE_MAX, opt, pie_policy, @@ -193,15 +193,17 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt, nla_get_u32(tb[TCA_PIE_DQ_RATE_ESTIMATOR])); /* Drop excess packets if new limit is lower */ - qlen = sch->q.qlen; while (sch->q.qlen > sch->limit) { struct sk_buff *skb = qdisc_dequeue_internal(sch, true); - dropped += qdisc_pkt_len(skb); - qdisc_qstats_backlog_dec(sch, skb); + if (!skb) + break; + + dropped_pkts++; + dropped_bytes += qdisc_pkt_len(skb); rtnl_qdisc_drop(skb, sch); } - qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped); + qdisc_tree_reduce_backlog(sch, dropped_pkts, dropped_bytes); sch_tree_unlock(sch); return 0; diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 2255355e51d3..d920f57dc6d7 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -1250,7 +1250,7 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, } } - gso_segs = skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1; + gso_segs = qdisc_pkt_segs(skb); err = qdisc_enqueue(skb, cl->qdisc, to_free); if (unlikely(err != NET_XMIT_SUCCESS)) { pr_debug("qfq_enqueue: enqueue failed %d\n", err); diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 39b735386996..300d577b3286 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -595,6 +595,7 @@ static int taprio_enqueue_segmented(struct sk_buff *skb, struct Qdisc *sch, skb_list_walk_safe(segs, segs, nskb) { skb_mark_not_on_list(segs); qdisc_skb_cb(segs)->pkt_len = segs->len; + qdisc_skb_cb(segs)->pkt_segs = 1; slen += segs->len; /* FIXME: we should be segmenting to a smaller size diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 4c977f049670..f2340164f579 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -221,6 +221,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch, skb_mark_not_on_list(segs); seg_len = segs->len; qdisc_skb_cb(segs)->pkt_len = seg_len; + qdisc_skb_cb(segs)->pkt_segs = 1; ret = qdisc_enqueue(segs, q->qdisc, to_free); if (ret != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(ret)) diff --git a/net/sctp/Kconfig b/net/sctp/Kconfig index 24d5a35ce894..e947646a380c 100644 --- a/net/sctp/Kconfig +++ b/net/sctp/Kconfig @@ -7,9 +7,9 @@ menuconfig IP_SCTP tristate "The SCTP Protocol" depends on INET depends on IPV6 || IPV6=n - select CRYPTO - select CRYPTO_HMAC - select CRYPTO_SHA1 + select CRYPTO_LIB_SHA1 + select CRYPTO_LIB_SHA256 + select CRYPTO_LIB_UTILS select NET_CRC32C select NET_UDP_TUNNEL help @@ -49,46 +49,25 @@ config SCTP_DBG_OBJCNT 'cat /proc/net/sctp/sctp_dbg_objcnt' If unsure, say N + choice - prompt "Default SCTP cookie HMAC encoding" - default SCTP_DEFAULT_COOKIE_HMAC_MD5 + prompt "Default SCTP cookie authentication method" + default SCTP_DEFAULT_COOKIE_HMAC_SHA256 help - This option sets the default sctp cookie hmac algorithm - when in doubt select 'md5' + This option sets the default SCTP cookie authentication method, for + when a method hasn't been explicitly selected via the + net.sctp.cookie_hmac_alg sysctl. -config SCTP_DEFAULT_COOKIE_HMAC_MD5 - bool "Enable optional MD5 hmac cookie generation" - help - Enable optional MD5 hmac based SCTP cookie generation - select SCTP_COOKIE_HMAC_MD5 + If unsure, choose the default (HMAC-SHA256). -config SCTP_DEFAULT_COOKIE_HMAC_SHA1 - bool "Enable optional SHA1 hmac cookie generation" - help - Enable optional SHA1 hmac based SCTP cookie generation - select SCTP_COOKIE_HMAC_SHA1 +config SCTP_DEFAULT_COOKIE_HMAC_SHA256 + bool "HMAC-SHA256" config SCTP_DEFAULT_COOKIE_HMAC_NONE - bool "Use no hmac alg in SCTP cookie generation" - help - Use no hmac algorithm in SCTP cookie generation + bool "None" endchoice -config SCTP_COOKIE_HMAC_MD5 - bool "Enable optional MD5 hmac cookie generation" - help - Enable optional MD5 hmac based SCTP cookie generation - select CRYPTO_HMAC if SCTP_COOKIE_HMAC_MD5 - select CRYPTO_MD5 if SCTP_COOKIE_HMAC_MD5 - -config SCTP_COOKIE_HMAC_SHA1 - bool "Enable optional SHA1 hmac cookie generation" - help - Enable optional SHA1 hmac based SCTP cookie generation - select CRYPTO_HMAC if SCTP_COOKIE_HMAC_SHA1 - select CRYPTO_SHA1 if SCTP_COOKIE_HMAC_SHA1 - config INET_SCTP_DIAG depends on INET_DIAG def_tristate INET_DIAG diff --git a/net/sctp/auth.c b/net/sctp/auth.c index c58fffc86a0c..82aad477590e 100644 --- a/net/sctp/auth.c +++ b/net/sctp/auth.c @@ -12,36 +12,37 @@ * Vlad Yasevich <vladislav.yasevich@hp.com> */ -#include <crypto/hash.h> +#include <crypto/sha1.h> +#include <crypto/sha2.h> #include <linux/slab.h> #include <linux/types.h> -#include <linux/scatterlist.h> #include <net/sctp/sctp.h> #include <net/sctp/auth.h> -static struct sctp_hmac sctp_hmac_list[SCTP_AUTH_NUM_HMACS] = { +static const struct sctp_hmac sctp_hmac_list[SCTP_AUTH_NUM_HMACS] = { { /* id 0 is reserved. as all 0 */ .hmac_id = SCTP_AUTH_HMAC_ID_RESERVED_0, }, { .hmac_id = SCTP_AUTH_HMAC_ID_SHA1, - .hmac_name = "hmac(sha1)", - .hmac_len = SCTP_SHA1_SIG_SIZE, + .hmac_len = SHA1_DIGEST_SIZE, }, { /* id 2 is reserved as well */ .hmac_id = SCTP_AUTH_HMAC_ID_RESERVED_2, }, -#if IS_ENABLED(CONFIG_CRYPTO_SHA256) { .hmac_id = SCTP_AUTH_HMAC_ID_SHA256, - .hmac_name = "hmac(sha256)", - .hmac_len = SCTP_SHA256_SIG_SIZE, + .hmac_len = SHA256_DIGEST_SIZE, } -#endif }; +static bool sctp_hmac_supported(__u16 hmac_id) +{ + return hmac_id < ARRAY_SIZE(sctp_hmac_list) && + sctp_hmac_list[hmac_id].hmac_len != 0; +} void sctp_auth_key_put(struct sctp_auth_bytes *key) { @@ -444,76 +445,7 @@ struct sctp_shared_key *sctp_auth_get_shkey( return NULL; } -/* - * Initialize all the possible digest transforms that we can use. Right - * now, the supported digests are SHA1 and SHA256. We do this here once - * because of the restrictiong that transforms may only be allocated in - * user context. This forces us to pre-allocated all possible transforms - * at the endpoint init time. - */ -int sctp_auth_init_hmacs(struct sctp_endpoint *ep, gfp_t gfp) -{ - struct crypto_shash *tfm = NULL; - __u16 id; - - /* If the transforms are already allocated, we are done */ - if (ep->auth_hmacs) - return 0; - - /* Allocated the array of pointers to transorms */ - ep->auth_hmacs = kcalloc(SCTP_AUTH_NUM_HMACS, - sizeof(struct crypto_shash *), - gfp); - if (!ep->auth_hmacs) - return -ENOMEM; - - for (id = 0; id < SCTP_AUTH_NUM_HMACS; id++) { - - /* See is we support the id. Supported IDs have name and - * length fields set, so that we can allocated and use - * them. We can safely just check for name, for without the - * name, we can't allocate the TFM. - */ - if (!sctp_hmac_list[id].hmac_name) - continue; - - /* If this TFM has been allocated, we are all set */ - if (ep->auth_hmacs[id]) - continue; - - /* Allocate the ID */ - tfm = crypto_alloc_shash(sctp_hmac_list[id].hmac_name, 0, 0); - if (IS_ERR(tfm)) - goto out_err; - - ep->auth_hmacs[id] = tfm; - } - - return 0; - -out_err: - /* Clean up any successful allocations */ - sctp_auth_destroy_hmacs(ep->auth_hmacs); - ep->auth_hmacs = NULL; - return -ENOMEM; -} - -/* Destroy the hmac tfm array */ -void sctp_auth_destroy_hmacs(struct crypto_shash *auth_hmacs[]) -{ - int i; - - if (!auth_hmacs) - return; - - for (i = 0; i < SCTP_AUTH_NUM_HMACS; i++) { - crypto_free_shash(auth_hmacs[i]); - } - kfree(auth_hmacs); -} - - -struct sctp_hmac *sctp_auth_get_hmac(__u16 hmac_id) +const struct sctp_hmac *sctp_auth_get_hmac(__u16 hmac_id) { return &sctp_hmac_list[hmac_id]; } @@ -521,7 +453,8 @@ struct sctp_hmac *sctp_auth_get_hmac(__u16 hmac_id) /* Get an hmac description information that we can use to build * the AUTH chunk */ -struct sctp_hmac *sctp_auth_asoc_get_hmac(const struct sctp_association *asoc) +const struct sctp_hmac * +sctp_auth_asoc_get_hmac(const struct sctp_association *asoc) { struct sctp_hmac_algo_param *hmacs; __u16 n_elt; @@ -543,26 +476,10 @@ struct sctp_hmac *sctp_auth_asoc_get_hmac(const struct sctp_association *asoc) sizeof(struct sctp_paramhdr)) >> 1; for (i = 0; i < n_elt; i++) { id = ntohs(hmacs->hmac_ids[i]); - - /* Check the id is in the supported range. And - * see if we support the id. Supported IDs have name and - * length fields set, so that we can allocate and use - * them. We can safely just check for name, for without the - * name, we can't allocate the TFM. - */ - if (id > SCTP_AUTH_HMAC_ID_MAX || - !sctp_hmac_list[id].hmac_name) { - id = 0; - continue; - } - - break; + if (sctp_hmac_supported(id)) + return &sctp_hmac_list[id]; } - - if (id == 0) - return NULL; - - return &sctp_hmac_list[id]; + return NULL; } static int __sctp_auth_find_hmacid(__be16 *hmacs, int n_elts, __be16 hmac_id) @@ -606,7 +523,6 @@ int sctp_auth_asoc_verify_hmac_id(const struct sctp_association *asoc, void sctp_auth_asoc_set_default_hmac(struct sctp_association *asoc, struct sctp_hmac_algo_param *hmacs) { - struct sctp_endpoint *ep; __u16 id; int i; int n_params; @@ -617,16 +533,9 @@ void sctp_auth_asoc_set_default_hmac(struct sctp_association *asoc, n_params = (ntohs(hmacs->param_hdr.length) - sizeof(struct sctp_paramhdr)) >> 1; - ep = asoc->ep; for (i = 0; i < n_params; i++) { id = ntohs(hmacs->hmac_ids[i]); - - /* Check the id is in the supported range */ - if (id > SCTP_AUTH_HMAC_ID_MAX) - continue; - - /* If this TFM has been allocated, use this id */ - if (ep->auth_hmacs[id]) { + if (sctp_hmac_supported(id)) { asoc->default_hmac_id = id; break; } @@ -709,10 +618,9 @@ void sctp_auth_calculate_hmac(const struct sctp_association *asoc, struct sctp_shared_key *ep_key, gfp_t gfp) { struct sctp_auth_bytes *asoc_key; - struct crypto_shash *tfm; __u16 key_id, hmac_id; - unsigned char *end; int free_key = 0; + size_t data_len; __u8 *digest; /* Extract the info we need: @@ -733,19 +641,17 @@ void sctp_auth_calculate_hmac(const struct sctp_association *asoc, free_key = 1; } - /* set up scatter list */ - end = skb_tail_pointer(skb); - - tfm = asoc->ep->auth_hmacs[hmac_id]; - + data_len = skb_tail_pointer(skb) - (unsigned char *)auth; digest = (u8 *)(&auth->auth_hdr + 1); - if (crypto_shash_setkey(tfm, &asoc_key->data[0], asoc_key->len)) - goto free; - - crypto_shash_tfm_digest(tfm, (u8 *)auth, end - (unsigned char *)auth, - digest); + if (hmac_id == SCTP_AUTH_HMAC_ID_SHA1) { + hmac_sha1_usingrawkey(asoc_key->data, asoc_key->len, + (const u8 *)auth, data_len, digest); + } else { + WARN_ON_ONCE(hmac_id != SCTP_AUTH_HMAC_ID_SHA256); + hmac_sha256_usingrawkey(asoc_key->data, asoc_key->len, + (const u8 *)auth, data_len, digest); + } -free: if (free_key) sctp_auth_key_put(asoc_key); } @@ -788,14 +694,11 @@ int sctp_auth_ep_set_hmacs(struct sctp_endpoint *ep, for (i = 0; i < hmacs->shmac_num_idents; i++) { id = hmacs->shmac_idents[i]; - if (id > SCTP_AUTH_HMAC_ID_MAX) + if (!sctp_hmac_supported(id)) return -EOPNOTSUPP; if (SCTP_AUTH_HMAC_ID_SHA1 == id) has_sha1 = 1; - - if (!sctp_hmac_list[id].hmac_name) - return -EOPNOTSUPP; } if (!has_sha1) @@ -1021,8 +924,6 @@ int sctp_auth_deact_key_id(struct sctp_endpoint *ep, int sctp_auth_init(struct sctp_endpoint *ep, gfp_t gfp) { - int err = -ENOMEM; - /* Allocate space for HMACS and CHUNKS authentication * variables. There are arrays that we encode directly * into parameters to make the rest of the operations easier. @@ -1060,13 +961,6 @@ int sctp_auth_init(struct sctp_endpoint *ep, gfp_t gfp) ep->auth_chunk_list = auth_chunks; } - /* Allocate and initialize transorms arrays for supported - * HMACs. - */ - err = sctp_auth_init_hmacs(ep, gfp); - if (err) - goto nomem; - return 0; nomem: @@ -1075,7 +969,7 @@ nomem: kfree(ep->auth_chunk_list); ep->auth_hmacs_list = NULL; ep->auth_chunk_list = NULL; - return err; + return -ENOMEM; } void sctp_auth_free(struct sctp_endpoint *ep) @@ -1084,6 +978,4 @@ void sctp_auth_free(struct sctp_endpoint *ep) kfree(ep->auth_chunk_list); ep->auth_hmacs_list = NULL; ep->auth_chunk_list = NULL; - sctp_auth_destroy_hmacs(ep->auth_hmacs); - ep->auth_hmacs = NULL; } diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index fd4f8243cc35..c655b571ca01 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -184,7 +184,8 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, * DATA. */ if (sctp_auth_send_cid(SCTP_CID_DATA, asoc)) { - struct sctp_hmac *hmac_desc = sctp_auth_asoc_get_hmac(asoc); + const struct sctp_hmac *hmac_desc = + sctp_auth_asoc_get_hmac(asoc); if (hmac_desc) max_data -= SCTP_PAD4(sizeof(struct sctp_auth_chunk) + diff --git a/net/sctp/diag.c b/net/sctp/diag.c index 23359e522273..2afb376299fe 100644 --- a/net/sctp/diag.c +++ b/net/sctp/diag.c @@ -73,19 +73,26 @@ static int inet_diag_msg_sctpladdrs_fill(struct sk_buff *skb, struct nlattr *attr; void *info = NULL; + rcu_read_lock(); list_for_each_entry_rcu(laddr, address_list, list) addrcnt++; + rcu_read_unlock(); attr = nla_reserve(skb, INET_DIAG_LOCALS, addrlen * addrcnt); if (!attr) return -EMSGSIZE; info = nla_data(attr); + rcu_read_lock(); list_for_each_entry_rcu(laddr, address_list, list) { memcpy(info, &laddr->a, sizeof(laddr->a)); memset(info + sizeof(laddr->a), 0, addrlen - sizeof(laddr->a)); info += addrlen; + + if (!--addrcnt) + break; } + rcu_read_unlock(); return 0; } @@ -173,7 +180,7 @@ static int inet_sctp_diag_fill(struct sock *sk, struct sctp_association *asoc, mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued; mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); - mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); + mem[SK_MEMINFO_DROPS] = sk_drops_read(sk); if (nla_put(skb, INET_DIAG_SKMEMINFO, sizeof(mem), &mem) < 0) goto errout; @@ -223,14 +230,15 @@ struct sctp_comm_param { bool net_admin; }; -static size_t inet_assoc_attr_size(struct sctp_association *asoc) +static size_t inet_assoc_attr_size(struct sock *sk, + struct sctp_association *asoc) { int addrlen = sizeof(struct sockaddr_storage); int addrcnt = 0; struct sctp_sockaddr_entry *laddr; list_for_each_entry_rcu(laddr, &asoc->base.bind_addr.address_list, - list) + list, lockdep_sock_is_held(sk)) addrcnt++; return nla_total_size(sizeof(struct sctp_info)) @@ -256,11 +264,14 @@ static int sctp_sock_dump_one(struct sctp_endpoint *ep, struct sctp_transport *t if (err) return err; - rep = nlmsg_new(inet_assoc_attr_size(assoc), GFP_KERNEL); - if (!rep) + lock_sock(sk); + + rep = nlmsg_new(inet_assoc_attr_size(sk, assoc), GFP_KERNEL); + if (!rep) { + release_sock(sk); return -ENOMEM; + } - lock_sock(sk); if (ep != assoc->ep) { err = -EAGAIN; goto out; diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c index 7e77b450697c..31e989dfe846 100644 --- a/net/sctp/endpointola.c +++ b/net/sctp/endpointola.c @@ -35,6 +35,15 @@ /* Forward declarations for internal helpers. */ static void sctp_endpoint_bh_rcv(struct work_struct *work); +static void gen_cookie_auth_key(struct hmac_sha256_key *key) +{ + u8 raw_key[SCTP_COOKIE_KEY_SIZE]; + + get_random_bytes(raw_key, sizeof(raw_key)); + hmac_sha256_preparekey(key, raw_key, sizeof(raw_key)); + memzero_explicit(raw_key, sizeof(raw_key)); +} + /* * Initialize the base fields of the endpoint structure. */ @@ -45,10 +54,6 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep, struct net *net = sock_net(sk); struct sctp_shared_key *null_key; - ep->digest = kzalloc(SCTP_SIGNATURE_SIZE, gfp); - if (!ep->digest) - return NULL; - ep->asconf_enable = net->sctp.addip_enable; ep->auth_enable = net->sctp.auth_enable; if (ep->auth_enable) { @@ -90,8 +95,8 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep, /* Get the receive buffer policy for this endpoint */ ep->rcvbuf_policy = net->sctp.rcvbuf_policy; - /* Initialize the secret key used with cookie. */ - get_random_bytes(ep->secret_key, sizeof(ep->secret_key)); + /* Generate the cookie authentication key. */ + gen_cookie_auth_key(&ep->cookie_auth_key); /* SCTP-AUTH extensions*/ INIT_LIST_HEAD(&ep->endpoint_shared_keys); @@ -118,7 +123,6 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep, nomem_shkey: sctp_auth_free(ep); nomem: - kfree(ep->digest); return NULL; } @@ -205,9 +209,6 @@ static void sctp_endpoint_destroy(struct sctp_endpoint *ep) return; } - /* Free the digest buffer */ - kfree(ep->digest); - /* SCTP-AUTH: Free up AUTH releated data such as shared keys * chunks and hmacs arrays that were allocated */ @@ -218,7 +219,7 @@ static void sctp_endpoint_destroy(struct sctp_endpoint *ep) sctp_inq_free(&ep->base.inqueue); sctp_bind_addr_free(&ep->base.bind_addr); - memset(ep->secret_key, 0, sizeof(ep->secret_key)); + memzero_explicit(&ep->cookie_auth_key, sizeof(ep->cookie_auth_key)); sk = ep->base.sk; /* Remove and free the port */ diff --git a/net/sctp/input.c b/net/sctp/input.c index 2dc2666988fb..e119e460ccde 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -117,7 +117,7 @@ int sctp_rcv(struct sk_buff *skb) * it's better to just linearize it otherwise crc computing * takes longer. */ - if ((!is_gso && skb_linearize(skb)) || + if (((!is_gso || skb_cloned(skb)) && skb_linearize(skb)) || !pskb_may_pull(skb, sizeof(struct sctphdr))) goto discard_it; @@ -190,7 +190,7 @@ int sctp_rcv(struct sk_buff *skb) goto discard_release; nf_reset_ct(skb); - if (sk_filter(sk, skb)) + if (sk_filter(sk, skb) || skb->len < sizeof(struct sctp_chunkhdr)) goto discard_release; /* Create an SCTP packet structure. */ diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c index 5c1652181805..f5a7d5a38755 100644 --- a/net/sctp/inqueue.c +++ b/net/sctp/inqueue.c @@ -169,13 +169,14 @@ next_chunk: chunk->head_skb = chunk->skb; /* skbs with "cover letter" */ - if (chunk->head_skb && chunk->skb->data_len == chunk->skb->len) + if (chunk->head_skb && chunk->skb->data_len == chunk->skb->len) { + if (WARN_ON(!skb_shinfo(chunk->skb)->frag_list)) { + __SCTP_INC_STATS(dev_net(chunk->skb->dev), + SCTP_MIB_IN_PKT_DISCARDS); + sctp_chunk_free(chunk); + goto next_chunk; + } chunk->skb = skb_shinfo(chunk->skb)->frag_list; - - if (WARN_ON(!chunk->skb)) { - __SCTP_INC_STATS(dev_net(chunk->skb->dev), SCTP_MIB_IN_PKT_DISCARDS); - sctp_chunk_free(chunk); - goto next_chunk; } } diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 3336dcfb4515..069b7e45d8bd 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -547,7 +547,9 @@ static void sctp_v6_from_sk(union sctp_addr *addr, struct sock *sk) { addr->v6.sin6_family = AF_INET6; addr->v6.sin6_port = 0; + addr->v6.sin6_flowinfo = 0; addr->v6.sin6_addr = sk->sk_v6_rcv_saddr; + addr->v6.sin6_scope_id = 0; } /* Initialize sk->sk_rcv_saddr from sctp_addr. */ @@ -775,54 +777,6 @@ static enum sctp_scope sctp_v6_scope(union sctp_addr *addr) return retval; } -/* Create and initialize a new sk for the socket to be returned by accept(). */ -static struct sock *sctp_v6_create_accept_sk(struct sock *sk, - struct sctp_association *asoc, - bool kern) -{ - struct sock *newsk; - struct ipv6_pinfo *newnp, *np = inet6_sk(sk); - struct sctp6_sock *newsctp6sk; - - newsk = sk_alloc(sock_net(sk), PF_INET6, GFP_KERNEL, sk->sk_prot, kern); - if (!newsk) - goto out; - - sock_init_data(NULL, newsk); - - sctp_copy_sock(newsk, sk, asoc); - sock_reset_flag(sk, SOCK_ZAPPED); - - newsctp6sk = (struct sctp6_sock *)newsk; - inet_sk(newsk)->pinet6 = &newsctp6sk->inet6; - - sctp_sk(newsk)->v4mapped = sctp_sk(sk)->v4mapped; - - newnp = inet6_sk(newsk); - - memcpy(newnp, np, sizeof(struct ipv6_pinfo)); - newnp->ipv6_mc_list = NULL; - newnp->ipv6_ac_list = NULL; - newnp->ipv6_fl_list = NULL; - - sctp_v6_copy_ip_options(sk, newsk); - - /* Initialize sk's sport, dport, rcv_saddr and daddr for getsockname() - * and getpeername(). - */ - sctp_v6_to_sk_daddr(&asoc->peer.primary_addr, newsk); - - newsk->sk_v6_rcv_saddr = sk->sk_v6_rcv_saddr; - - if (newsk->sk_prot->init(newsk)) { - sk_common_release(newsk); - newsk = NULL; - } - -out: - return newsk; -} - /* Format a sockaddr for return to user space. This makes sure the return is * AF_INET or AF_INET6 depending on the SCTP_I_WANT_MAPPED_V4_ADDR option. */ @@ -1169,7 +1123,6 @@ static struct sctp_pf sctp_pf_inet6 = { .bind_verify = sctp_inet6_bind_verify, .send_verify = sctp_inet6_send_verify, .supported_addrs = sctp_inet6_supported_addrs, - .create_accept_sk = sctp_v6_create_accept_sk, .addr_to_user = sctp_v6_addr_to_user, .to_sk_saddr = sctp_v6_to_sk_saddr, .to_sk_daddr = sctp_v6_to_sk_daddr, diff --git a/net/sctp/proc.c b/net/sctp/proc.c index 74bff317e205..1ed281f3c355 100644 --- a/net/sctp/proc.c +++ b/net/sctp/proc.c @@ -52,21 +52,21 @@ static const struct snmp_mib sctp_snmp_list[] = { SNMP_MIB_ITEM("SctpInPktBacklog", SCTP_MIB_IN_PKT_BACKLOG), SNMP_MIB_ITEM("SctpInPktDiscards", SCTP_MIB_IN_PKT_DISCARDS), SNMP_MIB_ITEM("SctpInDataChunkDiscards", SCTP_MIB_IN_DATA_CHUNK_DISCARDS), - SNMP_MIB_SENTINEL }; /* Display sctp snmp mib statistics(/proc/net/sctp/snmp). */ static int sctp_snmp_seq_show(struct seq_file *seq, void *v) { - unsigned long buff[SCTP_MIB_MAX]; + unsigned long buff[ARRAY_SIZE(sctp_snmp_list)]; + const int cnt = ARRAY_SIZE(sctp_snmp_list); struct net *net = seq->private; int i; - memset(buff, 0, sizeof(unsigned long) * SCTP_MIB_MAX); + memset(buff, 0, sizeof(buff)); - snmp_get_cpu_field_batch(buff, sctp_snmp_list, - net->sctp.sctp_statistics); - for (i = 0; sctp_snmp_list[i].name; i++) + snmp_get_cpu_field_batch_cnt(buff, sctp_snmp_list, cnt, + net->sctp.sctp_statistics); + for (i = 0; i < cnt; i++) seq_printf(seq, "%-32s\t%ld\n", sctp_snmp_list[i].name, buff[i]); diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index a5ccada55f2b..2c3398f75d76 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -34,6 +34,7 @@ #include <linux/memblock.h> #include <linux/highmem.h> #include <linux/slab.h> +#include <net/flow.h> #include <net/net_namespace.h> #include <net/protocol.h> #include <net/ip.h> @@ -437,7 +438,7 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr, fl4->fl4_dport = daddr->v4.sin_port; fl4->flowi4_proto = IPPROTO_SCTP; if (asoc) { - fl4->flowi4_tos = inet_dscp_to_dsfield(dscp); + fl4->flowi4_dscp = dscp; fl4->flowi4_scope = ip_sock_rt_scope(asoc->base.sk); fl4->flowi4_oif = asoc->base.sk->sk_bound_dev_if; fl4->fl4_sport = htons(asoc->base.bind_addr.port); @@ -579,38 +580,6 @@ static int sctp_v4_is_ce(const struct sk_buff *skb) return INET_ECN_is_ce(ip_hdr(skb)->tos); } -/* Create and initialize a new sk for the socket returned by accept(). */ -static struct sock *sctp_v4_create_accept_sk(struct sock *sk, - struct sctp_association *asoc, - bool kern) -{ - struct sock *newsk = sk_alloc(sock_net(sk), PF_INET, GFP_KERNEL, - sk->sk_prot, kern); - struct inet_sock *newinet; - - if (!newsk) - goto out; - - sock_init_data(NULL, newsk); - - sctp_copy_sock(newsk, sk, asoc); - sock_reset_flag(newsk, SOCK_ZAPPED); - - sctp_v4_copy_ip_options(sk, newsk); - - newinet = inet_sk(newsk); - - newinet->inet_daddr = asoc->peer.primary_addr.v4.sin_addr.s_addr; - - if (newsk->sk_prot->init(newsk)) { - sk_common_release(newsk); - newsk = NULL; - } - -out: - return newsk; -} - static int sctp_v4_addr_to_user(struct sctp_sock *sp, union sctp_addr *addr) { /* No address mapping for V4 sockets */ @@ -1118,7 +1087,6 @@ static struct sctp_pf sctp_pf_inet = { .bind_verify = sctp_inet_bind_verify, .send_verify = sctp_inet_send_verify, .supported_addrs = sctp_inet_supported_addrs, - .create_accept_sk = sctp_v4_create_accept_sk, .addr_to_user = sctp_v4_addr_to_user, .to_sk_saddr = sctp_v4_to_sk_saddr, .to_sk_daddr = sctp_v4_to_sk_daddr, @@ -1334,14 +1302,9 @@ static int __net_init sctp_defaults_init(struct net *net) /* Whether Cookie Preservative is enabled(1) or not(0) */ net->sctp.cookie_preserve_enable = 1; - /* Default sctp sockets to use md5 as their hmac alg */ -#if defined (CONFIG_SCTP_DEFAULT_COOKIE_HMAC_MD5) - net->sctp.sctp_hmac_alg = "md5"; -#elif defined (CONFIG_SCTP_DEFAULT_COOKIE_HMAC_SHA1) - net->sctp.sctp_hmac_alg = "sha1"; -#else - net->sctp.sctp_hmac_alg = NULL; -#endif + /* Whether cookie authentication is enabled(1) or not(0) */ + net->sctp.cookie_auth_enable = + !IS_ENABLED(CONFIG_SCTP_DEFAULT_COOKIE_HMAC_NONE); /* Max.Burst - 4 */ net->sctp.max_burst = SCTP_DEFAULT_MAX_BURST; diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 3ead591c72fd..2c0017d058d4 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -30,7 +30,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <crypto/hash.h> +#include <crypto/utils.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/ip.h> @@ -1319,7 +1319,7 @@ struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc, __u16 key_id) { struct sctp_authhdr auth_hdr; - struct sctp_hmac *hmac_desc; + const struct sctp_hmac *hmac_desc; struct sctp_chunk *retval; /* Get the first hmac that the peer told us to use */ @@ -1674,8 +1674,10 @@ static struct sctp_cookie_param *sctp_pack_cookie( * out on the network. */ retval = kzalloc(*cookie_len, GFP_ATOMIC); - if (!retval) - goto nodata; + if (!retval) { + *cookie_len = 0; + return NULL; + } cookie = (struct sctp_signed_cookie *) retval->body; @@ -1706,26 +1708,14 @@ static struct sctp_cookie_param *sctp_pack_cookie( memcpy((__u8 *)(cookie + 1) + ntohs(init_chunk->chunk_hdr->length), raw_addrs, addrs_len); - if (sctp_sk(ep->base.sk)->hmac) { - struct crypto_shash *tfm = sctp_sk(ep->base.sk)->hmac; - int err; - - /* Sign the message. */ - err = crypto_shash_setkey(tfm, ep->secret_key, - sizeof(ep->secret_key)) ?: - crypto_shash_tfm_digest(tfm, (u8 *)&cookie->c, bodysize, - cookie->signature); - if (err) - goto free_cookie; + /* Sign the cookie, if cookie authentication is enabled. */ + if (sctp_sk(ep->base.sk)->cookie_auth_enable) { + static_assert(sizeof(cookie->mac) == SHA256_DIGEST_SIZE); + hmac_sha256(&ep->cookie_auth_key, (const u8 *)&cookie->c, + bodysize, cookie->mac); } return retval; - -free_cookie: - kfree(retval); -nodata: - *cookie_len = 0; - return NULL; } /* Unpack the cookie from COOKIE ECHO chunk, recreating the association. */ @@ -1740,7 +1730,6 @@ struct sctp_association *sctp_unpack_cookie( struct sctp_signed_cookie *cookie; struct sk_buff *skb = chunk->skb; struct sctp_cookie *bear_cookie; - __u8 *digest = ep->digest; enum sctp_scope scope; unsigned int len; ktime_t kt; @@ -1770,30 +1759,19 @@ struct sctp_association *sctp_unpack_cookie( cookie = chunk->subh.cookie_hdr; bear_cookie = &cookie->c; - if (!sctp_sk(ep->base.sk)->hmac) - goto no_hmac; + /* Verify the cookie's MAC, if cookie authentication is enabled. */ + if (sctp_sk(ep->base.sk)->cookie_auth_enable) { + u8 mac[SHA256_DIGEST_SIZE]; - /* Check the signature. */ - { - struct crypto_shash *tfm = sctp_sk(ep->base.sk)->hmac; - int err; - - err = crypto_shash_setkey(tfm, ep->secret_key, - sizeof(ep->secret_key)) ?: - crypto_shash_tfm_digest(tfm, (u8 *)bear_cookie, bodysize, - digest); - if (err) { - *error = -SCTP_IERROR_NOMEM; + hmac_sha256(&ep->cookie_auth_key, (const u8 *)bear_cookie, + bodysize, mac); + static_assert(sizeof(cookie->mac) == sizeof(mac)); + if (crypto_memneq(mac, cookie->mac, sizeof(mac))) { + *error = -SCTP_IERROR_BAD_SIG; goto fail; } } - if (memcmp(digest, cookie->signature, SCTP_SIGNATURE_SIZE)) { - *error = -SCTP_IERROR_BAD_SIG; - goto fail; - } - -no_hmac: /* IG Section 2.35.2: * 3) Compare the port numbers and the verification tag contained * within the COOKIE ECHO chunk to the actual port numbers and the diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index a0524ba8d787..3755ba079d07 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -30,6 +30,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <crypto/utils.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/ip.h> @@ -885,7 +886,8 @@ enum sctp_disposition sctp_sf_do_5_1D_ce(struct net *net, return SCTP_DISPOSITION_CONSUME; nomem_authev: - sctp_ulpevent_free(ai_ev); + if (ai_ev) + sctp_ulpevent_free(ai_ev); nomem_aiev: sctp_ulpevent_free(ev); nomem_ev: @@ -4361,7 +4363,7 @@ static enum sctp_ierror sctp_sf_authenticate( struct sctp_shared_key *sh_key = NULL; struct sctp_authhdr *auth_hdr; __u8 *save_digest, *digest; - struct sctp_hmac *hmac; + const struct sctp_hmac *hmac; unsigned int sig_len; __u16 key_id; @@ -4416,7 +4418,7 @@ static enum sctp_ierror sctp_sf_authenticate( sh_key, GFP_ATOMIC); /* Discard the packet if the digests do not match */ - if (memcmp(save_digest, digest, sig_len)) { + if (crypto_memneq(save_digest, digest, sig_len)) { kfree(save_digest); return SCTP_IERROR_BAD_SIG; } diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 4921416434f9..d808096f5ab1 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -37,7 +37,6 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <crypto/hash.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/wait.h> @@ -307,7 +306,8 @@ static struct sctp_transport *sctp_addr_id2transport(struct sock *sk, * sockaddr_in6 [RFC 2553]), * addr_len - the size of the address structure. */ -static int sctp_bind(struct sock *sk, struct sockaddr *addr, int addr_len) +static int sctp_bind(struct sock *sk, struct sockaddr_unsized *addr, + int addr_len) { int retval = 0; @@ -1054,13 +1054,13 @@ static int sctp_setsockopt_bindx(struct sock *sk, struct sockaddr *addrs, } } -static int sctp_bind_add(struct sock *sk, struct sockaddr *addrs, - int addrlen) +static int sctp_bind_add(struct sock *sk, struct sockaddr_unsized *addrs, + int addrlen) { int err; lock_sock(sk); - err = sctp_setsockopt_bindx(sk, addrs, addrlen, SCTP_BINDX_ADD_ADDR); + err = sctp_setsockopt_bindx(sk, (struct sockaddr *)addrs, addrlen, SCTP_BINDX_ADD_ADDR); release_sock(sk); return err; } @@ -1554,8 +1554,6 @@ static void sctp_close(struct sock *sk, long timeout) spin_unlock_bh(&net->sctp.addr_wq_lock); sock_put(sk); - - SCTP_DBG_OBJCNT_DEC(sock); } /* Handle EPIPE error. */ @@ -4823,7 +4821,7 @@ static int sctp_connect(struct sock *sk, struct sockaddr *addr, return err; } -int sctp_inet_connect(struct socket *sock, struct sockaddr *uaddr, +int sctp_inet_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, int flags) { if (addr_len < sizeof(uaddr->sa_family)) @@ -4832,7 +4830,7 @@ int sctp_inet_connect(struct socket *sock, struct sockaddr *uaddr, if (uaddr->sa_family == AF_UNSPEC) return -EOPNOTSUPP; - return sctp_connect(sock->sk, uaddr, addr_len, flags); + return sctp_connect(sock->sk, (struct sockaddr *)uaddr, addr_len, flags); } /* Only called when shutdown a listening SCTP socket. */ @@ -4845,6 +4843,75 @@ static int sctp_disconnect(struct sock *sk, int flags) return 0; } +static struct sock *sctp_clone_sock(struct sock *sk, + struct sctp_association *asoc, + enum sctp_socket_type type) +{ + struct sock *newsk = sk_clone(sk, GFP_KERNEL, false); + struct inet_sock *newinet; + struct sctp_sock *newsp; + int err = -ENOMEM; + + if (!newsk) + return ERR_PTR(err); + + /* sk_clone() sets refcnt to 2 */ + sock_put(newsk); + + newinet = inet_sk(newsk); + newsp = sctp_sk(newsk); + + newsp->pf->to_sk_daddr(&asoc->peer.primary_addr, newsk); + newinet->inet_dport = htons(asoc->peer.port); + + newsp->pf->copy_ip_options(sk, newsk); + atomic_set(&newinet->inet_id, get_random_u16()); + + inet_set_bit(MC_LOOP, newsk); + newinet->mc_ttl = 1; + newinet->mc_index = 0; + newinet->mc_list = NULL; + +#if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family == AF_INET6) { + struct ipv6_pinfo *newnp = inet6_sk(newsk); + + newinet->pinet6 = &((struct sctp6_sock *)newsk)->inet6; + newinet->ipv6_fl_list = NULL; + + memcpy(newnp, inet6_sk(sk), sizeof(struct ipv6_pinfo)); + newnp->ipv6_mc_list = NULL; + newnp->ipv6_ac_list = NULL; + } +#endif + + newsp->do_auto_asconf = 0; + skb_queue_head_init(&newsp->pd_lobby); + + newsp->ep = sctp_endpoint_new(newsk, GFP_KERNEL); + if (!newsp->ep) + goto out_release; + + SCTP_DBG_OBJCNT_INC(sock); + sk_sockets_allocated_inc(newsk); + sock_prot_inuse_add(sock_net(sk), newsk->sk_prot, 1); + + err = sctp_sock_migrate(sk, newsk, asoc, type); + if (err) + goto out_release; + + /* Set newsk security attributes from original sk and connection + * security attribute from asoc. + */ + security_sctp_sk_clone(asoc, sk, newsk); + + return newsk; + +out_release: + sk_common_release(newsk); + return ERR_PTR(err); +} + /* 4.1.4 accept() - TCP Style Syntax * * Applications use accept() call to remove an established SCTP @@ -4854,18 +4921,13 @@ static int sctp_disconnect(struct sock *sk, int flags) */ static struct sock *sctp_accept(struct sock *sk, struct proto_accept_arg *arg) { - struct sctp_sock *sp; - struct sctp_endpoint *ep; - struct sock *newsk = NULL; struct sctp_association *asoc; - long timeo; + struct sock *newsk = NULL; int error = 0; + long timeo; lock_sock(sk); - sp = sctp_sk(sk); - ep = sp->ep; - if (!sctp_style(sk, TCP)) { error = -EOPNOTSUPP; goto out; @@ -4886,20 +4948,12 @@ static struct sock *sctp_accept(struct sock *sk, struct proto_accept_arg *arg) /* We treat the list of associations on the endpoint as the accept * queue and pick the first association on the list. */ - asoc = list_entry(ep->asocs.next, struct sctp_association, asocs); + asoc = list_entry(sctp_sk(sk)->ep->asocs.next, + struct sctp_association, asocs); - newsk = sp->pf->create_accept_sk(sk, asoc, arg->kern); - if (!newsk) { - error = -ENOMEM; - goto out; - } - - /* Populate the fields of the newsk from the oldsk and migrate the - * asoc to the newsk. - */ - error = sctp_sock_migrate(sk, newsk, asoc, SCTP_SOCKET_TCP); - if (error) { - sk_common_release(newsk); + newsk = sctp_clone_sock(sk, asoc, SCTP_SOCKET_TCP); + if (IS_ERR(newsk)) { + error = PTR_ERR(newsk); newsk = NULL; } @@ -4987,7 +5041,7 @@ static int sctp_init_sock(struct sock *sk) sp->default_rcv_context = 0; sp->max_burst = net->sctp.max_burst; - sp->sctp_hmac_alg = net->sctp.sctp_hmac_alg; + sp->cookie_auth_enable = net->sctp.cookie_auth_enable; /* Initialize default setup parameters. These parameters * can be modified with the SCTP_INITMSG socket option or @@ -5079,8 +5133,6 @@ static int sctp_init_sock(struct sock *sk) if (!sp->ep) return -ENOMEM; - sp->hmac = NULL; - sk->sk_destruct = sctp_destruct_sock; SCTP_DBG_OBJCNT_INC(sock); @@ -5112,23 +5164,16 @@ static void sctp_destroy_sock(struct sock *sk) sp->do_auto_asconf = 0; list_del(&sp->auto_asconf_list); } + sctp_endpoint_free(sp->ep); + sk_sockets_allocated_dec(sk); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); -} - -/* Triggered when there are no references on the socket anymore */ -static void sctp_destruct_common(struct sock *sk) -{ - struct sctp_sock *sp = sctp_sk(sk); - - /* Free up the HMAC transform. */ - crypto_free_shash(sp->hmac); + SCTP_DBG_OBJCNT_DEC(sock); } static void sctp_destruct_sock(struct sock *sk) { - sctp_destruct_common(sk); inet_sock_destruct(sk); } @@ -5628,11 +5673,11 @@ static int sctp_getsockopt_autoclose(struct sock *sk, int len, char __user *optv /* Helper routine to branch off an association to a new socket. */ static int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id, - struct socket **sockp) + struct socket **sockp) { struct sctp_association *asoc = sctp_id2assoc(sk, id); - struct sctp_sock *sp = sctp_sk(sk); struct socket *sock; + struct sock *newsk; int err = 0; /* Do not peel off from one netns to another one. */ @@ -5648,30 +5693,24 @@ static int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id, if (!sctp_style(sk, UDP)) return -EINVAL; - /* Create a new socket. */ - err = sock_create(sk->sk_family, SOCK_SEQPACKET, IPPROTO_SCTP, &sock); - if (err < 0) + err = sock_create_lite(sk->sk_family, SOCK_SEQPACKET, IPPROTO_SCTP, &sock); + if (err) return err; - sctp_copy_sock(sock->sk, sk, asoc); - - /* Make peeled-off sockets more like 1-1 accepted sockets. - * Set the daddr and initialize id to something more random and also - * copy over any ip options. - */ - sp->pf->to_sk_daddr(&asoc->peer.primary_addr, sock->sk); - sp->pf->copy_ip_options(sk, sock->sk); - - /* Populate the fields of the newsk from the oldsk and migrate the - * asoc to the newsk. - */ - err = sctp_sock_migrate(sk, sock->sk, asoc, - SCTP_SOCKET_UDP_HIGH_BANDWIDTH); - if (err) { + newsk = sctp_clone_sock(sk, asoc, SCTP_SOCKET_UDP_HIGH_BANDWIDTH); + if (IS_ERR(newsk)) { sock_release(sock); - sock = NULL; + *sockp = NULL; + return PTR_ERR(newsk); } + lock_sock_nested(newsk, SINGLE_DEPTH_NESTING); + __inet_accept(sk->sk_socket, sock, newsk); + release_sock(newsk); + + sock->ops = sk->sk_socket->ops; + __module_get(sock->ops->owner); + *sockp = sock; return err; @@ -8530,22 +8569,8 @@ static int sctp_listen_start(struct sock *sk, int backlog) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_endpoint *ep = sp->ep; - struct crypto_shash *tfm = NULL; - char alg[32]; int err; - /* Allocate HMAC for generating cookie. */ - if (!sp->hmac && sp->sctp_hmac_alg) { - sprintf(alg, "hmac(%s)", sp->sctp_hmac_alg); - tfm = crypto_alloc_shash(alg, 0, 0); - if (IS_ERR(tfm)) { - net_info_ratelimited("failed to load transform for %s: %ld\n", - sp->sctp_hmac_alg, PTR_ERR(tfm)); - return -ENOSYS; - } - sctp_sk(sk)->hmac = tfm; - } - /* * If a bind() or sctp_bindx() is not called prior to a listen() * call that allows new associations to be accepted, the system @@ -9468,71 +9493,6 @@ done: sctp_skb_set_owner_r(skb, sk); } -void sctp_copy_sock(struct sock *newsk, struct sock *sk, - struct sctp_association *asoc) -{ - struct inet_sock *inet = inet_sk(sk); - struct inet_sock *newinet; - struct sctp_sock *sp = sctp_sk(sk); - - newsk->sk_type = sk->sk_type; - newsk->sk_bound_dev_if = sk->sk_bound_dev_if; - newsk->sk_flags = sk->sk_flags; - newsk->sk_tsflags = sk->sk_tsflags; - newsk->sk_no_check_tx = sk->sk_no_check_tx; - newsk->sk_no_check_rx = sk->sk_no_check_rx; - newsk->sk_reuse = sk->sk_reuse; - sctp_sk(newsk)->reuse = sp->reuse; - - newsk->sk_shutdown = sk->sk_shutdown; - newsk->sk_destruct = sk->sk_destruct; - newsk->sk_family = sk->sk_family; - newsk->sk_protocol = IPPROTO_SCTP; - newsk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; - newsk->sk_sndbuf = sk->sk_sndbuf; - newsk->sk_rcvbuf = sk->sk_rcvbuf; - newsk->sk_lingertime = sk->sk_lingertime; - newsk->sk_rcvtimeo = READ_ONCE(sk->sk_rcvtimeo); - newsk->sk_sndtimeo = READ_ONCE(sk->sk_sndtimeo); - newsk->sk_rxhash = sk->sk_rxhash; - - newinet = inet_sk(newsk); - - /* Initialize sk's sport, dport, rcv_saddr and daddr for - * getsockname() and getpeername() - */ - newinet->inet_sport = inet->inet_sport; - newinet->inet_saddr = inet->inet_saddr; - newinet->inet_rcv_saddr = inet->inet_rcv_saddr; - newinet->inet_dport = htons(asoc->peer.port); - newinet->pmtudisc = inet->pmtudisc; - atomic_set(&newinet->inet_id, get_random_u16()); - - newinet->uc_ttl = inet->uc_ttl; - inet_set_bit(MC_LOOP, newsk); - newinet->mc_ttl = 1; - newinet->mc_index = 0; - newinet->mc_list = NULL; - - if (newsk->sk_flags & SK_FLAGS_TIMESTAMP) - net_enable_timestamp(); - - /* Set newsk security attributes from original sk and connection - * security attribute from asoc. - */ - security_sctp_sk_clone(asoc, sk, newsk); -} - -static inline void sctp_copy_descendant(struct sock *sk_to, - const struct sock *sk_from) -{ - size_t ancestor_size = sizeof(struct inet_sock); - - ancestor_size += sk_from->sk_prot->obj_size; - ancestor_size -= offsetof(struct sctp_sock, pd_lobby); - __inet_sk_copy_descendant(sk_to, sk_from, ancestor_size); -} - /* Populate the fields of the newsk from the oldsk and migrate the assoc * and its messages to the newsk. */ @@ -9549,19 +9509,10 @@ static int sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, struct sctp_bind_hashbucket *head; int err; - /* Migrate socket buffer sizes and all the socket level options to the - * new socket. - */ - newsk->sk_sndbuf = oldsk->sk_sndbuf; - newsk->sk_rcvbuf = oldsk->sk_rcvbuf; - /* Brute force copy old sctp opt. */ - sctp_copy_descendant(newsk, oldsk); - /* Restore the ep value that was overwritten with the above structure * copy. */ newsp->ep = newep; - newsp->hmac = NULL; /* Hook this new socket in to the bind_hash list. */ head = &sctp_port_hashtable[sctp_phashfn(sock_net(oldsk), @@ -9581,16 +9532,6 @@ static int sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, if (err) return err; - /* New ep's auth_hmacs should be set if old ep's is set, in case - * that net->sctp.auth_enable has been changed to 0 by users and - * new ep's auth_hmacs couldn't be set in sctp_endpoint_init(). - */ - if (oldsp->ep->auth_hmacs) { - err = sctp_auth_init_hmacs(newsp->ep, GFP_KERNEL); - if (err) - return err; - } - sctp_auto_asconf_init(newsp); /* Move any messages in the old socket's receive queue that are for the @@ -9723,7 +9664,6 @@ struct proto sctp_prot = { static void sctp_v6_destruct_sock(struct sock *sk) { - sctp_destruct_common(sk); inet6_sock_destruct(sk); } diff --git a/net/sctp/stream.c b/net/sctp/stream.c index f205556c5b24..0615e4426341 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -54,7 +54,7 @@ static void sctp_stream_shrink_out(struct sctp_stream *stream, __u16 outcnt) static void sctp_stream_free_ext(struct sctp_stream *stream, __u16 sid) { - struct sctp_sched_ops *sched; + const struct sctp_sched_ops *sched; if (!SCTP_SO(stream, sid)->ext) return; @@ -130,7 +130,7 @@ out: int sctp_stream_init(struct sctp_stream *stream, __u16 outcnt, __u16 incnt, gfp_t gfp) { - struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); + const struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); int i, ret = 0; gfp |= __GFP_NOWARN; @@ -182,7 +182,7 @@ int sctp_stream_init_ext(struct sctp_stream *stream, __u16 sid) void sctp_stream_free(struct sctp_stream *stream) { - struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); + const struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); int i; sched->unsched_all(stream); @@ -207,7 +207,7 @@ void sctp_stream_clear(struct sctp_stream *stream) void sctp_stream_update(struct sctp_stream *stream, struct sctp_stream *new) { - struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); + const struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); sched->unsched_all(stream); sctp_stream_outq_migrate(stream, new, new->outcnt); diff --git a/net/sctp/stream_sched.c b/net/sctp/stream_sched.c index 54afbe4fb087..50f8b5240359 100644 --- a/net/sctp/stream_sched.c +++ b/net/sctp/stream_sched.c @@ -91,7 +91,7 @@ static void sctp_sched_fcfs_unsched_all(struct sctp_stream *stream) { } -static struct sctp_sched_ops sctp_sched_fcfs = { +static const struct sctp_sched_ops sctp_sched_fcfs = { .set = sctp_sched_fcfs_set, .get = sctp_sched_fcfs_get, .init = sctp_sched_fcfs_init, @@ -111,10 +111,10 @@ static void sctp_sched_ops_fcfs_init(void) /* API to other parts of the stack */ -static struct sctp_sched_ops *sctp_sched_ops[SCTP_SS_MAX + 1]; +static const struct sctp_sched_ops *sctp_sched_ops[SCTP_SS_MAX + 1]; void sctp_sched_ops_register(enum sctp_sched_type sched, - struct sctp_sched_ops *sched_ops) + const struct sctp_sched_ops *sched_ops) { sctp_sched_ops[sched] = sched_ops; } @@ -130,7 +130,7 @@ void sctp_sched_ops_init(void) static void sctp_sched_free_sched(struct sctp_stream *stream) { - struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); + const struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); struct sctp_stream_out_ext *soute; int i; @@ -148,9 +148,9 @@ static void sctp_sched_free_sched(struct sctp_stream *stream) int sctp_sched_set_sched(struct sctp_association *asoc, enum sctp_sched_type sched) { - struct sctp_sched_ops *old = asoc->outqueue.sched; + const struct sctp_sched_ops *old = asoc->outqueue.sched; struct sctp_datamsg *msg = NULL; - struct sctp_sched_ops *n; + const struct sctp_sched_ops *n; struct sctp_chunk *ch; int i, ret = 0; @@ -263,14 +263,14 @@ void sctp_sched_dequeue_common(struct sctp_outq *q, struct sctp_chunk *ch) int sctp_sched_init_sid(struct sctp_stream *stream, __u16 sid, gfp_t gfp) { - struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); + const struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream); struct sctp_stream_out_ext *ext = SCTP_SO(stream, sid)->ext; INIT_LIST_HEAD(&ext->outq); return sched->init_sid(stream, sid, gfp); } -struct sctp_sched_ops *sctp_sched_ops_from_stream(struct sctp_stream *stream) +const struct sctp_sched_ops *sctp_sched_ops_from_stream(struct sctp_stream *stream) { struct sctp_association *asoc; diff --git a/net/sctp/stream_sched_fc.c b/net/sctp/stream_sched_fc.c index 4bd18a497a6d..776c6de46c22 100644 --- a/net/sctp/stream_sched_fc.c +++ b/net/sctp/stream_sched_fc.c @@ -188,7 +188,7 @@ static void sctp_sched_fc_unsched_all(struct sctp_stream *stream) list_del_init(&soute->fc_list); } -static struct sctp_sched_ops sctp_sched_fc = { +static const struct sctp_sched_ops sctp_sched_fc = { .set = sctp_sched_fc_set, .get = sctp_sched_fc_get, .init = sctp_sched_fc_init, @@ -206,7 +206,7 @@ void sctp_sched_ops_fc_init(void) sctp_sched_ops_register(SCTP_SS_FC, &sctp_sched_fc); } -static struct sctp_sched_ops sctp_sched_wfq = { +static const struct sctp_sched_ops sctp_sched_wfq = { .set = sctp_sched_wfq_set, .get = sctp_sched_wfq_get, .init = sctp_sched_fc_init, diff --git a/net/sctp/stream_sched_prio.c b/net/sctp/stream_sched_prio.c index 4d4d9da331f4..fb6c55e5615d 100644 --- a/net/sctp/stream_sched_prio.c +++ b/net/sctp/stream_sched_prio.c @@ -300,7 +300,7 @@ static void sctp_sched_prio_unsched_all(struct sctp_stream *stream) sctp_sched_prio_unsched(soute); } -static struct sctp_sched_ops sctp_sched_prio = { +static const struct sctp_sched_ops sctp_sched_prio = { .set = sctp_sched_prio_set, .get = sctp_sched_prio_get, .init = sctp_sched_prio_init, diff --git a/net/sctp/stream_sched_rr.c b/net/sctp/stream_sched_rr.c index 1f235e7f643a..9157b653f196 100644 --- a/net/sctp/stream_sched_rr.c +++ b/net/sctp/stream_sched_rr.c @@ -171,7 +171,7 @@ static void sctp_sched_rr_unsched_all(struct sctp_stream *stream) sctp_sched_rr_unsched(stream, soute); } -static struct sctp_sched_ops sctp_sched_rr = { +static const struct sctp_sched_ops sctp_sched_rr = { .set = sctp_sched_rr_set, .get = sctp_sched_rr_get, .init = sctp_sched_rr_init, diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index ee3eac338a9d..15e7db9a3ab2 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -174,7 +174,7 @@ static struct ctl_table sctp_net_table[] = { }, { .procname = "cookie_hmac_alg", - .data = &init_net.sctp.sctp_hmac_alg, + .data = &init_net.sctp.cookie_auth_enable, .maxlen = 8, .mode = 0644, .proc_handler = proc_sctp_do_hmac_alg, @@ -388,10 +388,8 @@ static int proc_sctp_do_hmac_alg(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(ctl->data, struct net, - sctp.sctp_hmac_alg); + sctp.cookie_auth_enable); struct ctl_table tbl; - bool changed = false; - char *none = "none"; char tmp[8] = {0}; int ret; @@ -399,35 +397,26 @@ static int proc_sctp_do_hmac_alg(const struct ctl_table *ctl, int write, if (write) { tbl.data = tmp; - tbl.maxlen = sizeof(tmp); - } else { - tbl.data = net->sctp.sctp_hmac_alg ? : none; - tbl.maxlen = strlen(tbl.data); - } - - ret = proc_dostring(&tbl, write, buffer, lenp, ppos); - if (write && ret == 0) { -#ifdef CONFIG_CRYPTO_MD5 - if (!strncmp(tmp, "md5", 3)) { - net->sctp.sctp_hmac_alg = "md5"; - changed = true; + tbl.maxlen = sizeof(tmp) - 1; + ret = proc_dostring(&tbl, 1, buffer, lenp, ppos); + if (ret) + return ret; + if (!strcmp(tmp, "sha256")) { + net->sctp.cookie_auth_enable = 1; + return 0; } -#endif -#ifdef CONFIG_CRYPTO_SHA1 - if (!strncmp(tmp, "sha1", 4)) { - net->sctp.sctp_hmac_alg = "sha1"; - changed = true; + if (!strcmp(tmp, "none")) { + net->sctp.cookie_auth_enable = 0; + return 0; } -#endif - if (!strncmp(tmp, "none", 4)) { - net->sctp.sctp_hmac_alg = NULL; - changed = true; - } - if (!changed) - ret = -EINVAL; + return -EINVAL; } - - return ret; + if (net->sctp.cookie_auth_enable) + tbl.data = (char *)"sha256"; + else + tbl.data = (char *)"none"; + tbl.maxlen = strlen(tbl.data); + return proc_dostring(&tbl, 0, buffer, lenp, ppos); } static int proc_sctp_do_rto_min(const struct ctl_table *ctl, int write, diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 4d258a6e8033..0c56d9673cc1 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -37,10 +37,10 @@ /* 1st Level Abstractions. */ /* Initialize a new transport from provided memory. */ -static struct sctp_transport *sctp_transport_init(struct net *net, - struct sctp_transport *peer, - const union sctp_addr *addr, - gfp_t gfp) +static void sctp_transport_init(struct net *net, + struct sctp_transport *peer, + const union sctp_addr *addr, + gfp_t gfp) { /* Copy in the address. */ peer->af_specific = sctp_get_af_specific(addr->sa.sa_family); @@ -83,8 +83,6 @@ static struct sctp_transport *sctp_transport_init(struct net *net, get_random_bytes(&peer->hb_nonce, sizeof(peer->hb_nonce)); refcount_set(&peer->refcnt, 1); - - return peer; } /* Allocate and initialize a new transport. */ @@ -96,20 +94,13 @@ struct sctp_transport *sctp_transport_new(struct net *net, transport = kzalloc(sizeof(*transport), gfp); if (!transport) - goto fail; + return NULL; - if (!sctp_transport_init(net, transport, addr, gfp)) - goto fail_init; + sctp_transport_init(net, transport, addr, gfp); SCTP_DBG_OBJCNT_INC(transport); return transport; - -fail_init: - kfree(transport); - -fail: - return NULL; } /* This transport is no longer needed. Free up if possible, or @@ -495,6 +486,7 @@ void sctp_transport_update_rto(struct sctp_transport *tp, __u32 rtt) if (tp->rttvar || tp->srtt) { struct net *net = tp->asoc->base.net; + unsigned int rto_beta, rto_alpha; /* 6.3.1 C3) When a new RTT measurement R' is made, set * RTTVAR <- (1 - RTO.Beta) * RTTVAR + RTO.Beta * |SRTT - R'| * SRTT <- (1 - RTO.Alpha) * SRTT + RTO.Alpha * R' @@ -506,10 +498,14 @@ void sctp_transport_update_rto(struct sctp_transport *tp, __u32 rtt) * For example, assuming the default value of RTO.Alpha of * 1/8, rto_alpha would be expressed as 3. */ - tp->rttvar = tp->rttvar - (tp->rttvar >> net->sctp.rto_beta) - + (((__u32)abs((__s64)tp->srtt - (__s64)rtt)) >> net->sctp.rto_beta); - tp->srtt = tp->srtt - (tp->srtt >> net->sctp.rto_alpha) - + (rtt >> net->sctp.rto_alpha); + rto_beta = READ_ONCE(net->sctp.rto_beta); + if (rto_beta < 32) + tp->rttvar = tp->rttvar - (tp->rttvar >> rto_beta) + + (((__u32)abs((__s64)tp->srtt - (__s64)rtt)) >> rto_beta); + rto_alpha = READ_ONCE(net->sctp.rto_alpha); + if (rto_alpha < 32) + tp->srtt = tp->srtt - (tp->srtt >> rto_alpha) + + (rtt >> rto_alpha); } else { /* 6.3.1 C2) When the first RTT measurement R is made, set * SRTT <- R, RTTVAR <- R/2. diff --git a/net/shaper/shaper_nl_gen.c b/net/shaper/shaper_nl_gen.c index 204c8ae8c7b1..e8cccc4c1180 100644 --- a/net/shaper/shaper_nl_gen.c +++ b/net/shaper/shaper_nl_gen.c @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/net_shaper.yaml */ /* YNL-GEN kernel source */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #include <net/netlink.h> #include <net/genetlink.h> diff --git a/net/shaper/shaper_nl_gen.h b/net/shaper/shaper_nl_gen.h index cb7f9026fc23..ec41c90431a4 100644 --- a/net/shaper/shaper_nl_gen.h +++ b/net/shaper/shaper_nl_gen.h @@ -2,6 +2,7 @@ /* Do not edit directly, auto-generated from: */ /* Documentation/netlink/specs/net_shaper.yaml */ /* YNL-GEN kernel header */ +/* To regenerate run: tools/net/ynl/ynl-regen.sh */ #ifndef _LINUX_NET_SHAPER_GEN_H #define _LINUX_NET_SHAPER_GEN_H diff --git a/net/smc/Kconfig b/net/smc/Kconfig index ba5e6a2dd2fd..325addf83cc6 100644 --- a/net/smc/Kconfig +++ b/net/smc/Kconfig @@ -1,8 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config SMC tristate "SMC socket protocol family" - depends on INET && INFINIBAND - depends on m || ISM != m + depends on INET && INFINIBAND && DIBS help SMC-R provides a "sockets over RDMA" solution making use of RDMA over Converged Ethernet (RoCE) technology to upgrade @@ -21,15 +20,12 @@ config SMC_DIAG if unsure, say Y. -config SMC_LO - bool "SMC intra-OS shortcut with loopback-ism" - depends on SMC - default n +config SMC_HS_CTRL_BPF + bool "Generic eBPF hook for SMC handshake flow" + depends on SMC && BPF_SYSCALL + default y help - SMC_LO enables the creation of an Emulated-ISM device named - loopback-ism in SMC and makes use of it for transferring data - when communication occurs within the same OS. This helps in - convenient testing of SMC-D since loopback-ism is independent - of architecture or hardware. - - if unsure, say N. + SMC_HS_CTRL_BPF enables support to register generic eBPF hook for SMC + handshake flow, which offer much greater flexibility in modifying the behavior + of the SMC protocol stack compared to a complete kernel-based approach. Select + this option if you want filtring the handshake process via eBPF programs.
\ No newline at end of file diff --git a/net/smc/Makefile b/net/smc/Makefile index 60f1c87d5212..5368634c5dd6 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -6,4 +6,4 @@ smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o smc-y += smc_tracepoint.o smc_inet.o smc-$(CONFIG_SYSCTL) += smc_sysctl.o -smc-$(CONFIG_SMC_LO) += smc_loopback.o +smc-$(CONFIG_SMC_HS_CTRL_BPF) += smc_hs_bpf.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 9311c38f7abe..f97f77b041d9 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -16,8 +16,7 @@ * based on prototype from Frank Blaschka */ -#define KMSG_COMPONENT "smc" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt +#define pr_fmt(fmt) "smc: " fmt #include <linux/module.h> #include <linux/socket.h> @@ -57,8 +56,8 @@ #include "smc_stats.h" #include "smc_tracepoint.h" #include "smc_sysctl.h" -#include "smc_loopback.h" #include "smc_inet.h" +#include "smc_hs_bpf.h" static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group * creation on server @@ -422,7 +421,7 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, return sk; } -int smc_bind(struct socket *sock, struct sockaddr *uaddr, +int smc_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; @@ -1097,8 +1096,7 @@ static int smc_find_ism_v2_device_clnt(struct smc_sock *smc, } /* Check for VLAN ID and register it on ISM device just for CLC handshake */ -static int smc_connect_ism_vlan_setup(struct smc_sock *smc, - struct smc_init_info *ini) +static int smc_connect_ism_vlan_setup(struct smc_init_info *ini) { if (ini->vlan_id && smc_ism_get_vlan(ini->ism_dev[0], ini->vlan_id)) return SMC_CLC_DECL_ISMVLANERR; @@ -1113,7 +1111,7 @@ static int smc_find_proposal_devices(struct smc_sock *smc, /* check if there is an ism device available */ if (!(ini->smcd_version & SMC_V1) || smc_find_ism_device(smc, ini) || - smc_connect_ism_vlan_setup(smc, ini)) + smc_connect_ism_vlan_setup(ini)) ini->smcd_version &= ~SMC_V1; /* else ISM V1 is supported for this connection */ @@ -1158,8 +1156,7 @@ static int smc_find_proposal_devices(struct smc_sock *smc, /* cleanup temporary VLAN ID registration used for CLC handshake. If ISM is * used, the VLAN ID will be registered again during the connection setup. */ -static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, - struct smc_init_info *ini) +static int smc_connect_ism_vlan_cleanup(struct smc_init_info *ini) { if (!smcd_indicated(ini->smc_type_v1)) return 0; @@ -1582,13 +1579,13 @@ static int __smc_connect(struct smc_sock *smc) goto vlan_cleanup; SMC_STAT_CLNT_SUCC_INC(sock_net(smc->clcsock->sk), aclc); - smc_connect_ism_vlan_cleanup(smc, ini); + smc_connect_ism_vlan_cleanup(ini); kfree(buf); kfree(ini); return 0; vlan_cleanup: - smc_connect_ism_vlan_cleanup(smc, ini); + smc_connect_ism_vlan_cleanup(ini); kfree(buf); fallback: kfree(ini); @@ -1645,7 +1642,7 @@ out: release_sock(&smc->sk); } -int smc_connect(struct socket *sock, struct sockaddr *addr, +int smc_connect(struct socket *sock, struct sockaddr_unsized *addr, int alen, int flags) { struct sock *sk = sock->sk; @@ -1697,7 +1694,7 @@ int smc_connect(struct socket *sock, struct sockaddr *addr, rc = -EALREADY; goto out; } - rc = kernel_connect(smc->clcsock, addr, alen, flags); + rc = kernel_connect(smc->clcsock, (struct sockaddr_unsized *)addr, alen, flags); if (rc && rc != -EINPROGRESS) goto out; @@ -2143,7 +2140,7 @@ static void smc_check_ism_v2_match(struct smc_init_info *ini, } } -static void smc_find_ism_store_rc(u32 rc, struct smc_init_info *ini) +static void smc_init_info_store_rc(u32 rc, struct smc_init_info *ini) { if (!ini->rc) ini->rc = rc; @@ -2206,7 +2203,7 @@ static void smc_find_ism_v2_device_serv(struct smc_sock *new_smc, mutex_unlock(&smcd_dev_list.mutex); if (!ini->ism_dev[0]) { - smc_find_ism_store_rc(SMC_CLC_DECL_NOSMCD2DEV, ini); + smc_init_info_store_rc(SMC_CLC_DECL_NOSMCD2DEV, ini); goto not_found; } @@ -2223,7 +2220,7 @@ static void smc_find_ism_v2_device_serv(struct smc_sock *new_smc, ini->ism_selected = i; rc = smc_listen_ism_init(new_smc, ini); if (rc) { - smc_find_ism_store_rc(rc, ini); + smc_init_info_store_rc(rc, ini); /* try next active ISM device */ continue; } @@ -2263,7 +2260,7 @@ static void smc_find_ism_v1_device_serv(struct smc_sock *new_smc, return; /* V1 ISM device found */ not_found: - smc_find_ism_store_rc(rc, ini); + smc_init_info_store_rc(rc, ini); ini->smcd_version &= ~SMC_V1; ini->ism_dev[0] = NULL; ini->is_smcd = false; @@ -2314,7 +2311,7 @@ static void smc_find_rdma_v2_device_serv(struct smc_sock *new_smc, ini->smcrv2.daddr = smc_ib_gid_to_ipv4(smc_v2_ext->roce); rc = smc_find_rdma_device(new_smc, ini); if (rc) { - smc_find_ism_store_rc(rc, ini); + smc_init_info_store_rc(rc, ini); goto not_found; } if (!ini->smcrv2.uses_gateway) @@ -2331,7 +2328,7 @@ static void smc_find_rdma_v2_device_serv(struct smc_sock *new_smc, if (!rc) return; ini->smcr_version = smcr_version; - smc_find_ism_store_rc(rc, ini); + smc_init_info_store_rc(rc, ini); not_found: ini->smcr_version &= ~SMC_V2; @@ -2378,7 +2375,7 @@ static int smc_listen_find_device(struct smc_sock *new_smc, /* check for matching IP prefix and subnet length (V1) */ prfx_rc = smc_listen_prfx_check(new_smc, pclc); if (prfx_rc) - smc_find_ism_store_rc(prfx_rc, ini); + smc_init_info_store_rc(prfx_rc, ini); /* get vlan id from IP device */ if (smc_vlan_by_tcpsk(new_smc->clcsock, ini)) @@ -2405,7 +2402,7 @@ static int smc_listen_find_device(struct smc_sock *new_smc, int rc; rc = smc_find_rdma_v1_device_serv(new_smc, pclc, ini); - smc_find_ism_store_rc(rc, ini); + smc_init_info_store_rc(rc, ini); return (!rc) ? 0 : ini->rc; } return prfx_rc; @@ -2568,8 +2565,9 @@ static void smc_listen_work(struct work_struct *work) goto out_decl; } - smc_listen_out_connected(new_smc); SMC_STAT_SERV_SUCC_INC(sock_net(newclcsock->sk), ini); + /* smc_listen_out() will release smcsk */ + smc_listen_out_connected(new_smc); goto out_free; out_unlock: @@ -3536,15 +3534,15 @@ static int __init smc_init(void) rc = -ENOMEM; - smc_tcp_ls_wq = alloc_workqueue("smc_tcp_ls_wq", 0, 0); + smc_tcp_ls_wq = alloc_workqueue("smc_tcp_ls_wq", WQ_PERCPU, 0); if (!smc_tcp_ls_wq) goto out_pnet; - smc_hs_wq = alloc_workqueue("smc_hs_wq", 0, 0); + smc_hs_wq = alloc_workqueue("smc_hs_wq", WQ_PERCPU, 0); if (!smc_hs_wq) goto out_alloc_tcp_ls_wq; - smc_close_wq = alloc_workqueue("smc_close_wq", 0, 0); + smc_close_wq = alloc_workqueue("smc_close_wq", WQ_PERCPU, 0); if (!smc_close_wq) goto out_alloc_hs_wq; @@ -3592,28 +3590,28 @@ static int __init smc_init(void) goto out_sock; } - rc = smc_loopback_init(); - if (rc) { - pr_err("%s: smc_loopback_init fails with %d\n", __func__, rc); - goto out_ib; - } - rc = tcp_register_ulp(&smc_ulp_ops); if (rc) { pr_err("%s: tcp_ulp_register fails with %d\n", __func__, rc); - goto out_lo; + goto out_ib; } rc = smc_inet_init(); if (rc) { pr_err("%s: smc_inet_init fails with %d\n", __func__, rc); goto out_ulp; } + rc = bpf_smc_hs_ctrl_init(); + if (rc) { + pr_err("%s: bpf_smc_hs_ctrl_init fails with %d\n", __func__, + rc); + goto out_inet; + } static_branch_enable(&tcp_have_smc); return 0; +out_inet: + smc_inet_exit(); out_ulp: tcp_unregister_ulp(&smc_ulp_ops); -out_lo: - smc_loopback_exit(); out_ib: smc_ib_unregister_client(); out_sock: @@ -3652,7 +3650,6 @@ static void __exit smc_exit(void) tcp_unregister_ulp(&smc_ulp_ops); sock_unregister(PF_SMC); smc_core_exit(); - smc_loopback_exit(); smc_ib_unregister_client(); smc_ism_exit(); destroy_workqueue(smc_close_wq); diff --git a/net/smc/smc.h b/net/smc/smc.h index 2c9084963739..9e6af72784ba 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -42,9 +42,9 @@ void smc_unhash_sk(struct sock *sk); void smc_release_cb(struct sock *sk); int smc_release(struct socket *sock); -int smc_bind(struct socket *sock, struct sockaddr *uaddr, +int smc_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len); -int smc_connect(struct socket *sock, struct sockaddr *addr, +int smc_connect(struct socket *sock, struct sockaddr_unsized *addr, int alen, int flags); int smc_accept(struct socket *sock, struct socket *new_sock, struct proto_accept_arg *arg); diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 5a4db151fe95..87c87edadde7 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -426,8 +426,6 @@ smc_clc_msg_decl_valid(struct smc_clc_msg_decline *dclc) { struct smc_clc_msg_hdr *hdr = &dclc->hdr; - if (hdr->typev1 != SMC_TYPE_R && hdr->typev1 != SMC_TYPE_D) - return false; if (hdr->version == SMC_V1) { if (ntohs(hdr->length) != sizeof(struct smc_clc_msg_decline)) return false; @@ -511,10 +509,10 @@ static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm, bool check_trl) } /* find ipv4 addr on device and get the prefix len, fill CLC proposal msg */ -static int smc_clc_prfx_set4_rcu(struct dst_entry *dst, __be32 ipv4, +static int smc_clc_prfx_set4_rcu(struct net_device *dev, __be32 ipv4, struct smc_clc_msg_proposal_prefix *prop) { - struct in_device *in_dev = __in_dev_get_rcu(dst->dev); + struct in_device *in_dev = __in_dev_get_rcu(dev); const struct in_ifaddr *ifa; if (!in_dev) @@ -532,12 +530,12 @@ static int smc_clc_prfx_set4_rcu(struct dst_entry *dst, __be32 ipv4, } /* fill CLC proposal msg with ipv6 prefixes from device */ -static int smc_clc_prfx_set6_rcu(struct dst_entry *dst, +static int smc_clc_prfx_set6_rcu(struct net_device *dev, struct smc_clc_msg_proposal_prefix *prop, struct smc_clc_ipv6_prefix *ipv6_prfx) { #if IS_ENABLED(CONFIG_IPV6) - struct inet6_dev *in6_dev = __in6_dev_get(dst->dev); + struct inet6_dev *in6_dev = __in6_dev_get(dev); struct inet6_ifaddr *ifa; int cnt = 0; @@ -566,41 +564,44 @@ static int smc_clc_prfx_set(struct socket *clcsock, struct smc_clc_msg_proposal_prefix *prop, struct smc_clc_ipv6_prefix *ipv6_prfx) { - struct dst_entry *dst = sk_dst_get(clcsock->sk); struct sockaddr_storage addrs; struct sockaddr_in6 *addr6; struct sockaddr_in *addr; + struct net_device *dev; + struct dst_entry *dst; int rc = -ENOENT; - if (!dst) { - rc = -ENOTCONN; - goto out; - } - if (!dst->dev) { - rc = -ENODEV; - goto out_rel; - } /* get address to which the internal TCP socket is bound */ if (kernel_getsockname(clcsock, (struct sockaddr *)&addrs) < 0) - goto out_rel; + goto out; + /* analyze IP specific data of net_device belonging to TCP socket */ addr6 = (struct sockaddr_in6 *)&addrs; + rcu_read_lock(); + + dst = __sk_dst_get(clcsock->sk); + dev = dst ? dst_dev_rcu(dst) : NULL; + if (!dev) { + rc = -ENODEV; + goto out_unlock; + } + if (addrs.ss_family == PF_INET) { /* IPv4 */ addr = (struct sockaddr_in *)&addrs; - rc = smc_clc_prfx_set4_rcu(dst, addr->sin_addr.s_addr, prop); + rc = smc_clc_prfx_set4_rcu(dev, addr->sin_addr.s_addr, prop); } else if (ipv6_addr_v4mapped(&addr6->sin6_addr)) { /* mapped IPv4 address - peer is IPv4 only */ - rc = smc_clc_prfx_set4_rcu(dst, addr6->sin6_addr.s6_addr32[3], + rc = smc_clc_prfx_set4_rcu(dev, addr6->sin6_addr.s6_addr32[3], prop); } else { /* IPv6 */ - rc = smc_clc_prfx_set6_rcu(dst, prop, ipv6_prfx); + rc = smc_clc_prfx_set6_rcu(dev, prop, ipv6_prfx); } + +out_unlock: rcu_read_unlock(); -out_rel: - dst_release(dst); out: return rc; } @@ -656,26 +657,26 @@ static int smc_clc_prfx_match6_rcu(struct net_device *dev, int smc_clc_prfx_match(struct socket *clcsock, struct smc_clc_msg_proposal_prefix *prop) { - struct dst_entry *dst = sk_dst_get(clcsock->sk); + struct net_device *dev; + struct dst_entry *dst; int rc; - if (!dst) { - rc = -ENOTCONN; - goto out; - } - if (!dst->dev) { + rcu_read_lock(); + + dst = __sk_dst_get(clcsock->sk); + dev = dst ? dst_dev_rcu(dst) : NULL; + if (!dev) { rc = -ENODEV; - goto out_rel; + goto out; } - rcu_read_lock(); + if (!prop->ipv6_prefixes_cnt) - rc = smc_clc_prfx_match4_rcu(dst->dev, prop); + rc = smc_clc_prfx_match4_rcu(dev, prop); else - rc = smc_clc_prfx_match6_rcu(dst->dev, prop); - rcu_read_unlock(); -out_rel: - dst_release(dst); + rc = smc_clc_prfx_match6_rcu(dev, prop); out: + rcu_read_unlock(); + return rc; } @@ -889,6 +890,7 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) return SMC_CLC_DECL_CNFERR; } pclc_base->hdr.typev1 = SMC_TYPE_N; + ini->smc_type_v1 = SMC_TYPE_N; } else { pclc_base->iparea_offset = htons(sizeof(*pclc_smcd)); plen += sizeof(*pclc_prfx) + @@ -915,7 +917,7 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) /* add SMC-D specifics */ if (ini->ism_dev[0]) { smcd = ini->ism_dev[0]; - smcd->ops->get_local_gid(smcd, &smcd_gid); + copy_to_smcdgid(&smcd_gid, &smcd->dibs->gid); pclc_smcd->ism.gid = htonll(smcd_gid.gid); pclc_smcd->ism.chid = htons(smc_ism_get_chid(ini->ism_dev[0])); @@ -965,7 +967,7 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) if (ini->ism_offered_cnt) { for (i = 1; i <= ini->ism_offered_cnt; i++) { smcd = ini->ism_dev[i]; - smcd->ops->get_local_gid(smcd, &smcd_gid); + copy_to_smcdgid(&smcd_gid, &smcd->dibs->gid); gidchids[entry].chid = htons(smc_ism_get_chid(ini->ism_dev[i])); gidchids[entry].gid = htonll(smcd_gid.gid); @@ -1058,7 +1060,7 @@ smcd_clc_prep_confirm_accept(struct smc_connection *conn, /* SMC-D specific settings */ memcpy(clc->hdr.eyecatcher, SMCD_EYECATCHER, sizeof(SMCD_EYECATCHER)); - smcd->ops->get_local_gid(smcd, &smcd_gid); + copy_to_smcdgid(&smcd_gid, &smcd->dibs->gid); clc->hdr.typev1 = SMC_TYPE_D; clc->d0.gid = htonll(smcd_gid.gid); clc->d0.token = htonll(conn->rmb_desc->token); diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 262746e304dd..e4eabc83719e 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -85,7 +85,7 @@ static void smc_lgr_schedule_free_work(struct smc_link_group *lgr) * otherwise there is a risk of out-of-sync link groups. */ if (!lgr->freeing) { - mod_delayed_work(system_wq, &lgr->free_work, + mod_delayed_work(system_percpu_wq, &lgr->free_work, (!lgr->is_smcd && lgr->role == SMC_CLNT) ? SMC_LGR_FREE_DELAY_CLNT : SMC_LGR_FREE_DELAY_SERV); @@ -555,7 +555,7 @@ static int smc_nl_fill_smcd_lgr(struct smc_link_group *lgr, if (nla_put_u32(skb, SMC_NLA_LGR_D_ID, *((u32 *)&lgr->id))) goto errattr; - smcd->ops->get_local_gid(smcd, &smcd_gid); + copy_to_smcdgid(&smcd_gid, &smcd->dibs->gid); if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_GID, smcd_gid.gid, SMC_NLA_LGR_D_PAD)) goto errattr; @@ -810,6 +810,8 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, lnk->clearing = 0; lnk->path_mtu = lnk->smcibdev->pattr[lnk->ibport - 1].active_mtu; lnk->link_id = smcr_next_link_id(lgr); + lnk->max_send_wr = lgr->max_send_wr; + lnk->max_recv_wr = lgr->max_recv_wr; lnk->lgr = lgr; smc_lgr_hold(lgr); /* lgr_put in smcr_link_clear() */ lnk->link_idx = link_idx; @@ -836,27 +838,39 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, rc = smc_llc_link_init(lnk); if (rc) goto out; - rc = smc_wr_alloc_link_mem(lnk); - if (rc) - goto clear_llc_lnk; rc = smc_ib_create_protection_domain(lnk); if (rc) - goto free_link_mem; - rc = smc_ib_create_queue_pair(lnk); - if (rc) - goto dealloc_pd; + goto clear_llc_lnk; + do { + rc = smc_ib_create_queue_pair(lnk); + if (rc) + goto dealloc_pd; + rc = smc_wr_alloc_link_mem(lnk); + if (!rc) + break; + else if (rc != -ENOMEM) /* give up */ + goto destroy_qp; + /* retry with smaller ... */ + lnk->max_send_wr /= 2; + lnk->max_recv_wr /= 2; + /* ... unless droping below old SMC_WR_BUF_SIZE */ + if (lnk->max_send_wr < 16 || lnk->max_recv_wr < 48) + goto destroy_qp; + smc_ib_destroy_queue_pair(lnk); + } while (1); + rc = smc_wr_create_link(lnk); if (rc) - goto destroy_qp; + goto free_link_mem; lnk->state = SMC_LNK_ACTIVATING; return 0; +free_link_mem: + smc_wr_free_link_mem(lnk); destroy_qp: smc_ib_destroy_queue_pair(lnk); dealloc_pd: smc_ib_dealloc_protection_domain(lnk); -free_link_mem: - smc_wr_free_link_mem(lnk); clear_llc_lnk: smc_llc_link_clear(lnk, false); out: @@ -896,7 +910,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) rc = SMC_CLC_DECL_MEM; goto ism_put_vlan; } - lgr->tx_wq = alloc_workqueue("smc_tx_wq-%*phN", 0, 0, + lgr->tx_wq = alloc_workqueue("smc_tx_wq-%*phN", WQ_PERCPU, 0, SMC_LGR_ID_SIZE, &lgr->id); if (!lgr->tx_wq) { rc = -ENOMEM; @@ -924,7 +938,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) if (ini->is_smcd) { /* SMC-D specific settings */ smcd = ini->ism_dev[ini->ism_selected]; - get_device(smcd->ops->get_dev(smcd)); + get_device(&smcd->dibs->dev); lgr->peer_gid.gid = ini->ism_peer_gid[ini->ism_selected].gid; lgr->peer_gid.gid_ext = @@ -1474,7 +1488,7 @@ static void smc_lgr_free(struct smc_link_group *lgr) destroy_workqueue(lgr->tx_wq); if (lgr->is_smcd) { smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); - put_device(lgr->smcd->ops->get_dev(lgr->smcd)); + put_device(&lgr->smcd->dibs->dev); } smc_lgr_put(lgr); /* theoretically last lgr_put */ } @@ -1883,35 +1897,32 @@ static int smc_vlan_by_tcpsk_walk(struct net_device *lower_dev, /* Determine vlan of internal TCP socket. */ int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini) { - struct dst_entry *dst = sk_dst_get(clcsock->sk); struct netdev_nested_priv priv; struct net_device *ndev; + struct dst_entry *dst; int rc = 0; ini->vlan_id = 0; - if (!dst) { - rc = -ENOTCONN; - goto out; - } - if (!dst->dev) { + + rcu_read_lock(); + + dst = __sk_dst_get(clcsock->sk); + ndev = dst ? dst_dev_rcu(dst) : NULL; + if (!ndev) { rc = -ENODEV; - goto out_rel; + goto out; } - ndev = dst->dev; if (is_vlan_dev(ndev)) { ini->vlan_id = vlan_dev_vlan_id(ndev); - goto out_rel; + goto out; } priv.data = (void *)&ini->vlan_id; - rtnl_lock(); - netdev_walk_all_lower_dev(ndev, smc_vlan_by_tcpsk_walk, &priv); - rtnl_unlock(); - -out_rel: - dst_release(dst); + netdev_walk_all_lower_dev_rcu(ndev, smc_vlan_by_tcpsk_walk, &priv); out: + rcu_read_unlock(); + return rc; } diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 48a1b1dcb576..5c18f08a4c8a 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -13,6 +13,7 @@ #define _SMC_CORE_H #include <linux/atomic.h> +#include <linux/types.h> #include <linux/smc.h> #include <linux/pci.h> #include <rdma/ib_verbs.h> @@ -33,6 +34,8 @@ * distributions may modify it to a value between * 16-255 as needed. */ +#define SMCR_MAX_SEND_WR_DEF 16 /* Default number of work requests per send queue */ +#define SMCR_MAX_RECV_WR_DEF 48 /* Default number of work requests per recv queue */ struct smc_lgr_list { /* list of link group definition */ struct list_head list; @@ -172,6 +175,8 @@ struct smc_link { struct completion llc_testlink_resp; /* wait for rx of testlink */ int llc_testlink_time; /* testlink interval */ atomic_t conn_cnt; /* connections on this link */ + u16 max_send_wr; + u16 max_recv_wr; }; /* For now we just allow one parallel link per link group. The SMC protocol @@ -221,6 +226,10 @@ struct smc_buf_desc { /* virtually contiguous */ }; struct { /* SMC-D */ + /* SMC-D tx buffer */ + bool is_attached; + /* no need for explicit writes */ + /* SMC-D rx buffer: */ unsigned short sba_idx; /* SBA index number */ u64 token; @@ -361,6 +370,10 @@ struct smc_link_group { /* max conn can be assigned to lgr */ u8 max_links; /* max links can be added in lgr */ + u16 max_send_wr; + /* number of WR buffers on send */ + u16 max_recv_wr; + /* number of WR buffers on recv */ }; struct { /* SMC-D */ struct smcd_gid peer_gid; diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index 8ed2f6689b01..bf0beaa23bdb 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -175,7 +175,7 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, dinfo.linkid = *((u32 *)conn->lgr->id); dinfo.peer_gid = conn->lgr->peer_gid.gid; dinfo.peer_gid_ext = conn->lgr->peer_gid.gid_ext; - smcd->ops->get_local_gid(smcd, &smcd_gid); + copy_to_smcdgid(&smcd_gid, &smcd->dibs->gid); dinfo.my_gid = smcd_gid.gid; dinfo.my_gid_ext = smcd_gid.gid_ext; dinfo.token = conn->rmb_desc->token; diff --git a/net/smc/smc_hs_bpf.c b/net/smc/smc_hs_bpf.c new file mode 100644 index 000000000000..063d23d85850 --- /dev/null +++ b/net/smc/smc_hs_bpf.c @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Generic hook for SMC handshake flow. + * + * Copyright IBM Corp. 2016 + * Copyright (c) 2025, Alibaba Inc. + * + * Author: D. Wythe <alibuda@linux.alibaba.com> + */ + +#include <linux/bpf_verifier.h> +#include <linux/bpf.h> +#include <linux/btf.h> +#include <linux/rculist.h> + +#include "smc_hs_bpf.h" + +static DEFINE_SPINLOCK(smc_hs_ctrl_list_lock); +static LIST_HEAD(smc_hs_ctrl_list); + +static int smc_hs_ctrl_reg(struct smc_hs_ctrl *ctrl) +{ + int ret = 0; + + spin_lock(&smc_hs_ctrl_list_lock); + /* already exist or duplicate name */ + if (smc_hs_ctrl_find_by_name(ctrl->name)) + ret = -EEXIST; + else + list_add_tail_rcu(&ctrl->list, &smc_hs_ctrl_list); + spin_unlock(&smc_hs_ctrl_list_lock); + return ret; +} + +static void smc_hs_ctrl_unreg(struct smc_hs_ctrl *ctrl) +{ + spin_lock(&smc_hs_ctrl_list_lock); + list_del_rcu(&ctrl->list); + spin_unlock(&smc_hs_ctrl_list_lock); + + /* Ensure that all readers to complete */ + synchronize_rcu(); +} + +struct smc_hs_ctrl *smc_hs_ctrl_find_by_name(const char *name) +{ + struct smc_hs_ctrl *ctrl; + + list_for_each_entry_rcu(ctrl, &smc_hs_ctrl_list, list) { + if (strcmp(ctrl->name, name) == 0) + return ctrl; + } + return NULL; +} + +static int __smc_bpf_stub_set_tcp_option(struct tcp_sock *tp) { return 1; } +static int __smc_bpf_stub_set_tcp_option_cond(const struct tcp_sock *tp, + struct inet_request_sock *ireq) +{ + return 1; +} + +static struct smc_hs_ctrl __smc_bpf_hs_ctrl = { + .syn_option = __smc_bpf_stub_set_tcp_option, + .synack_option = __smc_bpf_stub_set_tcp_option_cond, +}; + +static int smc_bpf_hs_ctrl_init(struct btf *btf) { return 0; } + +static int smc_bpf_hs_ctrl_reg(void *kdata, struct bpf_link *link) +{ + if (link) + return -EOPNOTSUPP; + + return smc_hs_ctrl_reg(kdata); +} + +static void smc_bpf_hs_ctrl_unreg(void *kdata, struct bpf_link *link) +{ + smc_hs_ctrl_unreg(kdata); +} + +static int smc_bpf_hs_ctrl_init_member(const struct btf_type *t, + const struct btf_member *member, + void *kdata, const void *udata) +{ + const struct smc_hs_ctrl *u_ctrl; + struct smc_hs_ctrl *k_ctrl; + u32 moff; + + u_ctrl = (const struct smc_hs_ctrl *)udata; + k_ctrl = (struct smc_hs_ctrl *)kdata; + + moff = __btf_member_bit_offset(t, member) / 8; + switch (moff) { + case offsetof(struct smc_hs_ctrl, name): + if (bpf_obj_name_cpy(k_ctrl->name, u_ctrl->name, + sizeof(u_ctrl->name)) <= 0) + return -EINVAL; + return 1; + case offsetof(struct smc_hs_ctrl, flags): + if (u_ctrl->flags & ~SMC_HS_CTRL_ALL_FLAGS) + return -EINVAL; + k_ctrl->flags = u_ctrl->flags; + return 1; + default: + break; + } + + return 0; +} + +static const struct bpf_func_proto * +bpf_smc_hs_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + return bpf_base_func_proto(func_id, prog); +} + +static const struct bpf_verifier_ops smc_bpf_verifier_ops = { + .get_func_proto = bpf_smc_hs_func_proto, + .is_valid_access = bpf_tracing_btf_ctx_access, +}; + +static struct bpf_struct_ops bpf_smc_hs_ctrl_ops = { + .name = "smc_hs_ctrl", + .init = smc_bpf_hs_ctrl_init, + .reg = smc_bpf_hs_ctrl_reg, + .unreg = smc_bpf_hs_ctrl_unreg, + .cfi_stubs = &__smc_bpf_hs_ctrl, + .verifier_ops = &smc_bpf_verifier_ops, + .init_member = smc_bpf_hs_ctrl_init_member, + .owner = THIS_MODULE, +}; + +int bpf_smc_hs_ctrl_init(void) +{ + return register_bpf_struct_ops(&bpf_smc_hs_ctrl_ops, smc_hs_ctrl); +} diff --git a/net/smc/smc_hs_bpf.h b/net/smc/smc_hs_bpf.h new file mode 100644 index 000000000000..f5f1807c079e --- /dev/null +++ b/net/smc/smc_hs_bpf.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Shared Memory Communications over RDMA (SMC-R) and RoCE + * + * Generic hook for SMC handshake flow. + * + * Copyright IBM Corp. 2016 + * Copyright (c) 2025, Alibaba Inc. + * + * Author: D. Wythe <alibuda@linux.alibaba.com> + */ + +#ifndef __SMC_HS_CTRL +#define __SMC_HS_CTRL + +#include <net/smc.h> + +/* Find hs_ctrl by the target name, which required to be a c-string. + * Return NULL if no such ctrl was found,otherwise, return a valid ctrl. + * + * Note: Caller MUST ensure it's was invoked under rcu_read_lock. + */ +struct smc_hs_ctrl *smc_hs_ctrl_find_by_name(const char *name); + +#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF) +int bpf_smc_hs_ctrl_init(void); +#else +static inline int bpf_smc_hs_ctrl_init(void) { return 0; } +#endif /* CONFIG_SMC_HS_CTRL_BPF */ + +#endif /* __SMC_HS_CTRL */ diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index 53828833a3f7..1154907c5c05 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -669,11 +669,6 @@ int smc_ib_create_queue_pair(struct smc_link *lnk) .recv_cq = lnk->smcibdev->roce_cq_recv, .srq = NULL, .cap = { - /* include unsolicited rdma_writes as well, - * there are max. 2 RDMA_WRITE per 1 WR_SEND - */ - .max_send_wr = SMC_WR_BUF_CNT * 3, - .max_recv_wr = SMC_WR_BUF_CNT * 3, .max_send_sge = SMC_IB_MAX_SEND_SGE, .max_recv_sge = lnk->wr_rx_sge_cnt, .max_inline_data = 0, @@ -683,6 +678,11 @@ int smc_ib_create_queue_pair(struct smc_link *lnk) }; int rc; + /* include unsolicited rdma_writes as well, + * there are max. 2 RDMA_WRITE per 1 WR_SEND + */ + qp_attr.cap.max_send_wr = 3 * lnk->lgr->max_send_wr; + qp_attr.cap.max_recv_wr = lnk->lgr->max_recv_wr; lnk->roce_qp = ib_create_qp(lnk->roce_pd, &qp_attr); rc = PTR_ERR_OR_ZERO(lnk->roce_qp); if (IS_ERR(lnk->roce_qp)) @@ -742,6 +742,9 @@ bool smc_ib_is_sg_need_sync(struct smc_link *lnk, unsigned int i; bool ret = false; + if (!lnk->smcibdev->ibdev->dma_device) + return ret; + /* for now there is just one DMA address */ for_each_sg(buf_slot->sgt[lnk->link_idx].sgl, sg, buf_slot->sgt[lnk->link_idx].nents, i) { @@ -971,13 +974,17 @@ static int smc_ib_add_dev(struct ib_device *ibdev) smcibdev->pnetid[i])) smc_pnetid_by_table_ib(smcibdev, i + 1); smc_copy_netdev_ifindex(smcibdev, i); - pr_warn_ratelimited("smc: ib device %s port %d has pnetid " - "%.16s%s\n", - smcibdev->ibdev->name, i + 1, - smcibdev->pnetid[i], - smcibdev->pnetid_by_user[i] ? - " (user defined)" : - ""); + if (smc_pnet_is_pnetid_set(smcibdev->pnetid[i])) + pr_warn_ratelimited("smc: ib device %s port %d has pnetid %.16s%s\n", + smcibdev->ibdev->name, i + 1, + smcibdev->pnetid[i], + smcibdev->pnetid_by_user[i] ? + " (user defined)" : + ""); + else + pr_warn_ratelimited("smc: ib device %s port %d has no pnetid\n", + smcibdev->ibdev->name, i + 1); + } schedule_work(&smcibdev->port_event_work); return 0; diff --git a/net/smc/smc_inet.c b/net/smc/smc_inet.c index a944e7dcb8b9..a94084b4a498 100644 --- a/net/smc/smc_inet.c +++ b/net/smc/smc_inet.c @@ -56,7 +56,6 @@ static struct inet_protosw smc_inet_protosw = { .protocol = IPPROTO_SMC, .prot = &smc_inet_prot, .ops = &smc_inet_stream_ops, - .flags = INET_PROTOSW_ICSK, }; #if IS_ENABLED(CONFIG_IPV6) @@ -104,27 +103,15 @@ static struct inet_protosw smc_inet6_protosw = { .protocol = IPPROTO_SMC, .prot = &smc_inet6_prot, .ops = &smc_inet6_stream_ops, - .flags = INET_PROTOSW_ICSK, }; #endif /* CONFIG_IPV6 */ -static unsigned int smc_sync_mss(struct sock *sk, u32 pmtu) -{ - /* No need pass it through to clcsock, mss can always be set by - * sock_create_kern or smc_setsockopt. - */ - return 0; -} - static int smc_inet_init_sock(struct sock *sk) { struct net *net = sock_net(sk); /* init common smc sock */ smc_sk_init(net, sk, IPPROTO_SMC); - - inet_csk(sk)->icsk_sync_mss = smc_sync_mss; - /* create clcsock */ return smc_create_clcsk(net, sk, sk->sk_family); } diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index 84f98e18c7db..7b228ca2f96a 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -17,7 +17,7 @@ #include "smc_ism.h" #include "smc_pnet.h" #include "smc_netlink.h" -#include "linux/ism.h" +#include "linux/dibs.h" struct smcd_dev_list smcd_dev_list = { .list = LIST_HEAD_INIT(smcd_dev_list.list), @@ -27,21 +27,24 @@ struct smcd_dev_list smcd_dev_list = { static bool smc_ism_v2_capable; static u8 smc_ism_v2_system_eid[SMC_MAX_EID_LEN]; -#if IS_ENABLED(CONFIG_ISM) -static void smcd_register_dev(struct ism_dev *ism); -static void smcd_unregister_dev(struct ism_dev *ism); -static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event); -static void smcd_handle_irq(struct ism_dev *ism, unsigned int dmbno, +static void smcd_register_dev(struct dibs_dev *dibs); +static void smcd_unregister_dev(struct dibs_dev *dibs); +static void smcd_handle_event(struct dibs_dev *dibs, + const struct dibs_event *event); +static void smcd_handle_irq(struct dibs_dev *dibs, unsigned int dmbno, u16 dmbemask); -static struct ism_client smc_ism_client = { - .name = "SMC-D", - .add = smcd_register_dev, - .remove = smcd_unregister_dev, +static struct dibs_client_ops smc_client_ops = { + .add_dev = smcd_register_dev, + .del_dev = smcd_unregister_dev, .handle_event = smcd_handle_event, .handle_irq = smcd_handle_irq, }; -#endif + +static struct dibs_client smc_dibs_client = { + .name = "SMC-D", + .ops = &smc_client_ops, +}; static void smc_ism_create_system_eid(void) { @@ -68,8 +71,12 @@ static void smc_ism_create_system_eid(void) int smc_ism_cantalk(struct smcd_gid *peer_gid, unsigned short vlan_id, struct smcd_dev *smcd) { - return smcd->ops->query_remote_gid(smcd, peer_gid, vlan_id ? 1 : 0, - vlan_id); + struct dibs_dev *dibs = smcd->dibs; + uuid_t ism_rgid; + + copy_to_dibsgid(&ism_rgid, peer_gid); + return dibs->ops->query_remote_gid(dibs, &ism_rgid, vlan_id ? 1 : 0, + vlan_id); } void smc_ism_get_system_eid(u8 **eid) @@ -82,7 +89,7 @@ void smc_ism_get_system_eid(u8 **eid) u16 smc_ism_get_chid(struct smcd_dev *smcd) { - return smcd->ops->get_chid(smcd); + return smcd->dibs->ops->get_fabric_id(smcd->dibs); } /* HW supports ISM V2 and thus System EID is defined */ @@ -131,7 +138,7 @@ int smc_ism_get_vlan(struct smcd_dev *smcd, unsigned short vlanid) if (!vlanid) /* No valid vlan id */ return -EINVAL; - if (!smcd->ops->add_vlan_id) + if (!smcd->dibs->ops->add_vlan_id) return -EOPNOTSUPP; /* create new vlan entry, in case we need it */ @@ -154,7 +161,7 @@ int smc_ism_get_vlan(struct smcd_dev *smcd, unsigned short vlanid) /* no existing entry found. * add new entry to device; might fail, e.g., if HW limit reached */ - if (smcd->ops->add_vlan_id(smcd, vlanid)) { + if (smcd->dibs->ops->add_vlan_id(smcd->dibs, vlanid)) { kfree(new_vlan); rc = -EIO; goto out; @@ -178,7 +185,7 @@ int smc_ism_put_vlan(struct smcd_dev *smcd, unsigned short vlanid) if (!vlanid) /* No valid vlan id */ return -EINVAL; - if (!smcd->ops->del_vlan_id) + if (!smcd->dibs->ops->del_vlan_id) return -EOPNOTSUPP; spin_lock_irqsave(&smcd->lock, flags); @@ -196,7 +203,7 @@ int smc_ism_put_vlan(struct smcd_dev *smcd, unsigned short vlanid) } /* Found and the last reference just gone */ - if (smcd->ops->del_vlan_id(smcd, vlanid)) + if (smcd->dibs->ops->del_vlan_id(smcd->dibs, vlanid)) rc = -EIO; list_del(&vlan->list); kfree(vlan); @@ -205,43 +212,43 @@ out: return rc; } -int smc_ism_unregister_dmb(struct smcd_dev *smcd, struct smc_buf_desc *dmb_desc) +void smc_ism_unregister_dmb(struct smcd_dev *smcd, + struct smc_buf_desc *dmb_desc) { - struct smcd_dmb dmb; - int rc = 0; + struct dibs_dmb dmb; if (!dmb_desc->dma_addr) - return rc; + return; memset(&dmb, 0, sizeof(dmb)); dmb.dmb_tok = dmb_desc->token; - dmb.sba_idx = dmb_desc->sba_idx; + dmb.idx = dmb_desc->sba_idx; dmb.cpu_addr = dmb_desc->cpu_addr; dmb.dma_addr = dmb_desc->dma_addr; dmb.dmb_len = dmb_desc->len; - rc = smcd->ops->unregister_dmb(smcd, &dmb); - if (!rc || rc == ISM_ERROR) { - dmb_desc->cpu_addr = NULL; - dmb_desc->dma_addr = 0; - } - return rc; + smcd->dibs->ops->unregister_dmb(smcd->dibs, &dmb); + + return; } int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len, struct smc_buf_desc *dmb_desc) { - struct smcd_dmb dmb; + struct dibs_dev *dibs; + struct dibs_dmb dmb; int rc; memset(&dmb, 0, sizeof(dmb)); dmb.dmb_len = dmb_len; - dmb.sba_idx = dmb_desc->sba_idx; + dmb.idx = dmb_desc->sba_idx; dmb.vlan_id = lgr->vlan_id; - dmb.rgid = lgr->peer_gid.gid; - rc = lgr->smcd->ops->register_dmb(lgr->smcd, &dmb, lgr->smcd->client); + copy_to_dibsgid(&dmb.rgid, &lgr->peer_gid); + + dibs = lgr->smcd->dibs; + rc = dibs->ops->register_dmb(dibs, &dmb, &smc_dibs_client); if (!rc) { - dmb_desc->sba_idx = dmb.sba_idx; + dmb_desc->sba_idx = dmb.idx; dmb_desc->token = dmb.dmb_tok; dmb_desc->cpu_addr = dmb.cpu_addr; dmb_desc->dma_addr = dmb.dma_addr; @@ -256,38 +263,39 @@ bool smc_ism_support_dmb_nocopy(struct smcd_dev *smcd) * merging sndbuf with peer DMB to avoid * data copies between them. */ - return (smcd->ops->support_dmb_nocopy && - smcd->ops->support_dmb_nocopy(smcd)); + return (smcd->dibs->ops->support_mmapped_rdmb && + smcd->dibs->ops->support_mmapped_rdmb(smcd->dibs)); } int smc_ism_attach_dmb(struct smcd_dev *dev, u64 token, struct smc_buf_desc *dmb_desc) { - struct smcd_dmb dmb; + struct dibs_dmb dmb; int rc = 0; - if (!dev->ops->attach_dmb) + if (!dev->dibs->ops->attach_dmb) return -EINVAL; memset(&dmb, 0, sizeof(dmb)); dmb.dmb_tok = token; - rc = dev->ops->attach_dmb(dev, &dmb); + rc = dev->dibs->ops->attach_dmb(dev->dibs, &dmb); if (!rc) { - dmb_desc->sba_idx = dmb.sba_idx; + dmb_desc->sba_idx = dmb.idx; dmb_desc->token = dmb.dmb_tok; dmb_desc->cpu_addr = dmb.cpu_addr; dmb_desc->dma_addr = dmb.dma_addr; dmb_desc->len = dmb.dmb_len; + dmb_desc->is_attached = true; } return rc; } int smc_ism_detach_dmb(struct smcd_dev *dev, u64 token) { - if (!dev->ops->detach_dmb) + if (!dev->dibs->ops->detach_dmb) return -EINVAL; - return dev->ops->detach_dmb(dev, token); + return dev->dibs->ops->detach_dmb(dev->dibs, token); } static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd, @@ -297,12 +305,12 @@ static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd, char smc_pnet[SMC_MAX_PNETID_LEN + 1]; struct smc_pci_dev smc_pci_dev; struct nlattr *port_attrs; + struct dibs_dev *dibs; struct nlattr *attrs; - struct ism_dev *ism; int use_cnt = 0; void *nlh; - ism = smcd->priv; + dibs = smcd->dibs; nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &smc_gen_nl_family, NLM_F_MULTI, SMC_NETLINK_GET_DEV_SMCD); @@ -317,7 +325,7 @@ static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd, if (nla_put_u8(skb, SMC_NLA_DEV_IS_CRIT, use_cnt > 0)) goto errattr; memset(&smc_pci_dev, 0, sizeof(smc_pci_dev)); - smc_set_pci_values(to_pci_dev(ism->dev.parent), &smc_pci_dev); + smc_set_pci_values(to_pci_dev(dibs->dev.parent), &smc_pci_dev); if (nla_put_u32(skb, SMC_NLA_DEV_PCI_FID, smc_pci_dev.pci_fid)) goto errattr; if (nla_put_u16(skb, SMC_NLA_DEV_PCI_CHID, smc_pci_dev.pci_pchid)) @@ -367,7 +375,7 @@ static void smc_nl_prep_smcd_dev(struct smcd_dev_list *dev_list, list_for_each_entry(smcd, &dev_list->list, list) { if (num < snum) goto next; - if (smc_ism_is_loopback(smcd)) + if (smc_ism_is_loopback(smcd->dibs)) goto next; if (smc_nl_handle_smcd_dev(smcd, skb, cb)) goto errout; @@ -385,11 +393,10 @@ int smcd_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } -#if IS_ENABLED(CONFIG_ISM) struct smc_ism_event_work { struct work_struct work; struct smcd_dev *smcd; - struct ism_event event; + struct dibs_event event; }; #define ISM_EVENT_REQUEST 0x0001 @@ -409,25 +416,27 @@ union smcd_sw_event_info { static void smcd_handle_sw_event(struct smc_ism_event_work *wrk) { - struct smcd_gid peer_gid = { .gid = wrk->event.tok, - .gid_ext = 0 }; + struct dibs_dev *dibs = wrk->smcd->dibs; union smcd_sw_event_info ev_info; + struct smcd_gid peer_gid; + uuid_t ism_rgid; - ev_info.info = wrk->event.info; - switch (wrk->event.code) { + copy_to_smcdgid(&peer_gid, &wrk->event.gid); + ev_info.info = wrk->event.data; + switch (wrk->event.subtype) { case ISM_EVENT_CODE_SHUTDOWN: /* Peer shut down DMBs */ smc_smcd_terminate(wrk->smcd, &peer_gid, ev_info.vlan_id); break; case ISM_EVENT_CODE_TESTLINK: /* Activity timer */ if (ev_info.code == ISM_EVENT_REQUEST && - wrk->smcd->ops->signal_event) { + dibs->ops->signal_event) { ev_info.code = ISM_EVENT_RESPONSE; - wrk->smcd->ops->signal_event(wrk->smcd, - &peer_gid, - ISM_EVENT_REQUEST_IR, - ISM_EVENT_CODE_TESTLINK, - ev_info.info); - } + copy_to_dibsgid(&ism_rgid, &peer_gid); + dibs->ops->signal_event(dibs, &ism_rgid, + ISM_EVENT_REQUEST_IR, + ISM_EVENT_CODE_TESTLINK, + ev_info.info); + } break; } } @@ -437,41 +446,39 @@ static void smc_ism_event_work(struct work_struct *work) { struct smc_ism_event_work *wrk = container_of(work, struct smc_ism_event_work, work); - struct smcd_gid smcd_gid = { .gid = wrk->event.tok, - .gid_ext = 0 }; + struct smcd_gid smcd_gid; + + copy_to_smcdgid(&smcd_gid, &wrk->event.gid); switch (wrk->event.type) { - case ISM_EVENT_GID: /* GID event, token is peer GID */ + case DIBS_DEV_EVENT: /* GID event, token is peer GID */ smc_smcd_terminate(wrk->smcd, &smcd_gid, VLAN_VID_MASK); break; - case ISM_EVENT_DMB: + case DIBS_BUF_EVENT: break; - case ISM_EVENT_SWR: /* Software defined event */ + case DIBS_SW_EVENT: /* Software defined event */ smcd_handle_sw_event(wrk); break; } kfree(wrk); } -static struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, - const struct smcd_ops *ops, int max_dmbs) +static struct smcd_dev *smcd_alloc_dev(const char *name, int max_dmbs) { struct smcd_dev *smcd; - smcd = devm_kzalloc(parent, sizeof(*smcd), GFP_KERNEL); + smcd = kzalloc(sizeof(*smcd), GFP_KERNEL); if (!smcd) return NULL; - smcd->conn = devm_kcalloc(parent, max_dmbs, - sizeof(struct smc_connection *), GFP_KERNEL); + smcd->conn = kcalloc(max_dmbs, sizeof(struct smc_connection *), + GFP_KERNEL); if (!smcd->conn) - return NULL; + goto free_smcd; smcd->event_wq = alloc_ordered_workqueue("ism_evt_wq-%s)", WQ_MEM_RECLAIM, name); if (!smcd->event_wq) - return NULL; - - smcd->ops = ops; + goto free_conn; spin_lock_init(&smcd->lock); spin_lock_init(&smcd->lgr_lock); @@ -479,28 +486,37 @@ static struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, INIT_LIST_HEAD(&smcd->lgr_list); init_waitqueue_head(&smcd->lgrs_deleted); return smcd; + +free_conn: + kfree(smcd->conn); +free_smcd: + kfree(smcd); + return NULL; } -static void smcd_register_dev(struct ism_dev *ism) +static void smcd_register_dev(struct dibs_dev *dibs) { - const struct smcd_ops *ops = ism_get_smcd_ops(); struct smcd_dev *smcd, *fentry; + int max_dmbs; - if (!ops) - return; + max_dmbs = dibs->ops->max_dmbs(); - smcd = smcd_alloc_dev(&ism->pdev->dev, dev_name(&ism->pdev->dev), ops, - ISM_NR_DMBS); + smcd = smcd_alloc_dev(dev_name(&dibs->dev), max_dmbs); if (!smcd) return; - smcd->priv = ism; - smcd->client = &smc_ism_client; - ism_set_priv(ism, &smc_ism_client, smcd); - if (smc_pnetid_by_dev_port(&ism->pdev->dev, 0, smcd->pnetid)) + + smcd->dibs = dibs; + dibs_set_priv(dibs, &smc_dibs_client, smcd); + + if (smc_pnetid_by_dev_port(dibs->dev.parent, 0, smcd->pnetid)) smc_pnetid_by_table_smcd(smcd); - if (smcd->ops->supports_v2()) + if (smc_ism_is_loopback(dibs) || + (dibs->ops->add_vlan_id && + !dibs->ops->add_vlan_id(dibs, ISM_RESERVED_VLANID))) { smc_ism_set_v2_capable(); + } + mutex_lock(&smcd_dev_list.mutex); /* sort list: * - devices without pnetid before devices with pnetid; @@ -509,7 +525,7 @@ static void smcd_register_dev(struct ism_dev *ism) if (!smcd->pnetid[0]) { fentry = list_first_entry_or_null(&smcd_dev_list.list, struct smcd_dev, list); - if (fentry && smc_ism_is_loopback(fentry)) + if (fentry && smc_ism_is_loopback(fentry->dibs)) list_add(&smcd->list, &fentry->list); else list_add(&smcd->list, &smcd_dev_list.list); @@ -518,25 +534,32 @@ static void smcd_register_dev(struct ism_dev *ism) } mutex_unlock(&smcd_dev_list.mutex); - pr_warn_ratelimited("smc: adding smcd device %s with pnetid %.16s%s\n", - dev_name(&ism->dev), smcd->pnetid, - smcd->pnetid_by_user ? " (user defined)" : ""); - + if (smc_pnet_is_pnetid_set(smcd->pnetid)) + pr_warn_ratelimited("smc: adding smcd device %s with pnetid %.16s%s\n", + dev_name(&dibs->dev), smcd->pnetid, + smcd->pnetid_by_user ? + " (user defined)" : + ""); + else + pr_warn_ratelimited("smc: adding smcd device %s without pnetid\n", + dev_name(&dibs->dev)); return; } -static void smcd_unregister_dev(struct ism_dev *ism) +static void smcd_unregister_dev(struct dibs_dev *dibs) { - struct smcd_dev *smcd = ism_get_priv(ism, &smc_ism_client); + struct smcd_dev *smcd = dibs_get_priv(dibs, &smc_dibs_client); pr_warn_ratelimited("smc: removing smcd device %s\n", - dev_name(&ism->dev)); + dev_name(&dibs->dev)); smcd->going_away = 1; smc_smcd_terminate_all(smcd); mutex_lock(&smcd_dev_list.mutex); list_del_init(&smcd->list); mutex_unlock(&smcd_dev_list.mutex); destroy_workqueue(smcd->event_wq); + kfree(smcd->conn); + kfree(smcd); } /* SMCD Device event handler. Called from ISM device interrupt handler. @@ -550,9 +573,10 @@ static void smcd_unregister_dev(struct ism_dev *ism) * Context: * - Function called in IRQ context from ISM device driver event handler. */ -static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event) +static void smcd_handle_event(struct dibs_dev *dibs, + const struct dibs_event *event) { - struct smcd_dev *smcd = ism_get_priv(ism, &smc_ism_client); + struct smcd_dev *smcd = dibs_get_priv(dibs, &smc_dibs_client); struct smc_ism_event_work *wrk; if (smcd->going_away) @@ -574,10 +598,10 @@ static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event) * Context: * - Function called in IRQ context from ISM device driver IRQ handler. */ -static void smcd_handle_irq(struct ism_dev *ism, unsigned int dmbno, +static void smcd_handle_irq(struct dibs_dev *dibs, unsigned int dmbno, u16 dmbemask) { - struct smcd_dev *smcd = ism_get_priv(ism, &smc_ism_client); + struct smcd_dev *smcd = dibs_get_priv(dibs, &smc_dibs_client); struct smc_connection *conn = NULL; unsigned long flags; @@ -587,27 +611,26 @@ static void smcd_handle_irq(struct ism_dev *ism, unsigned int dmbno, tasklet_schedule(&conn->rx_tsklet); spin_unlock_irqrestore(&smcd->lock, flags); } -#endif int smc_ism_signal_shutdown(struct smc_link_group *lgr) { int rc = 0; -#if IS_ENABLED(CONFIG_ISM) union smcd_sw_event_info ev_info; + uuid_t ism_rgid; if (lgr->peer_shutdown) return 0; - if (!lgr->smcd->ops->signal_event) + if (!lgr->smcd->dibs->ops->signal_event) return 0; memcpy(ev_info.uid, lgr->id, SMC_LGR_ID_SIZE); ev_info.vlan_id = lgr->vlan_id; ev_info.code = ISM_EVENT_REQUEST; - rc = lgr->smcd->ops->signal_event(lgr->smcd, &lgr->peer_gid, + copy_to_dibsgid(&ism_rgid, &lgr->peer_gid); + rc = lgr->smcd->dibs->ops->signal_event(lgr->smcd->dibs, &ism_rgid, ISM_EVENT_REQUEST_IR, ISM_EVENT_CODE_SHUTDOWN, ev_info.info); -#endif return rc; } @@ -618,15 +641,11 @@ int smc_ism_init(void) smc_ism_v2_capable = false; smc_ism_create_system_eid(); -#if IS_ENABLED(CONFIG_ISM) - rc = ism_register_client(&smc_ism_client); -#endif + rc = dibs_register_client(&smc_dibs_client); return rc; } void smc_ism_exit(void) { -#if IS_ENABLED(CONFIG_ISM) - ism_unregister_client(&smc_ism_client); -#endif + dibs_unregister_client(&smc_dibs_client); } diff --git a/net/smc/smc_ism.h b/net/smc/smc_ism.h index 6763133dd8d0..a1575e31df73 100644 --- a/net/smc/smc_ism.h +++ b/net/smc/smc_ism.h @@ -12,6 +12,7 @@ #include <linux/uio.h> #include <linux/types.h> #include <linux/mutex.h> +#include <linux/dibs.h> #include "smc.h" @@ -47,7 +48,8 @@ int smc_ism_get_vlan(struct smcd_dev *dev, unsigned short vlan_id); int smc_ism_put_vlan(struct smcd_dev *dev, unsigned short vlan_id); int smc_ism_register_dmb(struct smc_link_group *lgr, int buf_size, struct smc_buf_desc *dmb_desc); -int smc_ism_unregister_dmb(struct smcd_dev *dev, struct smc_buf_desc *dmb_desc); +void smc_ism_unregister_dmb(struct smcd_dev *dev, + struct smc_buf_desc *dmb_desc); bool smc_ism_support_dmb_nocopy(struct smcd_dev *smcd); int smc_ism_attach_dmb(struct smcd_dev *dev, u64 token, struct smc_buf_desc *dmb_desc); @@ -67,7 +69,9 @@ static inline int smc_ism_write(struct smcd_dev *smcd, u64 dmb_tok, { int rc; - rc = smcd->ops->move_data(smcd, dmb_tok, idx, sf, offset, data, len); + rc = smcd->dibs->ops->move_data(smcd->dibs, dmb_tok, idx, sf, offset, + data, len); + return rc < 0 ? rc : 0; } @@ -84,14 +88,36 @@ static inline bool __smc_ism_is_emulated(u16 chid) static inline bool smc_ism_is_emulated(struct smcd_dev *smcd) { - u16 chid = smcd->ops->get_chid(smcd); + u16 chid = smcd->dibs->ops->get_fabric_id(smcd->dibs); return __smc_ism_is_emulated(chid); } -static inline bool smc_ism_is_loopback(struct smcd_dev *smcd) +static inline bool smc_ism_is_loopback(struct dibs_dev *dibs) { - return (smcd->ops->get_chid(smcd) == 0xFFFF); + return (dibs->ops->get_fabric_id(dibs) == DIBS_LOOPBACK_FABRIC); +} + +static inline void copy_to_smcdgid(struct smcd_gid *sgid, uuid_t *dibs_gid) +{ + __be64 temp; + + memcpy(&temp, dibs_gid, sizeof(sgid->gid)); + sgid->gid = ntohll(temp); + memcpy(&temp, (uint8_t *)dibs_gid + sizeof(sgid->gid), + sizeof(sgid->gid_ext)); + sgid->gid_ext = ntohll(temp); +} + +static inline void copy_to_dibsgid(uuid_t *dibs_gid, struct smcd_gid *sgid) +{ + __be64 temp; + + temp = htonll(sgid->gid); + memcpy(dibs_gid, &temp, sizeof(sgid->gid)); + temp = htonll(sgid->gid_ext); + memcpy((uint8_t *)dibs_gid + sizeof(sgid->gid), &temp, + sizeof(sgid->gid_ext)); } #endif diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index f865c58c3aa7..f5d5eb617526 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -2157,6 +2157,8 @@ void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc) init_waitqueue_head(&lgr->llc_msg_waiter); init_rwsem(&lgr->llc_conf_mutex); lgr->llc_testlink_time = READ_ONCE(net->smc.sysctl_smcr_testlink_time); + lgr->max_send_wr = (u16)(READ_ONCE(net->smc.sysctl_smcr_max_send_wr)); + lgr->max_recv_wr = (u16)(READ_ONCE(net->smc.sysctl_smcr_max_recv_wr)); } /* called after lgr was removed from lgr_list */ diff --git a/net/smc/smc_loopback.c b/net/smc/smc_loopback.c deleted file mode 100644 index 0eb00bbefd17..000000000000 --- a/net/smc/smc_loopback.c +++ /dev/null @@ -1,421 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Shared Memory Communications Direct over loopback-ism device. - * - * Functions for loopback-ism device. - * - * Copyright (c) 2024, Alibaba Inc. - * - * Author: Wen Gu <guwen@linux.alibaba.com> - * Tony Lu <tonylu@linux.alibaba.com> - * - */ - -#include <linux/device.h> -#include <linux/types.h> -#include <net/smc.h> - -#include "smc_cdc.h" -#include "smc_ism.h" -#include "smc_loopback.h" - -#define SMC_LO_V2_CAPABLE 0x1 /* loopback-ism acts as ISMv2 */ -#define SMC_LO_SUPPORT_NOCOPY 0x1 -#define SMC_DMA_ADDR_INVALID (~(dma_addr_t)0) - -static const char smc_lo_dev_name[] = "loopback-ism"; -static struct smc_lo_dev *lo_dev; - -static void smc_lo_generate_ids(struct smc_lo_dev *ldev) -{ - struct smcd_gid *lgid = &ldev->local_gid; - uuid_t uuid; - - uuid_gen(&uuid); - memcpy(&lgid->gid, &uuid, sizeof(lgid->gid)); - memcpy(&lgid->gid_ext, (u8 *)&uuid + sizeof(lgid->gid), - sizeof(lgid->gid_ext)); - - ldev->chid = SMC_LO_RESERVED_CHID; -} - -static int smc_lo_query_rgid(struct smcd_dev *smcd, struct smcd_gid *rgid, - u32 vid_valid, u32 vid) -{ - struct smc_lo_dev *ldev = smcd->priv; - - /* rgid should be the same as lgid */ - if (!ldev || rgid->gid != ldev->local_gid.gid || - rgid->gid_ext != ldev->local_gid.gid_ext) - return -ENETUNREACH; - return 0; -} - -static int smc_lo_register_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb, - void *client_priv) -{ - struct smc_lo_dmb_node *dmb_node, *tmp_node; - struct smc_lo_dev *ldev = smcd->priv; - int sba_idx, rc; - - /* check space for new dmb */ - for_each_clear_bit(sba_idx, ldev->sba_idx_mask, SMC_LO_MAX_DMBS) { - if (!test_and_set_bit(sba_idx, ldev->sba_idx_mask)) - break; - } - if (sba_idx == SMC_LO_MAX_DMBS) - return -ENOSPC; - - dmb_node = kzalloc(sizeof(*dmb_node), GFP_KERNEL); - if (!dmb_node) { - rc = -ENOMEM; - goto err_bit; - } - - dmb_node->sba_idx = sba_idx; - dmb_node->len = dmb->dmb_len; - dmb_node->cpu_addr = kzalloc(dmb_node->len, GFP_KERNEL | - __GFP_NOWARN | __GFP_NORETRY | - __GFP_NOMEMALLOC); - if (!dmb_node->cpu_addr) { - rc = -ENOMEM; - goto err_node; - } - dmb_node->dma_addr = SMC_DMA_ADDR_INVALID; - refcount_set(&dmb_node->refcnt, 1); - -again: - /* add new dmb into hash table */ - get_random_bytes(&dmb_node->token, sizeof(dmb_node->token)); - write_lock_bh(&ldev->dmb_ht_lock); - hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb_node->token) { - if (tmp_node->token == dmb_node->token) { - write_unlock_bh(&ldev->dmb_ht_lock); - goto again; - } - } - hash_add(ldev->dmb_ht, &dmb_node->list, dmb_node->token); - write_unlock_bh(&ldev->dmb_ht_lock); - atomic_inc(&ldev->dmb_cnt); - - dmb->sba_idx = dmb_node->sba_idx; - dmb->dmb_tok = dmb_node->token; - dmb->cpu_addr = dmb_node->cpu_addr; - dmb->dma_addr = dmb_node->dma_addr; - dmb->dmb_len = dmb_node->len; - - return 0; - -err_node: - kfree(dmb_node); -err_bit: - clear_bit(sba_idx, ldev->sba_idx_mask); - return rc; -} - -static void __smc_lo_unregister_dmb(struct smc_lo_dev *ldev, - struct smc_lo_dmb_node *dmb_node) -{ - /* remove dmb from hash table */ - write_lock_bh(&ldev->dmb_ht_lock); - hash_del(&dmb_node->list); - write_unlock_bh(&ldev->dmb_ht_lock); - - clear_bit(dmb_node->sba_idx, ldev->sba_idx_mask); - kvfree(dmb_node->cpu_addr); - kfree(dmb_node); - - if (atomic_dec_and_test(&ldev->dmb_cnt)) - wake_up(&ldev->ldev_release); -} - -static int smc_lo_unregister_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb) -{ - struct smc_lo_dmb_node *dmb_node = NULL, *tmp_node; - struct smc_lo_dev *ldev = smcd->priv; - - /* find dmb from hash table */ - read_lock_bh(&ldev->dmb_ht_lock); - hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb->dmb_tok) { - if (tmp_node->token == dmb->dmb_tok) { - dmb_node = tmp_node; - break; - } - } - if (!dmb_node) { - read_unlock_bh(&ldev->dmb_ht_lock); - return -EINVAL; - } - read_unlock_bh(&ldev->dmb_ht_lock); - - if (refcount_dec_and_test(&dmb_node->refcnt)) - __smc_lo_unregister_dmb(ldev, dmb_node); - return 0; -} - -static int smc_lo_support_dmb_nocopy(struct smcd_dev *smcd) -{ - return SMC_LO_SUPPORT_NOCOPY; -} - -static int smc_lo_attach_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb) -{ - struct smc_lo_dmb_node *dmb_node = NULL, *tmp_node; - struct smc_lo_dev *ldev = smcd->priv; - - /* find dmb_node according to dmb->dmb_tok */ - read_lock_bh(&ldev->dmb_ht_lock); - hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb->dmb_tok) { - if (tmp_node->token == dmb->dmb_tok) { - dmb_node = tmp_node; - break; - } - } - if (!dmb_node) { - read_unlock_bh(&ldev->dmb_ht_lock); - return -EINVAL; - } - read_unlock_bh(&ldev->dmb_ht_lock); - - if (!refcount_inc_not_zero(&dmb_node->refcnt)) - /* the dmb is being unregistered, but has - * not been removed from the hash table. - */ - return -EINVAL; - - /* provide dmb information */ - dmb->sba_idx = dmb_node->sba_idx; - dmb->dmb_tok = dmb_node->token; - dmb->cpu_addr = dmb_node->cpu_addr; - dmb->dma_addr = dmb_node->dma_addr; - dmb->dmb_len = dmb_node->len; - return 0; -} - -static int smc_lo_detach_dmb(struct smcd_dev *smcd, u64 token) -{ - struct smc_lo_dmb_node *dmb_node = NULL, *tmp_node; - struct smc_lo_dev *ldev = smcd->priv; - - /* find dmb_node according to dmb->dmb_tok */ - read_lock_bh(&ldev->dmb_ht_lock); - hash_for_each_possible(ldev->dmb_ht, tmp_node, list, token) { - if (tmp_node->token == token) { - dmb_node = tmp_node; - break; - } - } - if (!dmb_node) { - read_unlock_bh(&ldev->dmb_ht_lock); - return -EINVAL; - } - read_unlock_bh(&ldev->dmb_ht_lock); - - if (refcount_dec_and_test(&dmb_node->refcnt)) - __smc_lo_unregister_dmb(ldev, dmb_node); - return 0; -} - -static int smc_lo_move_data(struct smcd_dev *smcd, u64 dmb_tok, - unsigned int idx, bool sf, unsigned int offset, - void *data, unsigned int size) -{ - struct smc_lo_dmb_node *rmb_node = NULL, *tmp_node; - struct smc_lo_dev *ldev = smcd->priv; - struct smc_connection *conn; - - if (!sf) - /* since sndbuf is merged with peer DMB, there is - * no need to copy data from sndbuf to peer DMB. - */ - return 0; - - read_lock_bh(&ldev->dmb_ht_lock); - hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb_tok) { - if (tmp_node->token == dmb_tok) { - rmb_node = tmp_node; - break; - } - } - if (!rmb_node) { - read_unlock_bh(&ldev->dmb_ht_lock); - return -EINVAL; - } - memcpy((char *)rmb_node->cpu_addr + offset, data, size); - read_unlock_bh(&ldev->dmb_ht_lock); - - conn = smcd->conn[rmb_node->sba_idx]; - if (!conn || conn->killed) - return -EPIPE; - tasklet_schedule(&conn->rx_tsklet); - return 0; -} - -static void smc_lo_get_local_gid(struct smcd_dev *smcd, - struct smcd_gid *smcd_gid) -{ - struct smc_lo_dev *ldev = smcd->priv; - - smcd_gid->gid = ldev->local_gid.gid; - smcd_gid->gid_ext = ldev->local_gid.gid_ext; -} - -static u16 smc_lo_get_chid(struct smcd_dev *smcd) -{ - return ((struct smc_lo_dev *)smcd->priv)->chid; -} - -static struct device *smc_lo_get_dev(struct smcd_dev *smcd) -{ - return &((struct smc_lo_dev *)smcd->priv)->dev; -} - -static const struct smcd_ops lo_ops = { - .query_remote_gid = smc_lo_query_rgid, - .register_dmb = smc_lo_register_dmb, - .unregister_dmb = smc_lo_unregister_dmb, - .support_dmb_nocopy = smc_lo_support_dmb_nocopy, - .attach_dmb = smc_lo_attach_dmb, - .detach_dmb = smc_lo_detach_dmb, - .add_vlan_id = NULL, - .del_vlan_id = NULL, - .set_vlan_required = NULL, - .reset_vlan_required = NULL, - .signal_event = NULL, - .move_data = smc_lo_move_data, - .get_local_gid = smc_lo_get_local_gid, - .get_chid = smc_lo_get_chid, - .get_dev = smc_lo_get_dev, -}; - -static struct smcd_dev *smcd_lo_alloc_dev(const struct smcd_ops *ops, - int max_dmbs) -{ - struct smcd_dev *smcd; - - smcd = kzalloc(sizeof(*smcd), GFP_KERNEL); - if (!smcd) - return NULL; - - smcd->conn = kcalloc(max_dmbs, sizeof(struct smc_connection *), - GFP_KERNEL); - if (!smcd->conn) - goto out_smcd; - - smcd->ops = ops; - - spin_lock_init(&smcd->lock); - spin_lock_init(&smcd->lgr_lock); - INIT_LIST_HEAD(&smcd->vlan); - INIT_LIST_HEAD(&smcd->lgr_list); - init_waitqueue_head(&smcd->lgrs_deleted); - return smcd; - -out_smcd: - kfree(smcd); - return NULL; -} - -static int smcd_lo_register_dev(struct smc_lo_dev *ldev) -{ - struct smcd_dev *smcd; - - smcd = smcd_lo_alloc_dev(&lo_ops, SMC_LO_MAX_DMBS); - if (!smcd) - return -ENOMEM; - ldev->smcd = smcd; - smcd->priv = ldev; - smc_ism_set_v2_capable(); - mutex_lock(&smcd_dev_list.mutex); - list_add(&smcd->list, &smcd_dev_list.list); - mutex_unlock(&smcd_dev_list.mutex); - pr_warn_ratelimited("smc: adding smcd device %s\n", - dev_name(&ldev->dev)); - return 0; -} - -static void smcd_lo_unregister_dev(struct smc_lo_dev *ldev) -{ - struct smcd_dev *smcd = ldev->smcd; - - pr_warn_ratelimited("smc: removing smcd device %s\n", - dev_name(&ldev->dev)); - smcd->going_away = 1; - smc_smcd_terminate_all(smcd); - mutex_lock(&smcd_dev_list.mutex); - list_del_init(&smcd->list); - mutex_unlock(&smcd_dev_list.mutex); - kfree(smcd->conn); - kfree(smcd); -} - -static int smc_lo_dev_init(struct smc_lo_dev *ldev) -{ - smc_lo_generate_ids(ldev); - rwlock_init(&ldev->dmb_ht_lock); - hash_init(ldev->dmb_ht); - atomic_set(&ldev->dmb_cnt, 0); - init_waitqueue_head(&ldev->ldev_release); - - return smcd_lo_register_dev(ldev); -} - -static void smc_lo_dev_exit(struct smc_lo_dev *ldev) -{ - smcd_lo_unregister_dev(ldev); - if (atomic_read(&ldev->dmb_cnt)) - wait_event(ldev->ldev_release, !atomic_read(&ldev->dmb_cnt)); -} - -static void smc_lo_dev_release(struct device *dev) -{ - struct smc_lo_dev *ldev = - container_of(dev, struct smc_lo_dev, dev); - - kfree(ldev); -} - -static int smc_lo_dev_probe(void) -{ - struct smc_lo_dev *ldev; - int ret; - - ldev = kzalloc(sizeof(*ldev), GFP_KERNEL); - if (!ldev) - return -ENOMEM; - - ldev->dev.parent = NULL; - ldev->dev.release = smc_lo_dev_release; - device_initialize(&ldev->dev); - dev_set_name(&ldev->dev, smc_lo_dev_name); - - ret = smc_lo_dev_init(ldev); - if (ret) - goto free_dev; - - lo_dev = ldev; /* global loopback device */ - return 0; - -free_dev: - put_device(&ldev->dev); - return ret; -} - -static void smc_lo_dev_remove(void) -{ - if (!lo_dev) - return; - - smc_lo_dev_exit(lo_dev); - put_device(&lo_dev->dev); /* device_initialize in smc_lo_dev_probe */ -} - -int smc_loopback_init(void) -{ - return smc_lo_dev_probe(); -} - -void smc_loopback_exit(void) -{ - smc_lo_dev_remove(); -} diff --git a/net/smc/smc_loopback.h b/net/smc/smc_loopback.h deleted file mode 100644 index 04dc6808d2e1..000000000000 --- a/net/smc/smc_loopback.h +++ /dev/null @@ -1,60 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Shared Memory Communications Direct over loopback-ism device. - * - * SMC-D loopback-ism device structure definitions. - * - * Copyright (c) 2024, Alibaba Inc. - * - * Author: Wen Gu <guwen@linux.alibaba.com> - * Tony Lu <tonylu@linux.alibaba.com> - * - */ - -#ifndef _SMC_LOOPBACK_H -#define _SMC_LOOPBACK_H - -#include <linux/device.h> -#include <net/smc.h> - -#if IS_ENABLED(CONFIG_SMC_LO) -#define SMC_LO_MAX_DMBS 5000 -#define SMC_LO_DMBS_HASH_BITS 12 -#define SMC_LO_RESERVED_CHID 0xFFFF - -struct smc_lo_dmb_node { - struct hlist_node list; - u64 token; - u32 len; - u32 sba_idx; - void *cpu_addr; - dma_addr_t dma_addr; - refcount_t refcnt; -}; - -struct smc_lo_dev { - struct smcd_dev *smcd; - struct device dev; - u16 chid; - struct smcd_gid local_gid; - atomic_t dmb_cnt; - rwlock_t dmb_ht_lock; - DECLARE_BITMAP(sba_idx_mask, SMC_LO_MAX_DMBS); - DECLARE_HASHTABLE(dmb_ht, SMC_LO_DMBS_HASH_BITS); - wait_queue_head_t ldev_release; -}; - -int smc_loopback_init(void); -void smc_loopback_exit(void); -#else -static inline int smc_loopback_init(void) -{ - return 0; -} - -static inline void smc_loopback_exit(void) -{ -} -#endif - -#endif /* _SMC_LOOPBACK_H */ diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 76ad29e31d60..a3a1e1fde8eb 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -169,7 +169,7 @@ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name) pr_warn_ratelimited("smc: smcd device %s " "erased user defined pnetid " "%.16s\n", - dev_name(smcd->ops->get_dev(smcd)), + dev_name(&smcd->dibs->dev), smcd->pnetid); memset(smcd->pnetid, 0, SMC_MAX_PNETID_LEN); smcd->pnetid_by_user = false; @@ -332,8 +332,11 @@ static struct smcd_dev *smc_pnet_find_smcd(char *smcd_name) mutex_lock(&smcd_dev_list.mutex); list_for_each_entry(smcd_dev, &smcd_dev_list.list, list) { - if (!strncmp(dev_name(smcd_dev->ops->get_dev(smcd_dev)), - smcd_name, IB_DEVICE_NAME_MAX - 1)) + if (!strncmp(dev_name(&smcd_dev->dibs->dev), smcd_name, + IB_DEVICE_NAME_MAX - 1) || + (smcd_dev->dibs->dev.parent && + !strncmp(dev_name(smcd_dev->dibs->dev.parent), smcd_name, + IB_DEVICE_NAME_MAX - 1))) goto out; } smcd_dev = NULL; @@ -413,7 +416,6 @@ static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name, bool smcddev_applied = true; bool ibdev_applied = true; struct smcd_dev *smcd; - struct device *dev; bool new_ibdev; /* try to apply the pnetid to active devices */ @@ -431,10 +433,8 @@ static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name, if (smcd) { smcddev_applied = smc_pnet_apply_smcd(smcd, pnet_name); if (smcddev_applied) { - dev = smcd->ops->get_dev(smcd); - pr_warn_ratelimited("smc: smcd device %s " - "applied user defined pnetid " - "%.16s\n", dev_name(dev), + pr_warn_ratelimited("smc: smcd device %s applied user defined pnetid %.16s\n", + dev_name(&smcd->dibs->dev), smcd->pnetid); } } @@ -450,7 +450,7 @@ static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name, return -ENOMEM; new_pe->type = SMC_PNET_IB; memcpy(new_pe->pnet_name, pnet_name, SMC_MAX_PNETID_LEN); - strncpy(new_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX); + strscpy(new_pe->ib_name, ib_name); new_pe->ib_port = ib_port; new_ibdev = true; @@ -1126,37 +1126,38 @@ static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev, */ void smc_pnet_find_roce_resource(struct sock *sk, struct smc_init_info *ini) { - struct dst_entry *dst = sk_dst_get(sk); - - if (!dst) - goto out; - if (!dst->dev) - goto out_rel; + struct net_device *dev; + struct dst_entry *dst; - smc_pnet_find_roce_by_pnetid(dst->dev, ini); + rcu_read_lock(); + dst = __sk_dst_get(sk); + dev = dst ? dst_dev_rcu(dst) : NULL; + dev_hold(dev); + rcu_read_unlock(); -out_rel: - dst_release(dst); -out: - return; + if (dev) { + smc_pnet_find_roce_by_pnetid(dev, ini); + dev_put(dev); + } } void smc_pnet_find_ism_resource(struct sock *sk, struct smc_init_info *ini) { - struct dst_entry *dst = sk_dst_get(sk); + struct net_device *dev; + struct dst_entry *dst; ini->ism_dev[0] = NULL; - if (!dst) - goto out; - if (!dst->dev) - goto out_rel; - smc_pnet_find_ism_by_pnetid(dst->dev, ini); + rcu_read_lock(); + dst = __sk_dst_get(sk); + dev = dst ? dst_dev_rcu(dst) : NULL; + dev_hold(dev); + rcu_read_unlock(); -out_rel: - dst_release(dst); -out: - return; + if (dev) { + smc_pnet_find_ism_by_pnetid(dev, ini); + dev_put(dev); + } } /* Lookup and apply a pnet table entry to the given ib device. @@ -1192,7 +1193,6 @@ int smc_pnetid_by_table_ib(struct smc_ib_device *smcibdev, u8 ib_port) */ int smc_pnetid_by_table_smcd(struct smcd_dev *smcddev) { - const char *ib_name = dev_name(smcddev->ops->get_dev(smcddev)); struct smc_pnettable *pnettable; struct smc_pnetentry *tmp_pe; struct smc_net *sn; @@ -1205,7 +1205,13 @@ int smc_pnetid_by_table_smcd(struct smcd_dev *smcddev) mutex_lock(&pnettable->lock); list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { if (tmp_pe->type == SMC_PNET_IB && - !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX)) { + (!strncmp(tmp_pe->ib_name, + dev_name(&smcddev->dibs->dev), + sizeof(tmp_pe->ib_name)) || + (smcddev->dibs->dev.parent && + !strncmp(tmp_pe->ib_name, + dev_name(smcddev->dibs->dev.parent), + sizeof(tmp_pe->ib_name))))) { smc_pnet_apply_smcd(smcddev, tmp_pe->pnet_name); rc = 0; break; diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index 2fab6456f765..b1efed546243 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -12,12 +12,14 @@ #include <linux/init.h> #include <linux/sysctl.h> +#include <linux/bpf.h> #include <net/net_namespace.h> #include "smc.h" #include "smc_core.h" #include "smc_llc.h" #include "smc_sysctl.h" +#include "smc_hs_bpf.h" static int min_sndbuf = SMC_BUF_MIN_SIZE; static int min_rcvbuf = SMC_BUF_MIN_SIZE; @@ -29,6 +31,71 @@ static int links_per_lgr_min = SMC_LINKS_ADD_LNK_MIN; static int links_per_lgr_max = SMC_LINKS_ADD_LNK_MAX; static int conns_per_lgr_min = SMC_CONN_PER_LGR_MIN; static int conns_per_lgr_max = SMC_CONN_PER_LGR_MAX; +static unsigned int smcr_max_wr_min = 2; +static unsigned int smcr_max_wr_max = 2048; + +#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF) +static int smc_net_replace_smc_hs_ctrl(struct net *net, const char *name) +{ + struct smc_hs_ctrl *ctrl = NULL; + + rcu_read_lock(); + /* null or empty name ask to clear current ctrl */ + if (name && name[0]) { + ctrl = smc_hs_ctrl_find_by_name(name); + if (!ctrl) { + rcu_read_unlock(); + return -EINVAL; + } + /* no change, just return */ + if (ctrl == rcu_dereference(net->smc.hs_ctrl)) { + rcu_read_unlock(); + return 0; + } + if (!bpf_try_module_get(ctrl, ctrl->owner)) { + rcu_read_unlock(); + return -EBUSY; + } + } + /* xhcg old ctrl with the new one atomically */ + ctrl = unrcu_pointer(xchg(&net->smc.hs_ctrl, RCU_INITIALIZER(ctrl))); + /* release old ctrl */ + if (ctrl) + bpf_module_put(ctrl, ctrl->owner); + + rcu_read_unlock(); + return 0; +} + +static int proc_smc_hs_ctrl(const struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct net *net = container_of(ctl->data, struct net, smc.hs_ctrl); + char val[SMC_HS_CTRL_NAME_MAX]; + const struct ctl_table tbl = { + .data = val, + .maxlen = SMC_HS_CTRL_NAME_MAX, + }; + struct smc_hs_ctrl *ctrl; + int ret; + + rcu_read_lock(); + ctrl = rcu_dereference(net->smc.hs_ctrl); + if (ctrl) + memcpy(val, ctrl->name, sizeof(ctrl->name)); + else + val[0] = '\0'; + rcu_read_unlock(); + + ret = proc_dostring(&tbl, write, buffer, lenp, ppos); + if (ret) + return ret; + + if (write) + ret = smc_net_replace_smc_hs_ctrl(net, val); + return ret; +} +#endif /* CONFIG_SMC_HS_CTRL_BPF */ static struct ctl_table smc_table[] = { { @@ -99,6 +166,33 @@ static struct ctl_table smc_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, + { + .procname = "smcr_max_send_wr", + .data = &init_net.smc.sysctl_smcr_max_send_wr, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &smcr_max_wr_min, + .extra2 = &smcr_max_wr_max, + }, + { + .procname = "smcr_max_recv_wr", + .data = &init_net.smc.sysctl_smcr_max_recv_wr, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &smcr_max_wr_min, + .extra2 = &smcr_max_wr_max, + }, +#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF) + { + .procname = "hs_ctrl", + .data = &init_net.smc.hs_ctrl, + .mode = 0644, + .maxlen = SMC_HS_CTRL_NAME_MAX, + .proc_handler = proc_smc_hs_ctrl, + }, +#endif /* CONFIG_SMC_HS_CTRL_BPF */ }; int __net_init smc_sysctl_net_init(struct net *net) @@ -109,6 +203,16 @@ int __net_init smc_sysctl_net_init(struct net *net) table = smc_table; if (!net_eq(net, &init_net)) { int i; +#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF) + struct smc_hs_ctrl *ctrl; + + rcu_read_lock(); + ctrl = rcu_dereference(init_net.smc.hs_ctrl); + if (ctrl && ctrl->flags & SMC_HS_CTRL_FLAG_INHERITABLE && + bpf_try_module_get(ctrl, ctrl->owner)) + rcu_assign_pointer(net->smc.hs_ctrl, ctrl); + rcu_read_unlock(); +#endif /* CONFIG_SMC_HS_CTRL_BPF */ table = kmemdup(table, sizeof(smc_table), GFP_KERNEL); if (!table) @@ -130,6 +234,8 @@ int __net_init smc_sysctl_net_init(struct net *net) WRITE_ONCE(net->smc.sysctl_rmem, net_smc_rmem_init); net->smc.sysctl_max_links_per_lgr = SMC_LINKS_PER_LGR_MAX_PREFER; net->smc.sysctl_max_conns_per_lgr = SMC_CONN_PER_LGR_PREFER; + net->smc.sysctl_smcr_max_send_wr = SMCR_MAX_SEND_WR_DEF; + net->smc.sysctl_smcr_max_recv_wr = SMCR_MAX_RECV_WR_DEF; /* disable handshake limitation by default */ net->smc.limit_smc_hs = 0; @@ -139,6 +245,9 @@ err_reg: if (!net_eq(net, &init_net)) kfree(table); err_alloc: +#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF) + smc_net_replace_smc_hs_ctrl(net, NULL); +#endif /* CONFIG_SMC_HS_CTRL_BPF */ return -ENOMEM; } @@ -148,6 +257,10 @@ void __net_exit smc_sysctl_net_exit(struct net *net) table = net->smc.smc_hdr->ctl_table_arg; unregister_net_sysctl_table(net->smc.smc_hdr); +#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF) + smc_net_replace_smc_hs_ctrl(net, NULL); +#endif /* CONFIG_SMC_HS_CTRL_BPF */ + if (!net_eq(net, &init_net)) kfree(table); } diff --git a/net/smc/smc_sysctl.h b/net/smc/smc_sysctl.h index eb2465ae1e15..8538915af7af 100644 --- a/net/smc/smc_sysctl.h +++ b/net/smc/smc_sysctl.h @@ -25,6 +25,8 @@ static inline int smc_sysctl_net_init(struct net *net) net->smc.sysctl_autocorking_size = SMC_AUTOCORKING_DEFAULT_SIZE; net->smc.sysctl_max_links_per_lgr = SMC_LINKS_PER_LGR_MAX_PREFER; net->smc.sysctl_max_conns_per_lgr = SMC_CONN_PER_LGR_PREFER; + net->smc.sysctl_smcr_max_send_wr = SMCR_MAX_SEND_WR_DEF; + net->smc.sysctl_smcr_max_recv_wr = SMCR_MAX_RECV_WR_DEF; return 0; } diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 214ac3cbcf9a..3144b4b1fe29 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -426,6 +426,9 @@ static int smcd_tx_rdma_writes(struct smc_connection *conn, size_t len, int srcchunk, dstchunk; int rc; + if (conn->sndbuf_desc->is_attached) + return 0; + for (dstchunk = 0; dstchunk < 2; dstchunk++) { for (srcchunk = 0; srcchunk < 2; srcchunk++) { void *data = conn->sndbuf_desc->cpu_addr + src_off; diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index b04a21b8c511..5feafa98ab1a 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -547,9 +547,9 @@ void smc_wr_remember_qp_attr(struct smc_link *lnk) IB_QP_DEST_QPN, &init_attr); - lnk->wr_tx_cnt = min_t(size_t, SMC_WR_BUF_CNT, + lnk->wr_tx_cnt = min_t(size_t, lnk->max_send_wr, lnk->qp_attr.cap.max_send_wr); - lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT * 3, + lnk->wr_rx_cnt = min_t(size_t, lnk->max_recv_wr, lnk->qp_attr.cap.max_recv_wr); } @@ -741,50 +741,51 @@ int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr) int smc_wr_alloc_link_mem(struct smc_link *link) { /* allocate link related memory */ - link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL); + link->wr_tx_bufs = kcalloc(link->max_send_wr, + SMC_WR_BUF_SIZE, GFP_KERNEL); if (!link->wr_tx_bufs) goto no_mem; - link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, link->wr_rx_buflen, + link->wr_rx_bufs = kcalloc(link->max_recv_wr, link->wr_rx_buflen, GFP_KERNEL); if (!link->wr_rx_bufs) goto no_mem_wr_tx_bufs; - link->wr_tx_ibs = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_ibs[0]), - GFP_KERNEL); + link->wr_tx_ibs = kcalloc(link->max_send_wr, + sizeof(link->wr_tx_ibs[0]), GFP_KERNEL); if (!link->wr_tx_ibs) goto no_mem_wr_rx_bufs; - link->wr_rx_ibs = kcalloc(SMC_WR_BUF_CNT * 3, + link->wr_rx_ibs = kcalloc(link->max_recv_wr, sizeof(link->wr_rx_ibs[0]), GFP_KERNEL); if (!link->wr_rx_ibs) goto no_mem_wr_tx_ibs; - link->wr_tx_rdmas = kcalloc(SMC_WR_BUF_CNT, + link->wr_tx_rdmas = kcalloc(link->max_send_wr, sizeof(link->wr_tx_rdmas[0]), GFP_KERNEL); if (!link->wr_tx_rdmas) goto no_mem_wr_rx_ibs; - link->wr_tx_rdma_sges = kcalloc(SMC_WR_BUF_CNT, + link->wr_tx_rdma_sges = kcalloc(link->max_send_wr, sizeof(link->wr_tx_rdma_sges[0]), GFP_KERNEL); if (!link->wr_tx_rdma_sges) goto no_mem_wr_tx_rdmas; - link->wr_tx_sges = kcalloc(SMC_WR_BUF_CNT, sizeof(link->wr_tx_sges[0]), + link->wr_tx_sges = kcalloc(link->max_send_wr, sizeof(link->wr_tx_sges[0]), GFP_KERNEL); if (!link->wr_tx_sges) goto no_mem_wr_tx_rdma_sges; - link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3, + link->wr_rx_sges = kcalloc(link->max_recv_wr, sizeof(link->wr_rx_sges[0]) * link->wr_rx_sge_cnt, GFP_KERNEL); if (!link->wr_rx_sges) goto no_mem_wr_tx_sges; - link->wr_tx_mask = bitmap_zalloc(SMC_WR_BUF_CNT, GFP_KERNEL); + link->wr_tx_mask = bitmap_zalloc(link->max_send_wr, GFP_KERNEL); if (!link->wr_tx_mask) goto no_mem_wr_rx_sges; - link->wr_tx_pends = kcalloc(SMC_WR_BUF_CNT, + link->wr_tx_pends = kcalloc(link->max_send_wr, sizeof(link->wr_tx_pends[0]), GFP_KERNEL); if (!link->wr_tx_pends) goto no_mem_wr_tx_mask; - link->wr_tx_compl = kcalloc(SMC_WR_BUF_CNT, + link->wr_tx_compl = kcalloc(link->max_send_wr, sizeof(link->wr_tx_compl[0]), GFP_KERNEL); if (!link->wr_tx_compl) @@ -905,7 +906,7 @@ int smc_wr_create_link(struct smc_link *lnk) goto dma_unmap; } smc_wr_init_sge(lnk); - bitmap_zero(lnk->wr_tx_mask, SMC_WR_BUF_CNT); + bitmap_zero(lnk->wr_tx_mask, lnk->max_send_wr); init_waitqueue_head(&lnk->wr_tx_wait); rc = percpu_ref_init(&lnk->wr_tx_refs, smcr_wr_tx_refs_free, 0, GFP_KERNEL); if (rc) diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index f3008dda222a..aa4533af9122 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -19,8 +19,6 @@ #include "smc.h" #include "smc_core.h" -#define SMC_WR_BUF_CNT 16 /* # of ctrl buffers per link */ - #define SMC_WR_TX_WAIT_FREE_SLOT_TIME (10 * HZ) #define SMC_WR_TX_SIZE 44 /* actual size of wr_send data (<=SMC_WR_BUF_SIZE) */ diff --git a/net/socket.c b/net/socket.c index 682969deaed3..136b98c54fb3 100644 --- a/net/socket.c +++ b/net/socket.c @@ -276,28 +276,41 @@ int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr_storage *k static int move_addr_to_user(struct sockaddr_storage *kaddr, int klen, void __user *uaddr, int __user *ulen) { - int err; int len; BUG_ON(klen > sizeof(struct sockaddr_storage)); - err = get_user(len, ulen); - if (err) - return err; + + if (can_do_masked_user_access()) + ulen = masked_user_access_begin(ulen); + else if (!user_access_begin(ulen, 4)) + return -EFAULT; + + unsafe_get_user(len, ulen, efault_end); + if (len > klen) len = klen; - if (len < 0) - return -EINVAL; + /* + * "fromlen shall refer to the value before truncation.." + * 1003.1g + */ + if (len >= 0) + unsafe_put_user(klen, ulen, efault_end); + + user_access_end(); + if (len) { + if (len < 0) + return -EINVAL; if (audit_sockaddr(klen, kaddr)) return -ENOMEM; if (copy_to_user(uaddr, kaddr, len)) return -EFAULT; } - /* - * "fromlen shall refer to the value before truncation.." - * 1003.1g - */ - return __put_user(klen, ulen); + return 0; + +efault_end: + user_access_end(); + return -EFAULT; } static struct kmem_cache *sock_inode_cachep __ro_after_init; @@ -1176,6 +1189,9 @@ static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from) if (sock->type == SOCK_SEQPACKET) msg.msg_flags |= MSG_EOR; + if (iocb->ki_flags & IOCB_NOSIGNAL) + msg.msg_flags |= MSG_NOSIGNAL; + res = __sock_sendmsg(sock, &msg); *from = msg.msg_iter; return res; @@ -1856,7 +1872,7 @@ int __sys_bind_socket(struct socket *sock, struct sockaddr_storage *address, addrlen); if (!err) err = READ_ONCE(sock->ops)->bind(sock, - (struct sockaddr *)address, + (struct sockaddr_unsized *)address, addrlen); return err; } @@ -1996,8 +2012,6 @@ static int __sys_accept4_file(struct file *file, struct sockaddr __user *upeer_s int __user *upeer_addrlen, int flags) { struct proto_accept_arg arg = { }; - struct file *newfile; - int newfd; if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK)) return -EINVAL; @@ -2005,18 +2019,7 @@ static int __sys_accept4_file(struct file *file, struct sockaddr __user *upeer_s if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK)) flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK; - newfd = get_unused_fd_flags(flags); - if (unlikely(newfd < 0)) - return newfd; - - newfile = do_accept(file, &arg, upeer_sockaddr, upeer_addrlen, - flags); - if (IS_ERR(newfile)) { - put_unused_fd(newfd); - return PTR_ERR(newfile); - } - fd_install(newfd, newfile); - return newfd; + return FD_ADD(flags, do_accept(file, &arg, upeer_sockaddr, upeer_addrlen, flags)); } /* @@ -2083,8 +2086,8 @@ int __sys_connect_file(struct file *file, struct sockaddr_storage *address, if (err) goto out; - err = READ_ONCE(sock->ops)->connect(sock, (struct sockaddr *)address, - addrlen, sock->file->f_flags | file_flags); + err = READ_ONCE(sock->ops)->connect(sock, (struct sockaddr_unsized *)address, + addrlen, sock->file->f_flags | file_flags); out: return err; } @@ -2111,78 +2114,53 @@ SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr, return __sys_connect(fd, uservaddr, addrlen); } -/* - * Get the local address ('name') of a socket object. Move the obtained - * name to user space. - */ - -int __sys_getsockname(int fd, struct sockaddr __user *usockaddr, - int __user *usockaddr_len) +int do_getsockname(struct socket *sock, int peer, + struct sockaddr __user *usockaddr, int __user *usockaddr_len) { - struct socket *sock; struct sockaddr_storage address; - CLASS(fd, f)(fd); int err; - if (fd_empty(f)) - return -EBADF; - sock = sock_from_file(fd_file(f)); - if (unlikely(!sock)) - return -ENOTSOCK; - - err = security_socket_getsockname(sock); + if (peer) + err = security_socket_getpeername(sock); + else + err = security_socket_getsockname(sock); if (err) return err; - - err = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 0); + err = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, peer); if (err < 0) return err; - /* "err" is actually length in this case */ return move_addr_to_user(&address, err, usockaddr, usockaddr_len); } -SYSCALL_DEFINE3(getsockname, int, fd, struct sockaddr __user *, usockaddr, - int __user *, usockaddr_len) -{ - return __sys_getsockname(fd, usockaddr, usockaddr_len); -} - /* - * Get the remote address ('name') of a socket object. Move the obtained - * name to user space. + * Get the remote or local address ('name') of a socket object. Move the + * obtained name to user space. */ - -int __sys_getpeername(int fd, struct sockaddr __user *usockaddr, - int __user *usockaddr_len) +int __sys_getsockname(int fd, struct sockaddr __user *usockaddr, + int __user *usockaddr_len, int peer) { struct socket *sock; - struct sockaddr_storage address; CLASS(fd, f)(fd); - int err; if (fd_empty(f)) return -EBADF; sock = sock_from_file(fd_file(f)); if (unlikely(!sock)) return -ENOTSOCK; + return do_getsockname(sock, peer, usockaddr, usockaddr_len); +} - err = security_socket_getpeername(sock); - if (err) - return err; - - err = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 1); - if (err < 0) - return err; - - /* "err" is actually length in this case */ - return move_addr_to_user(&address, err, usockaddr, usockaddr_len); +SYSCALL_DEFINE3(getsockname, int, fd, struct sockaddr __user *, usockaddr, + int __user *, usockaddr_len) +{ + return __sys_getsockname(fd, usockaddr, usockaddr_len, 0); } SYSCALL_DEFINE3(getpeername, int, fd, struct sockaddr __user *, usockaddr, int __user *, usockaddr_len) { - return __sys_getpeername(fd, usockaddr, usockaddr_len); + return __sys_getsockname(fd, usockaddr, usockaddr_len, 1); } /* @@ -3146,12 +3124,12 @@ SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args) case SYS_GETSOCKNAME: err = __sys_getsockname(a0, (struct sockaddr __user *)a1, - (int __user *)a[2]); + (int __user *)a[2], 0); break; case SYS_GETPEERNAME: err = - __sys_getpeername(a0, (struct sockaddr __user *)a1, - (int __user *)a[2]); + __sys_getsockname(a0, (struct sockaddr __user *)a1, + (int __user *)a[2], 1); break; case SYS_SOCKETPAIR: err = __sys_socketpair(a0, a1, a[2], (int __user *)a[3]); @@ -3567,13 +3545,13 @@ static long compat_sock_ioctl(struct file *file, unsigned int cmd, * Returns 0 or an error. */ -int kernel_bind(struct socket *sock, struct sockaddr *addr, int addrlen) +int kernel_bind(struct socket *sock, struct sockaddr_unsized *addr, int addrlen) { struct sockaddr_storage address; memcpy(&address, addr, addrlen); - return READ_ONCE(sock->ops)->bind(sock, (struct sockaddr *)&address, + return READ_ONCE(sock->ops)->bind(sock, (struct sockaddr_unsized *)&address, addrlen); } EXPORT_SYMBOL(kernel_bind); @@ -3646,14 +3624,14 @@ EXPORT_SYMBOL(kernel_accept); * Returns 0 or an error code. */ -int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen, +int kernel_connect(struct socket *sock, struct sockaddr_unsized *addr, int addrlen, int flags) { struct sockaddr_storage address; memcpy(&address, addr, addrlen); - return READ_ONCE(sock->ops)->connect(sock, (struct sockaddr *)&address, + return READ_ONCE(sock->ops)->connect(sock, (struct sockaddr_unsized *)&address, addrlen, flags); } EXPORT_SYMBOL(kernel_connect); diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c index 43b1f558b33d..fe0e76fdd1f1 100644 --- a/net/strparser/strparser.c +++ b/net/strparser/strparser.c @@ -127,7 +127,7 @@ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, } if (!strp->skb_nextp) { - /* We are going to append to the frags_list of head. + /* We are going to append to the frag_list of head. * Need to unshare the frag_list. */ err = skb_unclone(head, GFP_ATOMIC); @@ -238,7 +238,7 @@ static int __strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb, strp_parser_err(strp, -EMSGSIZE, desc); break; } else if (len <= (ssize_t)head->len - - skb->len - stm->strp.offset) { + (ssize_t)skb->len - stm->strp.offset) { /* Length must be into new skb (and also * greater than zero) */ diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig index 2d8b67dac7b5..a570e7adf270 100644 --- a/net/sunrpc/Kconfig +++ b/net/sunrpc/Kconfig @@ -101,6 +101,20 @@ config SUNRPC_DEBUG If unsure, say Y. +config SUNRPC_DEBUG_TRACE + bool "RPC: Send dfprintk() output to the trace buffer" + depends on SUNRPC_DEBUG && TRACING + default n + help + dprintk() output can be voluminous, which can overwhelm the + kernel's logging facility as it must be sent to the console. + This option causes dprintk() output to go to the trace buffer + instead of the kernel log. + + This will cause warnings about trace_printk() being used to be + logged at boot time, so say N unless you are debugging a problem + with sunrpc-based clients or services. + config SUNRPC_XPRT_RDMA tristate "RPC-over-RDMA transport" depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c index cb32ab9a8395..7d2cdc2bd374 100644 --- a/net/sunrpc/auth_gss/gss_rpc_xdr.c +++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c @@ -794,12 +794,12 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp, struct gssx_res_accept_sec_context *res = data; u32 value_follows; int err; - struct page *scratch; + struct folio *scratch; - scratch = alloc_page(GFP_KERNEL); + scratch = folio_alloc(GFP_KERNEL, 0); if (!scratch) return -ENOMEM; - xdr_set_scratch_page(xdr, scratch); + xdr_set_scratch_folio(xdr, scratch); /* res->status */ err = gssx_dec_status(xdr, &res->status); @@ -844,6 +844,6 @@ int gssx_dec_accept_sec_context(struct rpc_rqst *rqstp, err = gssx_dec_option_array(xdr, &res->options); out_free: - __free_page(scratch); + folio_put(scratch); return err; } diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index e82212f6b562..a8ec30759a18 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -724,7 +724,7 @@ svcauth_gss_verify_header(struct svc_rqst *rqstp, struct rsc *rsci, rqstp->rq_auth_stat = rpc_autherr_badverf; return SVC_DENIED; } - if (flavor != RPC_AUTH_GSS) { + if (flavor != RPC_AUTH_GSS || checksum.len < XDR_UNIT) { rqstp->rq_auth_stat = rpc_autherr_badverf; return SVC_DENIED; } diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 8ca354ecfd02..58442ae1c2da 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1457,12 +1457,12 @@ static int rpc_sockname(struct net *net, struct sockaddr *sap, size_t salen, switch (sap->sa_family) { case AF_INET: err = kernel_bind(sock, - (struct sockaddr *)&rpc_inaddr_loopback, + (struct sockaddr_unsized *)&rpc_inaddr_loopback, sizeof(rpc_inaddr_loopback)); break; case AF_INET6: err = kernel_bind(sock, - (struct sockaddr *)&rpc_in6addr_loopback, + (struct sockaddr_unsized *)&rpc_in6addr_loopback, sizeof(rpc_in6addr_loopback)); break; default: @@ -1474,7 +1474,7 @@ static int rpc_sockname(struct net *net, struct sockaddr *sap, size_t salen, goto out_release; } - err = kernel_connect(sock, sap, salen, 0); + err = kernel_connect(sock, (struct sockaddr_unsized *)sap, salen, 0); if (err < 0) { dprintk("RPC: can't connect UDP socket (%d)\n", err); goto out_release; diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 0bd1df2ebb47..379daefc4847 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -536,17 +536,16 @@ static int rpc_new_file(struct dentry *parent, inode = rpc_get_inode(dir->i_sb, S_IFREG | mode); if (unlikely(!inode)) { - dput(dentry); - inode_unlock(dir); + simple_done_creating(dentry); return -ENOMEM; } inode->i_ino = iunique(dir->i_sb, 100); if (i_fop) inode->i_fop = i_fop; rpc_inode_setowner(inode, private); - d_instantiate(dentry, inode); + d_make_persistent(dentry, inode); fsnotify_create(dir, dentry); - inode_unlock(dir); + simple_done_creating(dentry); return 0; } @@ -563,18 +562,17 @@ static struct dentry *rpc_new_dir(struct dentry *parent, inode = rpc_get_inode(dir->i_sb, S_IFDIR | mode); if (unlikely(!inode)) { - dput(dentry); - inode_unlock(dir); + simple_done_creating(dentry); return ERR_PTR(-ENOMEM); } inode->i_ino = iunique(dir->i_sb, 100); inc_nlink(dir); - d_instantiate(dentry, inode); + d_make_persistent(dentry, inode); fsnotify_mkdir(dir, dentry); - inode_unlock(dir); + simple_done_creating(dentry); - return dentry; + return dentry; // borrowed } static int rpc_populate(struct dentry *parent, @@ -657,8 +655,7 @@ int rpc_mkpipe_dentry(struct dentry *parent, const char *name, inode = rpc_get_inode(dir->i_sb, umode); if (unlikely(!inode)) { - dput(dentry); - inode_unlock(dir); + simple_done_creating(dentry); err = -ENOMEM; goto failed; } @@ -668,10 +665,10 @@ int rpc_mkpipe_dentry(struct dentry *parent, const char *name, rpci->private = private; rpci->pipe = pipe; rpc_inode_setowner(inode, private); - d_instantiate(dentry, inode); - pipe->dentry = dentry; + pipe->dentry = dentry; // borrowed + d_make_persistent(dentry, inode); fsnotify_create(dir, dentry); - inode_unlock(dir); + simple_done_creating(dentry); return 0; failed: @@ -1206,7 +1203,7 @@ static void rpc_kill_sb(struct super_block *sb) sb); mutex_unlock(&sn->pipefs_sb_lock); out: - kill_litter_super(sb); + kill_anon_super(sb); put_net(net); } diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 73bc39281ef5..016f16ca5779 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -276,8 +276,6 @@ EXPORT_SYMBOL_GPL(rpc_destroy_wait_queue); static int rpc_wait_bit_killable(struct wait_bit_key *key, int mode) { - if (unlikely(current->flags & PF_EXITING)) - return -EINTR; schedule(); if (signal_pending_state(mode, current)) return -ERESTARTSYS; @@ -1076,7 +1074,6 @@ int rpc_malloc(struct rpc_task *task) rqst->rq_rbuffer = (char *)rqst->rq_buffer + rqst->rq_callsize; return 0; } -EXPORT_SYMBOL_GPL(rpc_malloc); /** * rpc_free - free RPC buffer resources allocated via rpc_malloc @@ -1097,7 +1094,6 @@ void rpc_free(struct rpc_task *task) else kfree(buf); } -EXPORT_SYMBOL_GPL(rpc_free); /* * Creation and deletion of RPC task structures diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c index 4e92e2a50168..d8d8842c7de5 100644 --- a/net/sunrpc/socklib.c +++ b/net/sunrpc/socklib.c @@ -86,7 +86,7 @@ xdr_partial_copy_from_skb(struct xdr_buf *xdr, struct xdr_skb_reader *desc) /* ACL likes to be lazy in allocating pages - ACLs * are small by default but can get huge. */ if ((xdr->flags & XDRBUF_SPARSE_PAGES) && *ppage == NULL) { - *ppage = alloc_page(GFP_NOWAIT | __GFP_NOWARN); + *ppage = alloc_page(GFP_NOWAIT); if (unlikely(*ppage == NULL)) { if (copied == 0) return -ENOMEM; diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index b1fab3a69544..4704dce7284e 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -352,7 +352,7 @@ static int svc_pool_map_get_node(unsigned int pidx) if (m->mode == SVC_POOL_PERNODE) return m->pool_to[pidx]; } - return NUMA_NO_NODE; + return numa_mem_id(); } /* * Set the given thread's cpus_allowed mask so that it @@ -436,7 +436,6 @@ void svc_rpcb_cleanup(struct svc_serv *serv, struct net *net) svc_unregister(serv, net); rpcb_put_local(net); } -EXPORT_SYMBOL_GPL(svc_rpcb_cleanup); static int svc_uses_rpcbind(struct svc_serv *serv) { @@ -670,8 +669,8 @@ svc_rqst_free(struct svc_rqst *rqstp) folio_batch_release(&rqstp->rq_fbatch); kfree(rqstp->rq_bvec); svc_release_buffer(rqstp); - if (rqstp->rq_scratch_page) - put_page(rqstp->rq_scratch_page); + if (rqstp->rq_scratch_folio) + folio_put(rqstp->rq_scratch_folio); kfree(rqstp->rq_resp); kfree(rqstp->rq_argp); kfree(rqstp->rq_auth_data); @@ -692,8 +691,8 @@ svc_prepare_thread(struct svc_serv *serv, struct svc_pool *pool, int node) rqstp->rq_server = serv; rqstp->rq_pool = pool; - rqstp->rq_scratch_page = alloc_pages_node(node, GFP_KERNEL, 0); - if (!rqstp->rq_scratch_page) + rqstp->rq_scratch_folio = __folio_alloc_node(GFP_KERNEL, 0, node); + if (!rqstp->rq_scratch_folio) goto out_enomem; rqstp->rq_argp = kmalloc_node(serv->sv_xdrsize, GFP_KERNEL, node); @@ -1426,8 +1425,6 @@ svc_process_common(struct svc_rqst *rqstp) /* Call the function that processes the request. */ rc = process.dispatch(rqstp); - if (procp->pc_release) - procp->pc_release(rqstp); xdr_finish_decode(xdr); if (!rc) @@ -1526,6 +1523,14 @@ static void svc_drop(struct svc_rqst *rqstp) trace_svc_drop(rqstp); } +static void svc_release_rqst(struct svc_rqst *rqstp) +{ + const struct svc_procedure *procp = rqstp->rq_procinfo; + + if (procp && procp->pc_release) + procp->pc_release(rqstp); +} + /** * svc_process - Execute one RPC transaction * @rqstp: RPC transaction context @@ -1565,9 +1570,12 @@ void svc_process(struct svc_rqst *rqstp) if (unlikely(*p != rpc_call)) goto out_baddir; - if (!svc_process_common(rqstp)) + if (!svc_process_common(rqstp)) { + svc_release_rqst(rqstp); goto out_drop; + } svc_send(rqstp); + svc_release_rqst(rqstp); return; out_baddir: @@ -1635,6 +1643,7 @@ void svc_process_bc(struct rpc_rqst *req, struct svc_rqst *rqstp) if (!proc_error) { /* Processing error: drop the request */ xprt_free_bc_request(req); + svc_release_rqst(rqstp); return; } /* Finally, send the reply synchronously */ @@ -1648,6 +1657,7 @@ void svc_process_bc(struct rpc_rqst *req, struct svc_rqst *rqstp) timeout.to_maxval = timeout.to_initval; memcpy(&req->rq_snd_buf, &rqstp->rq_res, sizeof(req->rq_snd_buf)); task = rpc_run_bc_task(req, &timeout); + svc_release_rqst(rqstp); if (IS_ERR(task)) return; diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 8b1837228799..6973184ff667 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -1014,6 +1014,19 @@ static void svc_delete_xprt(struct svc_xprt *xprt) struct svc_serv *serv = xprt->xpt_server; struct svc_deferred_req *dr; + /* unregister with rpcbind for when transport type is TCP or UDP. + */ + if (test_bit(XPT_RPCB_UNREG, &xprt->xpt_flags)) { + struct svc_sock *svsk = container_of(xprt, struct svc_sock, + sk_xprt); + struct socket *sock = svsk->sk_sock; + + if (svc_register(serv, xprt->xpt_net, sock->sk->sk_family, + sock->sk->sk_protocol, 0) < 0) + pr_warn("failed to unregister %s with rpcbind\n", + xprt->xpt_class->xcl_name); + } + if (test_and_set_bit(XPT_DEAD, &xprt->xpt_flags)) return; @@ -1102,6 +1115,7 @@ static void svc_clean_up_xprts(struct svc_serv *serv, struct net *net) * svc_xprt_destroy_all - Destroy transports associated with @serv * @serv: RPC service to be shut down * @net: target network namespace + * @unregister: true if it is OK to unregister the destroyed xprts * * Server threads may still be running (especially in the case where the * service is still running in other network namespaces). @@ -1114,7 +1128,8 @@ static void svc_clean_up_xprts(struct svc_serv *serv, struct net *net) * threads, we may need to wait a little while and then check again to * see if they're done. */ -void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net) +void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net, + bool unregister) { int delay = 0; @@ -1124,6 +1139,9 @@ void svc_xprt_destroy_all(struct svc_serv *serv, struct net *net) svc_clean_up_xprts(serv, net); msleep(delay++); } + + if (unregister) + svc_rpcb_cleanup(serv, net); } EXPORT_SYMBOL_GPL(svc_xprt_destroy_all); diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 46c156b121db..d61cd9b40491 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -68,6 +68,17 @@ #define RPCDBG_FACILITY RPCDBG_SVCXPRT +/* + * For UDP: + * 1 for header page + * enough pages for RPCSVC_MAXPAYLOAD_UDP + * 1 in case payload is not aligned + * 1 for tail page + */ +enum { + SUNRPC_MAX_UDP_SENDPAGES = 1 + RPCSVC_MAXPAYLOAD_UDP / PAGE_SIZE + 1 + 1 +}; + /* To-do: to avoid tying up an nfsd thread while waiting for a * handshake request, the request could instead be deferred. */ @@ -257,20 +268,47 @@ svc_tcp_sock_process_cmsg(struct socket *sock, struct msghdr *msg, } static int -svc_tcp_sock_recv_cmsg(struct svc_sock *svsk, struct msghdr *msg) +svc_tcp_sock_recv_cmsg(struct socket *sock, unsigned int *msg_flags) { union { struct cmsghdr cmsg; u8 buf[CMSG_SPACE(sizeof(u8))]; } u; - struct socket *sock = svsk->sk_sock; + u8 alert[2]; + struct kvec alert_kvec = { + .iov_base = alert, + .iov_len = sizeof(alert), + }; + struct msghdr msg = { + .msg_flags = *msg_flags, + .msg_control = &u, + .msg_controllen = sizeof(u), + }; int ret; - msg->msg_control = &u; - msg->msg_controllen = sizeof(u); + iov_iter_kvec(&msg.msg_iter, ITER_DEST, &alert_kvec, 1, + alert_kvec.iov_len); + ret = sock_recvmsg(sock, &msg, MSG_DONTWAIT); + if (ret > 0 && + tls_get_record_type(sock->sk, &u.cmsg) == TLS_RECORD_TYPE_ALERT) { + iov_iter_revert(&msg.msg_iter, ret); + ret = svc_tcp_sock_process_cmsg(sock, &msg, &u.cmsg, -EAGAIN); + } + return ret; +} + +static int +svc_tcp_sock_recvmsg(struct svc_sock *svsk, struct msghdr *msg) +{ + int ret; + struct socket *sock = svsk->sk_sock; + ret = sock_recvmsg(sock, msg, MSG_DONTWAIT); - if (unlikely(msg->msg_controllen != sizeof(u))) - ret = svc_tcp_sock_process_cmsg(sock, msg, &u.cmsg, ret); + if (msg->msg_flags & MSG_CTRUNC) { + msg->msg_flags &= ~(MSG_CTRUNC | MSG_EOR); + if (ret == 0 || ret == -EIO) + ret = svc_tcp_sock_recv_cmsg(sock, &msg->msg_flags); + } return ret; } @@ -321,7 +359,7 @@ static ssize_t svc_tcp_read_msg(struct svc_rqst *rqstp, size_t buflen, iov_iter_advance(&msg.msg_iter, seek); buflen -= seek; } - len = svc_tcp_sock_recv_cmsg(svsk, &msg); + len = svc_tcp_sock_recvmsg(svsk, &msg); if (len > 0) svc_flush_bvec(bvec, len, seek); @@ -713,14 +751,14 @@ static int svc_udp_sendto(struct svc_rqst *rqstp) if (svc_xprt_is_dead(xprt)) goto out_notconn; - count = xdr_buf_to_bvec(rqstp->rq_bvec, rqstp->rq_maxpages, xdr); + count = xdr_buf_to_bvec(svsk->sk_bvec, SUNRPC_MAX_UDP_SENDPAGES, xdr); - iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec, + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, svsk->sk_bvec, count, rqstp->rq_res.len); err = sock_sendmsg(svsk->sk_sock, &msg); if (err == -ECONNREFUSED) { /* ICMP error on earlier request. */ - iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec, + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, svsk->sk_bvec, count, rqstp->rq_res.len); err = sock_sendmsg(svsk->sk_sock, &msg); } @@ -809,6 +847,7 @@ static void svc_udp_init(struct svc_sock *svsk, struct svc_serv *serv) /* data might have come in before data_ready set up */ set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags); set_bit(XPT_CHNGBUF, &svsk->sk_xprt.xpt_flags); + set_bit(XPT_RPCB_UNREG, &svsk->sk_xprt.xpt_flags); /* make sure we get destination address info */ switch (svsk->sk_sk->sk_family) { @@ -1018,7 +1057,7 @@ static ssize_t svc_tcp_read_marker(struct svc_sock *svsk, iov.iov_base = ((char *)&svsk->sk_marker) + svsk->sk_tcplen; iov.iov_len = want; iov_iter_kvec(&msg.msg_iter, ITER_DEST, &iov, 1, want); - len = svc_tcp_sock_recv_cmsg(svsk, &msg); + len = svc_tcp_sock_recvmsg(svsk, &msg); if (len < 0) return len; svsk->sk_tcplen += len; @@ -1034,9 +1073,10 @@ static ssize_t svc_tcp_read_marker(struct svc_sock *svsk, return svc_sock_reclen(svsk); err_too_large: - net_notice_ratelimited("svc: %s %s RPC fragment too large: %d\n", - __func__, svsk->sk_xprt.xpt_server->sv_name, - svc_sock_reclen(svsk)); + net_notice_ratelimited("svc: %s oversized RPC fragment (%u octets) from %pISpc\n", + svsk->sk_xprt.xpt_server->sv_name, + svc_sock_reclen(svsk), + (struct sockaddr *)&svsk->sk_xprt.xpt_remote); svc_xprt_deferred_close(&svsk->sk_xprt); err_short: return -EAGAIN; @@ -1197,7 +1237,7 @@ err_noclose: * that the pages backing @xdr are unchanging. */ static int svc_tcp_sendmsg(struct svc_sock *svsk, struct svc_rqst *rqstp, - rpc_fraghdr marker, int *sentp) + rpc_fraghdr marker) { struct msghdr msg = { .msg_flags = MSG_SPLICE_PAGES, @@ -1206,29 +1246,24 @@ static int svc_tcp_sendmsg(struct svc_sock *svsk, struct svc_rqst *rqstp, void *buf; int ret; - *sentp = 0; - /* The stream record marker is copied into a temporary page - * fragment buffer so that it can be included in rq_bvec. + * fragment buffer so that it can be included in sk_bvec. */ buf = page_frag_alloc(&svsk->sk_frag_cache, sizeof(marker), GFP_KERNEL); if (!buf) return -ENOMEM; memcpy(buf, &marker, sizeof(marker)); - bvec_set_virt(rqstp->rq_bvec, buf, sizeof(marker)); + bvec_set_virt(svsk->sk_bvec, buf, sizeof(marker)); - count = xdr_buf_to_bvec(rqstp->rq_bvec + 1, rqstp->rq_maxpages, + count = xdr_buf_to_bvec(svsk->sk_bvec + 1, rqstp->rq_maxpages, &rqstp->rq_res); - iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec, + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, svsk->sk_bvec, 1 + count, sizeof(marker) + rqstp->rq_res.len); ret = sock_sendmsg(svsk->sk_sock, &msg); page_frag_free(buf); - if (ret < 0) - return ret; - *sentp += ret; - return 0; + return ret; } /** @@ -1247,7 +1282,7 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp) struct xdr_buf *xdr = &rqstp->rq_res; rpc_fraghdr marker = cpu_to_be32(RPC_LAST_STREAM_FRAGMENT | (u32)xdr->len); - int sent, err; + int sent; svc_tcp_release_ctxt(xprt, rqstp->rq_xprt_ctxt); rqstp->rq_xprt_ctxt = NULL; @@ -1255,9 +1290,9 @@ static int svc_tcp_sendto(struct svc_rqst *rqstp) mutex_lock(&xprt->xpt_mutex); if (svc_xprt_is_dead(xprt)) goto out_notconn; - err = svc_tcp_sendmsg(svsk, rqstp, marker, &sent); - trace_svcsock_tcp_send(xprt, err < 0 ? (long)err : sent); - if (err < 0 || sent != (xdr->len + sizeof(marker))) + sent = svc_tcp_sendmsg(svsk, rqstp, marker); + trace_svcsock_tcp_send(xprt, sent); + if (sent < 0 || sent != (xdr->len + sizeof(marker))) goto out_close; mutex_unlock(&xprt->xpt_mutex); return sent; @@ -1266,10 +1301,10 @@ out_notconn: mutex_unlock(&xprt->xpt_mutex); return -ENOTCONN; out_close: - pr_notice("rpc-srv/tcp: %s: %s %d when sending %d bytes - shutting down socket\n", + pr_notice("rpc-srv/tcp: %s: %s %d when sending %zu bytes - shutting down socket\n", xprt->xpt_server->sv_name, - (err < 0) ? "got error" : "sent", - (err < 0) ? err : sent, xdr->len); + (sent < 0) ? "got error" : "sent", + sent, xdr->len + sizeof(marker)); svc_xprt_deferred_close(xprt); mutex_unlock(&xprt->xpt_mutex); return -EAGAIN; @@ -1328,6 +1363,7 @@ static void svc_tcp_init(struct svc_sock *svsk, struct svc_serv *serv) if (sk->sk_state == TCP_LISTEN) { strcpy(svsk->sk_xprt.xpt_remotebuf, "listener"); set_bit(XPT_LISTENER, &svsk->sk_xprt.xpt_flags); + set_bit(XPT_RPCB_UNREG, &svsk->sk_xprt.xpt_flags); sk->sk_data_ready = svc_tcp_listen_data_ready; set_bit(XPT_CONN, &svsk->sk_xprt.xpt_flags); } else { @@ -1368,6 +1404,20 @@ void svc_sock_update_bufs(struct svc_serv *serv) spin_unlock_bh(&serv->sv_lock); } +static int svc_sock_sendpages(struct svc_serv *serv, struct socket *sock, int flags) +{ + switch (sock->type) { + case SOCK_STREAM: + /* +1 for TCP record marker */ + if (flags & SVC_SOCK_TEMPORARY) + return svc_serv_maxpages(serv) + 1; + return 0; + case SOCK_DGRAM: + return SUNRPC_MAX_UDP_SENDPAGES; + } + return -EINVAL; +} + /* * Initialize socket for RPC use and create svc_sock struct */ @@ -1378,12 +1428,26 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, struct svc_sock *svsk; struct sock *inet; int pmap_register = !(flags & SVC_SOCK_ANONYMOUS); + int sendpages; unsigned long pages; + sendpages = svc_sock_sendpages(serv, sock, flags); + if (sendpages < 0) + return ERR_PTR(sendpages); + pages = svc_serv_maxpages(serv); svsk = kzalloc(struct_size(svsk, sk_pages, pages), GFP_KERNEL); if (!svsk) return ERR_PTR(-ENOMEM); + + if (sendpages) { + svsk->sk_bvec = kcalloc(sendpages, sizeof(*svsk->sk_bvec), GFP_KERNEL); + if (!svsk->sk_bvec) { + kfree(svsk); + return ERR_PTR(-ENOMEM); + } + } + svsk->sk_maxpages = pages; inet = sock->sk; @@ -1395,6 +1459,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv, inet->sk_protocol, ntohs(inet_sk(inet)->inet_sport)); if (err < 0) { + kfree(svsk->sk_bvec); kfree(svsk); return ERR_PTR(err); } @@ -1533,7 +1598,7 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, ip6_sock_set_v6only(sock->sk); if (type == SOCK_STREAM) sock->sk->sk_reuse = SK_CAN_REUSE; /* allow address reuse */ - error = kernel_bind(sock, sin, len); + error = kernel_bind(sock, (struct sockaddr_unsized *)sin, len); if (error < 0) goto bummer; @@ -1612,5 +1677,6 @@ static void svc_sock_free(struct svc_xprt *xprt) sock_release(sock); page_frag_cache_drain(&svsk->sk_frag_cache); + kfree(svsk->sk_bvec); kfree(svsk); } diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c index 09434e1143c5..8b01b7ae2690 100644 --- a/net/sunrpc/sysfs.c +++ b/net/sunrpc/sysfs.c @@ -389,7 +389,7 @@ static ssize_t rpc_sysfs_xprt_dstaddr_store(struct kobject *kobj, saddr = (struct sockaddr *)&xprt->addr; port = rpc_get_port(saddr); - /* buf_len is the len until the first occurence of either + /* buf_len is the len until the first occurrence of either * '\n' or '\0' */ buf_len = strcspn(buf, "\n"); diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index 1478c41c7e9d..3aac1456e23e 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -190,7 +190,7 @@ rpcrdma_alloc_sparse_pages(struct xdr_buf *buf) ppages = buf->pages + (buf->page_base >> PAGE_SHIFT); while (len > 0) { if (!*ppages) - *ppages = alloc_page(GFP_NOWAIT | __GFP_NOWARN); + *ppages = alloc_page(GFP_NOWAIT); if (!*ppages) return -ENOBUFS; ppages++; diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 3d7f1413df02..b7b318ad25c4 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -591,12 +591,18 @@ static void svc_rdma_detach(struct svc_xprt *xprt) rdma_disconnect(rdma->sc_cm_id); } -static void __svc_rdma_free(struct work_struct *work) +/** + * svc_rdma_free - Release class-specific transport resources + * @xprt: Generic svc transport object + */ +static void svc_rdma_free(struct svc_xprt *xprt) { struct svcxprt_rdma *rdma = - container_of(work, struct svcxprt_rdma, sc_work); + container_of(xprt, struct svcxprt_rdma, sc_xprt); struct ib_device *device = rdma->sc_cm_id->device; + might_sleep(); + /* This blocks until the Completion Queues are empty */ if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) ib_drain_qp(rdma->sc_qp); @@ -629,15 +635,6 @@ static void __svc_rdma_free(struct work_struct *work) kfree(rdma); } -static void svc_rdma_free(struct svc_xprt *xprt) -{ - struct svcxprt_rdma *rdma = - container_of(xprt, struct svcxprt_rdma, sc_xprt); - - INIT_WORK(&rdma->sc_work, __svc_rdma_free); - schedule_work(&rdma->sc_work); -} - static int svc_rdma_has_wspace(struct svc_xprt *xprt) { struct svcxprt_rdma *rdma = diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index c5f7bbf5775f..2e1fe6013361 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -407,9 +407,9 @@ xs_sock_recv_cmsg(struct socket *sock, unsigned int *msg_flags, int flags) iov_iter_kvec(&msg.msg_iter, ITER_DEST, &alert_kvec, 1, alert_kvec.iov_len); ret = sock_recvmsg(sock, &msg, flags); - if (ret > 0 && - tls_get_record_type(sock->sk, &u.cmsg) == TLS_RECORD_TYPE_ALERT) { - iov_iter_revert(&msg.msg_iter, ret); + if (ret > 0) { + if (tls_get_record_type(sock->sk, &u.cmsg) == TLS_RECORD_TYPE_ALERT) + iov_iter_revert(&msg.msg_iter, ret); ret = xs_sock_process_cmsg(sock, &msg, msg_flags, &u.cmsg, -EAGAIN); } @@ -1845,8 +1845,8 @@ static int xs_bind(struct sock_xprt *transport, struct socket *sock) memcpy(&myaddr, &transport->srcaddr, transport->xprt.addrlen); do { rpc_set_port((struct sockaddr *)&myaddr, port); - err = kernel_bind(sock, (struct sockaddr *)&myaddr, - transport->xprt.addrlen); + err = kernel_bind(sock, (struct sockaddr_unsized *)&myaddr, + transport->xprt.addrlen); if (err == 0) { if (transport->xprt.reuseport) transport->srcport = port; @@ -2005,7 +2005,7 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt, xs_stream_start_connect(transport); - return kernel_connect(sock, xs_addr(xprt), xprt->addrlen, 0); + return kernel_connect(sock, (struct sockaddr_unsized *)xs_addr(xprt), xprt->addrlen, 0); } /** @@ -2405,7 +2405,8 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) /* Tell the socket layer to start connecting... */ set_bit(XPRT_SOCK_CONNECTING, &transport->sock_state); - return kernel_connect(sock, xs_addr(xprt), xprt->addrlen, O_NONBLOCK); + return kernel_connect(sock, (struct sockaddr_unsized *)xs_addr(xprt), + xprt->addrlen, O_NONBLOCK); } /** diff --git a/net/tipc/addr.c b/net/tipc/addr.c index fd0796269eed..6f5c54cbf8d9 100644 --- a/net/tipc/addr.c +++ b/net/tipc/addr.c @@ -79,7 +79,7 @@ void tipc_set_node_addr(struct net *net, u32 addr) pr_info("Node number set to %u\n", addr); } -char *tipc_nodeid2string(char *str, u8 *id) +int tipc_nodeid2string(char *str, u8 *id) { int i; u8 c; @@ -109,7 +109,7 @@ char *tipc_nodeid2string(char *str, u8 *id) if (i == NODE_ID_LEN) { memcpy(str, id, NODE_ID_LEN); str[NODE_ID_LEN] = 0; - return str; + return i; } /* Translate to hex string */ @@ -120,5 +120,5 @@ char *tipc_nodeid2string(char *str, u8 *id) for (i = NODE_ID_STR_LEN - 2; str[i] == '0'; i--) str[i] = 0; - return str; + return i + 1; } diff --git a/net/tipc/addr.h b/net/tipc/addr.h index 93f82398283d..a113cf7e1f89 100644 --- a/net/tipc/addr.h +++ b/net/tipc/addr.h @@ -130,6 +130,6 @@ static inline int in_own_node(struct net *net, u32 addr) bool tipc_in_scope(bool legacy_format, u32 domain, u32 addr); void tipc_set_node_id(struct net *net, u8 *id); void tipc_set_node_addr(struct net *net, u32 addr); -char *tipc_nodeid2string(char *str, u8 *id); +int tipc_nodeid2string(char *str, u8 *id); #endif diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c index ea5bb131ebd0..751904f10aab 100644 --- a/net/tipc/crypto.c +++ b/net/tipc/crypto.c @@ -1797,7 +1797,7 @@ exit: * @b: bearer where the message has been received * * If the decryption is successful, the decrypted skb is returned directly or - * as the callback, the encryption header and auth tag will be trimed out + * as the callback, the encryption header and auth tag will be trimmed out * before forwarding to tipc_rcv() via the tipc_crypto_rcv_complete(). * Otherwise, the skb will be freed! * Note: RX key(s) can be re-aligned, or in case of no key suitable, TX diff --git a/net/tipc/link.c b/net/tipc/link.c index 3ee44d731700..931f55f781a1 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -495,11 +495,9 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, /* Set link name for unicast links only */ if (peer_id) { - tipc_nodeid2string(self_str, tipc_own_id(net)); - if (strlen(self_str) > 16) + if (tipc_nodeid2string(self_str, tipc_own_id(net)) > NODE_ID_LEN) sprintf(self_str, "%x", self); - tipc_nodeid2string(peer_str, peer_id); - if (strlen(peer_str) > 16) + if (tipc_nodeid2string(peer_str, peer_id) > NODE_ID_LEN) sprintf(peer_str, "%x", peer); } /* Peer i/f name will be completed by reset/activate message */ @@ -570,8 +568,7 @@ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, u8 *peer_id, if (peer_id) { char peer_str[NODE_ID_STR_LEN] = {0,}; - tipc_nodeid2string(peer_str, peer_id); - if (strlen(peer_str) > 16) + if (tipc_nodeid2string(peer_str, peer_id) > NODE_ID_LEN) sprintf(peer_str, "%x", peer); /* Broadcast receiver link name: "broadcast-link:<peer>" */ snprintf(l->name, sizeof(l->name), "%s:%s", tipc_bclink_name, diff --git a/net/tipc/net.c b/net/tipc/net.c index 0e95572e56b4..7e65d0b0c4a8 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -145,7 +145,9 @@ void tipc_net_finalize_work(struct work_struct *work) { struct tipc_net *tn = container_of(work, struct tipc_net, work); + rtnl_lock(); tipc_net_finalize(tipc_link_net(tn->bcl), tn->trial_addr); + rtnl_unlock(); } void tipc_net_stop(struct net *net) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index e028bf658499..817b07d95a91 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -710,7 +710,7 @@ int tipc_sk_bind(struct socket *sock, struct sockaddr *skaddr, int alen) return res; } -static int tipc_bind(struct socket *sock, struct sockaddr *skaddr, int alen) +static int tipc_bind(struct socket *sock, struct sockaddr_unsized *skaddr, int alen) { struct tipc_uaddr *ua = (struct tipc_uaddr *)skaddr; u32 atype = ua->addrtype; @@ -726,7 +726,7 @@ static int tipc_bind(struct socket *sock, struct sockaddr *skaddr, int alen) return -EACCES; } } - return tipc_sk_bind(sock, skaddr, alen); + return tipc_sk_bind(sock, (struct sockaddr *)skaddr, alen); } /** @@ -2366,7 +2366,7 @@ static void tipc_sk_filter_rcv(struct sock *sk, struct sk_buff *skb, else if (sk_rmem_alloc_get(sk) + skb->truesize >= limit) { trace_tipc_sk_dump(sk, skb, TIPC_DUMP_ALL, "err_overload2!"); - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); err = TIPC_ERR_OVERLOAD; } @@ -2458,7 +2458,7 @@ static void tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk, trace_tipc_sk_dump(sk, skb, TIPC_DUMP_ALL, "err_overload!"); /* Overload => reject message back to sender */ onode = tipc_own_addr(sock_net(sk)); - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); if (tipc_msg_reverse(onode, &skb, TIPC_ERR_OVERLOAD)) { trace_tipc_sk_rej_msg(sk, skb, TIPC_DUMP_ALL, "@sk_enqueue!"); @@ -2565,7 +2565,7 @@ static bool tipc_sockaddr_is_sane(struct sockaddr_tipc *addr) * * Return: 0 on success, errno otherwise */ -static int tipc_connect(struct socket *sock, struct sockaddr *dest, +static int tipc_connect(struct socket *sock, struct sockaddr_unsized *dest, int destlen, int flags) { struct sock *sk = sock->sk; @@ -3031,10 +3031,8 @@ static void tipc_sk_remove(struct tipc_sock *tsk) struct sock *sk = &tsk->sk; struct tipc_net *tn = net_generic(sock_net(sk), tipc_net_id); - if (!rhashtable_remove_fast(&tn->sk_rht, &tsk->node, tsk_rht_params)) { - WARN_ON(refcount_read(&sk->sk_refcnt) == 1); + if (!rhashtable_remove_fast(&tn->sk_rht, &tsk->node, tsk_rht_params)) __sock_put(sk); - } } static const struct rhashtable_params tsk_rht_params = { @@ -3657,7 +3655,7 @@ int tipc_sk_fill_sock_diag(struct sk_buff *skb, struct netlink_callback *cb, nla_put_u32(skb, TIPC_NLA_SOCK_STAT_SENDQ, skb_queue_len(&sk->sk_write_queue)) || nla_put_u32(skb, TIPC_NLA_SOCK_STAT_DROP, - atomic_read(&sk->sk_drops))) + sk_drops_read(sk))) goto stat_msg_cancel; if (tsk->cong_link_cnt && diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c index ffe577bf6b51..aad7f96b6009 100644 --- a/net/tipc/topsrv.c +++ b/net/tipc/topsrv.c @@ -57,7 +57,7 @@ * @conn_idr: identifier set of connection * @idr_lock: protect the connection identifier set * @idr_in_use: amount of allocated identifier entry - * @net: network namspace instance + * @net: network namespace instance * @awork: accept work item * @rcv_wq: receive workqueue * @send_wq: send workqueue @@ -83,7 +83,7 @@ struct tipc_topsrv { * @sock: socket handler associated with connection * @flags: indicates connection state * @server: pointer to connected server - * @sub_list: lsit to all pertaing subscriptions + * @sub_list: list to all pertaining subscriptions * @sub_lock: lock protecting the subscription list * @rwork: receive work item * @outqueue: pointer to first outbound message in queue diff --git a/net/tls/tls.h b/net/tls/tls.h index 774859b63f0d..2f86baeb71fc 100644 --- a/net/tls/tls.h +++ b/net/tls/tls.h @@ -128,8 +128,9 @@ struct tls_rec { char aad_space[TLS_AAD_SPACE_SIZE]; u8 iv_data[TLS_MAX_IV_SIZE]; + + /* Must be last --ends in a flexible-array member. */ struct aead_request aead_req; - u8 aead_req_ctx[]; }; int __net_init tls_proc_init(struct net *net); @@ -141,6 +142,7 @@ void update_sk_prot(struct sock *sk, struct tls_context *ctx); int wait_on_pending_writer(struct sock *sk, long *timeo); void tls_err_abort(struct sock *sk, int err); +void tls_strp_abort_strp(struct tls_strparser *strp, int err); int init_prot_info(struct tls_prot_info *prot, const struct tls_crypto_info *crypto_info, @@ -196,7 +198,7 @@ void tls_strp_msg_done(struct tls_strparser *strp); int tls_rx_msg_size(struct tls_strparser *strp, struct sk_buff *skb); void tls_rx_msg_ready(struct tls_strparser *strp); -void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh); +bool tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh); int tls_strp_msg_cow(struct tls_sw_context_rx *ctx); struct sk_buff *tls_strp_msg_detach(struct tls_sw_context_rx *ctx); int tls_strp_msg_hold(struct tls_strparser *strp, struct sk_buff_head *dst); diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index f672a62a9a52..82ea407e520a 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -123,17 +123,19 @@ static void tls_device_queue_ctx_destruction(struct tls_context *ctx) /* We assume that the socket is already connected */ static struct net_device *get_netdev_for_sock(struct sock *sk) { - struct dst_entry *dst = sk_dst_get(sk); - struct net_device *netdev = NULL; + struct net_device *dev, *lowest_dev = NULL; + struct dst_entry *dst; - if (likely(dst)) { - netdev = netdev_sk_get_lowest_dev(dst->dev, sk); - dev_hold(netdev); + rcu_read_lock(); + dst = __sk_dst_get(sk); + dev = dst ? dst_dev_rcu(dst) : NULL; + if (likely(dev)) { + lowest_dev = netdev_sk_get_lowest_dev(dev, sk); + dev_hold(lowest_dev); } + rcu_read_unlock(); - dst_release(dst); - - return netdev; + return lowest_dev; } static void destroy_record(struct tls_record_info *record) @@ -371,7 +373,8 @@ static int tls_do_allocation(struct sock *sk, if (!offload_ctx->open_record) { if (unlikely(!skb_page_frag_refill(prepend_size, pfrag, sk->sk_allocation))) { - READ_ONCE(sk->sk_prot)->enter_memory_pressure(sk); + if (!sk->sk_bypass_prot_mem) + READ_ONCE(sk->sk_prot)->enter_memory_pressure(sk); sk_stream_moderate_sndbuf(sk); return -ENOMEM; } @@ -459,7 +462,7 @@ static int tls_push_data(struct sock *sk, /* TLS_HEADER_SIZE is not counted as part of the TLS record, and * we need to leave room for an authentication tag. */ - max_open_record_len = TLS_MAX_PAYLOAD_SIZE + + max_open_record_len = tls_ctx->tx_max_payload_len + prot->prepend_size; do { rc = tls_do_allocation(sk, ctx, pfrag, prot->prepend_size); @@ -721,8 +724,10 @@ tls_device_rx_resync_async(struct tls_offload_resync_async *resync_async, /* shouldn't get to wraparound: * too long in async stage, something bad happened */ - if (WARN_ON_ONCE(resync_async->rcd_delta == USHRT_MAX)) + if (WARN_ON_ONCE(resync_async->rcd_delta == USHRT_MAX)) { + tls_offload_rx_resync_async_request_cancel(resync_async); return false; + } /* asynchronous stage: log all headers seq such that * req_seq <= seq <= end_seq, and wait for real resync request @@ -1410,7 +1415,7 @@ int __init tls_device_init(void) if (!dummy_page) return -ENOMEM; - destruct_wq = alloc_workqueue("ktls_device_destruct", 0, 0); + destruct_wq = alloc_workqueue("ktls_device_destruct", WQ_PERCPU, 0); if (!destruct_wq) { err = -ENOMEM; goto err_free_dummy; diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index a3ccb3135e51..56ce0bc8317b 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -255,12 +255,9 @@ int tls_process_cmsg(struct sock *sk, struct msghdr *msg, if (msg->msg_flags & MSG_MORE) return -EINVAL; - rc = tls_handle_open_record(sk, msg->msg_flags); - if (rc) - return rc; - *record_type = *(unsigned char *)CMSG_DATA(cmsg); - rc = 0; + + rc = tls_handle_open_record(sk, msg->msg_flags); break; default: return -EINVAL; @@ -544,6 +541,28 @@ static int do_tls_getsockopt_no_pad(struct sock *sk, char __user *optval, return 0; } +static int do_tls_getsockopt_tx_payload_len(struct sock *sk, char __user *optval, + int __user *optlen) +{ + struct tls_context *ctx = tls_get_ctx(sk); + u16 payload_len = ctx->tx_max_payload_len; + int len; + + if (get_user(len, optlen)) + return -EFAULT; + + if (len < sizeof(payload_len)) + return -EINVAL; + + if (put_user(sizeof(payload_len), optlen)) + return -EFAULT; + + if (copy_to_user(optval, &payload_len, sizeof(payload_len))) + return -EFAULT; + + return 0; +} + static int do_tls_getsockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen) { @@ -563,6 +582,9 @@ static int do_tls_getsockopt(struct sock *sk, int optname, case TLS_RX_EXPECT_NO_PAD: rc = do_tls_getsockopt_no_pad(sk, optval, optlen); break; + case TLS_TX_MAX_PAYLOAD_LEN: + rc = do_tls_getsockopt_tx_payload_len(sk, optval, optlen); + break; default: rc = -ENOPROTOOPT; break; @@ -812,6 +834,32 @@ static int do_tls_setsockopt_no_pad(struct sock *sk, sockptr_t optval, return rc; } +static int do_tls_setsockopt_tx_payload_len(struct sock *sk, sockptr_t optval, + unsigned int optlen) +{ + struct tls_context *ctx = tls_get_ctx(sk); + struct tls_sw_context_tx *sw_ctx = tls_sw_ctx_tx(ctx); + u16 value; + bool tls_13 = ctx->prot_info.version == TLS_1_3_VERSION; + + if (sw_ctx && sw_ctx->open_rec) + return -EBUSY; + + if (sockptr_is_null(optval) || optlen != sizeof(value)) + return -EINVAL; + + if (copy_from_sockptr(&value, optval, sizeof(value))) + return -EFAULT; + + if (value < TLS_MIN_RECORD_SIZE_LIM - (tls_13 ? 1 : 0) || + value > TLS_MAX_PAYLOAD_SIZE) + return -EINVAL; + + ctx->tx_max_payload_len = value; + + return 0; +} + static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval, unsigned int optlen) { @@ -833,6 +881,11 @@ static int do_tls_setsockopt(struct sock *sk, int optname, sockptr_t optval, case TLS_RX_EXPECT_NO_PAD: rc = do_tls_setsockopt_no_pad(sk, optval, optlen); break; + case TLS_TX_MAX_PAYLOAD_LEN: + lock_sock(sk); + rc = do_tls_setsockopt_tx_payload_len(sk, optval, optlen); + release_sock(sk); + break; default: rc = -ENOPROTOOPT; break; @@ -1022,6 +1075,7 @@ static int tls_init(struct sock *sk) ctx->tx_conf = TLS_BASE; ctx->rx_conf = TLS_BASE; + ctx->tx_max_payload_len = TLS_MAX_PAYLOAD_SIZE; update_sk_prot(sk, ctx); out: write_unlock_bh(&sk->sk_callback_lock); @@ -1111,6 +1165,12 @@ static int tls_get_info(struct sock *sk, struct sk_buff *skb, bool net_admin) goto nla_failure; } + err = nla_put_u16(skb, TLS_INFO_TX_MAX_PAYLOAD_LEN, + ctx->tx_max_payload_len); + + if (err) + goto nla_failure; + rcu_read_unlock(); nla_nest_end(skb, start); return 0; @@ -1132,6 +1192,7 @@ static size_t tls_get_info_size(const struct sock *sk, bool net_admin) nla_total_size(sizeof(u16)) + /* TLS_INFO_TXCONF */ nla_total_size(0) + /* TLS_INFO_ZC_RO_TX */ nla_total_size(0) + /* TLS_INFO_RX_NO_PAD */ + nla_total_size(sizeof(u16)) + /* TLS_INFO_TX_MAX_PAYLOAD_LEN */ 0; return size; diff --git a/net/tls/tls_proc.c b/net/tls/tls_proc.c index 367666aa07b8..4012c4372d4c 100644 --- a/net/tls/tls_proc.c +++ b/net/tls/tls_proc.c @@ -27,17 +27,19 @@ static const struct snmp_mib tls_mib_list[] = { SNMP_MIB_ITEM("TlsTxRekeyOk", LINUX_MIB_TLSTXREKEYOK), SNMP_MIB_ITEM("TlsTxRekeyError", LINUX_MIB_TLSTXREKEYERROR), SNMP_MIB_ITEM("TlsRxRekeyReceived", LINUX_MIB_TLSRXREKEYRECEIVED), - SNMP_MIB_SENTINEL }; static int tls_statistics_seq_show(struct seq_file *seq, void *v) { - unsigned long buf[LINUX_MIB_TLSMAX] = {}; + unsigned long buf[ARRAY_SIZE(tls_mib_list)]; + const int cnt = ARRAY_SIZE(tls_mib_list); struct net *net = seq->private; int i; - snmp_get_cpu_field_batch(buf, tls_mib_list, net->mib.tls_statistics); - for (i = 0; tls_mib_list[i].name; i++) + memset(buf, 0, sizeof(buf)); + snmp_get_cpu_field_batch_cnt(buf, tls_mib_list, cnt, + net->mib.tls_statistics); + for (i = 0; i < cnt; i++) seq_printf(seq, "%-32s\t%lu\n", tls_mib_list[i].name, buf[i]); return 0; diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c index 095cf31bae0b..98e12f0ff57e 100644 --- a/net/tls/tls_strp.c +++ b/net/tls/tls_strp.c @@ -13,7 +13,7 @@ static struct workqueue_struct *tls_strp_wq; -static void tls_strp_abort_strp(struct tls_strparser *strp, int err) +void tls_strp_abort_strp(struct tls_strparser *strp, int err) { if (strp->stopped) return; @@ -211,11 +211,17 @@ static int tls_strp_copyin_frag(struct tls_strparser *strp, struct sk_buff *skb, struct sk_buff *in_skb, unsigned int offset, size_t in_len) { + unsigned int nfrag = skb->len / PAGE_SIZE; size_t len, chunk; skb_frag_t *frag; int sz; - frag = &skb_shinfo(skb)->frags[skb->len / PAGE_SIZE]; + if (unlikely(nfrag >= skb_shinfo(skb)->nr_frags)) { + DEBUG_NET_WARN_ON_ONCE(1); + return -EMSGSIZE; + } + + frag = &skb_shinfo(skb)->frags[nfrag]; len = in_len; /* First make sure we got the header */ @@ -475,7 +481,7 @@ static void tls_strp_load_anchor_with_queue(struct tls_strparser *strp, int len) strp->stm.offset = offset; } -void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh) +bool tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh) { struct strp_msg *rxm; struct tls_msg *tlm; @@ -484,8 +490,11 @@ void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh) DEBUG_NET_WARN_ON_ONCE(!strp->stm.full_len); if (!strp->copy_mode && force_refresh) { - if (WARN_ON(tcp_inq(strp->sk) < strp->stm.full_len)) - return; + if (unlikely(tcp_inq(strp->sk) < strp->stm.full_len)) { + WRITE_ONCE(strp->msg_ready, 0); + memset(&strp->stm, 0, sizeof(strp->stm)); + return false; + } tls_strp_load_anchor_with_queue(strp, strp->stm.full_len); } @@ -495,6 +504,8 @@ void tls_strp_msg_load(struct tls_strparser *strp, bool force_refresh) rxm->offset = strp->stm.offset; tlm = tls_msg(strp->anchor); tlm->control = strp->mark; + + return true; } /* Called with lock held on lower socket */ @@ -515,10 +526,8 @@ static int tls_strp_read_sock(struct tls_strparser *strp) tls_strp_load_anchor_with_queue(strp, inq); if (!strp->stm.full_len) { sz = tls_rx_msg_size(strp, strp->anchor); - if (sz < 0) { - tls_strp_abort_strp(strp, sz); + if (sz < 0) return sz; - } strp->stm.full_len = sz; diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 549d1ea01a72..9937d4c810f2 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1054,7 +1054,7 @@ static int tls_sw_sendmsg_locked(struct sock *sk, struct msghdr *msg, if (ret == -EINPROGRESS) num_async++; else if (ret != -EAGAIN) - goto send_end; + goto end; } } @@ -1079,7 +1079,7 @@ static int tls_sw_sendmsg_locked(struct sock *sk, struct msghdr *msg, orig_size = msg_pl->sg.size; full_record = false; try_to_copy = msg_data_left(msg); - record_room = TLS_MAX_PAYLOAD_SIZE - msg_pl->sg.size; + record_room = tls_ctx->tx_max_payload_len - msg_pl->sg.size; if (try_to_copy >= record_room) { try_to_copy = record_room; full_record = true; @@ -1112,8 +1112,11 @@ alloc_encrypted: goto send_end; tls_ctx->pending_open_record_frags = true; - if (sk_msg_full(msg_pl)) + if (sk_msg_full(msg_pl)) { full_record = true; + sk_msg_trim(sk, msg_en, + msg_pl->sg.size + prot->overhead_size); + } if (full_record || eor) goto copied; @@ -1149,6 +1152,13 @@ alloc_encrypted: } else if (ret != -EAGAIN) goto send_end; } + + /* Transmit if any encryptions have completed */ + if (test_and_clear_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) { + cancel_delayed_work(&ctx->tx_work.work); + tls_tx_records(sk, msg->msg_flags); + } + continue; rollback_iter: copied -= try_to_copy; @@ -1204,6 +1214,12 @@ copied: goto send_end; } } + + /* Transmit if any encryptions have completed */ + if (test_and_clear_bit(BIT_TX_SCHEDULED, &ctx->tx_bitmask)) { + cancel_delayed_work(&ctx->tx_work.work); + tls_tx_records(sk, msg->msg_flags); + } } continue; @@ -1223,8 +1239,9 @@ trim_sgl: goto alloc_encrypted; } +send_end: if (!num_async) { - goto send_end; + goto end; } else if (num_zc || eor) { int err; @@ -1242,7 +1259,7 @@ trim_sgl: tls_tx_records(sk, msg->msg_flags); } -send_end: +end: ret = sk_stream_error(sk, msg->msg_flags, ret); return copied > 0 ? copied : ret; } @@ -1384,7 +1401,8 @@ tls_rx_rec_wait(struct sock *sk, struct sk_psock *psock, bool nonblock, return sock_intr_errno(timeo); } - tls_strp_msg_load(&ctx->strp, released); + if (unlikely(!tls_strp_msg_load(&ctx->strp, released))) + return tls_rx_rec_wait(sk, psock, nonblock, false); return 1; } @@ -1636,8 +1654,10 @@ static int tls_decrypt_sg(struct sock *sk, struct iov_iter *out_iov, if (unlikely(darg->async)) { err = tls_strp_msg_hold(&ctx->strp, &ctx->async_hold); - if (err) - __skb_queue_tail(&ctx->async_hold, darg->skb); + if (err) { + err = tls_decrypt_async_wait(ctx); + darg->async = false; + } return err; } @@ -1807,6 +1827,9 @@ int decrypt_skb(struct sock *sk, struct scatterlist *sgout) return tls_decrypt_sg(sk, NULL, sgout, &darg); } +/* All records returned from a recvmsg() call must have the same type. + * 0 is not a valid content type. Use it as "no type reported, yet". + */ static int tls_record_content_type(struct msghdr *msg, struct tls_msg *tlm, u8 *control) { @@ -2050,8 +2073,10 @@ int tls_sw_recvmsg(struct sock *sk, if (err < 0) goto end; + /* process_rx_list() will set @control if it processed any records */ copied = err; - if (len <= copied || (copied && control != TLS_RECORD_TYPE_DATA) || rx_more) + if (len <= copied || rx_more || + (control && control != TLS_RECORD_TYPE_DATA)) goto end; target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); @@ -2468,8 +2493,7 @@ int tls_rx_msg_size(struct tls_strparser *strp, struct sk_buff *skb) return data_len + TLS_HEADER_SIZE; read_failure: - tls_err_abort(strp->sk, ret); - + tls_strp_abort_strp(strp, ret); return ret; } diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 6d7c110814ff..55cdebfa0da0 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -733,19 +733,7 @@ static void unix_release_sock(struct sock *sk, int embrion) /* ---- Socket is dead now and most probably destroyed ---- */ - /* - * Fixme: BSD difference: In BSD all sockets connected to us get - * ECONNRESET and we die on the spot. In Linux we behave - * like files and pipes do and wait for the last - * dereference. - * - * Can't we simply set sock->err? - * - * What the above comment does talk about? --ANK(980817) - */ - - if (READ_ONCE(unix_tot_inflight)) - unix_gc(); /* Garbage collect fds */ + unix_schedule_gc(NULL); } struct unix_peercred { @@ -854,8 +842,8 @@ out: } static int unix_release(struct socket *); -static int unix_bind(struct socket *, struct sockaddr *, int); -static int unix_stream_connect(struct socket *, struct sockaddr *, +static int unix_bind(struct socket *, struct sockaddr_unsized *, int); +static int unix_stream_connect(struct socket *, struct sockaddr_unsized *, int addr_len, int flags); static int unix_socketpair(struct socket *, struct socket *); static int unix_accept(struct socket *, struct socket *, struct proto_accept_arg *arg); @@ -877,7 +865,7 @@ static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t); static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int); static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor); static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor); -static int unix_dgram_connect(struct socket *, struct sockaddr *, +static int unix_dgram_connect(struct socket *, struct sockaddr_unsized *, int, int); static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t); static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t, @@ -1210,25 +1198,16 @@ static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len, unix_mkname_bsd(sunaddr, addr_len); if (flags & SOCK_COREDUMP) { - const struct cred *cred; - struct cred *kcred; struct path root; - kcred = prepare_kernel_cred(&init_task); - if (!kcred) { - err = -ENOMEM; - goto fail; - } - task_lock(&init_task); get_fs_root(init_task.fs, &root); task_unlock(&init_task); - cred = override_creds(kcred); - err = vfs_path_lookup(root.dentry, root.mnt, sunaddr->sun_path, - LOOKUP_BENEATH | LOOKUP_NO_SYMLINKS | - LOOKUP_NO_MAGICLINKS, &path); - put_cred(revert_creds(cred)); + scoped_with_kernel_creds() + err = vfs_path_lookup(root.dentry, root.mnt, sunaddr->sun_path, + LOOKUP_BENEATH | LOOKUP_NO_SYMLINKS | + LOOKUP_NO_MAGICLINKS, &path); path_put(&root); if (err) goto fail; @@ -1387,7 +1366,7 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, * Get the parent directory, calculate the hash for last * component. */ - dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0); + dentry = start_creating_path(AT_FDCWD, addr->name->sun_path, &parent, 0); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); goto out; @@ -1399,7 +1378,7 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, idmap = mnt_idmap(parent.mnt); err = security_path_mknod(&parent, dentry, mode, 0); if (!err) - err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0); + err = vfs_mknod(idmap, d_inode(parent.dentry), dentry, mode, 0, NULL); if (err) goto out_path; err = mutex_lock_interruptible(&u->bindlock); @@ -1417,7 +1396,7 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, unix_table_double_unlock(net, old_hash, new_hash); unix_insert_bsd_socket(sk); mutex_unlock(&u->bindlock); - done_path_create(&parent, dentry); + end_creating_path(&parent, dentry); return 0; out_unlock: @@ -1427,7 +1406,7 @@ out_unlink: /* failed after successful mknod? unlink what we'd created... */ vfs_unlink(idmap, d_inode(parent.dentry), dentry, NULL); out_path: - done_path_create(&parent, dentry); + end_creating_path(&parent, dentry); out: unix_release_addr(addr); return err == -EEXIST ? -EADDRINUSE : err; @@ -1477,7 +1456,7 @@ out: return err; } -static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +static int unix_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; struct sock *sk = sock->sk; @@ -1523,7 +1502,7 @@ static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2) unix_state_unlock(sk2); } -static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, +static int unix_dgram_connect(struct socket *sock, struct sockaddr_unsized *addr, int alen, int flags) { struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr; @@ -1642,7 +1621,7 @@ static long unix_wait_for_peer(struct sock *other, long timeo) return timeo; } -static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, +static int unix_stream_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, int flags) { struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; @@ -2110,8 +2089,6 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, if (err < 0) return err; - wait_for_unix_gc(scm.fp); - if (msg->msg_flags & MSG_OOB) { err = -EOPNOTSUPP; goto out; @@ -2405,8 +2382,6 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, if (err < 0) return err; - wait_for_unix_gc(scm.fp); - if (msg->msg_flags & MSG_OOB) { err = -EOPNOTSUPP; #if IS_ENABLED(CONFIG_AF_UNIX_OOB) @@ -2954,6 +2929,7 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state, u = unix_sk(sk); +redo: /* Lock the socket to prevent queue disordering * while sleeps in memcpy_tomsg */ @@ -2965,7 +2941,6 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state, struct sk_buff *skb, *last; int chunk; -redo: unix_state_lock(sk); if (sock_flag(sk, SOCK_DEAD)) { err = -ECONNRESET; @@ -3015,7 +2990,6 @@ again: goto out; } - mutex_lock(&u->iolock); goto redo; unlock: unix_state_unlock(sk); @@ -3286,9 +3260,6 @@ EXPORT_SYMBOL_GPL(unix_outq_len); static int unix_open_file(struct sock *sk) { - struct file *f; - int fd; - if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) return -EPERM; @@ -3298,18 +3269,7 @@ static int unix_open_file(struct sock *sk) if (!unix_sk(sk)->path.dentry) return -ENOENT; - fd = get_unused_fd_flags(O_CLOEXEC); - if (fd < 0) - return fd; - - f = dentry_open(&unix_sk(sk)->path, O_PATH, current_cred()); - if (IS_ERR(f)) { - put_unused_fd(fd); - return PTR_ERR(f); - } - - fd_install(fd, f); - return fd; + return FD_ADD(O_CLOEXEC, dentry_open(&unix_sk(sk)->path, O_PATH, current_cred())); } static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) diff --git a/net/unix/af_unix.h b/net/unix/af_unix.h index 59db179df9bb..c4f1b2da363d 100644 --- a/net/unix/af_unix.h +++ b/net/unix/af_unix.h @@ -24,14 +24,12 @@ struct unix_skb_parms { #define UNIXCB(skb) (*(struct unix_skb_parms *)&((skb)->cb)) /* GC for SCM_RIGHTS */ -extern unsigned int unix_tot_inflight; void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver); void unix_del_edges(struct scm_fp_list *fpl); void unix_update_edges(struct unix_sock *receiver); int unix_prepare_fpl(struct scm_fp_list *fpl); void unix_destroy_fpl(struct scm_fp_list *fpl); -void unix_gc(void); -void wait_for_unix_gc(struct scm_fp_list *fpl); +void unix_schedule_gc(struct user_struct *user); /* SOCK_DIAG */ long unix_inq_len(struct sock *sk); diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 01e2b9452c75..78323d43e63e 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -121,8 +121,13 @@ static struct unix_vertex *unix_edge_successor(struct unix_edge *edge) return edge->successor->vertex; } -static bool unix_graph_maybe_cyclic; -static bool unix_graph_grouped; +enum { + UNIX_GRAPH_NOT_CYCLIC, + UNIX_GRAPH_MAYBE_CYCLIC, + UNIX_GRAPH_CYCLIC, +}; + +static unsigned char unix_graph_state; static void unix_update_graph(struct unix_vertex *vertex) { @@ -132,8 +137,7 @@ static void unix_update_graph(struct unix_vertex *vertex) if (!vertex) return; - unix_graph_maybe_cyclic = true; - unix_graph_grouped = false; + WRITE_ONCE(unix_graph_state, UNIX_GRAPH_MAYBE_CYCLIC); } static LIST_HEAD(unix_unvisited_vertices); @@ -145,6 +149,7 @@ enum unix_vertex_index { }; static unsigned long unix_vertex_unvisited_index = UNIX_VERTEX_INDEX_MARK1; +static unsigned long unix_vertex_max_scc_index = UNIX_VERTEX_INDEX_START; static void unix_add_edge(struct scm_fp_list *fpl, struct unix_edge *edge) { @@ -153,6 +158,7 @@ static void unix_add_edge(struct scm_fp_list *fpl, struct unix_edge *edge) if (!vertex) { vertex = list_first_entry(&fpl->vertices, typeof(*vertex), entry); vertex->index = unix_vertex_unvisited_index; + vertex->scc_index = ++unix_vertex_max_scc_index; vertex->out_degree = 0; INIT_LIST_HEAD(&vertex->edges); INIT_LIST_HEAD(&vertex->scc_entry); @@ -194,7 +200,6 @@ static void unix_free_vertices(struct scm_fp_list *fpl) } static DEFINE_SPINLOCK(unix_gc_lock); -unsigned int unix_tot_inflight; void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver) { @@ -220,7 +225,6 @@ void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver) } while (i < fpl->count_unix); receiver->scm_stat.nr_unix_fds += fpl->count_unix; - WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + fpl->count_unix); out: WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight + fpl->count); @@ -251,7 +255,6 @@ void unix_del_edges(struct scm_fp_list *fpl) receiver = fpl->edges[0].successor; receiver->scm_stat.nr_unix_fds -= fpl->count_unix; } - WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - fpl->count_unix); out: WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight - fpl->count); @@ -297,6 +300,8 @@ int unix_prepare_fpl(struct scm_fp_list *fpl) if (!fpl->edges) goto err; + unix_schedule_gc(fpl->user); + return 0; err: @@ -402,9 +407,11 @@ static bool unix_scc_cyclic(struct list_head *scc) static LIST_HEAD(unix_visited_vertices); static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2; -static void __unix_walk_scc(struct unix_vertex *vertex, unsigned long *last_index, - struct sk_buff_head *hitlist) +static unsigned long __unix_walk_scc(struct unix_vertex *vertex, + unsigned long *last_index, + struct sk_buff_head *hitlist) { + unsigned long cyclic_sccs = 0; LIST_HEAD(vertex_stack); struct unix_edge *edge; LIST_HEAD(edge_stack); @@ -489,10 +496,15 @@ prev_vertex: scc_dead = unix_vertex_dead(v); } - if (scc_dead) + if (scc_dead) { unix_collect_skb(&scc, hitlist); - else if (!unix_graph_maybe_cyclic) - unix_graph_maybe_cyclic = unix_scc_cyclic(&scc); + } else { + if (unix_vertex_max_scc_index < vertex->scc_index) + unix_vertex_max_scc_index = vertex->scc_index; + + if (unix_scc_cyclic(&scc)) + cyclic_sccs++; + } list_del(&scc); } @@ -500,13 +512,18 @@ prev_vertex: /* Need backtracking ? */ if (!list_empty(&edge_stack)) goto prev_vertex; + + return cyclic_sccs; } +static unsigned long unix_graph_cyclic_sccs; + static void unix_walk_scc(struct sk_buff_head *hitlist) { unsigned long last_index = UNIX_VERTEX_INDEX_START; + unsigned long cyclic_sccs = 0; - unix_graph_maybe_cyclic = false; + unix_vertex_max_scc_index = UNIX_VERTEX_INDEX_START; /* Visit every vertex exactly once. * __unix_walk_scc() moves visited vertices to unix_visited_vertices. @@ -515,18 +532,20 @@ static void unix_walk_scc(struct sk_buff_head *hitlist) struct unix_vertex *vertex; vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry); - __unix_walk_scc(vertex, &last_index, hitlist); + cyclic_sccs += __unix_walk_scc(vertex, &last_index, hitlist); } list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices); swap(unix_vertex_unvisited_index, unix_vertex_grouped_index); - unix_graph_grouped = true; + WRITE_ONCE(unix_graph_cyclic_sccs, cyclic_sccs); + WRITE_ONCE(unix_graph_state, + cyclic_sccs ? UNIX_GRAPH_CYCLIC : UNIX_GRAPH_NOT_CYCLIC); } static void unix_walk_scc_fast(struct sk_buff_head *hitlist) { - unix_graph_maybe_cyclic = false; + unsigned long cyclic_sccs = unix_graph_cyclic_sccs; while (!list_empty(&unix_unvisited_vertices)) { struct unix_vertex *vertex; @@ -543,34 +562,38 @@ static void unix_walk_scc_fast(struct sk_buff_head *hitlist) scc_dead = unix_vertex_dead(vertex); } - if (scc_dead) + if (scc_dead) { + cyclic_sccs--; unix_collect_skb(&scc, hitlist); - else if (!unix_graph_maybe_cyclic) - unix_graph_maybe_cyclic = unix_scc_cyclic(&scc); + } list_del(&scc); } list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices); + + WRITE_ONCE(unix_graph_cyclic_sccs, cyclic_sccs); + WRITE_ONCE(unix_graph_state, + cyclic_sccs ? UNIX_GRAPH_CYCLIC : UNIX_GRAPH_NOT_CYCLIC); } static bool gc_in_progress; -static void __unix_gc(struct work_struct *work) +static void unix_gc(struct work_struct *work) { struct sk_buff_head hitlist; struct sk_buff *skb; spin_lock(&unix_gc_lock); - if (!unix_graph_maybe_cyclic) { + if (unix_graph_state == UNIX_GRAPH_NOT_CYCLIC) { spin_unlock(&unix_gc_lock); goto skip_gc; } __skb_queue_head_init(&hitlist); - if (unix_graph_grouped) + if (unix_graph_state == UNIX_GRAPH_CYCLIC) unix_walk_scc_fast(&hitlist); else unix_walk_scc(&hitlist); @@ -587,36 +610,27 @@ skip_gc: WRITE_ONCE(gc_in_progress, false); } -static DECLARE_WORK(unix_gc_work, __unix_gc); - -void unix_gc(void) -{ - WRITE_ONCE(gc_in_progress, true); - queue_work(system_unbound_wq, &unix_gc_work); -} +static DECLARE_WORK(unix_gc_work, unix_gc); -#define UNIX_INFLIGHT_TRIGGER_GC 16000 -#define UNIX_INFLIGHT_SANE_USER (SCM_MAX_FD * 8) +#define UNIX_INFLIGHT_SANE_USER (SCM_MAX_FD * 8) -void wait_for_unix_gc(struct scm_fp_list *fpl) +void unix_schedule_gc(struct user_struct *user) { - /* If number of inflight sockets is insane, - * force a garbage collect right now. - * - * Paired with the WRITE_ONCE() in unix_inflight(), - * unix_notinflight(), and __unix_gc(). - */ - if (READ_ONCE(unix_tot_inflight) > UNIX_INFLIGHT_TRIGGER_GC && - !READ_ONCE(gc_in_progress)) - unix_gc(); + if (READ_ONCE(unix_graph_state) == UNIX_GRAPH_NOT_CYCLIC) + return; /* Penalise users who want to send AF_UNIX sockets * but whose sockets have not been received yet. */ - if (!fpl || !fpl->count_unix || - READ_ONCE(fpl->user->unix_inflight) < UNIX_INFLIGHT_SANE_USER) + if (user && + READ_ONCE(user->unix_inflight) < UNIX_INFLIGHT_SANE_USER) return; - if (READ_ONCE(gc_in_progress)) + if (!READ_ONCE(gc_in_progress)) { + WRITE_ONCE(gc_in_progress, true); + queue_work(system_dfl_wq, &unix_gc_work); + } + + if (user && READ_ONCE(unix_graph_cyclic_sccs)) flush_work(&unix_gc_work); } diff --git a/net/vmw_vsock/Kconfig b/net/vmw_vsock/Kconfig index 56356d2980c8..8e803c4828c4 100644 --- a/net/vmw_vsock/Kconfig +++ b/net/vmw_vsock/Kconfig @@ -72,7 +72,7 @@ config VIRTIO_VSOCKETS_COMMON config HYPERV_VSOCKETS tristate "Hyper-V transport for Virtual Sockets" - depends on VSOCKETS && HYPERV + depends on VSOCKETS && HYPERV_VMBUS help This module implements a Hyper-V transport for Virtual Sockets. diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index ead6a3c14b87..adcba1b7bf74 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -487,12 +487,26 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk) goto err; } - if (vsk->transport) { - if (vsk->transport == new_transport) { - ret = 0; - goto err; - } + if (vsk->transport && vsk->transport == new_transport) { + ret = 0; + goto err; + } + + /* We increase the module refcnt to prevent the transport unloading + * while there are open sockets assigned to it. + */ + if (!new_transport || !try_module_get(new_transport->module)) { + ret = -ENODEV; + goto err; + } + /* It's safe to release the mutex after a successful try_module_get(). + * Whichever transport `new_transport` points at, it won't go away until + * the last module_put() below or in vsock_deassign_transport(). + */ + mutex_unlock(&vsock_register_mutex); + + if (vsk->transport) { /* transport->release() must be called with sock lock acquired. * This path can only be taken during vsock_connect(), where we * have already held the sock lock. In the other cases, this @@ -512,20 +526,6 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk) vsk->peer_shutdown = 0; } - /* We increase the module refcnt to prevent the transport unloading - * while there are open sockets assigned to it. - */ - if (!new_transport || !try_module_get(new_transport->module)) { - ret = -ENODEV; - goto err; - } - - /* It's safe to release the mutex after a successful try_module_get(). - * Whichever transport `new_transport` points at, it won't go away until - * the last module_put() below or in vsock_deassign_transport(). - */ - mutex_unlock(&vsock_register_mutex); - if (sk->sk_type == SOCK_SEQPACKET) { if (!new_transport->seqpacket_allow || !new_transport->seqpacket_allow(remote_cid)) { @@ -689,7 +689,8 @@ static int __vsock_bind_connectible(struct vsock_sock *vsk, unsigned int i; for (i = 0; i < MAX_PORT_RETRIES; i++) { - if (port <= LAST_RESERVED_PORT) + if (port == VMADDR_PORT_ANY || + port <= LAST_RESERVED_PORT) port = LAST_RESERVED_PORT + 1; new_addr.svm_port = port++; @@ -986,7 +987,7 @@ static int vsock_release(struct socket *sock) } static int -vsock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) +vsock_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len) { int err; struct sock *sk; @@ -1028,12 +1029,7 @@ static int vsock_getname(struct socket *sock, vm_addr = &vsk->local_addr; } - /* sys_getsockname() and sys_getpeername() pass us a - * MAX_SOCK_ADDR-sized buffer and don't set addr_len. Unfortunately - * that macro is defined in socket.c instead of .h, so we hardcode its - * value here. - */ - BUILD_BUG_ON(sizeof(*vm_addr) > 128); + BUILD_BUG_ON(sizeof(*vm_addr) > sizeof(struct sockaddr_storage)); memcpy(addr, vm_addr, sizeof(*vm_addr)); err = sizeof(*vm_addr); @@ -1332,7 +1328,7 @@ out: } static int vsock_dgram_connect(struct socket *sock, - struct sockaddr *addr, int addr_len, int flags) + struct sockaddr_unsized *addr, int addr_len, int flags) { int err; struct sock *sk; @@ -1532,7 +1528,7 @@ static void vsock_connect_timeout(struct work_struct *work) sock_put(sk); } -static int vsock_connect(struct socket *sock, struct sockaddr *addr, +static int vsock_connect(struct socket *sock, struct sockaddr_unsized *addr, int addr_len, int flags) { int err; @@ -1653,7 +1649,7 @@ static int vsock_connect(struct socket *sock, struct sockaddr *addr, * reschedule it, then ungrab the socket refcount to * keep it balanced. */ - if (mod_delayed_work(system_wq, &vsk->connect_work, + if (mod_delayed_work(system_percpu_wq, &vsk->connect_work, timeout)) sock_put(sk); @@ -1665,18 +1661,40 @@ static int vsock_connect(struct socket *sock, struct sockaddr *addr, timeout = schedule_timeout(timeout); lock_sock(sk); - if (signal_pending(current)) { - err = sock_intr_errno(timeout); - sk->sk_state = sk->sk_state == TCP_ESTABLISHED ? TCP_CLOSING : TCP_CLOSE; - sock->state = SS_UNCONNECTED; - vsock_transport_cancel_pkt(vsk); - vsock_remove_connected(vsk); - goto out_wait; - } else if ((sk->sk_state != TCP_ESTABLISHED) && (timeout == 0)) { - err = -ETIMEDOUT; + /* Connection established. Whatever happens to socket once we + * release it, that's not connect()'s concern. No need to go + * into signal and timeout handling. Call it a day. + * + * Note that allowing to "reset" an already established socket + * here is racy and insecure. + */ + if (sk->sk_state == TCP_ESTABLISHED) + break; + + /* If connection was _not_ established and a signal/timeout came + * to be, we want the socket's state reset. User space may want + * to retry. + * + * sk_state != TCP_ESTABLISHED implies that socket is not on + * vsock_connected_table. We keep the binding and the transport + * assigned. + */ + if (signal_pending(current) || timeout == 0) { + err = timeout == 0 ? -ETIMEDOUT : sock_intr_errno(timeout); + + /* Listener might have already responded with + * VIRTIO_VSOCK_OP_RESPONSE. Its handling expects our + * sk_state == TCP_SYN_SENT, which hereby we break. + * In such case VIRTIO_VSOCK_OP_RST will follow. + */ sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; + + /* Try to cancel VIRTIO_VSOCK_OP_REQUEST skb sent out by + * transport->connect(). + */ vsock_transport_cancel_pkt(vsk); + goto out_wait; } diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index b6569b0ca2bb..8c867023a2e5 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -926,7 +926,7 @@ static int __init virtio_vsock_init(void) { int ret; - virtio_vsock_workqueue = alloc_workqueue("virtio_vsock", 0, 0); + virtio_vsock_workqueue = alloc_workqueue("virtio_vsock", WQ_PERCPU, 0); if (!virtio_vsock_workqueue) return -ENOMEM; diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index fe92e5fa95b4..dcc8a1d5851e 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -105,12 +105,14 @@ static int virtio_transport_fill_skb(struct sk_buff *skb, size_t len, bool zcopy) { + struct msghdr *msg = info->msg; + if (zcopy) - return __zerocopy_sg_from_iter(info->msg, NULL, skb, - &info->msg->msg_iter, len, NULL); + return __zerocopy_sg_from_iter(msg, NULL, skb, + &msg->msg_iter, len, NULL); virtio_vsock_skb_put(skb, len); - return skb_copy_datagram_from_iter(skb, 0, &info->msg->msg_iter, len); + return skb_copy_datagram_from_iter_full(skb, 0, &msg->msg_iter, len); } static void virtio_transport_init_hdr(struct sk_buff *skb, diff --git a/net/vmw_vsock/vsock_addr.c b/net/vmw_vsock/vsock_addr.c index 223b9660a759..a986aa6fff9b 100644 --- a/net/vmw_vsock/vsock_addr.c +++ b/net/vmw_vsock/vsock_addr.c @@ -57,7 +57,7 @@ bool vsock_addr_equals_addr(const struct sockaddr_vm *addr, } EXPORT_SYMBOL_GPL(vsock_addr_equals_addr); -int vsock_addr_cast(const struct sockaddr *addr, +int vsock_addr_cast(const struct sockaddr_unsized *addr, size_t len, struct sockaddr_vm **out_addr) { if (len < sizeof(**out_addr)) diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c index 6e78927a598e..bc2ff918b315 100644 --- a/net/vmw_vsock/vsock_loopback.c +++ b/net/vmw_vsock/vsock_loopback.c @@ -139,7 +139,7 @@ static int __init vsock_loopback_init(void) struct vsock_loopback *vsock = &the_vsock_loopback; int ret; - vsock->workqueue = alloc_workqueue("vsock-loopback", 0, 0); + vsock->workqueue = alloc_workqueue("vsock-loopback", WQ_PERCPU, 0); if (!vsock->workqueue) return -ENOMEM; diff --git a/net/wireless/chan.c b/net/wireless/chan.c index 193734b7f9dc..68221b1ab45e 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -100,6 +100,11 @@ static u32 cfg80211_get_end_freq(const struct cfg80211_chan_def *chandef, punctured = 0) : (punctured >>= 1))) \ if (!(punctured & 1)) +#define for_each_s1g_subchan(chandef, freq_khz) \ + for (freq_khz = cfg80211_s1g_get_start_freq_khz(chandef); \ + freq_khz <= cfg80211_s1g_get_end_freq_khz(chandef); \ + freq_khz += MHZ_TO_KHZ(1)) + struct cfg80211_per_bw_puncturing_values { u8 len; const u16 *valid_values; @@ -336,8 +341,7 @@ static bool cfg80211_valid_center_freq(u32 center, bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef) { - u32 control_freq, oper_freq; - int oper_width, control_width; + u32 control_freq, control_freq_khz, start_khz, end_khz; if (!chandef->chan) return false; @@ -363,27 +367,16 @@ bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef) case NL80211_CHAN_WIDTH_4: case NL80211_CHAN_WIDTH_8: case NL80211_CHAN_WIDTH_16: - if (chandef->chan->band != NL80211_BAND_S1GHZ) - return false; - - control_freq = ieee80211_channel_to_khz(chandef->chan); - oper_freq = ieee80211_chandef_to_khz(chandef); - control_width = nl80211_chan_width_to_mhz( - ieee80211_s1g_channel_width( - chandef->chan)); - oper_width = cfg80211_chandef_get_width(chandef); - - if (oper_width < 0 || control_width < 0) + if (!cfg80211_chandef_is_s1g(chandef)) return false; if (chandef->center_freq2) return false; - if (control_freq + MHZ_TO_KHZ(control_width) / 2 > - oper_freq + MHZ_TO_KHZ(oper_width) / 2) - return false; + control_freq_khz = ieee80211_channel_to_khz(chandef->chan); + start_khz = cfg80211_s1g_get_start_freq_khz(chandef); + end_khz = cfg80211_s1g_get_end_freq_khz(chandef); - if (control_freq - MHZ_TO_KHZ(control_width) / 2 < - oper_freq - MHZ_TO_KHZ(oper_width) / 2) + if (control_freq_khz < start_khz || control_freq_khz > end_khz) return false; break; case NL80211_CHAN_WIDTH_80P80: @@ -461,6 +454,9 @@ bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef) !cfg80211_edmg_chandef_valid(chandef)) return false; + if (!cfg80211_chandef_is_s1g(chandef) && chandef->s1g_primary_2mhz) + return false; + return valid_puncturing_bitmap(chandef); } EXPORT_SYMBOL(cfg80211_chandef_valid); @@ -725,6 +721,10 @@ static int cfg80211_get_chans_dfs_required(struct wiphy *wiphy, { struct ieee80211_channel *c; + /* DFS is not required for S1G */ + if (cfg80211_chandef_is_s1g(chandef)) + return 0; + for_each_subchan(chandef, freq, cf) { c = ieee80211_get_channel_khz(wiphy, freq); if (!c) @@ -1130,6 +1130,55 @@ static bool cfg80211_edmg_usable(struct wiphy *wiphy, u8 edmg_channels, return true; } +static bool cfg80211_s1g_usable(struct wiphy *wiphy, + const struct cfg80211_chan_def *chandef) +{ + u32 freq_khz; + const struct ieee80211_channel *chan; + u32 pri_khz = ieee80211_channel_to_khz(chandef->chan); + u32 end_khz = cfg80211_s1g_get_end_freq_khz(chandef); + u32 start_khz = cfg80211_s1g_get_start_freq_khz(chandef); + int width_mhz = cfg80211_chandef_get_width(chandef); + u32 prohibited_flags = IEEE80211_CHAN_DISABLED; + + if (width_mhz >= 16) + prohibited_flags |= IEEE80211_CHAN_NO_16MHZ; + if (width_mhz >= 8) + prohibited_flags |= IEEE80211_CHAN_NO_8MHZ; + if (width_mhz >= 4) + prohibited_flags |= IEEE80211_CHAN_NO_4MHZ; + + if (chandef->chan->flags & IEEE80211_CHAN_S1G_NO_PRIMARY) + return false; + + if (pri_khz < start_khz || pri_khz > end_khz) + return false; + + for_each_s1g_subchan(chandef, freq_khz) { + chan = ieee80211_get_channel_khz(wiphy, freq_khz); + if (!chan || (chan->flags & prohibited_flags)) + return false; + } + + if (chandef->s1g_primary_2mhz) { + u32 sib_khz; + const struct ieee80211_channel *sibling; + + sibling = cfg80211_s1g_get_primary_sibling(wiphy, chandef); + if (!sibling) + return false; + + if (sibling->flags & IEEE80211_CHAN_S1G_NO_PRIMARY) + return false; + + sib_khz = ieee80211_channel_to_khz(sibling); + if (sib_khz < start_khz || sib_khz > end_khz) + return false; + } + + return true; +} + bool _cfg80211_chandef_usable(struct wiphy *wiphy, const struct cfg80211_chan_def *chandef, u32 prohibited_flags, @@ -1154,6 +1203,9 @@ bool _cfg80211_chandef_usable(struct wiphy *wiphy, ext_nss_cap = __le16_to_cpu(vht_cap->vht_mcs.tx_highest) & IEEE80211_VHT_EXT_NSS_BW_CAPABLE; + if (cfg80211_chandef_is_s1g(chandef)) + return cfg80211_s1g_usable(wiphy, chandef); + if (edmg_cap->channels && !cfg80211_edmg_usable(wiphy, chandef->edmg.channels, @@ -1165,21 +1217,6 @@ bool _cfg80211_chandef_usable(struct wiphy *wiphy, control_freq = chandef->chan->center_freq; switch (chandef->width) { - case NL80211_CHAN_WIDTH_1: - width = 1; - break; - case NL80211_CHAN_WIDTH_2: - width = 2; - break; - case NL80211_CHAN_WIDTH_4: - width = 4; - break; - case NL80211_CHAN_WIDTH_8: - width = 8; - break; - case NL80211_CHAN_WIDTH_16: - width = 16; - break; case NL80211_CHAN_WIDTH_5: width = 5; break; diff --git a/net/wireless/core.c b/net/wireless/core.c index a7e2931ffb2e..9a420d627d3c 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -34,6 +34,9 @@ /* name for sysfs, %d is appended */ #define PHY_NAME "phy" +/* maximum length of radio debugfs directory name */ +#define RADIO_DEBUGFSDIR_MAX_LEN 8 + MODULE_AUTHOR("Johannes Berg"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("wireless configuration support"); @@ -428,7 +431,7 @@ static void cfg80211_wiphy_work(struct work_struct *work) if (wk) { list_del_init(&wk->entry); if (!list_empty(&rdev->wiphy_work_list)) - queue_work(system_unbound_wq, work); + queue_work(system_dfl_wq, work); spin_unlock_irq(&rdev->wiphy_work_lock); trace_wiphy_work_run(&rdev->wiphy, wk); @@ -1018,6 +1021,15 @@ int wiphy_register(struct wiphy *wiphy) rdev->wiphy.features |= NL80211_FEATURE_SCAN_FLUSH; + if (rdev->wiphy.bss_param_support & WIPHY_BSS_PARAM_P2P_CTWINDOW) + rdev->wiphy.features |= NL80211_FEATURE_P2P_GO_CTWIN; + else if (rdev->wiphy.features & NL80211_FEATURE_P2P_GO_CTWIN) + rdev->wiphy.bss_param_support |= WIPHY_BSS_PARAM_P2P_CTWINDOW; + if (rdev->wiphy.bss_param_support & WIPHY_BSS_PARAM_P2P_OPPPS) + rdev->wiphy.features |= NL80211_FEATURE_P2P_GO_OPPPS; + else if (rdev->wiphy.features & NL80211_FEATURE_P2P_GO_OPPPS) + rdev->wiphy.bss_param_support |= WIPHY_BSS_PARAM_P2P_OPPPS; + rtnl_lock(); wiphy_lock(&rdev->wiphy); res = device_add(&rdev->wiphy.dev); @@ -1033,6 +1045,18 @@ int wiphy_register(struct wiphy *wiphy) /* add to debugfs */ rdev->wiphy.debugfsdir = debugfs_create_dir(wiphy_name(&rdev->wiphy), ieee80211_debugfs_dir); + if (wiphy->n_radio > 0) { + int idx; + char radio_name[RADIO_DEBUGFSDIR_MAX_LEN]; + + for (idx = 0; idx < wiphy->n_radio; idx++) { + scnprintf(radio_name, sizeof(radio_name), "radio%d", + idx); + wiphy->radio_cfg[idx].radio_debugfsdir = + debugfs_create_dir(radio_name, + rdev->wiphy.debugfsdir); + } + } cfg80211_debugfs_rdev_add(rdev); nl80211_notify_wiphy(rdev, NL80211_CMD_NEW_WIPHY); @@ -1042,12 +1066,12 @@ int wiphy_register(struct wiphy *wiphy) wiphy_regulatory_register(wiphy); if (wiphy->regulatory_flags & REGULATORY_CUSTOM_REG) { - struct regulatory_request request; - - request.wiphy_idx = get_wiphy_idx(wiphy); - request.initiator = NL80211_REGDOM_SET_BY_DRIVER; - request.alpha2[0] = '9'; - request.alpha2[1] = '9'; + struct regulatory_request request = { + .wiphy_idx = get_wiphy_idx(wiphy), + .initiator = NL80211_REGDOM_SET_BY_DRIVER, + .alpha2[0] = '9', + .alpha2[1] = '9', + }; nl80211_send_reg_change_event(&request); } @@ -1356,6 +1380,7 @@ void cfg80211_leave(struct cfg80211_registered_device *rdev, cfg80211_pmsr_wdev_down(wdev); + cfg80211_stop_radar_detection(wdev); cfg80211_stop_background_radar_detection(wdev); switch (wdev->iftype) { @@ -1689,7 +1714,7 @@ void wiphy_work_queue(struct wiphy *wiphy, struct wiphy_work *work) list_add_tail(&work->entry, &rdev->wiphy_work_list); spin_unlock_irqrestore(&rdev->wiphy_work_lock, flags); - queue_work(system_unbound_wq, &rdev->wiphy_work); + queue_work(system_dfl_wq, &rdev->wiphy_work); } EXPORT_SYMBOL_GPL(wiphy_work_queue); @@ -1778,6 +1803,62 @@ bool wiphy_delayed_work_pending(struct wiphy *wiphy, } EXPORT_SYMBOL_GPL(wiphy_delayed_work_pending); +enum hrtimer_restart wiphy_hrtimer_work_timer(struct hrtimer *t) +{ + struct wiphy_hrtimer_work *hrwork = + container_of(t, struct wiphy_hrtimer_work, timer); + + wiphy_work_queue(hrwork->wiphy, &hrwork->work); + + return HRTIMER_NORESTART; +} +EXPORT_SYMBOL_GPL(wiphy_hrtimer_work_timer); + +void wiphy_hrtimer_work_queue(struct wiphy *wiphy, + struct wiphy_hrtimer_work *hrwork, + ktime_t delay) +{ + trace_wiphy_hrtimer_work_queue(wiphy, &hrwork->work, delay); + + if (!delay) { + hrtimer_cancel(&hrwork->timer); + wiphy_work_queue(wiphy, &hrwork->work); + return; + } + + hrwork->wiphy = wiphy; + hrtimer_start_range_ns(&hrwork->timer, delay, + 1000 * NSEC_PER_USEC, HRTIMER_MODE_REL); +} +EXPORT_SYMBOL_GPL(wiphy_hrtimer_work_queue); + +void wiphy_hrtimer_work_cancel(struct wiphy *wiphy, + struct wiphy_hrtimer_work *hrwork) +{ + lockdep_assert_held(&wiphy->mtx); + + hrtimer_cancel(&hrwork->timer); + wiphy_work_cancel(wiphy, &hrwork->work); +} +EXPORT_SYMBOL_GPL(wiphy_hrtimer_work_cancel); + +void wiphy_hrtimer_work_flush(struct wiphy *wiphy, + struct wiphy_hrtimer_work *hrwork) +{ + lockdep_assert_held(&wiphy->mtx); + + hrtimer_cancel(&hrwork->timer); + wiphy_work_flush(wiphy, &hrwork->work); +} +EXPORT_SYMBOL_GPL(wiphy_hrtimer_work_flush); + +bool wiphy_hrtimer_work_pending(struct wiphy *wiphy, + struct wiphy_hrtimer_work *hrwork) +{ + return hrtimer_is_queued(&hrwork->timer); +} +EXPORT_SYMBOL_GPL(wiphy_hrtimer_work_pending); + static int __init cfg80211_init(void) { int err; diff --git a/net/wireless/core.h b/net/wireless/core.h index b6bd7f4d6385..63dcf315dba7 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -489,6 +489,7 @@ cfg80211_start_background_radar_detection(struct cfg80211_registered_device *rde struct wireless_dev *wdev, struct cfg80211_chan_def *chandef); +void cfg80211_stop_radar_detection(struct wireless_dev *wdev); void cfg80211_stop_background_radar_detection(struct wireless_dev *wdev); void cfg80211_background_cac_done_wk(struct work_struct *work); @@ -550,7 +551,8 @@ cfg80211_bss_update(struct cfg80211_registered_device *rdev, bool signal_valid, unsigned long ts); enum ieee80211_ap_reg_power -cfg80211_get_6ghz_power_type(const u8 *elems, size_t elems_len); +cfg80211_get_6ghz_power_type(const u8 *elems, size_t elems_len, + u32 client_flags); #ifdef CONFIG_CFG80211_DEVELOPER_WARNINGS #define CFG80211_DEV_WARN_ON(cond) WARN_ON(cond) diff --git a/net/wireless/debugfs.c b/net/wireless/debugfs.c index 40e49074e2ee..f9e7fff1ef25 100644 --- a/net/wireless/debugfs.c +++ b/net/wireless/debugfs.c @@ -29,6 +29,24 @@ static const struct file_operations name## _ops = { \ .llseek = generic_file_llseek, \ } +#define DEBUGFS_RADIO_READONLY_FILE(name, buflen, fmt, value...) \ +static ssize_t name## _read(struct file *file, char __user *userbuf, \ + size_t count, loff_t *ppos) \ +{ \ + struct wiphy_radio_cfg *radio_cfg = file->private_data; \ + char buf[buflen]; \ + int res; \ + \ + res = scnprintf(buf, buflen, fmt "\n", ##value); \ + return simple_read_from_buffer(userbuf, count, ppos, buf, res); \ +} \ + \ +static const struct file_operations name## _ops = { \ + .read = name## _read, \ + .open = simple_open, \ + .llseek = generic_file_llseek, \ +} + DEBUGFS_READONLY_FILE(rts_threshold, 20, "%d", wiphy->rts_threshold); DEBUGFS_READONLY_FILE(fragmentation_threshold, 20, "%d", @@ -38,6 +56,9 @@ DEBUGFS_READONLY_FILE(short_retry_limit, 20, "%d", DEBUGFS_READONLY_FILE(long_retry_limit, 20, "%d", wiphy->retry_long); +DEBUGFS_RADIO_READONLY_FILE(radio_rts_threshold, 20, "%d", + radio_cfg->rts_threshold); + static int ht_print_chan(struct ieee80211_channel *chan, char *buf, int buf_size, int offset) { @@ -100,15 +121,27 @@ static const struct file_operations ht40allow_map_ops = { #define DEBUGFS_ADD(name) \ debugfs_create_file(#name, 0444, phyd, &rdev->wiphy, &name## _ops) +#define DEBUGFS_RADIO_ADD(name, radio_idx) \ + debugfs_create_file(#name, 0444, radiod, \ + &rdev->wiphy.radio_cfg[radio_idx], \ + &name## _ops) + void cfg80211_debugfs_rdev_add(struct cfg80211_registered_device *rdev) { struct dentry *phyd = rdev->wiphy.debugfsdir; + struct dentry *radiod; + u8 i; DEBUGFS_ADD(rts_threshold); DEBUGFS_ADD(fragmentation_threshold); DEBUGFS_ADD(short_retry_limit); DEBUGFS_ADD(long_retry_limit); DEBUGFS_ADD(ht40allow_map); + + for (i = 0; i < rdev->wiphy.n_radio; i++) { + radiod = rdev->wiphy.radio_cfg[i].radio_debugfsdir; + DEBUGFS_RADIO_ADD(radio_rts_threshold, i); + } } struct debugfs_read_work { diff --git a/net/wireless/ethtool.c b/net/wireless/ethtool.c index 2613d6ac0fda..46e4317cbd7e 100644 --- a/net/wireless/ethtool.c +++ b/net/wireless/ethtool.c @@ -23,7 +23,7 @@ void cfg80211_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) else strscpy(info->fw_version, "N/A", sizeof(info->fw_version)); - strscpy(info->bus_info, dev_name(wiphy_dev(wdev->wiphy)), + strscpy(info->bus_info, dev_name(pdev), sizeof(info->bus_info)); } EXPORT_SYMBOL(cfg80211_get_drvinfo); diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index 46394eb2086f..3fc175f9f868 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -1295,6 +1295,25 @@ cfg80211_start_background_radar_detection(struct cfg80211_registered_device *rde return 0; } +void cfg80211_stop_radar_detection(struct wireless_dev *wdev) +{ + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + int link_id; + + for_each_valid_link(wdev, link_id) { + struct cfg80211_chan_def chandef; + + if (!wdev->links[link_id].cac_started) + continue; + + chandef = *wdev_chandef(wdev, link_id); + rdev_end_cac(rdev, wdev->netdev, link_id); + nl80211_radar_notify(rdev, &chandef, NL80211_RADAR_CAC_ABORTED, + wdev->netdev, GFP_KERNEL); + } +} + void cfg80211_stop_background_radar_detection(struct wireless_dev *wdev) { struct wiphy *wiphy = wdev->wiphy; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 89519aa52893..c961cd42a832 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -312,6 +312,26 @@ static int validate_supported_selectors(const struct nlattr *attr, return 0; } +static int validate_nan_cluster_id(const struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + const u8 *data = nla_data(attr); + unsigned int len = nla_len(attr); + static const u8 cluster_id_prefix[4] = {0x50, 0x6f, 0x9a, 0x1}; + + if (len != ETH_ALEN) { + NL_SET_ERR_MSG_ATTR(extack, attr, "bad cluster id length"); + return -EINVAL; + } + + if (memcmp(data, cluster_id_prefix, sizeof(cluster_id_prefix))) { + NL_SET_ERR_MSG_ATTR(extack, attr, "invalid cluster id prefix"); + return -EINVAL; + } + + return 0; +} + /* policy for the attributes */ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR]; @@ -411,6 +431,14 @@ static const struct nla_policy nl80211_txattr_policy[NL80211_TXRATE_MAX + 1] = { [NL80211_TXRATE_HE_LTF] = NLA_POLICY_RANGE(NLA_U8, NL80211_RATE_INFO_HE_1XLTF, NL80211_RATE_INFO_HE_4XLTF), + [NL80211_TXRATE_EHT] = NLA_POLICY_EXACT_LEN(sizeof(struct nl80211_txrate_eht)), + [NL80211_TXRATE_EHT_GI] = NLA_POLICY_RANGE(NLA_U8, + NL80211_RATE_INFO_EHT_GI_0_8, + NL80211_RATE_INFO_EHT_GI_3_2), + [NL80211_TXRATE_EHT_LTF] = NLA_POLICY_RANGE(NLA_U8, + NL80211_RATE_INFO_EHT_1XLTF, + NL80211_RATE_INFO_EHT_8XLTF), + }; static const struct nla_policy @@ -492,6 +520,36 @@ nl80211_s1g_short_beacon[NL80211_S1G_SHORT_BEACON_ATTR_MAX + 1] = { IEEE80211_MAX_DATA_LEN), }; +static const struct nla_policy +nl80211_nan_band_conf_policy[NL80211_NAN_BAND_CONF_ATTR_MAX + 1] = { + [NL80211_NAN_BAND_CONF_BAND] = NLA_POLICY_MAX(NLA_U8, + NUM_NL80211_BANDS - 1), + [NL80211_NAN_BAND_CONF_FREQ] = { .type = NLA_U16 }, + [NL80211_NAN_BAND_CONF_RSSI_CLOSE] = NLA_POLICY_MIN(NLA_S8, -59), + [NL80211_NAN_BAND_CONF_RSSI_MIDDLE] = NLA_POLICY_MIN(NLA_S8, -74), + [NL80211_NAN_BAND_CONF_WAKE_DW] = NLA_POLICY_MAX(NLA_U8, 5), + [NL80211_NAN_BAND_CONF_DISABLE_SCAN] = { .type = NLA_FLAG }, +}; + +static const struct nla_policy +nl80211_nan_conf_policy[NL80211_NAN_CONF_ATTR_MAX + 1] = { + [NL80211_NAN_CONF_CLUSTER_ID] = + NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_nan_cluster_id, + ETH_ALEN), + [NL80211_NAN_CONF_EXTRA_ATTRS] = { .type = NLA_BINARY, + .len = IEEE80211_MAX_DATA_LEN}, + [NL80211_NAN_CONF_VENDOR_ELEMS] = + NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_ie_attr, + IEEE80211_MAX_DATA_LEN), + [NL80211_NAN_CONF_BAND_CONFIGS] = + NLA_POLICY_NESTED_ARRAY(nl80211_nan_band_conf_policy), + [NL80211_NAN_CONF_SCAN_PERIOD] = { .type = NLA_U16 }, + [NL80211_NAN_CONF_SCAN_DWELL_TIME] = NLA_POLICY_RANGE(NLA_U16, 50, 512), + [NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL] = + NLA_POLICY_RANGE(NLA_U8, 50, 200), + [NL80211_NAN_CONF_NOTIFY_DW] = { .type = NLA_FLAG }, +}; + static const struct netlink_range_validation nl80211_punct_bitmap_range = { .min = 0, .max = 0xffff, @@ -761,6 +819,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN), [NL80211_ATTR_NAN_MASTER_PREF] = NLA_POLICY_MIN(NLA_U8, 1), [NL80211_ATTR_BANDS] = { .type = NLA_U32 }, + [NL80211_ATTR_NAN_CONFIG] = NLA_POLICY_NESTED(nl80211_nan_conf_policy), [NL80211_ATTR_NAN_FUNC] = { .type = NLA_NESTED }, [NL80211_ATTR_FILS_KEK] = { .type = NLA_BINARY, .len = FILS_MAX_KEK_LEN }, @@ -871,6 +930,8 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_S1G_LONG_BEACON_PERIOD] = NLA_POLICY_MIN(NLA_U8, 2), [NL80211_ATTR_S1G_SHORT_BEACON] = NLA_POLICY_NESTED(nl80211_s1g_short_beacon), + [NL80211_ATTR_BSS_PARAM] = { .type = NLA_FLAG }, + [NL80211_ATTR_S1G_PRIMARY_2MHZ] = { .type = NLA_FLAG }, }; /* policy for the key attributes */ @@ -1219,21 +1280,6 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy, if ((chan->flags & IEEE80211_CHAN_NO_HE) && nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_HE)) goto nla_put_failure; - if ((chan->flags & IEEE80211_CHAN_1MHZ) && - nla_put_flag(msg, NL80211_FREQUENCY_ATTR_1MHZ)) - goto nla_put_failure; - if ((chan->flags & IEEE80211_CHAN_2MHZ) && - nla_put_flag(msg, NL80211_FREQUENCY_ATTR_2MHZ)) - goto nla_put_failure; - if ((chan->flags & IEEE80211_CHAN_4MHZ) && - nla_put_flag(msg, NL80211_FREQUENCY_ATTR_4MHZ)) - goto nla_put_failure; - if ((chan->flags & IEEE80211_CHAN_8MHZ) && - nla_put_flag(msg, NL80211_FREQUENCY_ATTR_8MHZ)) - goto nla_put_failure; - if ((chan->flags & IEEE80211_CHAN_16MHZ) && - nla_put_flag(msg, NL80211_FREQUENCY_ATTR_16MHZ)) - goto nla_put_failure; if ((chan->flags & IEEE80211_CHAN_NO_320MHZ) && nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_320MHZ)) goto nla_put_failure; @@ -1259,6 +1305,15 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy, nla_put_flag(msg, NL80211_FREQUENCY_ATTR_ALLOW_20MHZ_ACTIVITY)) goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_NO_4MHZ) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_4MHZ)) + goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_NO_8MHZ) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_8MHZ)) + goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_NO_16MHZ) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_16MHZ)) + goto nla_put_failure; } if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_MAX_TX_POWER, @@ -2545,6 +2600,41 @@ fail: return -ENOBUFS; } +static int nl80211_put_nan_capa(struct wiphy *wiphy, struct sk_buff *msg) +{ + struct nlattr *nan_caps; + + nan_caps = nla_nest_start(msg, NL80211_ATTR_NAN_CAPABILITIES); + if (!nan_caps) + return -ENOBUFS; + + if (wiphy->nan_capa.flags & WIPHY_NAN_FLAGS_CONFIGURABLE_SYNC && + nla_put_flag(msg, NL80211_NAN_CAPA_CONFIGURABLE_SYNC)) + goto fail; + + if ((wiphy->nan_capa.flags & WIPHY_NAN_FLAGS_USERSPACE_DE) && + nla_put_flag(msg, NL80211_NAN_CAPA_USERSPACE_DE)) + goto fail; + + if (nla_put_u8(msg, NL80211_NAN_CAPA_OP_MODE, + wiphy->nan_capa.op_mode) || + nla_put_u8(msg, NL80211_NAN_CAPA_NUM_ANTENNAS, + wiphy->nan_capa.n_antennas) || + nla_put_u16(msg, NL80211_NAN_CAPA_MAX_CHANNEL_SWITCH_TIME, + wiphy->nan_capa.max_channel_switch_time) || + nla_put_u8(msg, NL80211_NAN_CAPA_CAPABILITIES, + wiphy->nan_capa.dev_capabilities)) + goto fail; + + nla_nest_end(msg, nan_caps); + + return 0; + +fail: + nla_nest_cancel(msg, nan_caps); + return -ENOBUFS; +} + struct nl80211_dump_wiphy_state { s64 filter_wiphy; long start; @@ -3019,6 +3109,40 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, rdev->wiphy.ext_features)) goto nla_put_failure; + if (rdev->wiphy.bss_param_support) { + struct nlattr *nested; + u32 parsup = rdev->wiphy.bss_param_support; + + nested = nla_nest_start(msg, NL80211_ATTR_BSS_PARAM); + if (!nested) + goto nla_put_failure; + + if ((parsup & WIPHY_BSS_PARAM_CTS_PROT) && + nla_put_flag(msg, NL80211_ATTR_BSS_CTS_PROT)) + goto nla_put_failure; + if ((parsup & WIPHY_BSS_PARAM_SHORT_PREAMBLE) && + nla_put_flag(msg, NL80211_ATTR_BSS_SHORT_PREAMBLE)) + goto nla_put_failure; + if ((parsup & WIPHY_BSS_PARAM_SHORT_SLOT_TIME) && + nla_put_flag(msg, NL80211_ATTR_BSS_SHORT_SLOT_TIME)) + goto nla_put_failure; + if ((parsup & WIPHY_BSS_PARAM_BASIC_RATES) && + nla_put_flag(msg, NL80211_ATTR_BSS_BASIC_RATES)) + goto nla_put_failure; + if ((parsup & WIPHY_BSS_PARAM_AP_ISOLATE) && + nla_put_flag(msg, NL80211_ATTR_AP_ISOLATE)) + goto nla_put_failure; + if ((parsup & WIPHY_BSS_PARAM_HT_OPMODE) && + nla_put_flag(msg, NL80211_ATTR_BSS_HT_OPMODE)) + goto nla_put_failure; + if ((parsup & WIPHY_BSS_PARAM_P2P_CTWINDOW) && + nla_put_flag(msg, NL80211_ATTR_P2P_CTWINDOW)) + goto nla_put_failure; + if ((parsup & WIPHY_BSS_PARAM_P2P_OPPPS) && + nla_put_flag(msg, NL80211_ATTR_P2P_OPPPS)) + goto nla_put_failure; + nla_nest_end(msg, nested); + } if (rdev->wiphy.bss_select_support) { struct nlattr *nested; u32 bss_select_support = rdev->wiphy.bss_select_support; @@ -3163,6 +3287,12 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, if (nl80211_put_radios(&rdev->wiphy, msg)) goto nla_put_failure; + state->split_start++; + break; + case 18: + if (nl80211_put_nan_capa(&rdev->wiphy, msg)) + goto nla_put_failure; + /* done */ state->split_start = 0; break; @@ -3406,6 +3536,7 @@ static int _nl80211_parse_chandef(struct cfg80211_registered_device *rdev, chandef->center_freq1 = KHZ_TO_MHZ(control_freq); chandef->freq1_offset = control_freq % 1000; chandef->center_freq2 = 0; + chandef->s1g_primary_2mhz = false; if (!chandef->chan) { NL_SET_ERR_MSG_ATTR(extack, attrs[NL80211_ATTR_WIPHY_FREQ], @@ -3413,6 +3544,9 @@ static int _nl80211_parse_chandef(struct cfg80211_registered_device *rdev, return -EINVAL; } + if (cfg80211_chandef_is_s1g(chandef)) + chandef->width = NL80211_CHAN_WIDTH_1; + if (attrs[NL80211_ATTR_WIPHY_CHANNEL_TYPE]) { enum nl80211_channel_type chantype; @@ -3449,27 +3583,20 @@ static int _nl80211_parse_chandef(struct cfg80211_registered_device *rdev, return -EINVAL; } } else if (attrs[NL80211_ATTR_CHANNEL_WIDTH]) { - chandef->width = - nla_get_u32(attrs[NL80211_ATTR_CHANNEL_WIDTH]); - if (chandef->chan->band == NL80211_BAND_S1GHZ) { - /* User input error for channel width doesn't match channel */ - if (chandef->width != ieee80211_s1g_channel_width(chandef->chan)) { - NL_SET_ERR_MSG_ATTR(extack, - attrs[NL80211_ATTR_CHANNEL_WIDTH], - "bad channel width"); - return -EINVAL; - } - } + chandef->width = nla_get_u32(attrs[NL80211_ATTR_CHANNEL_WIDTH]); if (attrs[NL80211_ATTR_CENTER_FREQ1]) { chandef->center_freq1 = nla_get_u32(attrs[NL80211_ATTR_CENTER_FREQ1]); - chandef->freq1_offset = - nla_get_u32_default(attrs[NL80211_ATTR_CENTER_FREQ1_OFFSET], - 0); + chandef->freq1_offset = nla_get_u32_default( + attrs[NL80211_ATTR_CENTER_FREQ1_OFFSET], 0); } + if (attrs[NL80211_ATTR_CENTER_FREQ2]) chandef->center_freq2 = nla_get_u32(attrs[NL80211_ATTR_CENTER_FREQ2]); + + chandef->s1g_primary_2mhz = nla_get_flag( + attrs[NL80211_ATTR_S1G_PRIMARY_2MHZ]); } if (info->attrs[NL80211_ATTR_WIPHY_EDMG_CHANNELS]) { @@ -4012,8 +4139,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) rdev->wiphy.txq_quantum = old_txq_quantum; } - if (old_rts_threshold) - kfree(old_radio_rts_threshold); + kfree(old_radio_rts_threshold); return result; } @@ -4052,6 +4178,9 @@ int nl80211_send_chandef(struct sk_buff *msg, const struct cfg80211_chan_def *ch if (chandef->punctured && nla_put_u32(msg, NL80211_ATTR_PUNCT_BITMAP, chandef->punctured)) return -ENOBUFS; + if (chandef->s1g_primary_2mhz && + nla_put_flag(msg, NL80211_ATTR_S1G_PRIMARY_2MHZ)) + return -ENOBUFS; return 0; } @@ -5393,6 +5522,164 @@ static bool he_set_mcs_mask(struct genl_info *info, return true; } +static void eht_build_mcs_mask(struct genl_info *info, + const struct ieee80211_sta_eht_cap *eht_cap, + u8 mcs_nss_len, u16 *mcs_mask) +{ + struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; + u8 nss, mcs_7 = 0, mcs_9 = 0, mcs_11 = 0, mcs_13 = 0; + unsigned int link_id = nl80211_link_id(info->attrs); + + if (mcs_nss_len == 4) { + const struct ieee80211_eht_mcs_nss_supp_20mhz_only *mcs = + &eht_cap->eht_mcs_nss_supp.only_20mhz; + + mcs_7 = u8_get_bits(mcs->rx_tx_mcs7_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + mcs_9 = u8_get_bits(mcs->rx_tx_mcs9_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + mcs_11 = u8_get_bits(mcs->rx_tx_mcs11_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + mcs_13 = u8_get_bits(mcs->rx_tx_mcs13_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + + } else { + const struct ieee80211_eht_mcs_nss_supp_bw *mcs; + enum nl80211_chan_width width; + + switch (wdev->iftype) { + case NL80211_IFTYPE_ADHOC: + width = wdev->u.ibss.chandef.width; + break; + case NL80211_IFTYPE_MESH_POINT: + width = wdev->u.mesh.chandef.width; + break; + case NL80211_IFTYPE_OCB: + width = wdev->u.ocb.chandef.width; + break; + default: + if (wdev->valid_links) + width = wdev->links[link_id].ap.chandef.width; + else + width = wdev->u.ap.preset_chandef.width; + break; + } + + switch (width) { + case NL80211_CHAN_WIDTH_320: + mcs = &eht_cap->eht_mcs_nss_supp.bw._320; + break; + case NL80211_CHAN_WIDTH_160: + mcs = &eht_cap->eht_mcs_nss_supp.bw._160; + break; + default: + mcs = &eht_cap->eht_mcs_nss_supp.bw._80; + break; + } + + mcs_7 = u8_get_bits(mcs->rx_tx_mcs9_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + mcs_9 = u8_get_bits(mcs->rx_tx_mcs9_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + mcs_11 = u8_get_bits(mcs->rx_tx_mcs11_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + mcs_13 = u8_get_bits(mcs->rx_tx_mcs13_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + } + + /* Enable MCS 14 for NSS 0 */ + if (eht_cap->eht_cap_elem.phy_cap_info[6] & + IEEE80211_EHT_PHY_CAP6_EHT_DUP_6GHZ_SUPP) + mcs_mask[0] |= 0x4000; + + /* Enable MCS 15 for NSS 0 */ + mcs_mask[0] |= 0x8000; + + for (nss = 0; nss < NL80211_EHT_NSS_MAX; nss++) { + if (!mcs_7) + continue; + mcs_mask[nss] |= 0x00FF; + mcs_7--; + + if (!mcs_9) + continue; + mcs_mask[nss] |= 0x0300; + mcs_9--; + + if (!mcs_11) + continue; + mcs_mask[nss] |= 0x0C00; + mcs_11--; + + if (!mcs_13) + continue; + mcs_mask[nss] |= 0x3000; + mcs_13--; + } +} + +static bool eht_set_mcs_mask(struct genl_info *info, struct wireless_dev *wdev, + struct ieee80211_supported_band *sband, + struct nl80211_txrate_eht *txrate, + u16 mcs[NL80211_EHT_NSS_MAX]) +{ + const struct ieee80211_sta_he_cap *he_cap; + const struct ieee80211_sta_eht_cap *eht_cap; + u16 tx_mcs_mask[NL80211_EHT_NSS_MAX] = { 0 }; + u8 i, mcs_nss_len; + + he_cap = ieee80211_get_he_iftype_cap(sband, wdev->iftype); + if (!he_cap) + return false; + + eht_cap = ieee80211_get_eht_iftype_cap(sband, wdev->iftype); + if (!eht_cap) + return false; + + /* Checks for MCS 14 */ + if (txrate->mcs[0] & 0x4000) { + if (sband->band != NL80211_BAND_6GHZ) + return false; + + if (!(eht_cap->eht_cap_elem.phy_cap_info[6] & + IEEE80211_EHT_PHY_CAP6_EHT_DUP_6GHZ_SUPP)) + return false; + } + + mcs_nss_len = ieee80211_eht_mcs_nss_size(&he_cap->he_cap_elem, + &eht_cap->eht_cap_elem, + wdev->iftype == + NL80211_IFTYPE_STATION); + + if (mcs_nss_len == 3) { + /* Supported iftypes for setting non-20 MHZ only EHT MCS */ + switch (wdev->iftype) { + case NL80211_IFTYPE_ADHOC: + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_P2P_GO: + case NL80211_IFTYPE_MESH_POINT: + case NL80211_IFTYPE_OCB: + break; + default: + return false; + } + } + + /* Build eht_mcs_mask from EHT and HE capabilities */ + eht_build_mcs_mask(info, eht_cap, mcs_nss_len, tx_mcs_mask); + + memset(mcs, 0, sizeof(u16) * NL80211_EHT_NSS_MAX); + for (i = 0; i < NL80211_EHT_NSS_MAX; i++) { + if ((tx_mcs_mask[i] & txrate->mcs[i]) == txrate->mcs[i]) + mcs[i] = txrate->mcs[i]; + else + return false; + } + + return true; +} + static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, struct nlattr *attrs[], enum nl80211_attrs attr, @@ -5413,6 +5700,8 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, /* Default to all rates enabled */ for (i = 0; i < NUM_NL80211_BANDS; i++) { const struct ieee80211_sta_he_cap *he_cap; + const struct ieee80211_sta_eht_cap *eht_cap; + u8 mcs_nss_len; if (!default_all_enabled) break; @@ -5441,6 +5730,21 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, mask->control[i].he_gi = 0xFF; mask->control[i].he_ltf = 0xFF; + + eht_cap = ieee80211_get_eht_iftype_cap(sband, wdev->iftype); + if (!eht_cap) + continue; + + mcs_nss_len = ieee80211_eht_mcs_nss_size(&he_cap->he_cap_elem, + &eht_cap->eht_cap_elem, + wdev->iftype == + NL80211_IFTYPE_STATION); + + eht_build_mcs_mask(info, eht_cap, mcs_nss_len, + mask->control[i].eht_mcs); + + mask->control[i].eht_gi = 0xFF; + mask->control[i].eht_ltf = 0xFF; } /* if no rates are given set it back to the defaults */ @@ -5512,13 +5816,27 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, mask->control[band].he_ltf = nla_get_u8(tb[NL80211_TXRATE_HE_LTF]); + if (tb[NL80211_TXRATE_EHT] && + !eht_set_mcs_mask(info, wdev, sband, + nla_data(tb[NL80211_TXRATE_EHT]), + mask->control[band].eht_mcs)) + return -EINVAL; + + if (tb[NL80211_TXRATE_EHT_GI]) + mask->control[band].eht_gi = + nla_get_u8(tb[NL80211_TXRATE_EHT_GI]); + if (tb[NL80211_TXRATE_EHT_LTF]) + mask->control[band].eht_ltf = + nla_get_u8(tb[NL80211_TXRATE_EHT_LTF]); + if (mask->control[band].legacy == 0) { - /* don't allow empty legacy rates if HT, VHT or HE + /* don't allow empty legacy rates if HT, VHT, HE or EHT * are not even supported. */ if (!(rdev->wiphy.bands[band]->ht_cap.ht_supported || rdev->wiphy.bands[band]->vht_cap.vht_supported || - ieee80211_get_he_iftype_cap(sband, wdev->iftype))) + ieee80211_get_he_iftype_cap(sband, wdev->iftype) || + ieee80211_get_eht_iftype_cap(sband, wdev->iftype))) return -EINVAL; for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++) @@ -5533,6 +5851,10 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, if (mask->control[band].he_mcs[i]) goto out; + for (i = 0; i < NL80211_EHT_NSS_MAX; i++) + if (mask->control[band].eht_mcs[i]) + goto out; + /* legacy and mcs rates may not be both empty */ return -EINVAL; } @@ -5546,7 +5868,7 @@ static int validate_beacon_tx_rate(struct cfg80211_registered_device *rdev, enum nl80211_band band, struct cfg80211_bitrate_mask *beacon_rate) { - u32 count_ht, count_vht, count_he, i; + u32 count_ht, count_vht, count_he, count_eht, i; u32 rate = beacon_rate->control[band].legacy; /* Allow only one rate */ @@ -5592,8 +5914,21 @@ static int validate_beacon_tx_rate(struct cfg80211_registered_device *rdev, return -EINVAL; } - if ((count_ht && count_vht && count_he) || - (!rate && !count_ht && !count_vht && !count_he)) + count_eht = 0; + for (i = 0; i < NL80211_EHT_NSS_MAX; i++) { + if (hweight16(beacon_rate->control[band].eht_mcs[i]) > 1) { + return -EINVAL; + } else if (beacon_rate->control[band].eht_mcs[i]) { + count_eht++; + if (count_eht > 1) + return -EINVAL; + } + if (count_eht && rate) + return -EINVAL; + } + + if ((count_ht && count_vht && count_he && count_eht) || + (!rate && !count_ht && !count_vht && !count_he && !count_eht)) return -EINVAL; if (rate && @@ -5613,6 +5948,11 @@ static int validate_beacon_tx_rate(struct cfg80211_registered_device *rdev, NL80211_EXT_FEATURE_BEACON_RATE_HE)) return -EINVAL; + if (count_eht && + !wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_BEACON_RATE_EHT)) + return -EINVAL; + return 0; } @@ -6411,7 +6751,7 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) beacon_check.relax = true; beacon_check.reg_power = cfg80211_get_6ghz_power_type(params->beacon.tail, - params->beacon.tail_len); + params->beacon.tail_len, 0); if (!cfg80211_reg_check_beaconing(&rdev->wiphy, ¶ms->chandef, &beacon_check)) { err = -EINVAL; @@ -6590,7 +6930,7 @@ static int nl80211_set_beacon(struct sk_buff *skb, struct genl_info *info) beacon_check.relax = true; beacon_check.reg_power = cfg80211_get_6ghz_power_type(params->beacon.tail, - params->beacon.tail_len); + params->beacon.tail_len, 0); if (!cfg80211_reg_check_beaconing(&rdev->wiphy, &wdev->links[link_id].ap.chandef, &beacon_check)) { @@ -7062,7 +7402,8 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, u32 seq, int flags, struct cfg80211_registered_device *rdev, struct net_device *dev, - const u8 *mac_addr, struct station_info *sinfo) + const u8 *mac_addr, struct station_info *sinfo, + bool link_stats) { void *hdr; struct nlattr *sinfoattr, *bss_param; @@ -7283,7 +7624,7 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, goto nla_put_failure; } - if (sinfo->valid_links) { + if (link_stats && sinfo->valid_links) { links = nla_nest_start(msg, NL80211_ATTR_MLO_LINKS); if (!links) goto nla_put_failure; @@ -7574,7 +7915,7 @@ static int nl80211_dump_station(struct sk_buff *skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, rdev, wdev->netdev, mac_addr, - &sinfo) < 0) + &sinfo, false) < 0) goto out; sta_idx++; @@ -7635,7 +7976,7 @@ static int nl80211_get_station(struct sk_buff *skb, struct genl_info *info) if (nl80211_send_station(msg, NL80211_CMD_NEW_STATION, info->snd_portid, info->snd_seq, 0, - rdev, dev, mac_addr, &sinfo) < 0) { + rdev, dev, mac_addr, &sinfo, false) < 0) { nlmsg_free(msg); return -ENOBUFS; } @@ -8829,6 +9170,9 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct bss_parameters params; + u32 bss_param_support = rdev->wiphy.bss_param_support; + u32 changed = 0; + bool strict; memset(¶ms, 0, sizeof(params)); params.link_id = nl80211_link_id_or_invalid(info->attrs); @@ -8841,26 +9185,54 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) params.p2p_ctwindow = -1; params.p2p_opp_ps = -1; - if (info->attrs[NL80211_ATTR_BSS_CTS_PROT]) + strict = nla_get_flag(info->attrs[NL80211_ATTR_BSS_PARAM]); + if (info->attrs[NL80211_ATTR_BSS_CTS_PROT]) { + if (strict && !(bss_param_support & WIPHY_BSS_PARAM_CTS_PROT)) + return -EINVAL; params.use_cts_prot = nla_get_u8(info->attrs[NL80211_ATTR_BSS_CTS_PROT]); - if (info->attrs[NL80211_ATTR_BSS_SHORT_PREAMBLE]) + changed |= WIPHY_BSS_PARAM_CTS_PROT; + } + if (info->attrs[NL80211_ATTR_BSS_SHORT_PREAMBLE]) { + if (strict && + !(bss_param_support & WIPHY_BSS_PARAM_SHORT_PREAMBLE)) + return -EINVAL; params.use_short_preamble = nla_get_u8(info->attrs[NL80211_ATTR_BSS_SHORT_PREAMBLE]); - if (info->attrs[NL80211_ATTR_BSS_SHORT_SLOT_TIME]) + changed |= WIPHY_BSS_PARAM_SHORT_PREAMBLE; + } + if (info->attrs[NL80211_ATTR_BSS_SHORT_SLOT_TIME]) { + if (strict && + !(bss_param_support & WIPHY_BSS_PARAM_SHORT_SLOT_TIME)) + return -EINVAL; params.use_short_slot_time = nla_get_u8(info->attrs[NL80211_ATTR_BSS_SHORT_SLOT_TIME]); + changed |= WIPHY_BSS_PARAM_SHORT_SLOT_TIME; + } if (info->attrs[NL80211_ATTR_BSS_BASIC_RATES]) { + if (strict && + !(bss_param_support & WIPHY_BSS_PARAM_BASIC_RATES)) + return -EINVAL; params.basic_rates = nla_data(info->attrs[NL80211_ATTR_BSS_BASIC_RATES]); params.basic_rates_len = nla_len(info->attrs[NL80211_ATTR_BSS_BASIC_RATES]); + changed |= WIPHY_BSS_PARAM_BASIC_RATES; } - if (info->attrs[NL80211_ATTR_AP_ISOLATE]) - params.ap_isolate = !!nla_get_u8(info->attrs[NL80211_ATTR_AP_ISOLATE]); - if (info->attrs[NL80211_ATTR_BSS_HT_OPMODE]) + if (info->attrs[NL80211_ATTR_AP_ISOLATE]) { + if (strict && !(bss_param_support & WIPHY_BSS_PARAM_AP_ISOLATE)) + return -EINVAL; + params.ap_isolate = + !!nla_get_u8(info->attrs[NL80211_ATTR_AP_ISOLATE]); + changed |= WIPHY_BSS_PARAM_AP_ISOLATE; + } + if (info->attrs[NL80211_ATTR_BSS_HT_OPMODE]) { + if (strict && !(bss_param_support & WIPHY_BSS_PARAM_HT_OPMODE)) + return -EINVAL; params.ht_opmode = nla_get_u16(info->attrs[NL80211_ATTR_BSS_HT_OPMODE]); + changed |= WIPHY_BSS_PARAM_HT_OPMODE; + } if (info->attrs[NL80211_ATTR_P2P_CTWINDOW]) { if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) @@ -8868,8 +9240,9 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) params.p2p_ctwindow = nla_get_u8(info->attrs[NL80211_ATTR_P2P_CTWINDOW]); if (params.p2p_ctwindow != 0 && - !(rdev->wiphy.features & NL80211_FEATURE_P2P_GO_CTWIN)) + !(bss_param_support & WIPHY_BSS_PARAM_P2P_CTWINDOW)) return -EINVAL; + changed |= WIPHY_BSS_PARAM_P2P_CTWINDOW; } if (info->attrs[NL80211_ATTR_P2P_OPPPS]) { @@ -8878,9 +9251,11 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) return -EINVAL; tmp = nla_get_u8(info->attrs[NL80211_ATTR_P2P_OPPPS]); + if (tmp && !(bss_param_support & WIPHY_BSS_PARAM_P2P_OPPPS)) + return -EINVAL; params.p2p_opp_ps = tmp; if (params.p2p_opp_ps && - !(rdev->wiphy.features & NL80211_FEATURE_P2P_GO_OPPPS)) + !(rdev->wiphy.bss_param_support & WIPHY_BSS_PARAM_P2P_OPPPS)) return -EINVAL; } @@ -8891,6 +9266,10 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) return -EOPNOTSUPP; + changed &= rdev->wiphy.bss_param_support; + if (!changed) + return 0; + return rdev_change_bss(rdev, dev, ¶ms); } @@ -10070,8 +10449,9 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) goto out_free; } - /* ignore disabled channels */ + /* Ignore disabled / no primary channels */ if (chan->flags & IEEE80211_CHAN_DISABLED || + chan->flags & IEEE80211_CHAN_S1G_NO_PRIMARY || !cfg80211_wdev_channel_allowed(wdev, chan)) continue; @@ -10093,6 +10473,8 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) chan = &wiphy->bands[band]->channels[j]; if (chan->flags & IEEE80211_CHAN_DISABLED || + chan->flags & + IEEE80211_CHAN_S1G_NO_PRIMARY || !cfg80211_wdev_channel_allowed(wdev, chan)) continue; @@ -13397,7 +13779,9 @@ static int nl80211_register_mgmt(struct sk_buff *skb, struct genl_info *info) break; case NL80211_IFTYPE_NAN: if (!wiphy_ext_feature_isset(wdev->wiphy, - NL80211_EXT_FEATURE_SECURE_NAN)) + NL80211_EXT_FEATURE_SECURE_NAN) && + !(wdev->wiphy->nan_capa.flags & + WIPHY_NAN_FLAGS_USERSPACE_DE)) return -EOPNOTSUPP; break; default: @@ -13458,7 +13842,9 @@ static int nl80211_tx_mgmt(struct sk_buff *skb, struct genl_info *info) break; case NL80211_IFTYPE_NAN: if (!wiphy_ext_feature_isset(wdev->wiphy, - NL80211_EXT_FEATURE_SECURE_NAN)) + NL80211_EXT_FEATURE_SECURE_NAN) && + !(wdev->wiphy->nan_capa.flags & + WIPHY_NAN_FLAGS_USERSPACE_DE)) return -EOPNOTSUPP; break; default: @@ -15105,6 +15491,216 @@ static int nl80211_stop_p2p_device(struct sk_buff *skb, struct genl_info *info) return 0; } +static struct ieee80211_channel *nl80211_get_nan_channel(struct wiphy *wiphy, + int freq) +{ + struct ieee80211_channel *chan; + struct cfg80211_chan_def def; + + /* Check if the frequency is valid for NAN */ + if (freq != 5220 && freq != 5745 && freq != 2437) + return NULL; + + chan = ieee80211_get_channel(wiphy, freq); + if (!chan) + return NULL; + + cfg80211_chandef_create(&def, chan, NL80211_CHAN_NO_HT); + + /* Check if the channel is allowed */ + if (cfg80211_reg_can_beacon(wiphy, &def, NL80211_IFTYPE_NAN)) + return chan; + + return NULL; +} + +static int nl80211_parse_nan_band_config(struct wiphy *wiphy, + struct nlattr **tb, + struct cfg80211_nan_band_config *cfg, + enum nl80211_band band) +{ + if (BIT(band) & ~(u32)wiphy->nan_supported_bands) + return -EINVAL; + + if (tb[NL80211_NAN_BAND_CONF_FREQ]) { + u16 freq = nla_get_u16(tb[NL80211_NAN_BAND_CONF_FREQ]); + + if (band != NL80211_BAND_5GHZ) + return -EINVAL; + + cfg->chan = nl80211_get_nan_channel(wiphy, freq); + if (!cfg->chan) + return -EINVAL; + } + + if (tb[NL80211_NAN_BAND_CONF_RSSI_CLOSE]) { + cfg->rssi_close = + nla_get_s8(tb[NL80211_NAN_BAND_CONF_RSSI_CLOSE]); + if (!tb[NL80211_NAN_BAND_CONF_RSSI_MIDDLE]) + return -EINVAL; + } + + if (tb[NL80211_NAN_BAND_CONF_RSSI_MIDDLE]) { + cfg->rssi_middle = + nla_get_s8(tb[NL80211_NAN_BAND_CONF_RSSI_MIDDLE]); + if (!cfg->rssi_close || cfg->rssi_middle >= cfg->rssi_close) + return -EINVAL; + } + + if (tb[NL80211_NAN_BAND_CONF_WAKE_DW]) { + cfg->awake_dw_interval = + nla_get_u8(tb[NL80211_NAN_BAND_CONF_WAKE_DW]); + + if (band == NL80211_BAND_2GHZ && cfg->awake_dw_interval == 0) + return -EINVAL; + } + + cfg->disable_scan = + nla_get_flag(tb[NL80211_NAN_BAND_CONF_DISABLE_SCAN]); + return 0; +} + +static int nl80211_parse_nan_conf(struct wiphy *wiphy, + struct genl_info *info, + struct cfg80211_nan_conf *conf, + u32 *changed_flags) +{ + struct nlattr *attrs[NL80211_NAN_CONF_ATTR_MAX + 1]; + int err, rem; + u32 changed = 0; + struct nlattr *band_config; + + if (info->attrs[NL80211_ATTR_NAN_MASTER_PREF]) { + conf->master_pref = + nla_get_u8(info->attrs[NL80211_ATTR_NAN_MASTER_PREF]); + + changed |= CFG80211_NAN_CONF_CHANGED_PREF; + } + + if (info->attrs[NL80211_ATTR_BANDS]) { + u32 bands = nla_get_u32(info->attrs[NL80211_ATTR_BANDS]); + + if (bands & ~(u32)wiphy->nan_supported_bands) + return -EOPNOTSUPP; + + if (bands && !(bands & BIT(NL80211_BAND_2GHZ))) + return -EINVAL; + + conf->bands = bands; + changed |= CFG80211_NAN_CONF_CHANGED_BANDS; + } + + conf->band_cfgs[NL80211_BAND_2GHZ].awake_dw_interval = 1; + if (conf->bands & BIT(NL80211_BAND_5GHZ) || !conf->bands) + conf->band_cfgs[NL80211_BAND_5GHZ].awake_dw_interval = 1; + + /* On 2.4 GHz band use channel 6 */ + conf->band_cfgs[NL80211_BAND_2GHZ].chan = + nl80211_get_nan_channel(wiphy, 2437); + if (!conf->band_cfgs[NL80211_BAND_2GHZ].chan) + return -EINVAL; + + if (!info->attrs[NL80211_ATTR_NAN_CONFIG]) + goto out; + + err = nla_parse_nested(attrs, NL80211_NAN_CONF_ATTR_MAX, + info->attrs[NL80211_ATTR_NAN_CONFIG], NULL, + info->extack); + if (err) + return err; + + changed |= CFG80211_NAN_CONF_CHANGED_CONFIG; + if (attrs[NL80211_NAN_CONF_CLUSTER_ID]) + conf->cluster_id = + nla_data(attrs[NL80211_NAN_CONF_CLUSTER_ID]); + + if (attrs[NL80211_NAN_CONF_EXTRA_ATTRS]) { + conf->extra_nan_attrs = + nla_data(attrs[NL80211_NAN_CONF_EXTRA_ATTRS]); + conf->extra_nan_attrs_len = + nla_len(attrs[NL80211_NAN_CONF_EXTRA_ATTRS]); + } + + if (attrs[NL80211_NAN_CONF_VENDOR_ELEMS]) { + conf->vendor_elems = + nla_data(attrs[NL80211_NAN_CONF_VENDOR_ELEMS]); + conf->vendor_elems_len = + nla_len(attrs[NL80211_NAN_CONF_VENDOR_ELEMS]); + } + + if (attrs[NL80211_NAN_CONF_BAND_CONFIGS]) { + nla_for_each_nested(band_config, + attrs[NL80211_NAN_CONF_BAND_CONFIGS], + rem) { + enum nl80211_band band; + struct cfg80211_nan_band_config *cfg; + struct nlattr *tb[NL80211_NAN_BAND_CONF_ATTR_MAX + 1]; + + err = nla_parse_nested(tb, + NL80211_NAN_BAND_CONF_ATTR_MAX, + band_config, NULL, + info->extack); + if (err) + return err; + + if (!tb[NL80211_NAN_BAND_CONF_BAND]) + return -EINVAL; + + band = nla_get_u8(tb[NL80211_NAN_BAND_CONF_BAND]); + if (conf->bands && !(conf->bands & BIT(band))) + return -EINVAL; + + cfg = &conf->band_cfgs[band]; + + err = nl80211_parse_nan_band_config(wiphy, tb, cfg, + band); + if (err) + return err; + } + } + + if (attrs[NL80211_NAN_CONF_SCAN_PERIOD]) + conf->scan_period = + nla_get_u16(attrs[NL80211_NAN_CONF_SCAN_PERIOD]); + + if (attrs[NL80211_NAN_CONF_SCAN_DWELL_TIME]) + conf->scan_dwell_time = + nla_get_u16(attrs[NL80211_NAN_CONF_SCAN_DWELL_TIME]); + + if (attrs[NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL]) + conf->discovery_beacon_interval = + nla_get_u8(attrs[NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL]); + + if (attrs[NL80211_NAN_CONF_NOTIFY_DW]) + conf->enable_dw_notification = + nla_get_flag(attrs[NL80211_NAN_CONF_NOTIFY_DW]); + +out: + if (!conf->band_cfgs[NL80211_BAND_5GHZ].chan && + (!conf->bands || conf->bands & BIT(NL80211_BAND_5GHZ))) { + /* If no 5GHz channel is specified use default, if possible */ + conf->band_cfgs[NL80211_BAND_5GHZ].chan = + nl80211_get_nan_channel(wiphy, 5745); + if (!conf->band_cfgs[NL80211_BAND_5GHZ].chan) + conf->band_cfgs[NL80211_BAND_5GHZ].chan = + nl80211_get_nan_channel(wiphy, 5220); + + /* Return error if user space asked explicitly for 5 GHz */ + if (!conf->band_cfgs[NL80211_BAND_5GHZ].chan && + conf->bands & BIT(NL80211_BAND_5GHZ)) { + NL_SET_ERR_MSG_ATTR(info->extack, + info->attrs[NL80211_ATTR_BANDS], + "5 GHz band operation is not allowed"); + return -EINVAL; + } + } + + if (changed_flags) + *changed_flags = changed; + + return 0; +} + static int nl80211_start_nan(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; @@ -15121,23 +15717,13 @@ static int nl80211_start_nan(struct sk_buff *skb, struct genl_info *info) if (rfkill_blocked(rdev->wiphy.rfkill)) return -ERFKILL; + /* Master preference is mandatory for START_NAN */ if (!info->attrs[NL80211_ATTR_NAN_MASTER_PREF]) return -EINVAL; - conf.master_pref = - nla_get_u8(info->attrs[NL80211_ATTR_NAN_MASTER_PREF]); - - if (info->attrs[NL80211_ATTR_BANDS]) { - u32 bands = nla_get_u32(info->attrs[NL80211_ATTR_BANDS]); - - if (bands & ~(u32)wdev->wiphy->nan_supported_bands) - return -EOPNOTSUPP; - - if (bands && !(bands & BIT(NL80211_BAND_2GHZ))) - return -EINVAL; - - conf.bands = bands; - } + err = nl80211_parse_nan_conf(&rdev->wiphy, info, &conf, NULL); + if (err) + return err; err = rdev_start_nan(rdev, wdev, &conf); if (err) @@ -15493,6 +16079,7 @@ static int nl80211_nan_change_config(struct sk_buff *skb, struct wireless_dev *wdev = info->user_ptr[1]; struct cfg80211_nan_conf conf = {}; u32 changed = 0; + int err; if (wdev->iftype != NL80211_IFTYPE_NAN) return -EOPNOTSUPP; @@ -15500,27 +16087,9 @@ static int nl80211_nan_change_config(struct sk_buff *skb, if (!wdev_running(wdev)) return -ENOTCONN; - if (info->attrs[NL80211_ATTR_NAN_MASTER_PREF]) { - conf.master_pref = - nla_get_u8(info->attrs[NL80211_ATTR_NAN_MASTER_PREF]); - if (conf.master_pref <= 1 || conf.master_pref == 255) - return -EINVAL; - - changed |= CFG80211_NAN_CONF_CHANGED_PREF; - } - - if (info->attrs[NL80211_ATTR_BANDS]) { - u32 bands = nla_get_u32(info->attrs[NL80211_ATTR_BANDS]); - - if (bands & ~(u32)wdev->wiphy->nan_supported_bands) - return -EOPNOTSUPP; - - if (bands && !(bands & BIT(NL80211_BAND_2GHZ))) - return -EINVAL; - - conf.bands = bands; - changed |= CFG80211_NAN_CONF_CHANGED_BANDS; - } + err = nl80211_parse_nan_conf(&rdev->wiphy, info, &conf, &changed); + if (err) + return err; if (!changed) return -EINVAL; @@ -19680,7 +20249,7 @@ void cfg80211_new_sta(struct net_device *dev, const u8 *mac_addr, return; if (nl80211_send_station(msg, NL80211_CMD_NEW_STATION, 0, 0, 0, - rdev, dev, mac_addr, sinfo) < 0) { + rdev, dev, mac_addr, sinfo, false) < 0) { nlmsg_free(msg); return; } @@ -19710,7 +20279,7 @@ void cfg80211_del_sta_sinfo(struct net_device *dev, const u8 *mac_addr, } if (nl80211_send_station(msg, NL80211_CMD_DEL_STATION, 0, 0, 0, - rdev, dev, mac_addr, sinfo) < 0) { + rdev, dev, mac_addr, sinfo, false) < 0) { nlmsg_free(msg); return; } @@ -21243,6 +21812,88 @@ void cfg80211_epcs_changed(struct net_device *netdev, bool enabled) } EXPORT_SYMBOL(cfg80211_epcs_changed); +void cfg80211_next_nan_dw_notif(struct wireless_dev *wdev, + struct ieee80211_channel *chan, gfp_t gfp) +{ + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + struct sk_buff *msg; + void *hdr; + + trace_cfg80211_next_nan_dw_notif(wdev, chan); + + if (!wdev->owner_nlportid) + return; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, + NL80211_CMD_NAN_NEXT_DW_NOTIFICATION); + if (!hdr) + goto nla_put_failure; + + if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || + nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), + NL80211_ATTR_PAD) || + nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, chan->center_freq)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + genlmsg_unicast(wiphy_net(wiphy), msg, wdev->owner_nlportid); + + return; + + nla_put_failure: + nlmsg_free(msg); +} +EXPORT_SYMBOL(cfg80211_next_nan_dw_notif); + +void cfg80211_nan_cluster_joined(struct wireless_dev *wdev, + const u8 *cluster_id, bool new_cluster, + gfp_t gfp) +{ + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + struct sk_buff *msg; + void *hdr; + + trace_cfg80211_nan_cluster_joined(wdev, cluster_id, new_cluster); + + memcpy(wdev->u.nan.cluster_id, cluster_id, ETH_ALEN); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_NAN_CLUSTER_JOINED); + if (!hdr) + goto nla_put_failure; + + if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || + nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), + NL80211_ATTR_PAD) || + nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, cluster_id) || + (new_cluster && nla_put_flag(msg, NL80211_ATTR_NAN_NEW_CLUSTER))) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + if (!wdev->owner_nlportid) + genlmsg_multicast_netns(&nl80211_fam, wiphy_net(wiphy), + msg, 0, NL80211_MCGRP_NAN, gfp); + else + genlmsg_unicast(wiphy_net(wiphy), msg, + wdev->owner_nlportid); + return; + + nla_put_failure: + nlmsg_free(msg); +} +EXPORT_SYMBOL(cfg80211_nan_cluster_joined); + /* initialisation/exit functions */ int __init nl80211_init(void) diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 3b0ac3437f81..73cab51f6379 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -1707,6 +1707,16 @@ static uint32_t reg_rule_to_chan_bw_flags(const struct ieee80211_regdomain *regd if (reg_rule->flags & NL80211_RRF_AUTO_BW) max_bandwidth_khz = reg_get_max_bandwidth(regd, reg_rule); + if (is_s1g) { + if (max_bandwidth_khz < MHZ_TO_KHZ(16)) + bw_flags |= IEEE80211_CHAN_NO_16MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(8)) + bw_flags |= IEEE80211_CHAN_NO_8MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(4)) + bw_flags |= IEEE80211_CHAN_NO_4MHZ; + return bw_flags; + } + /* If we get a reg_rule we can assume that at least 5Mhz fit */ if (!cfg80211_does_bw_fit_range(freq_range, center_freq_khz, @@ -1717,59 +1727,19 @@ static uint32_t reg_rule_to_chan_bw_flags(const struct ieee80211_regdomain *regd MHZ_TO_KHZ(20))) bw_flags |= IEEE80211_CHAN_NO_20MHZ; - if (is_s1g) { - /* S1G is strict about non overlapping channels. We can - * calculate which bandwidth is allowed per channel by finding - * the largest bandwidth which cleanly divides the freq_range. - */ - int edge_offset; - int ch_bw = max_bandwidth_khz; - - while (ch_bw) { - edge_offset = (center_freq_khz - ch_bw / 2) - - freq_range->start_freq_khz; - if (edge_offset % ch_bw == 0) { - switch (KHZ_TO_MHZ(ch_bw)) { - case 1: - bw_flags |= IEEE80211_CHAN_1MHZ; - break; - case 2: - bw_flags |= IEEE80211_CHAN_2MHZ; - break; - case 4: - bw_flags |= IEEE80211_CHAN_4MHZ; - break; - case 8: - bw_flags |= IEEE80211_CHAN_8MHZ; - break; - case 16: - bw_flags |= IEEE80211_CHAN_16MHZ; - break; - default: - /* If we got here, no bandwidths fit on - * this frequency, ie. band edge. - */ - bw_flags |= IEEE80211_CHAN_DISABLED; - break; - } - break; - } - ch_bw /= 2; - } - } else { - if (max_bandwidth_khz < MHZ_TO_KHZ(10)) - bw_flags |= IEEE80211_CHAN_NO_10MHZ; - if (max_bandwidth_khz < MHZ_TO_KHZ(20)) - bw_flags |= IEEE80211_CHAN_NO_20MHZ; - if (max_bandwidth_khz < MHZ_TO_KHZ(40)) - bw_flags |= IEEE80211_CHAN_NO_HT40; - if (max_bandwidth_khz < MHZ_TO_KHZ(80)) - bw_flags |= IEEE80211_CHAN_NO_80MHZ; - if (max_bandwidth_khz < MHZ_TO_KHZ(160)) - bw_flags |= IEEE80211_CHAN_NO_160MHZ; - if (max_bandwidth_khz < MHZ_TO_KHZ(320)) - bw_flags |= IEEE80211_CHAN_NO_320MHZ; - } + if (max_bandwidth_khz < MHZ_TO_KHZ(10)) + bw_flags |= IEEE80211_CHAN_NO_10MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(20)) + bw_flags |= IEEE80211_CHAN_NO_20MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(40)) + bw_flags |= IEEE80211_CHAN_NO_HT40; + if (max_bandwidth_khz < MHZ_TO_KHZ(80)) + bw_flags |= IEEE80211_CHAN_NO_80MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(160)) + bw_flags |= IEEE80211_CHAN_NO_160MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(320)) + bw_flags |= IEEE80211_CHAN_NO_320MHZ; + return bw_flags; } diff --git a/net/wireless/scan.c b/net/wireless/scan.c index a8339ed52404..7546647752fd 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -1816,6 +1816,9 @@ static void cfg80211_update_hidden_bsses(struct cfg80211_internal_bss *known, WARN_ON(ies != old_ies); rcu_assign_pointer(bss->pub.beacon_ies, new_ies); + + bss->ts = known->ts; + bss->pub.ts_boottime = known->pub.ts_boottime; } } @@ -1882,6 +1885,10 @@ cfg80211_update_known_bss(struct cfg80211_registered_device *rdev, { lockdep_assert_held(&rdev->bss_lock); + /* Update time stamps */ + known->ts = new->ts; + known->pub.ts_boottime = new->pub.ts_boottime; + /* Update IEs */ if (rcu_access_pointer(new->pub.proberesp_ies)) { const struct cfg80211_bss_ies *old; @@ -1916,7 +1923,8 @@ cfg80211_update_known_bss(struct cfg80211_registered_device *rdev, */ f = rcu_access_pointer(new->pub.beacon_ies); - kfree_rcu((struct cfg80211_bss_ies *)f, rcu_head); + if (!new->pub.hidden_beacon_bss) + kfree_rcu((struct cfg80211_bss_ies *)f, rcu_head); return false; } @@ -1944,8 +1952,6 @@ cfg80211_update_known_bss(struct cfg80211_registered_device *rdev, if (signal_valid) known->pub.signal = new->pub.signal; known->pub.capability = new->pub.capability; - known->ts = new->ts; - known->pub.ts_boottime = new->pub.ts_boottime; known->parent_tsf = new->parent_tsf; known->pub.chains = new->pub.chains; memcpy(known->pub.chain_signal, new->pub.chain_signal, @@ -2206,7 +2212,8 @@ struct cfg80211_inform_single_bss_data { }; enum ieee80211_ap_reg_power -cfg80211_get_6ghz_power_type(const u8 *elems, size_t elems_len) +cfg80211_get_6ghz_power_type(const u8 *elems, size_t elems_len, + u32 client_flags) { const struct ieee80211_he_6ghz_oper *he_6ghz_oper; struct ieee80211_he_operation *he_oper; @@ -2224,26 +2231,13 @@ cfg80211_get_6ghz_power_type(const u8 *elems, size_t elems_len) if (!he_6ghz_oper) return IEEE80211_REG_UNSET_AP; - switch (u8_get_bits(he_6ghz_oper->control, - IEEE80211_HE_6GHZ_OPER_CTRL_REG_INFO)) { - case IEEE80211_6GHZ_CTRL_REG_LPI_AP: - case IEEE80211_6GHZ_CTRL_REG_INDOOR_LPI_AP: - return IEEE80211_REG_LPI_AP; - case IEEE80211_6GHZ_CTRL_REG_SP_AP: - case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP: - case IEEE80211_6GHZ_CTRL_REG_INDOOR_SP_AP_OLD: - return IEEE80211_REG_SP_AP; - case IEEE80211_6GHZ_CTRL_REG_VLP_AP: - return IEEE80211_REG_VLP_AP; - default: - return IEEE80211_REG_UNSET_AP; - } + return cfg80211_6ghz_power_type(he_6ghz_oper->control, client_flags); } static bool cfg80211_6ghz_power_type_valid(const u8 *elems, size_t elems_len, const u32 flags) { - switch (cfg80211_get_6ghz_power_type(elems, elems_len)) { + switch (cfg80211_get_6ghz_power_type(elems, elems_len, flags)) { case IEEE80211_REG_LPI_AP: return true; case IEEE80211_REG_SP_AP: diff --git a/net/wireless/sme.c b/net/wireless/sme.c index 826ec0a6355f..3a028ff287fb 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -900,13 +900,16 @@ void __cfg80211_connect_result(struct net_device *dev, if (!wdev->u.client.ssid_len) { rcu_read_lock(); for_each_valid_link(cr, link) { + u32 ssid_len; + ssid = ieee80211_bss_get_elem(cr->links[link].bss, WLAN_EID_SSID); if (!ssid || !ssid->datalen) continue; - memcpy(wdev->u.client.ssid, ssid->data, ssid->datalen); + ssid_len = min(ssid->datalen, IEEE80211_MAX_SSID_LEN); + memcpy(wdev->u.client.ssid, ssid->data, ssid_len); wdev->u.client.ssid_len = ssid->datalen; break; } diff --git a/net/wireless/sysfs.c b/net/wireless/sysfs.c index 62f26618f674..8d142856e385 100644 --- a/net/wireless/sysfs.c +++ b/net/wireless/sysfs.c @@ -137,7 +137,7 @@ static int wiphy_resume(struct device *dev) if (rdev->wiphy.registered && rdev->ops->resume) ret = rdev_resume(rdev); rdev->suspended = false; - queue_work(system_unbound_wq, &rdev->wiphy_work); + queue_work(system_dfl_wq, &rdev->wiphy_work); wiphy_unlock(&rdev->wiphy); if (ret) diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 34c584a215e5..2b71f1d867a0 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -304,6 +304,27 @@ TRACE_EVENT(wiphy_delayed_work_queue, __entry->delay) ); +TRACE_EVENT(wiphy_hrtimer_work_queue, + TP_PROTO(struct wiphy *wiphy, struct wiphy_work *work, + ktime_t delay), + TP_ARGS(wiphy, work, delay), + TP_STRUCT__entry( + WIPHY_ENTRY + __field(void *, instance) + __field(void *, func) + __field(ktime_t, delay) + ), + TP_fast_assign( + WIPHY_ASSIGN; + __entry->instance = work; + __entry->func = work->func; + __entry->delay = delay; + ), + TP_printk(WIPHY_PR_FMT " instance=%p func=%pS delay=%llu", + WIPHY_PR_ARG, __entry->instance, __entry->func, + __entry->delay) +); + TRACE_EVENT(wiphy_work_worker_start, TP_PROTO(struct wiphy *wiphy), TP_ARGS(wiphy), @@ -3137,23 +3158,6 @@ DEFINE_EVENT(cfg80211_netdev_mac_evt, cfg80211_notify_new_peer_candidate, TP_ARGS(netdev, macaddr) ); -DECLARE_EVENT_CLASS(netdev_evt_only, - TP_PROTO(struct net_device *netdev), - TP_ARGS(netdev), - TP_STRUCT__entry( - NETDEV_ENTRY - ), - TP_fast_assign( - NETDEV_ASSIGN; - ), - TP_printk(NETDEV_PR_FMT , NETDEV_PR_ARG) -); - -DEFINE_EVENT(netdev_evt_only, cfg80211_send_rx_auth, - TP_PROTO(struct net_device *netdev), - TP_ARGS(netdev) -); - TRACE_EVENT(cfg80211_send_rx_assoc, TP_PROTO(struct net_device *netdev, const struct cfg80211_rx_assoc_resp_data *data), @@ -3480,21 +3484,6 @@ TRACE_EVENT(cfg80211_reg_can_beacon, __entry->prohibited_flags, __entry->permitting_flags) ); -TRACE_EVENT(cfg80211_chandef_dfs_required, - TP_PROTO(struct wiphy *wiphy, struct cfg80211_chan_def *chandef), - TP_ARGS(wiphy, chandef), - TP_STRUCT__entry( - WIPHY_ENTRY - CHAN_DEF_ENTRY - ), - TP_fast_assign( - WIPHY_ASSIGN; - CHAN_DEF_ASSIGN(chandef); - ), - TP_printk(WIPHY_PR_FMT ", " CHAN_DEF_PR_FMT, - WIPHY_PR_ARG, CHAN_DEF_PR_ARG) -); - TRACE_EVENT(cfg80211_ch_switch_notify, TP_PROTO(struct net_device *netdev, struct cfg80211_chan_def *chandef, @@ -3862,30 +3851,6 @@ DEFINE_EVENT(cfg80211_bss_evt, cfg80211_return_bss, TP_ARGS(pub) ); -TRACE_EVENT(cfg80211_return_uint, - TP_PROTO(unsigned int ret), - TP_ARGS(ret), - TP_STRUCT__entry( - __field(unsigned int, ret) - ), - TP_fast_assign( - __entry->ret = ret; - ), - TP_printk("ret: %d", __entry->ret) -); - -TRACE_EVENT(cfg80211_return_u32, - TP_PROTO(u32 ret), - TP_ARGS(ret), - TP_STRUCT__entry( - __field(u32, ret) - ), - TP_fast_assign( - __entry->ret = ret; - ), - TP_printk("ret: %u", __entry->ret) -); - TRACE_EVENT(cfg80211_report_wowlan_wakeup, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, struct cfg80211_wowlan_wakeup *wakeup), @@ -4222,6 +4187,41 @@ TRACE_EVENT(cfg80211_epcs_changed, WDEV_PR_ARG, __entry->enabled) ); +TRACE_EVENT(cfg80211_next_nan_dw_notif, + TP_PROTO(struct wireless_dev *wdev, + struct ieee80211_channel *chan), + TP_ARGS(wdev, chan), + TP_STRUCT__entry( + WDEV_ENTRY + CHAN_ENTRY + ), + TP_fast_assign( + WDEV_ASSIGN; + CHAN_ASSIGN(chan); + ), + TP_printk(WDEV_PR_FMT " " CHAN_PR_FMT, + WDEV_PR_ARG, CHAN_PR_ARG) +); + +TRACE_EVENT(cfg80211_nan_cluster_joined, + TP_PROTO(struct wireless_dev *wdev, + const u8 *cluster_id, + bool new_cluster), + TP_ARGS(wdev, cluster_id, new_cluster), + TP_STRUCT__entry( + WDEV_ENTRY + MAC_ENTRY(cluster_id) + __field(bool, new_cluster) + ), + TP_fast_assign( + WDEV_ASSIGN; + MAC_ASSIGN(cluster_id, cluster_id); + __entry->new_cluster = new_cluster; + ), + TP_printk(WDEV_PR_FMT " cluster_id %pMF%s", + WDEV_PR_ARG, __entry->cluster_id, + __entry->new_cluster ? " [new]" : "") +); #endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */ #undef TRACE_INCLUDE_PATH diff --git a/net/wireless/util.c b/net/wireless/util.c index 240c68baa3d1..27e8a2f52f04 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -106,33 +106,6 @@ u32 ieee80211_channel_to_freq_khz(int chan, enum nl80211_band band) } EXPORT_SYMBOL(ieee80211_channel_to_freq_khz); -enum nl80211_chan_width -ieee80211_s1g_channel_width(const struct ieee80211_channel *chan) -{ - if (WARN_ON(!chan || chan->band != NL80211_BAND_S1GHZ)) - return NL80211_CHAN_WIDTH_20_NOHT; - - /*S1G defines a single allowed channel width per channel. - * Extract that width here. - */ - if (chan->flags & IEEE80211_CHAN_1MHZ) - return NL80211_CHAN_WIDTH_1; - else if (chan->flags & IEEE80211_CHAN_2MHZ) - return NL80211_CHAN_WIDTH_2; - else if (chan->flags & IEEE80211_CHAN_4MHZ) - return NL80211_CHAN_WIDTH_4; - else if (chan->flags & IEEE80211_CHAN_8MHZ) - return NL80211_CHAN_WIDTH_8; - else if (chan->flags & IEEE80211_CHAN_16MHZ) - return NL80211_CHAN_WIDTH_16; - - pr_err("unknown channel width for channel at %dKHz?\n", - ieee80211_channel_to_khz(chan)); - - return NL80211_CHAN_WIDTH_1; -} -EXPORT_SYMBOL(ieee80211_s1g_channel_width); - int ieee80211_freq_khz_to_channel(u32 freq) { /* TODO: just handle MHz for now */ @@ -1230,28 +1203,7 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev, dev->ieee80211_ptr->use_4addr = false; rdev_set_qos_map(rdev, dev, NULL); - switch (otype) { - case NL80211_IFTYPE_AP: - case NL80211_IFTYPE_P2P_GO: - cfg80211_stop_ap(rdev, dev, -1, true); - break; - case NL80211_IFTYPE_ADHOC: - cfg80211_leave_ibss(rdev, dev, false); - break; - case NL80211_IFTYPE_STATION: - case NL80211_IFTYPE_P2P_CLIENT: - cfg80211_disconnect(rdev, dev, - WLAN_REASON_DEAUTH_LEAVING, true); - break; - case NL80211_IFTYPE_MESH_POINT: - /* mesh should be handled? */ - break; - case NL80211_IFTYPE_OCB: - cfg80211_leave_ocb(rdev, dev); - break; - default: - break; - } + cfg80211_leave(rdev, dev->ieee80211_ptr); cfg80211_process_rdev_events(rdev); cfg80211_mlme_purge_registrations(dev->ieee80211_ptr); @@ -2584,7 +2536,7 @@ int cfg80211_get_radio_idx_by_chan(struct wiphy *wiphy, } } - return -ENOENT; + return -EINVAL; } EXPORT_SYMBOL(cfg80211_get_radio_idx_by_chan); @@ -2969,9 +2921,8 @@ cfg80211_get_iftype_ext_capa(struct wiphy *wiphy, enum nl80211_iftype type) } EXPORT_SYMBOL(cfg80211_get_iftype_ext_capa); -static bool -ieee80211_radio_freq_range_valid(const struct wiphy_radio *radio, - u32 freq, u32 width) +bool ieee80211_radio_freq_range_valid(const struct wiphy_radio *radio, + u32 freq, u32 width) { const struct wiphy_radio_freq_range *r; int i; @@ -2985,6 +2936,7 @@ ieee80211_radio_freq_range_valid(const struct wiphy_radio *radio, return false; } +EXPORT_SYMBOL(ieee80211_radio_freq_range_valid); bool cfg80211_radio_chandef_valid(const struct wiphy_radio *radio, const struct cfg80211_chan_def *chandef) @@ -2992,7 +2944,7 @@ bool cfg80211_radio_chandef_valid(const struct wiphy_radio *radio, u32 freq, width; freq = ieee80211_chandef_to_khz(chandef); - width = cfg80211_chandef_get_width(chandef); + width = MHZ_TO_KHZ(cfg80211_chandef_get_width(chandef)); if (!ieee80211_radio_freq_range_valid(radio, freq, width)) return false; diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 655d1e0ae25f..af8762b24039 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -670,7 +670,7 @@ out: return 0; } -static int x25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +static int x25_bind(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len) { struct sock *sk = sock->sk; struct sockaddr_x25 *addr = (struct sockaddr_x25 *)uaddr; @@ -743,7 +743,7 @@ static int x25_wait_for_connection_establishment(struct sock *sk) return rc; } -static int x25_connect(struct socket *sock, struct sockaddr *uaddr, +static int x25_connect(struct socket *sock, struct sockaddr_unsized *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 9c3acecc14b1..f093c3453f64 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -36,6 +36,13 @@ #define TX_BATCH_SIZE 32 #define MAX_PER_SOCKET_BUDGET 32 +struct xsk_addrs { + u32 num_descs; + u64 addrs[MAX_SKB_FRAGS + 1]; +}; + +static struct kmem_cache *xsk_tx_generic_cache; + void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool) { if (pool->cached_need_wakeup & XDP_WAKEUP_RX) @@ -532,42 +539,93 @@ static int xsk_wakeup(struct xdp_sock *xs, u8 flags) return dev->netdev_ops->ndo_xsk_wakeup(dev, xs->queue_id, flags); } -static int xsk_cq_reserve_addr_locked(struct xsk_buff_pool *pool, u64 addr) +static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool) { - unsigned long flags; int ret; - spin_lock_irqsave(&pool->cq_lock, flags); - ret = xskq_prod_reserve_addr(pool->cq, addr); - spin_unlock_irqrestore(&pool->cq_lock, flags); + spin_lock(&pool->cq_cached_prod_lock); + ret = xskq_prod_reserve(pool->cq); + spin_unlock(&pool->cq_cached_prod_lock); return ret; } -static void xsk_cq_submit_locked(struct xsk_buff_pool *pool, u32 n) +static bool xsk_skb_destructor_is_addr(struct sk_buff *skb) { - unsigned long flags; + return (uintptr_t)skb_shinfo(skb)->destructor_arg & 0x1UL; +} - spin_lock_irqsave(&pool->cq_lock, flags); - xskq_prod_submit_n(pool->cq, n); - spin_unlock_irqrestore(&pool->cq_lock, flags); +static u64 xsk_skb_destructor_get_addr(struct sk_buff *skb) +{ + return (u64)((uintptr_t)skb_shinfo(skb)->destructor_arg & ~0x1UL); } -static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n) +static void xsk_skb_destructor_set_addr(struct sk_buff *skb, u64 addr) { - unsigned long flags; + skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t)addr | 0x1UL); +} - spin_lock_irqsave(&pool->cq_lock, flags); - xskq_prod_cancel_n(pool->cq, n); - spin_unlock_irqrestore(&pool->cq_lock, flags); +static void xsk_inc_num_desc(struct sk_buff *skb) +{ + struct xsk_addrs *xsk_addr; + + if (!xsk_skb_destructor_is_addr(skb)) { + xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; + xsk_addr->num_descs++; + } } static u32 xsk_get_num_desc(struct sk_buff *skb) { - return skb ? (long)skb_shinfo(skb)->destructor_arg : 0; + struct xsk_addrs *xsk_addr; + + if (xsk_skb_destructor_is_addr(skb)) + return 1; + + xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; + + return xsk_addr->num_descs; +} + +static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool, + struct sk_buff *skb) +{ + u32 num_descs = xsk_get_num_desc(skb); + struct xsk_addrs *xsk_addr; + u32 descs_processed = 0; + unsigned long flags; + u32 idx, i; + + spin_lock_irqsave(&pool->cq_prod_lock, flags); + idx = xskq_get_prod(pool->cq); + + if (unlikely(num_descs > 1)) { + xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; + + for (i = 0; i < num_descs; i++) { + xskq_prod_write_addr(pool->cq, idx + descs_processed, + xsk_addr->addrs[i]); + descs_processed++; + } + kmem_cache_free(xsk_tx_generic_cache, xsk_addr); + } else { + xskq_prod_write_addr(pool->cq, idx, + xsk_skb_destructor_get_addr(skb)); + descs_processed++; + } + xskq_prod_submit_n(pool->cq, descs_processed); + spin_unlock_irqrestore(&pool->cq_prod_lock, flags); +} + +static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n) +{ + spin_lock(&pool->cq_cached_prod_lock); + xskq_prod_cancel_n(pool->cq, n); + spin_unlock(&pool->cq_cached_prod_lock); } -static void xsk_destruct_skb(struct sk_buff *skb) +INDIRECT_CALLABLE_SCOPE +void xsk_destruct_skb(struct sk_buff *skb) { struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta; @@ -576,23 +634,33 @@ static void xsk_destruct_skb(struct sk_buff *skb) *compl->tx_timestamp = ktime_get_tai_fast_ns(); } - xsk_cq_submit_locked(xdp_sk(skb->sk)->pool, xsk_get_num_desc(skb)); + xsk_cq_submit_addr_locked(xdp_sk(skb->sk)->pool, skb); sock_wfree(skb); } -static void xsk_set_destructor_arg(struct sk_buff *skb) +static void xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs, + u64 addr) { - long num = xsk_get_num_desc(xdp_sk(skb->sk)->skb) + 1; - - skb_shinfo(skb)->destructor_arg = (void *)num; + skb->dev = xs->dev; + skb->priority = READ_ONCE(xs->sk.sk_priority); + skb->mark = READ_ONCE(xs->sk.sk_mark); + skb->destructor = xsk_destruct_skb; + xsk_skb_destructor_set_addr(skb, addr); } static void xsk_consume_skb(struct sk_buff *skb) { struct xdp_sock *xs = xdp_sk(skb->sk); + u32 num_descs = xsk_get_num_desc(skb); + struct xsk_addrs *xsk_addr; + + if (unlikely(num_descs > 1)) { + xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; + kmem_cache_free(xsk_tx_generic_cache, xsk_addr); + } skb->destructor = sock_wfree; - xsk_cq_cancel_locked(xs->pool, xsk_get_num_desc(skb)); + xsk_cq_cancel_locked(xs->pool, num_descs); /* Free skb without triggering the perf drop trace */ consume_skb(skb); xs->skb = NULL; @@ -604,6 +672,45 @@ static void xsk_drop_skb(struct sk_buff *skb) xsk_consume_skb(skb); } +static int xsk_skb_metadata(struct sk_buff *skb, void *buffer, + struct xdp_desc *desc, struct xsk_buff_pool *pool, + u32 hr) +{ + struct xsk_tx_metadata *meta = NULL; + + if (unlikely(pool->tx_metadata_len == 0)) + return -EINVAL; + + meta = buffer - pool->tx_metadata_len; + if (unlikely(!xsk_buff_valid_tx_metadata(meta))) + return -EINVAL; + + if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM) { + if (unlikely(meta->request.csum_start + + meta->request.csum_offset + + sizeof(__sum16) > desc->len)) + return -EINVAL; + + skb->csum_start = hr + meta->request.csum_start; + skb->csum_offset = meta->request.csum_offset; + skb->ip_summed = CHECKSUM_PARTIAL; + + if (unlikely(pool->tx_sw_csum)) { + int err; + + err = skb_checksum_help(skb); + if (err) + return err; + } + } + + if (meta->flags & XDP_TXMD_FLAGS_LAUNCH_TIME) + skb->skb_mstamp_ns = meta->request.launch_time; + xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta); + + return 0; +} + static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, struct xdp_desc *desc) { @@ -615,6 +722,9 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, int err, i; u64 addr; + addr = desc->addr; + buffer = xsk_buff_raw_get_data(pool, addr); + if (!skb) { hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom)); @@ -623,13 +733,39 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, return ERR_PTR(err); skb_reserve(skb, hr); + + xsk_skb_init_misc(skb, xs, desc->addr); + if (desc->options & XDP_TX_METADATA) { + err = xsk_skb_metadata(skb, buffer, desc, pool, hr); + if (unlikely(err)) + return ERR_PTR(err); + } + } else { + struct xsk_addrs *xsk_addr; + + if (xsk_skb_destructor_is_addr(skb)) { + xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, + GFP_KERNEL); + if (!xsk_addr) + return ERR_PTR(-ENOMEM); + + xsk_addr->num_descs = 1; + xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb); + skb_shinfo(skb)->destructor_arg = (void *)xsk_addr; + } else { + xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; + } + + /* in case of -EOVERFLOW that could happen below, + * xsk_consume_skb() will release this node as whole skb + * would be dropped, which implies freeing all list elements + */ + xsk_addr->addrs[xsk_addr->num_descs] = desc->addr; } - addr = desc->addr; len = desc->len; ts = pool->unaligned ? len : pool->chunk_size; - buffer = xsk_buff_raw_get_data(pool, addr); offset = offset_in_page(buffer); addr = buffer - pool->addrs; @@ -660,16 +796,15 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, struct xdp_desc *desc) { - struct xsk_tx_metadata *meta = NULL; struct net_device *dev = xs->dev; struct sk_buff *skb = xs->skb; - bool first_frag = false; int err; if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) { skb = xsk_build_skb_zerocopy(xs, desc); if (IS_ERR(skb)) { err = PTR_ERR(skb); + skb = NULL; goto free_err; } } else { @@ -680,8 +815,6 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, len = desc->len; if (!skb) { - first_frag = true; - hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom)); tr = dev->needed_tailroom; skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err); @@ -694,11 +827,35 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, err = skb_store_bits(skb, 0, buffer, len); if (unlikely(err)) goto free_err; + + xsk_skb_init_misc(skb, xs, desc->addr); + if (desc->options & XDP_TX_METADATA) { + err = xsk_skb_metadata(skb, buffer, desc, + xs->pool, hr); + if (unlikely(err)) + goto free_err; + } } else { int nr_frags = skb_shinfo(skb)->nr_frags; + struct xsk_addrs *xsk_addr; struct page *page; u8 *vaddr; + if (xsk_skb_destructor_is_addr(skb)) { + xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, + GFP_KERNEL); + if (!xsk_addr) { + err = -ENOMEM; + goto free_err; + } + + xsk_addr->num_descs = 1; + xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb); + skb_shinfo(skb)->destructor_arg = (void *)xsk_addr; + } else { + xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg; + } + if (unlikely(nr_frags == (MAX_SKB_FRAGS - 1) && xp_mb_desc(desc))) { err = -EOVERFLOW; goto free_err; @@ -716,60 +873,22 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, skb_add_rx_frag(skb, nr_frags, page, 0, len, PAGE_SIZE); refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc); - } - - if (first_frag && desc->options & XDP_TX_METADATA) { - if (unlikely(xs->pool->tx_metadata_len == 0)) { - err = -EINVAL; - goto free_err; - } - - meta = buffer - xs->pool->tx_metadata_len; - if (unlikely(!xsk_buff_valid_tx_metadata(meta))) { - err = -EINVAL; - goto free_err; - } - if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM) { - if (unlikely(meta->request.csum_start + - meta->request.csum_offset + - sizeof(__sum16) > len)) { - err = -EINVAL; - goto free_err; - } - - skb->csum_start = hr + meta->request.csum_start; - skb->csum_offset = meta->request.csum_offset; - skb->ip_summed = CHECKSUM_PARTIAL; - - if (unlikely(xs->pool->tx_sw_csum)) { - err = skb_checksum_help(skb); - if (err) - goto free_err; - } - } - - if (meta->flags & XDP_TXMD_FLAGS_LAUNCH_TIME) - skb->skb_mstamp_ns = meta->request.launch_time; + xsk_addr->addrs[xsk_addr->num_descs] = desc->addr; } } - skb->dev = dev; - skb->priority = READ_ONCE(xs->sk.sk_priority); - skb->mark = READ_ONCE(xs->sk.sk_mark); - skb->destructor = xsk_destruct_skb; - xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta); - xsk_set_destructor_arg(skb); + xsk_inc_num_desc(skb); return skb; free_err: - if (first_frag && skb) + if (skb && !skb_shinfo(skb)->nr_frags) kfree_skb(skb); if (err == -EOVERFLOW) { /* Drop the packet */ - xsk_set_destructor_arg(xs->skb); + xsk_inc_num_desc(xs->skb); xsk_drop_skb(xs->skb); xskq_cons_release(xs->tx); } else { @@ -812,7 +931,7 @@ static int __xsk_generic_xmit(struct sock *sk) * if there is space in it. This avoids having to implement * any buffering in the Tx path. */ - err = xsk_cq_reserve_addr_locked(xs->pool, desc.addr); + err = xsk_cq_reserve_locked(xs->pool); if (err) { err = -EAGAIN; goto out; @@ -1153,7 +1272,7 @@ static bool xsk_validate_queues(struct xdp_sock *xs) return xs->fq_tmp && xs->cq_tmp; } -static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) +static int xsk_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr_len) { struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr; struct sock *sk = sock->sk; @@ -1815,8 +1934,18 @@ static int __init xsk_init(void) if (err) goto out_pernet; + xsk_tx_generic_cache = kmem_cache_create("xsk_generic_xmit_cache", + sizeof(struct xsk_addrs), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!xsk_tx_generic_cache) { + err = -ENOMEM; + goto out_unreg_notif; + } + return 0; +out_unreg_notif: + unregister_netdevice_notifier(&xsk_netdev_notifier); out_pernet: unregister_pernet_subsys(&xsk_net_ops); out_sk: diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index aa9788f20d0d..51526034c42a 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -12,26 +12,22 @@ void xp_add_xsk(struct xsk_buff_pool *pool, struct xdp_sock *xs) { - unsigned long flags; - if (!xs->tx) return; - spin_lock_irqsave(&pool->xsk_tx_list_lock, flags); + spin_lock(&pool->xsk_tx_list_lock); list_add_rcu(&xs->tx_list, &pool->xsk_tx_list); - spin_unlock_irqrestore(&pool->xsk_tx_list_lock, flags); + spin_unlock(&pool->xsk_tx_list_lock); } void xp_del_xsk(struct xsk_buff_pool *pool, struct xdp_sock *xs) { - unsigned long flags; - if (!xs->tx) return; - spin_lock_irqsave(&pool->xsk_tx_list_lock, flags); + spin_lock(&pool->xsk_tx_list_lock); list_del_rcu(&xs->tx_list); - spin_unlock_irqrestore(&pool->xsk_tx_list_lock, flags); + spin_unlock(&pool->xsk_tx_list_lock); } void xp_destroy(struct xsk_buff_pool *pool) @@ -94,7 +90,8 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, INIT_LIST_HEAD(&pool->xskb_list); INIT_LIST_HEAD(&pool->xsk_tx_list); spin_lock_init(&pool->xsk_tx_list_lock); - spin_lock_init(&pool->cq_lock); + spin_lock_init(&pool->cq_prod_lock); + spin_lock_init(&pool->cq_cached_prod_lock); refcount_set(&pool->users, 1); pool->fq = xs->fq_tmp; @@ -158,10 +155,6 @@ static void xp_disable_drv_zc(struct xsk_buff_pool *pool) } } -#define NETDEV_XDP_ACT_ZC (NETDEV_XDP_ACT_BASIC | \ - NETDEV_XDP_ACT_REDIRECT | \ - NETDEV_XDP_ACT_XSK_ZEROCOPY) - int xp_assign_dev(struct xsk_buff_pool *pool, struct net_device *netdev, u16 queue_id, u16 flags) { @@ -203,7 +196,7 @@ int xp_assign_dev(struct xsk_buff_pool *pool, /* For copy-mode, we are done. */ return 0; - if ((netdev->xdp_features & NETDEV_XDP_ACT_ZC) != NETDEV_XDP_ACT_ZC) { + if ((netdev->xdp_features & NETDEV_XDP_ACT_XSK) != NETDEV_XDP_ACT_XSK) { err = -EOPNOTSUPP; goto err_unreg_pool; } diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 46d87e961ad6..1eb8d9f8b104 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -143,14 +143,24 @@ static inline bool xp_unused_options_set(u32 options) static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) { - u64 addr = desc->addr - pool->tx_metadata_len; - u64 len = desc->len + pool->tx_metadata_len; - u64 offset = addr & (pool->chunk_size - 1); + u64 len = desc->len; + u64 addr, offset; - if (!desc->len) + if (!len) return false; - if (offset + len > pool->chunk_size) + /* Can overflow if desc->addr < pool->tx_metadata_len */ + if (check_sub_overflow(desc->addr, pool->tx_metadata_len, &addr)) + return false; + + offset = addr & (pool->chunk_size - 1); + + /* + * Can't overflow: @offset is guaranteed to be < ``U32_MAX`` + * (pool->chunk_size is ``u32``), @len is guaranteed + * to be <= ``U32_MAX``. + */ + if (offset + len + pool->tx_metadata_len > pool->chunk_size) return false; if (addr >= pool->addrs_cnt) @@ -158,27 +168,42 @@ static inline bool xp_aligned_validate_desc(struct xsk_buff_pool *pool, if (xp_unused_options_set(desc->options)) return false; + return true; } static inline bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) { - u64 addr = xp_unaligned_add_offset_to_addr(desc->addr) - pool->tx_metadata_len; - u64 len = desc->len + pool->tx_metadata_len; + u64 len = desc->len; + u64 addr, end; - if (!desc->len) + if (!len) return false; + /* Can't overflow: @len is guaranteed to be <= ``U32_MAX`` */ + len += pool->tx_metadata_len; if (len > pool->chunk_size) return false; - if (addr >= pool->addrs_cnt || addr + len > pool->addrs_cnt || - xp_desc_crosses_non_contig_pg(pool, addr, len)) + /* Can overflow if desc->addr is close to 0 */ + if (check_sub_overflow(xp_unaligned_add_offset_to_addr(desc->addr), + pool->tx_metadata_len, &addr)) + return false; + + if (addr >= pool->addrs_cnt) + return false; + + /* Can overflow if pool->addrs_cnt is high enough */ + if (check_add_overflow(addr, len, &end) || end > pool->addrs_cnt) + return false; + + if (xp_desc_crosses_non_contig_pg(pool, addr, len)) return false; if (xp_unused_options_set(desc->options)) return false; + return true; } @@ -344,6 +369,11 @@ static inline u32 xskq_cons_present_entries(struct xsk_queue *q) /* Functions for producers */ +static inline u32 xskq_get_prod(struct xsk_queue *q) +{ + return READ_ONCE(q->ring->producer); +} + static inline u32 xskq_prod_nb_free(struct xsk_queue *q, u32 max) { u32 free_entries = q->nentries - (q->cached_prod - q->cached_cons); @@ -390,6 +420,13 @@ static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr) return 0; } +static inline void xskq_prod_write_addr(struct xsk_queue *q, u32 idx, u64 addr) +{ + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; + + ring->desc[idx & q->ring_mask] = addr; +} + static inline void xskq_prod_write_addr_batch(struct xsk_queue *q, struct xdp_desc *descs, u32 nb_entries) { diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig index f0157702718f..4a62817a88f8 100644 --- a/net/xfrm/Kconfig +++ b/net/xfrm/Kconfig @@ -110,14 +110,17 @@ config XFRM_IPCOMP select CRYPTO_DEFLATE config NET_KEY - tristate "PF_KEY sockets" + tristate "PF_KEY sockets (deprecated)" select XFRM_ALGO help PF_KEYv2 socket family, compatible to KAME ones. - They are required if you are going to use IPsec tools ported - from KAME. - Say Y unless you know what you are doing. + The PF_KEYv2 socket interface is deprecated and + scheduled for removal. All maintained IKE daemons + no longer need PF_KEY sockets. Please use the netlink + interface (XFRM_USER) to configure IPsec. + + If unsure, say N. config NET_KEY_MIGRATE bool "PF_KEY MIGRATE" diff --git a/net/xfrm/espintcp.c b/net/xfrm/espintcp.c index fc7a603b04f1..bf744ac9d5a7 100644 --- a/net/xfrm/espintcp.c +++ b/net/xfrm/espintcp.c @@ -555,14 +555,10 @@ static void espintcp_close(struct sock *sk, long timeout) static __poll_t espintcp_poll(struct file *file, struct socket *sock, poll_table *wait) { - __poll_t mask = datagram_poll(file, sock, wait); struct sock *sk = sock->sk; struct espintcp_ctx *ctx = espintcp_getctx(sk); - if (!skb_queue_empty(&ctx->ike_queue)) - mask |= EPOLLIN | EPOLLRDNORM; - - return mask; + return datagram_poll_queue(file, sock, wait, &ctx->ike_queue); } static void build_protos(struct proto *espintcp_prot, diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index d2819baea414..52ae0e034d29 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -155,7 +155,8 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur return skb; } - if (skb_is_gso(skb) && unlikely(xmit_xfrm_check_overflow(skb))) { + if (skb_is_gso(skb) && (unlikely(x->xso.dev != dev) || + unlikely(xmit_xfrm_check_overflow(skb)))) { struct sk_buff *segs; /* Packet got rerouted, fixup features and segment it. */ @@ -415,10 +416,12 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x) struct net_device *dev = x->xso.dev; bool check_tunnel_size; - if (x->xso.type == XFRM_DEV_OFFLOAD_UNSPECIFIED) + if (!x->type_offload || + (x->xso.type == XFRM_DEV_OFFLOAD_UNSPECIFIED && x->encap)) return false; - if ((dev == xfrm_dst_path(dst)->dev) && !xdst->child->xfrm) { + if ((!dev || dev == xfrm_dst_path(dst)->dev) && + !xdst->child->xfrm) { mtu = xfrm_state_mtu(x, xdst->child_mtu_cached); if (skb->len <= mtu) goto ok; @@ -430,9 +433,12 @@ bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x) return false; ok: + if (!dev) + return true; + check_tunnel_size = x->xso.type == XFRM_DEV_OFFLOAD_PACKET && x->props.mode == XFRM_MODE_TUNNEL; - switch (x->props.family) { + switch (skb_dst(skb)->ops->family) { case AF_INET: /* Check for IPv4 options */ if (ip_hdr(skb)->ihl != 5) diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index c9ddef869aa5..4ed346e682c7 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -505,6 +505,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) async = 1; dev_put(skb->dev); seq = XFRM_SKB_CB(skb)->seq.input.low; + spin_lock(&x->lock); goto resume; } /* GRO call */ @@ -541,9 +542,11 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR); goto drop; } + + nexthdr = x->type_offload->input_tail(x, skb); } - goto lock; + goto process; } family = XFRM_SPI_SKB_CB(skb)->family; @@ -611,7 +614,12 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) goto drop; } -lock: +process: + seq_hi = htonl(xfrm_replay_seqhi(x, seq)); + + XFRM_SKB_CB(skb)->seq.input.low = seq; + XFRM_SKB_CB(skb)->seq.input.hi = seq_hi; + spin_lock(&x->lock); if (unlikely(x->km.state != XFRM_STATE_VALID)) { @@ -638,21 +646,13 @@ lock: goto drop_unlock; } - spin_unlock(&x->lock); - if (xfrm_tunnel_check(skb, x, family)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMODEERROR); - goto drop; + goto drop_unlock; } - seq_hi = htonl(xfrm_replay_seqhi(x, seq)); - - XFRM_SKB_CB(skb)->seq.input.low = seq; - XFRM_SKB_CB(skb)->seq.input.hi = seq_hi; - - if (crypto_done) { - nexthdr = x->type_offload->input_tail(x, skb); - } else { + if (!crypto_done) { + spin_unlock(&x->lock); dev_hold(skb->dev); nexthdr = x->type->input(x, skb); @@ -660,9 +660,9 @@ lock: return 0; dev_put(skb->dev); + spin_lock(&x->lock); } resume: - spin_lock(&x->lock); if (nexthdr < 0) { if (nexthdr == -EBADMSG) { xfrm_audit_state_icvfail(x, skb, @@ -676,7 +676,7 @@ resume: /* only the first xfrm gets the encap type */ encap_type = 0; - if (xfrm_replay_recheck(x, skb, seq)) { + if (!crypto_done && xfrm_replay_recheck(x, skb, seq)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATESEQERROR); goto drop_unlock; } diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 9077730ff7d0..54222fcbd7fd 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -698,7 +698,7 @@ static void xfrm_get_inner_ipproto(struct sk_buff *skb, struct xfrm_state *x) return; if (x->outer_mode.encap == XFRM_MODE_TUNNEL) { - switch (x->outer_mode.family) { + switch (skb_dst(skb)->ops->family) { case AF_INET: xo->inner_ipproto = ip_hdr(skb)->protocol; break; @@ -772,8 +772,12 @@ int xfrm_output(struct sock *sk, struct sk_buff *skb) /* Exclusive direct xmit for tunnel mode, as * some filtering or matching rules may apply * in transport mode. + * Locally generated packets also require + * the normal XFRM path for L2 header setup, + * as the hardware needs the L2 header to match + * for encryption, so skip direct output as well. */ - if (x->props.mode == XFRM_MODE_TUNNEL) + if (x->props.mode == XFRM_MODE_TUNNEL && !skb->sk) return xfrm_dev_direct_output(sk, x, skb); return xfrm_output_resume(sk, skb, 0); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index c5035a9bc3bb..62486f866975 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2594,7 +2594,7 @@ xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, const struct flowi *fl, static dscp_t xfrm_get_dscp(const struct flowi *fl, int family) { if (family == AF_INET) - return inet_dsfield_to_dscp(fl->u.ip4.flowi4_tos); + return fl->u.ip4.flowi4_dscp; return 0; } @@ -3462,7 +3462,7 @@ decode_session4(const struct xfrm_flow_keys *flkeys, struct flowi *fl, bool reve } fl4->flowi4_proto = flkeys->basic.ip_proto; - fl4->flowi4_tos = flkeys->ip.tos & ~INET_ECN_MASK; + fl4->flowi4_dscp = inet_dsfield_to_dscp(flkeys->ip.tos); } #if IS_ENABLED(CONFIG_IPV6) @@ -3594,7 +3594,7 @@ static bool xfrm_icmp_flow_decode(struct sk_buff *skb, unsigned short family, fl1->flowi_oif = fl->flowi_oif; fl1->flowi_mark = fl->flowi_mark; - fl1->flowi_tos = fl->flowi_tos; + fl1->flowi_dscp = fl->flowi_dscp; nf_nat_decode_session(newskb, fl1, family); ret = false; @@ -3881,12 +3881,18 @@ int __xfrm_route_forward(struct sk_buff *skb, unsigned short family) } skb_dst_force(skb); - if (!skb_dst(skb)) { + dst = skb_dst(skb); + if (!dst) { XFRM_INC_STATS(net, LINUX_MIB_XFRMFWDHDRERROR); return 0; } - dst = xfrm_lookup(net, skb_dst(skb), &fl, NULL, XFRM_LOOKUP_QUEUE); + /* ignore return value from skb_dstref_steal, xfrm_lookup takes + * care of dropping the refcnt if needed. + */ + skb_dstref_steal(skb); + + dst = xfrm_lookup(net, dst, &fl, NULL, XFRM_LOOKUP_QUEUE); if (IS_ERR(dst)) { res = 0; dst = NULL; diff --git a/net/xfrm/xfrm_proc.c b/net/xfrm/xfrm_proc.c index 8e07dd614b0b..5e1fd6b1d503 100644 --- a/net/xfrm/xfrm_proc.c +++ b/net/xfrm/xfrm_proc.c @@ -45,21 +45,21 @@ static const struct snmp_mib xfrm_mib_list[] = { SNMP_MIB_ITEM("XfrmInStateDirError", LINUX_MIB_XFRMINSTATEDIRERROR), SNMP_MIB_ITEM("XfrmInIptfsError", LINUX_MIB_XFRMINIPTFSERROR), SNMP_MIB_ITEM("XfrmOutNoQueueSpace", LINUX_MIB_XFRMOUTNOQSPACE), - SNMP_MIB_SENTINEL }; static int xfrm_statistics_seq_show(struct seq_file *seq, void *v) { - unsigned long buff[LINUX_MIB_XFRMMAX]; + unsigned long buff[ARRAY_SIZE(xfrm_mib_list)]; + const int cnt = ARRAY_SIZE(xfrm_mib_list); struct net *net = seq->private; int i; - memset(buff, 0, sizeof(unsigned long) * LINUX_MIB_XFRMMAX); + memset(buff, 0, sizeof(buff)); xfrm_state_update_stats(net); - snmp_get_cpu_field_batch(buff, xfrm_mib_list, - net->mib.xfrm_statistics); - for (i = 0; xfrm_mib_list[i].name; i++) + snmp_get_cpu_field_batch_cnt(buff, xfrm_mib_list, cnt, + net->mib.xfrm_statistics); + for (i = 0; i < cnt; i++) seq_printf(seq, "%-24s\t%lu\n", xfrm_mib_list[i].name, buff[i]); diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 77db3b5fe4ac..9e14e453b55c 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -592,6 +592,7 @@ void xfrm_state_free(struct xfrm_state *x) } EXPORT_SYMBOL(xfrm_state_free); +static void xfrm_state_delete_tunnel(struct xfrm_state *x); static void xfrm_state_gc_destroy(struct xfrm_state *x) { if (x->mode_cbs && x->mode_cbs->destroy_state) @@ -607,6 +608,7 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x) kfree(x->replay_esn); kfree(x->preplay_esn); xfrm_unset_type_offload(x); + xfrm_state_delete_tunnel(x); if (x->type) { x->type->destructor(x); xfrm_put_type(x->type); @@ -806,7 +808,6 @@ void __xfrm_state_destroy(struct xfrm_state *x) } EXPORT_SYMBOL(__xfrm_state_destroy); -static void xfrm_state_delete_tunnel(struct xfrm_state *x); int __xfrm_state_delete(struct xfrm_state *x) { struct net *net = xs_net(x); @@ -2073,6 +2074,7 @@ static struct xfrm_state *xfrm_state_clone_and_setup(struct xfrm_state *orig, return x; error: + x->km.state = XFRM_STATE_DEAD; xfrm_state_put(x); out: return NULL; @@ -2157,11 +2159,15 @@ struct xfrm_state *xfrm_state_migrate(struct xfrm_state *x, xfrm_state_insert(xc); } else { if (xfrm_state_add(xc) < 0) - goto error; + goto error_add; } return xc; +error_add: + if (xuo) + xfrm_dev_state_delete(xc); error: + xc->km.state = XFRM_STATE_DEAD; xfrm_state_put(xc); return NULL; } @@ -2191,14 +2197,18 @@ int xfrm_state_update(struct xfrm_state *x) } if (x1->km.state == XFRM_STATE_ACQ) { - if (x->dir && x1->dir != x->dir) + if (x->dir && x1->dir != x->dir) { + to_put = x1; goto out; + } __xfrm_state_insert(x); x = NULL; } else { - if (x1->dir != x->dir) + if (x1->dir != x->dir) { + to_put = x1; goto out; + } } err = 0; @@ -2583,6 +2593,8 @@ int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high, for (h = 0; h < range; h++) { u32 spi = (low == high) ? low : get_random_u32_inclusive(low, high); + if (spi == 0) + goto next; newspi = htonl(spi); spin_lock_bh(&net->xfrm.xfrm_state_lock); @@ -2598,6 +2610,7 @@ int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high, xfrm_state_put(x0); spin_unlock_bh(&net->xfrm.xfrm_state_lock); +next: if (signal_pending(current)) { err = -ERESTARTSYS; goto unlock; @@ -3295,21 +3308,25 @@ out_bydst: void xfrm_state_fini(struct net *net) { unsigned int sz; + int i; flush_work(&net->xfrm.state_hash_work); - xfrm_state_flush(net, IPSEC_PROTO_ANY, false); + xfrm_state_flush(net, 0, false); flush_work(&xfrm_state_gc_work); WARN_ON(!list_empty(&net->xfrm.state_all)); + for (i = 0; i <= net->xfrm.state_hmask; i++) { + WARN_ON(!hlist_empty(net->xfrm.state_byseq + i)); + WARN_ON(!hlist_empty(net->xfrm.state_byspi + i)); + WARN_ON(!hlist_empty(net->xfrm.state_bysrc + i)); + WARN_ON(!hlist_empty(net->xfrm.state_bydst + i)); + } + sz = (net->xfrm.state_hmask + 1) * sizeof(struct hlist_head); - WARN_ON(!hlist_empty(net->xfrm.state_byseq)); xfrm_hash_free(net->xfrm.state_byseq, sz); - WARN_ON(!hlist_empty(net->xfrm.state_byspi)); xfrm_hash_free(net->xfrm.state_byspi, sz); - WARN_ON(!hlist_empty(net->xfrm.state_bysrc)); xfrm_hash_free(net->xfrm.state_bysrc, sz); - WARN_ON(!hlist_empty(net->xfrm.state_bydst)); xfrm_hash_free(net->xfrm.state_bydst, sz); free_percpu(net->xfrm.state_cache_input); } diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 684239018bec..403b5ecac2c5 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -593,7 +593,7 @@ static int attach_one_algo(struct xfrm_algo **algpp, u8 *props, if (!p) return -ENOMEM; - strcpy(p->alg_name, algo->name); + strscpy(p->alg_name, algo->name); *algpp = p; return 0; } @@ -620,7 +620,7 @@ static int attach_crypt(struct xfrm_state *x, struct nlattr *rta, if (!p) return -ENOMEM; - strcpy(p->alg_name, algo->name); + strscpy(p->alg_name, algo->name); x->ealg = p; x->geniv = algo->uinfo.encr.geniv; return 0; @@ -649,7 +649,7 @@ static int attach_auth(struct xfrm_algo_auth **algpp, u8 *props, if (!p) return -ENOMEM; - strcpy(p->alg_name, algo->name); + strscpy(p->alg_name, algo->name); p->alg_key_len = ualg->alg_key_len; p->alg_trunc_len = algo->uinfo.auth.icv_truncbits; memcpy(p->alg_key, ualg->alg_key, (ualg->alg_key_len + 7) / 8); @@ -684,7 +684,7 @@ static int attach_auth_trunc(struct xfrm_algo_auth **algpp, u8 *props, if (!p) return -ENOMEM; - strcpy(p->alg_name, algo->name); + strscpy(p->alg_name, algo->name); if (!p->alg_trunc_len) p->alg_trunc_len = algo->uinfo.auth.icv_truncbits; @@ -714,7 +714,7 @@ static int attach_aead(struct xfrm_state *x, struct nlattr *rta, if (!p) return -ENOMEM; - strcpy(p->alg_name, algo->name); + strscpy(p->alg_name, algo->name); x->aead = p; x->geniv = algo->uinfo.aead.geniv; return 0; @@ -947,8 +947,11 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, if (attrs[XFRMA_SA_PCPU]) { x->pcpu_num = nla_get_u32(attrs[XFRMA_SA_PCPU]); - if (x->pcpu_num >= num_possible_cpus()) + if (x->pcpu_num >= num_possible_cpus()) { + err = -ERANGE; + NL_SET_ERR_MSG(extack, "pCPU number too big"); goto error; + } } err = __xfrm_init_state(x, extack); @@ -3035,6 +3038,9 @@ static int xfrm_add_acquire(struct sk_buff *skb, struct nlmsghdr *nlh, } xfrm_state_free(x); + xfrm_dev_policy_delete(xp); + xfrm_dev_policy_free(xp); + security_xfrm_policy_free(xp->security); kfree(xp); return 0; |
