diff options
Diffstat (limited to 'net')
185 files changed, 1893 insertions, 1267 deletions
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 407b2335f091..790b54a7cbe3 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -504,28 +504,6 @@ static void vlan_dev_set_rx_mode(struct net_device *vlan_dev) dev_uc_sync(vlan_dev_priv(vlan_dev)->real_dev, vlan_dev); } -/* - * vlan network devices have devices nesting below it, and are a special - * "super class" of normal network devices; split their locks off into a - * separate class since they always nest. - */ -static struct lock_class_key vlan_netdev_xmit_lock_key; -static struct lock_class_key vlan_netdev_addr_lock_key; - -static void vlan_dev_set_lockdep_one(struct net_device *dev, - struct netdev_queue *txq, - void *unused) -{ - lockdep_set_class(&txq->_xmit_lock, &vlan_netdev_xmit_lock_key); -} - -static void vlan_dev_set_lockdep_class(struct net_device *dev) -{ - lockdep_set_class(&dev->addr_list_lock, - &vlan_netdev_addr_lock_key); - netdev_for_each_tx_queue(dev, vlan_dev_set_lockdep_one, NULL); -} - static __be16 vlan_parse_protocol(const struct sk_buff *skb) { struct vlan_ethhdr *veth = (struct vlan_ethhdr *)(skb->data); @@ -627,7 +605,7 @@ static int vlan_dev_init(struct net_device *dev) SET_NETDEV_DEVTYPE(dev, &vlan_type); - vlan_dev_set_lockdep_class(dev); + netdev_lockdep_set_classes(dev); vlan->vlan_pcpu_stats = netdev_alloc_pcpu_stats(struct vlan_pcpu_stats); if (!vlan->vlan_pcpu_stats) diff --git a/net/8021q/vlanproc.c b/net/8021q/vlanproc.c index 7825c129742a..87b959da00cd 100644 --- a/net/8021q/vlanproc.c +++ b/net/8021q/vlanproc.c @@ -163,48 +163,34 @@ void vlan_proc_rem_dev(struct net_device *vlandev) * The following few functions build the content of /proc/net/vlan/config */ -/* start read of /proc/net/vlan/config */ -static void *vlan_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(rcu) +static void *vlan_seq_from_index(struct seq_file *seq, loff_t *pos) { + unsigned long ifindex = *pos; struct net_device *dev; - struct net *net = seq_file_net(seq); - loff_t i = 1; - - rcu_read_lock(); - if (*pos == 0) - return SEQ_START_TOKEN; - for_each_netdev_rcu(net, dev) { + for_each_netdev_dump(seq_file_net(seq), dev, ifindex) { if (!is_vlan_dev(dev)) continue; - - if (i++ == *pos) - return dev; + *pos = dev->ifindex; + return dev; } + return NULL; +} + +static void *vlan_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(rcu) +{ + rcu_read_lock(); + if (*pos == 0) + return SEQ_START_TOKEN; - return NULL; + return vlan_seq_from_index(seq, pos); } static void *vlan_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct net_device *dev; - struct net *net = seq_file_net(seq); - ++*pos; - - dev = v; - if (v == SEQ_START_TOKEN) - dev = net_device_entry(&net->dev_base_head); - - for_each_netdev_continue_rcu(net, dev) { - if (!is_vlan_dev(dev)) - continue; - - return dev; - } - - return NULL; + return vlan_seq_from_index(seq, pos); } static void vlan_seq_stop(struct seq_file *seq, void *v) diff --git a/net/Makefile b/net/Makefile index b06b5539e7a6..65bb8c72a35e 100644 --- a/net/Makefile +++ b/net/Makefile @@ -17,7 +17,7 @@ obj-$(CONFIG_NETFILTER) += netfilter/ obj-$(CONFIG_INET) += ipv4/ obj-$(CONFIG_TLS) += tls/ obj-$(CONFIG_XFRM) += xfrm/ -obj-$(CONFIG_UNIX_SCM) += unix/ +obj-$(CONFIG_UNIX) += unix/ obj-y += ipv6/ obj-$(CONFIG_PACKET) += packet/ obj-$(CONFIG_NET_KEY) += key/ diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index 28a939d56090..4c7e85534324 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -684,7 +684,7 @@ static bool batadv_dat_forward_data(struct batadv_priv *bat_priv, cand = batadv_dat_select_candidates(bat_priv, ip, vid); if (!cand) - goto out; + return ret; batadv_dbg(BATADV_DBG_DAT, bat_priv, "DHT_SEND for %pI4\n", &ip); @@ -728,7 +728,6 @@ free_orig: batadv_orig_node_put(cand[i].orig_node); } -out: kfree(cand); return ret; } diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 5fc754b0b3f7..75119f1ffccc 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -691,29 +691,31 @@ int batadv_throw_uevent(struct batadv_priv *bat_priv, enum batadv_uev_type type, "%s%s", BATADV_UEV_TYPE_VAR, batadv_uev_type_str[type]); if (!uevent_env[0]) - goto out; + goto report_error; uevent_env[1] = kasprintf(GFP_ATOMIC, "%s%s", BATADV_UEV_ACTION_VAR, batadv_uev_action_str[action]); if (!uevent_env[1]) - goto out; + goto free_first_env; /* If the event is DEL, ignore the data field */ if (action != BATADV_UEV_DEL) { uevent_env[2] = kasprintf(GFP_ATOMIC, "%s%s", BATADV_UEV_DATA_VAR, data); if (!uevent_env[2]) - goto out; + goto free_second_env; } ret = kobject_uevent_env(bat_kobj, KOBJ_CHANGE, uevent_env); -out: - kfree(uevent_env[0]); - kfree(uevent_env[1]); kfree(uevent_env[2]); +free_second_env: + kfree(uevent_env[1]); +free_first_env: + kfree(uevent_env[0]); if (ret) +report_error: batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "Impossible to send uevent for (%s,%s,%s) event (err: %d)\n", batadv_uev_type_str[type], diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 870dcd7f1786..8ca854a75a32 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -13,7 +13,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2024.0" +#define BATADV_SOURCE_VERSION "2024.1" #endif /* B.A.T.M.A.N. parameters */ diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 1f7ed9d4f6fd..0954757f0b8b 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -15,7 +15,6 @@ #include <linux/cache.h> #include <linux/err.h> #include <linux/errno.h> -#include <linux/export.h> #include <linux/genetlink.h> #include <linux/gfp.h> #include <linux/if_ether.h> diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c index 8906f7bdf4a9..02de71719aed 100644 --- a/net/bpf/bpf_dummy_struct_ops.c +++ b/net/bpf/bpf_dummy_struct_ops.c @@ -7,7 +7,7 @@ #include <linux/bpf.h> #include <linux/btf.h> -extern struct bpf_struct_ops bpf_bpf_dummy_ops; +static struct bpf_struct_ops bpf_bpf_dummy_ops; /* A common type for test_N with return value in bpf_dummy_ops */ typedef int (*dummy_ops_test_ret_fn)(struct bpf_dummy_ops_state *state, ...); @@ -22,6 +22,8 @@ struct bpf_dummy_ops_test_args { struct bpf_dummy_ops_state state; }; +static struct btf *bpf_dummy_ops_btf; + static struct bpf_dummy_ops_test_args * dummy_ops_init_args(const union bpf_attr *kattr, unsigned int nr) { @@ -90,9 +92,15 @@ int bpf_struct_ops_test_run(struct bpf_prog *prog, const union bpf_attr *kattr, void *image = NULL; unsigned int op_idx; int prog_ret; + s32 type_id; int err; - if (prog->aux->attach_btf_id != st_ops->type_id) + type_id = btf_find_by_name_kind(bpf_dummy_ops_btf, + bpf_bpf_dummy_ops.name, + BTF_KIND_STRUCT); + if (type_id < 0) + return -EINVAL; + if (prog->aux->attach_btf_id != type_id) return -EOPNOTSUPP; func_proto = prog->aux->attach_func_proto; @@ -148,6 +156,7 @@ out: static int bpf_dummy_init(struct btf *btf) { + bpf_dummy_ops_btf = btf; return 0; } @@ -247,7 +256,7 @@ static struct bpf_dummy_ops __bpf_bpf_dummy_ops = { .test_sleepable = bpf_dummy_test_sleepable, }; -struct bpf_struct_ops bpf_bpf_dummy_ops = { +static struct bpf_struct_ops bpf_bpf_dummy_ops = { .verifier_ops = &bpf_dummy_verifier_ops, .init = bpf_dummy_init, .check_member = bpf_dummy_ops_check_member, @@ -256,4 +265,11 @@ struct bpf_struct_ops bpf_bpf_dummy_ops = { .unreg = bpf_dummy_unreg, .name = "bpf_dummy_ops", .cfi_stubs = &__bpf_bpf_dummy_ops, + .owner = THIS_MODULE, }; + +static int __init bpf_dummy_struct_ops_init(void) +{ + return register_bpf_struct_ops(&bpf_bpf_dummy_ops, bpf_dummy_ops); +} +late_initcall(bpf_dummy_struct_ops_init); diff --git a/net/bridge/br.c b/net/bridge/br.c index ac19b797dbec..2cab878e0a39 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -356,26 +356,21 @@ void br_opt_toggle(struct net_bridge *br, enum net_bridge_opts opt, bool on) clear_bit(opt, &br->options); } -static void __net_exit br_net_exit_batch(struct list_head *net_list) +static void __net_exit br_net_exit_batch_rtnl(struct list_head *net_list, + struct list_head *dev_to_kill) { struct net_device *dev; struct net *net; - LIST_HEAD(list); - - rtnl_lock(); + ASSERT_RTNL(); list_for_each_entry(net, net_list, exit_list) for_each_netdev(net, dev) if (netif_is_bridge_master(dev)) - br_dev_delete(dev, &list); - - unregister_netdevice_many(&list); - - rtnl_unlock(); + br_dev_delete(dev, dev_to_kill); } static struct pernet_operations br_net_ops = { - .exit_batch = br_net_exit_batch, + .exit_batch_rtnl = br_net_exit_batch_rtnl, }; static const struct stp_proto br_stp_proto = { diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 65cee0ad3c1b..717e9750614c 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -108,13 +108,6 @@ out: return NETDEV_TX_OK; } -static struct lock_class_key bridge_netdev_addr_lock_key; - -static void br_set_lockdep_class(struct net_device *dev) -{ - lockdep_set_class(&dev->addr_list_lock, &bridge_netdev_addr_lock_key); -} - static int br_dev_init(struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); @@ -153,7 +146,7 @@ static int br_dev_init(struct net_device *dev) br_fdb_hash_fini(br); } - br_set_lockdep_class(dev); + netdev_lockdep_set_classes(dev); return err; } diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index c622de5eccd0..c77591e63841 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -35,10 +35,7 @@ static struct kmem_cache *br_fdb_cache __read_mostly; int __init br_fdb_init(void) { - br_fdb_cache = kmem_cache_create("bridge_fdb_cache", - sizeof(struct net_bridge_fdb_entry), - 0, - SLAB_HWCACHE_ALIGN, NULL); + br_fdb_cache = KMEM_CACHE(net_bridge_fdb_entry, SLAB_HWCACHE_ALIGN); if (!br_fdb_cache) return -ENOMEM; diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 5ad4abfcb7ba..2cf4fc756263 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -455,7 +455,8 @@ static int br_fill_ifinfo(struct sk_buff *skb, u32 filter_mask, const struct net_device *dev, bool getlink) { - u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN; + u8 operstate = netif_running(dev) ? READ_ONCE(dev->operstate) : + IF_OPER_DOWN; struct nlattr *af = NULL; struct net_bridge *br; struct ifinfomsg *hdr; diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 15f44d026e75..9c2fffb827ab 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -841,7 +841,7 @@ void br_vlan_flush(struct net_bridge *br) vg = br_vlan_group(br); __vlan_flush(br, NULL, vg); RCU_INIT_POINTER(br->vlgrp, NULL); - synchronize_rcu(); + synchronize_net(); __vlan_group_free(vg); } @@ -1372,7 +1372,7 @@ void nbp_vlan_flush(struct net_bridge_port *port) vg = nbp_vlan_group(port); __vlan_flush(port->br, port, vg); RCU_INIT_POINTER(port->vlgrp, NULL); - synchronize_rcu(); + synchronize_net(); __vlan_group_free(vg); } diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig index 7f304a19ac1b..104c0125e32e 100644 --- a/net/bridge/netfilter/Kconfig +++ b/net/bridge/netfilter/Kconfig @@ -39,6 +39,10 @@ config NF_CONNTRACK_BRIDGE To compile it as a module, choose M here. If unsure, say N. +# old sockopt interface and eval loop +config BRIDGE_NF_EBTABLES_LEGACY + tristate + menuconfig BRIDGE_NF_EBTABLES tristate "Ethernet Bridge tables (ebtables) support" depends on BRIDGE && NETFILTER && NETFILTER_XTABLES @@ -55,6 +59,7 @@ if BRIDGE_NF_EBTABLES # config BRIDGE_EBT_BROUTE tristate "ebt: broute table support" + select BRIDGE_NF_EBTABLES_LEGACY help The ebtables broute table is used to define rules that decide between bridging and routing frames, giving Linux the functionality of a @@ -65,6 +70,7 @@ config BRIDGE_EBT_BROUTE config BRIDGE_EBT_T_FILTER tristate "ebt: filter table support" + select BRIDGE_NF_EBTABLES_LEGACY help The ebtables filter table is used to define frame filtering rules at local input, forwarding and local output. See the man page for @@ -74,6 +80,7 @@ config BRIDGE_EBT_T_FILTER config BRIDGE_EBT_T_NAT tristate "ebt: nat table support" + select BRIDGE_NF_EBTABLES_LEGACY help The ebtables nat table is used to define rules that alter the MAC source address (MAC SNAT) or the MAC destination address (MAC DNAT). diff --git a/net/bridge/netfilter/Makefile b/net/bridge/netfilter/Makefile index 1c9ce49ab651..b9a1303da977 100644 --- a/net/bridge/netfilter/Makefile +++ b/net/bridge/netfilter/Makefile @@ -9,7 +9,7 @@ obj-$(CONFIG_NFT_BRIDGE_REJECT) += nft_reject_bridge.o # connection tracking obj-$(CONFIG_NF_CONNTRACK_BRIDGE) += nf_conntrack_bridge.o -obj-$(CONFIG_BRIDGE_NF_EBTABLES) += ebtables.o +obj-$(CONFIG_BRIDGE_NF_EBTABLES_LEGACY) += ebtables.o # tables obj-$(CONFIG_BRIDGE_EBT_BROUTE) += ebtable_broute.o diff --git a/net/can/af_can.c b/net/can/af_can.c index 7343fd487dbe..707576eeeb58 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -865,6 +865,8 @@ static __init int can_init(void) /* check for correct padding to be able to use the structs similarly */ BUILD_BUG_ON(offsetof(struct can_frame, len) != offsetof(struct canfd_frame, len) || + offsetof(struct can_frame, len) != + offsetof(struct canxl_frame, flags) || offsetof(struct can_frame, data) != offsetof(struct canfd_frame, data)); diff --git a/net/can/bcm.c b/net/can/bcm.c index 9168114fc87f..27d5fcf0eac9 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -72,9 +72,11 @@ #define BCM_TIMER_SEC_MAX (400 * 24 * 60 * 60) /* use of last_frames[index].flags */ +#define RX_LOCAL 0x10 /* frame was created on the local host */ +#define RX_OWN 0x20 /* frame was sent via the socket it was received on */ #define RX_RECV 0x40 /* received data for this element */ #define RX_THR 0x80 /* element not been sent due to throttle feature */ -#define BCM_CAN_FLAGS_MASK 0x3F /* to clean private flags after usage */ +#define BCM_CAN_FLAGS_MASK 0x0F /* to clean private flags after usage */ /* get best masking value for can_rx_register() for a given single can_id */ #define REGMASK(id) ((id & CAN_EFF_FLAG) ? \ @@ -138,6 +140,16 @@ static LIST_HEAD(bcm_notifier_list); static DEFINE_SPINLOCK(bcm_notifier_lock); static struct bcm_sock *bcm_busy_notifier; +/* Return pointer to store the extra msg flags for bcm_recvmsg(). + * We use the space of one unsigned int beyond the 'struct sockaddr_can' + * in skb->cb. + */ +static inline unsigned int *bcm_flags(struct sk_buff *skb) +{ + /* return pointer after struct sockaddr_can */ + return (unsigned int *)(&((struct sockaddr_can *)skb->cb)[1]); +} + static inline struct bcm_sock *bcm_sk(const struct sock *sk) { return (struct bcm_sock *)sk; @@ -325,6 +337,7 @@ static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head, struct sock *sk = op->sk; unsigned int datalen = head->nframes * op->cfsiz; int err; + unsigned int *pflags; skb = alloc_skb(sizeof(*head) + datalen, gfp_any()); if (!skb) @@ -332,6 +345,14 @@ static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head, skb_put_data(skb, head, sizeof(*head)); + /* ensure space for sockaddr_can and msg flags */ + sock_skb_cb_check_size(sizeof(struct sockaddr_can) + + sizeof(unsigned int)); + + /* initialize msg flags */ + pflags = bcm_flags(skb); + *pflags = 0; + if (head->nframes) { /* CAN frames starting here */ firstframe = (struct canfd_frame *)skb_tail_pointer(skb); @@ -344,8 +365,14 @@ static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head, * relevant for updates that are generated by the * BCM, where nframes is 1 */ - if (head->nframes == 1) + if (head->nframes == 1) { + if (firstframe->flags & RX_LOCAL) + *pflags |= MSG_DONTROUTE; + if (firstframe->flags & RX_OWN) + *pflags |= MSG_CONFIRM; + firstframe->flags &= BCM_CAN_FLAGS_MASK; + } } if (has_timestamp) { @@ -360,7 +387,6 @@ static void bcm_send_to_user(struct bcm_op *op, struct bcm_msg_head *head, * containing the interface index. */ - sock_skb_cb_check_size(sizeof(struct sockaddr_can)); addr = (struct sockaddr_can *)skb->cb; memset(addr, 0, sizeof(*addr)); addr->can_family = AF_CAN; @@ -444,7 +470,7 @@ static void bcm_rx_changed(struct bcm_op *op, struct canfd_frame *data) op->frames_filtered = op->frames_abs = 0; /* this element is not throttled anymore */ - data->flags &= (BCM_CAN_FLAGS_MASK|RX_RECV); + data->flags &= ~RX_THR; memset(&head, 0, sizeof(head)); head.opcode = RX_CHANGED; @@ -465,13 +491,17 @@ static void bcm_rx_changed(struct bcm_op *op, struct canfd_frame *data) */ static void bcm_rx_update_and_send(struct bcm_op *op, struct canfd_frame *lastdata, - const struct canfd_frame *rxdata) + const struct canfd_frame *rxdata, + unsigned char traffic_flags) { memcpy(lastdata, rxdata, op->cfsiz); /* mark as used and throttled by default */ lastdata->flags |= (RX_RECV|RX_THR); + /* add own/local/remote traffic flags */ + lastdata->flags |= traffic_flags; + /* throttling mode inactive ? */ if (!op->kt_ival2) { /* send RX_CHANGED to the user immediately */ @@ -508,7 +538,8 @@ rx_changed_settime: * received data stored in op->last_frames[] */ static void bcm_rx_cmp_to_index(struct bcm_op *op, unsigned int index, - const struct canfd_frame *rxdata) + const struct canfd_frame *rxdata, + unsigned char traffic_flags) { struct canfd_frame *cf = op->frames + op->cfsiz * index; struct canfd_frame *lcf = op->last_frames + op->cfsiz * index; @@ -521,7 +552,7 @@ static void bcm_rx_cmp_to_index(struct bcm_op *op, unsigned int index, if (!(lcf->flags & RX_RECV)) { /* received data for the first time => send update to user */ - bcm_rx_update_and_send(op, lcf, rxdata); + bcm_rx_update_and_send(op, lcf, rxdata, traffic_flags); return; } @@ -529,7 +560,7 @@ static void bcm_rx_cmp_to_index(struct bcm_op *op, unsigned int index, for (i = 0; i < rxdata->len; i += 8) { if ((get_u64(cf, i) & get_u64(rxdata, i)) != (get_u64(cf, i) & get_u64(lcf, i))) { - bcm_rx_update_and_send(op, lcf, rxdata); + bcm_rx_update_and_send(op, lcf, rxdata, traffic_flags); return; } } @@ -537,7 +568,7 @@ static void bcm_rx_cmp_to_index(struct bcm_op *op, unsigned int index, if (op->flags & RX_CHECK_DLC) { /* do a real check in CAN frame length */ if (rxdata->len != lcf->len) { - bcm_rx_update_and_send(op, lcf, rxdata); + bcm_rx_update_and_send(op, lcf, rxdata, traffic_flags); return; } } @@ -644,6 +675,7 @@ static void bcm_rx_handler(struct sk_buff *skb, void *data) struct bcm_op *op = (struct bcm_op *)data; const struct canfd_frame *rxframe = (struct canfd_frame *)skb->data; unsigned int i; + unsigned char traffic_flags; if (op->can_id != rxframe->can_id) return; @@ -673,15 +705,24 @@ static void bcm_rx_handler(struct sk_buff *skb, void *data) return; } + /* compute flags to distinguish between own/local/remote CAN traffic */ + traffic_flags = 0; + if (skb->sk) { + traffic_flags |= RX_LOCAL; + if (skb->sk == op->sk) + traffic_flags |= RX_OWN; + } + if (op->flags & RX_FILTER_ID) { /* the easiest case */ - bcm_rx_update_and_send(op, op->last_frames, rxframe); + bcm_rx_update_and_send(op, op->last_frames, rxframe, + traffic_flags); goto rx_starttimer; } if (op->nframes == 1) { /* simple compare with index 0 */ - bcm_rx_cmp_to_index(op, 0, rxframe); + bcm_rx_cmp_to_index(op, 0, rxframe, traffic_flags); goto rx_starttimer; } @@ -698,7 +739,8 @@ static void bcm_rx_handler(struct sk_buff *skb, void *data) if ((get_u64(op->frames, 0) & get_u64(rxframe, 0)) == (get_u64(op->frames, 0) & get_u64(op->frames + op->cfsiz * i, 0))) { - bcm_rx_cmp_to_index(op, i, rxframe); + bcm_rx_cmp_to_index(op, i, rxframe, + traffic_flags); break; } } @@ -1675,6 +1717,9 @@ static int bcm_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, memcpy(msg->msg_name, skb->cb, msg->msg_namelen); } + /* assign the flags that have been recorded in bcm_send_to_user() */ + msg->msg_flags |= *(bcm_flags(skb)); + skb_free_datagram(sk, skb); return size; diff --git a/net/can/isotp.c b/net/can/isotp.c index d1c6f206f429..25bac0fafc83 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -381,8 +381,9 @@ static int isotp_rcv_fc(struct isotp_sock *so, struct canfd_frame *cf, int ae) return 1; } - /* get communication parameters only from the first FC frame */ - if (so->tx.state == ISOTP_WAIT_FIRST_FC) { + /* get static/dynamic communication params from first/every FC frame */ + if (so->tx.state == ISOTP_WAIT_FIRST_FC || + so->opt.flags & CAN_ISOTP_DYN_FC_PARMS) { so->txfc.bs = cf->data[ae + 1]; so->txfc.stmin = cf->data[ae + 2]; diff --git a/net/can/raw.c b/net/can/raw.c index e6b822624ba2..cb8e6f788af8 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -91,6 +91,10 @@ struct raw_sock { int recv_own_msgs; int fd_frames; int xl_frames; + struct can_raw_vcid_options raw_vcid_opts; + canid_t tx_vcid_shifted; + canid_t rx_vcid_shifted; + canid_t rx_vcid_mask_shifted; int join_filters; int count; /* number of active filters */ struct can_filter dfilter; /* default/single filter */ @@ -134,10 +138,29 @@ static void raw_rcv(struct sk_buff *oskb, void *data) return; /* make sure to not pass oversized frames to the socket */ - if ((!ro->fd_frames && can_is_canfd_skb(oskb)) || - (!ro->xl_frames && can_is_canxl_skb(oskb))) + if (!ro->fd_frames && can_is_canfd_skb(oskb)) return; + if (can_is_canxl_skb(oskb)) { + struct canxl_frame *cxl = (struct canxl_frame *)oskb->data; + + /* make sure to not pass oversized frames to the socket */ + if (!ro->xl_frames) + return; + + /* filter CAN XL VCID content */ + if (ro->raw_vcid_opts.flags & CAN_RAW_XL_VCID_RX_FILTER) { + /* apply VCID filter if user enabled the filter */ + if ((cxl->prio & ro->rx_vcid_mask_shifted) != + (ro->rx_vcid_shifted & ro->rx_vcid_mask_shifted)) + return; + } else { + /* no filter => do not forward VCID tagged frames */ + if (cxl->prio & CANXL_VCID_MASK) + return; + } + } + /* eliminate multiple filter matches for the same skb */ if (this_cpu_ptr(ro->uniq)->skb == oskb && this_cpu_ptr(ro->uniq)->skbcnt == can_skb_prv(oskb)->skbcnt) { @@ -698,6 +721,19 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, ro->fd_frames = ro->xl_frames; break; + case CAN_RAW_XL_VCID_OPTS: + if (optlen != sizeof(ro->raw_vcid_opts)) + return -EINVAL; + + if (copy_from_sockptr(&ro->raw_vcid_opts, optval, optlen)) + return -EFAULT; + + /* prepare 32 bit values for handling in hot path */ + ro->tx_vcid_shifted = ro->raw_vcid_opts.tx_vcid << CANXL_VCID_OFFSET; + ro->rx_vcid_shifted = ro->raw_vcid_opts.rx_vcid << CANXL_VCID_OFFSET; + ro->rx_vcid_mask_shifted = ro->raw_vcid_opts.rx_vcid_mask << CANXL_VCID_OFFSET; + break; + case CAN_RAW_JOIN_FILTERS: if (optlen != sizeof(ro->join_filters)) return -EINVAL; @@ -786,6 +822,21 @@ static int raw_getsockopt(struct socket *sock, int level, int optname, val = &ro->xl_frames; break; + case CAN_RAW_XL_VCID_OPTS: + /* user space buffer to small for VCID opts? */ + if (len < sizeof(ro->raw_vcid_opts)) { + /* return -ERANGE and needed space in optlen */ + err = -ERANGE; + if (put_user(sizeof(ro->raw_vcid_opts), optlen)) + err = -EFAULT; + } else { + if (len > sizeof(ro->raw_vcid_opts)) + len = sizeof(ro->raw_vcid_opts); + if (copy_to_user(optval, &ro->raw_vcid_opts, len)) + err = -EFAULT; + } + break; + case CAN_RAW_JOIN_FILTERS: if (len > sizeof(int)) len = sizeof(int); @@ -803,23 +854,41 @@ static int raw_getsockopt(struct socket *sock, int level, int optname, return 0; } -static bool raw_bad_txframe(struct raw_sock *ro, struct sk_buff *skb, int mtu) +static void raw_put_canxl_vcid(struct raw_sock *ro, struct sk_buff *skb) +{ + struct canxl_frame *cxl = (struct canxl_frame *)skb->data; + + /* sanitize non CAN XL bits */ + cxl->prio &= (CANXL_PRIO_MASK | CANXL_VCID_MASK); + + /* clear VCID in CAN XL frame if pass through is disabled */ + if (!(ro->raw_vcid_opts.flags & CAN_RAW_XL_VCID_TX_PASS)) + cxl->prio &= CANXL_PRIO_MASK; + + /* set VCID in CAN XL frame if enabled */ + if (ro->raw_vcid_opts.flags & CAN_RAW_XL_VCID_TX_SET) { + cxl->prio &= CANXL_PRIO_MASK; + cxl->prio |= ro->tx_vcid_shifted; + } +} + +static unsigned int raw_check_txframe(struct raw_sock *ro, struct sk_buff *skb, int mtu) { /* Classical CAN -> no checks for flags and device capabilities */ if (can_is_can_skb(skb)) - return false; + return CAN_MTU; /* CAN FD -> needs to be enabled and a CAN FD or CAN XL device */ if (ro->fd_frames && can_is_canfd_skb(skb) && (mtu == CANFD_MTU || can_is_canxl_dev_mtu(mtu))) - return false; + return CANFD_MTU; /* CAN XL -> needs to be enabled and a CAN XL device */ if (ro->xl_frames && can_is_canxl_skb(skb) && can_is_canxl_dev_mtu(mtu)) - return false; + return CANXL_MTU; - return true; + return 0; } static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) @@ -829,6 +898,7 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) struct sockcm_cookie sockc; struct sk_buff *skb; struct net_device *dev; + unsigned int txmtu; int ifindex; int err = -EINVAL; @@ -869,9 +939,16 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) goto free_skb; err = -EINVAL; - if (raw_bad_txframe(ro, skb, dev->mtu)) + + /* check for valid CAN (CC/FD/XL) frame content */ + txmtu = raw_check_txframe(ro, skb, dev->mtu); + if (!txmtu) goto free_skb; + /* only CANXL: clear/forward/set VCID value */ + if (txmtu == CANXL_MTU) + raw_put_canxl_vcid(ro, skb); + sockcm_init(&sockc, sk); if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); diff --git a/net/core/dev.c b/net/core/dev.c index 73a021973007..cc9c2eda65ac 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -153,6 +153,8 @@ #include <linux/prandom.h> #include <linux/once_lite.h> #include <net/netdev_rx_queue.h> +#include <net/page_pool/types.h> +#include <net/page_pool/helpers.h> #include "dev.h" #include "net-sysfs.h" @@ -166,28 +168,6 @@ static int call_netdevice_notifiers_extack(unsigned long val, struct net_device *dev, struct netlink_ext_ack *extack); -/* - * The @dev_base_head list is protected by @dev_base_lock and the rtnl - * semaphore. - * - * Pure readers hold dev_base_lock for reading, or rcu_read_lock() - * - * Writers must hold the rtnl semaphore while they loop through the - * dev_base_head list, and hold dev_base_lock for writing when they do the - * actual updates. This allows pure readers to access the list even - * while a writer is preparing to update it. - * - * To put it another way, dev_base_lock is held for writing only to - * protect against pure readers; the rtnl semaphore provides the - * protection against other writers. - * - * See, for example usages, register_netdevice() and - * unregister_netdevice(), which must be called with the rtnl - * semaphore held. - */ -DEFINE_RWLOCK(dev_base_lock); -EXPORT_SYMBOL(dev_base_lock); - static DEFINE_MUTEX(ifalias_mutex); /* protects napi_hash addition/deletion and napi_gen_id */ @@ -341,13 +321,22 @@ int netdev_name_node_alt_create(struct net_device *dev, const char *name) return 0; } -static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node) +static void netdev_name_node_alt_free(struct rcu_head *head) { - list_del(&name_node->list); + struct netdev_name_node *name_node = + container_of(head, struct netdev_name_node, rcu); + kfree(name_node->name); netdev_name_node_free(name_node); } +static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node) +{ + netdev_name_node_del(name_node); + list_del(&name_node->list); + call_rcu(&name_node->rcu, netdev_name_node_alt_free); +} + int netdev_name_node_alt_destroy(struct net_device *dev, const char *name) { struct netdev_name_node *name_node; @@ -362,10 +351,7 @@ int netdev_name_node_alt_destroy(struct net_device *dev, const char *name) if (name_node == dev->name_node || name_node->dev != dev) return -EINVAL; - netdev_name_node_del(name_node); - synchronize_rcu(); __netdev_name_node_alt_destroy(name_node); - return 0; } @@ -373,8 +359,10 @@ static void netdev_name_node_alt_flush(struct net_device *dev) { struct netdev_name_node *name_node, *tmp; - list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list) - __netdev_name_node_alt_destroy(name_node); + list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list) { + list_del(&name_node->list); + netdev_name_node_alt_free(&name_node->rcu); + } } /* Device list insertion */ @@ -385,12 +373,10 @@ static void list_netdevice(struct net_device *dev) ASSERT_RTNL(); - write_lock(&dev_base_lock); list_add_tail_rcu(&dev->dev_list, &net->dev_base_head); netdev_name_node_add(net, dev->name_node); hlist_add_head_rcu(&dev->index_hlist, dev_index_hash(net, dev->ifindex)); - write_unlock(&dev_base_lock); netdev_for_each_altname(dev, name_node) netdev_name_node_add(net, name_node); @@ -404,7 +390,7 @@ static void list_netdevice(struct net_device *dev) /* Device list removal * caller must respect a RCU grace period before freeing/reusing dev */ -static void unlist_netdevice(struct net_device *dev, bool lock) +static void unlist_netdevice(struct net_device *dev) { struct netdev_name_node *name_node; struct net *net = dev_net(dev); @@ -417,13 +403,9 @@ static void unlist_netdevice(struct net_device *dev, bool lock) netdev_name_node_del(name_node); /* Unlink dev from the device chain */ - if (lock) - write_lock(&dev_base_lock); list_del_rcu(&dev->dev_list); netdev_name_node_del(dev->name_node); hlist_del_rcu(&dev->index_hlist); - if (lock) - write_unlock(&dev_base_lock); dev_base_seq_inc(dev_net(dev)); } @@ -442,6 +424,12 @@ static RAW_NOTIFIER_HEAD(netdev_chain); DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); EXPORT_PER_CPU_SYMBOL(softnet_data); +/* Page_pool has a lockless array/stack to alloc/recycle pages. + * PP consumers must pay attention to run APIs in the appropriate context + * (e.g. NAPI context). + */ +static DEFINE_PER_CPU_ALIGNED(struct page_pool *, system_page_pool); + #ifdef CONFIG_LOCKDEP /* * register_netdevice() inits txq->_xmit_lock and sets lockdep class @@ -738,9 +726,9 @@ EXPORT_SYMBOL_GPL(dev_fill_forward_path); * @net: the applicable net namespace * @name: name to find * - * Find an interface by name. Must be called under RTNL semaphore - * or @dev_base_lock. If the name is found a pointer to the device - * is returned. If the name is not found then %NULL is returned. The + * Find an interface by name. Must be called under RTNL semaphore. + * If the name is found a pointer to the device is returned. + * If the name is not found then %NULL is returned. The * reference counters are not incremented so the caller must be * careful with locks. */ @@ -821,8 +809,7 @@ EXPORT_SYMBOL(netdev_get_by_name); * Search for an interface by index. Returns %NULL if the device * is not found or a pointer to the device. The device has not * had its reference counter increased so the caller must be careful - * about locking. The caller must hold either the RTNL semaphore - * or @dev_base_lock. + * about locking. The caller must hold the RTNL semaphore. */ struct net_device *__dev_get_by_index(struct net *net, int ifindex) @@ -1212,13 +1199,13 @@ int dev_change_name(struct net_device *dev, const char *newname) dev->flags & IFF_UP ? " (while UP)" : ""); old_assign_type = dev->name_assign_type; - dev->name_assign_type = NET_NAME_RENAMED; + WRITE_ONCE(dev->name_assign_type, NET_NAME_RENAMED); rollback: ret = device_rename(&dev->dev, dev->name); if (ret) { memcpy(dev->name, oldname, IFNAMSIZ); - dev->name_assign_type = old_assign_type; + WRITE_ONCE(dev->name_assign_type, old_assign_type); up_write(&devnet_rename_sem); return ret; } @@ -1227,15 +1214,11 @@ rollback: netdev_adjacent_rename_links(dev, oldname); - write_lock(&dev_base_lock); netdev_name_node_del(dev->name_node); - write_unlock(&dev_base_lock); - synchronize_rcu(); + synchronize_net(); - write_lock(&dev_base_lock); netdev_name_node_add(net, dev->name_node); - write_unlock(&dev_base_lock); ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev); ret = notifier_to_errno(ret); @@ -1247,7 +1230,7 @@ rollback: down_write(&devnet_rename_sem); memcpy(dev->name, oldname, IFNAMSIZ); memcpy(oldname, newname, IFNAMSIZ); - dev->name_assign_type = old_assign_type; + WRITE_ONCE(dev->name_assign_type, old_assign_type); old_assign_type = NET_NAME_RENAMED; goto rollback; } else { @@ -4858,6 +4841,12 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq); xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len, skb_headlen(skb) + mac_len, true); + if (skb_is_nonlinear(skb)) { + skb_shinfo(skb)->xdp_frags_size = skb->data_len; + xdp_buff_set_frags_flag(xdp); + } else { + xdp_buff_clear_frags_flag(xdp); + } orig_data_end = xdp->data_end; orig_data = xdp->data; @@ -4887,6 +4876,14 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, skb->len += off; /* positive on grow, negative on shrink */ } + /* XDP frag metadata (e.g. nr_frags) are updated in eBPF helpers + * (e.g. bpf_xdp_adjust_tail), we need to update data_len here. + */ + if (xdp_buff_has_frags(xdp)) + skb->data_len = skb_shinfo(skb)->xdp_frags_size; + else + skb->data_len = 0; + /* check if XDP changed eth hdr such SKB needs update */ eth = (struct ethhdr *)xdp->data; if ((orig_eth_type != eth->h_proto) || @@ -4920,11 +4917,35 @@ u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp, return act; } -static u32 netif_receive_generic_xdp(struct sk_buff *skb, +static int +netif_skb_check_for_xdp(struct sk_buff **pskb, struct bpf_prog *prog) +{ + struct sk_buff *skb = *pskb; + int err, hroom, troom; + + if (!skb_cow_data_for_xdp(this_cpu_read(system_page_pool), pskb, prog)) + return 0; + + /* In case we have to go down the path and also linearize, + * then lets do the pskb_expand_head() work just once here. + */ + hroom = XDP_PACKET_HEADROOM - skb_headroom(skb); + troom = skb->tail + skb->data_len - skb->end; + err = pskb_expand_head(skb, + hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0, + troom > 0 ? troom + 128 : 0, GFP_ATOMIC); + if (err) + return err; + + return skb_linearize(skb); +} + +static u32 netif_receive_generic_xdp(struct sk_buff **pskb, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { - u32 act = XDP_DROP; + struct sk_buff *skb = *pskb; + u32 mac_len, act = XDP_DROP; /* Reinjected packets coming from act_mirred or similar should * not get XDP generic processing. @@ -4932,41 +4953,36 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, if (skb_is_redirected(skb)) return XDP_PASS; - /* XDP packets must be linear and must have sufficient headroom - * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also - * native XDP provides, thus we need to do it here as well. + /* XDP packets must have sufficient headroom of XDP_PACKET_HEADROOM + * bytes. This is the guarantee that also native XDP provides, + * thus we need to do it here as well. */ + mac_len = skb->data - skb_mac_header(skb); + __skb_push(skb, mac_len); + if (skb_cloned(skb) || skb_is_nonlinear(skb) || skb_headroom(skb) < XDP_PACKET_HEADROOM) { - int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb); - int troom = skb->tail + skb->data_len - skb->end; - - /* In case we have to go down the path and also linearize, - * then lets do the pskb_expand_head() work just once here. - */ - if (pskb_expand_head(skb, - hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0, - troom > 0 ? troom + 128 : 0, GFP_ATOMIC)) - goto do_drop; - if (skb_linearize(skb)) + if (netif_skb_check_for_xdp(pskb, xdp_prog)) goto do_drop; } - act = bpf_prog_run_generic_xdp(skb, xdp, xdp_prog); + __skb_pull(*pskb, mac_len); + + act = bpf_prog_run_generic_xdp(*pskb, xdp, xdp_prog); switch (act) { case XDP_REDIRECT: case XDP_TX: case XDP_PASS: break; default: - bpf_warn_invalid_xdp_action(skb->dev, xdp_prog, act); + bpf_warn_invalid_xdp_action((*pskb)->dev, xdp_prog, act); fallthrough; case XDP_ABORTED: - trace_xdp_exception(skb->dev, xdp_prog, act); + trace_xdp_exception((*pskb)->dev, xdp_prog, act); fallthrough; case XDP_DROP: do_drop: - kfree_skb(skb); + kfree_skb(*pskb); break; } @@ -5004,24 +5020,24 @@ void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key); -int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb) +int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff **pskb) { if (xdp_prog) { struct xdp_buff xdp; u32 act; int err; - act = netif_receive_generic_xdp(skb, &xdp, xdp_prog); + act = netif_receive_generic_xdp(pskb, &xdp, xdp_prog); if (act != XDP_PASS) { switch (act) { case XDP_REDIRECT: - err = xdp_do_generic_redirect(skb->dev, skb, + err = xdp_do_generic_redirect((*pskb)->dev, *pskb, &xdp, xdp_prog); if (err) goto out_redir; break; case XDP_TX: - generic_xdp_tx(skb, xdp_prog); + generic_xdp_tx(*pskb, xdp_prog); break; } return XDP_DROP; @@ -5029,7 +5045,7 @@ int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb) } return XDP_PASS; out_redir: - kfree_skb_reason(skb, SKB_DROP_REASON_XDP); + kfree_skb_reason(*pskb, SKB_DROP_REASON_XDP); return XDP_DROP; } EXPORT_SYMBOL_GPL(do_xdp_generic); @@ -5352,7 +5368,8 @@ another_round: int ret2; migrate_disable(); - ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb); + ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), + &skb); migrate_enable(); if (ret2 != XDP_PASS) { @@ -6177,8 +6194,13 @@ static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) clear_bit(NAPI_STATE_SCHED, &napi->state); } -static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll, - u16 budget) +enum { + NAPI_F_PREFER_BUSY_POLL = 1, + NAPI_F_END_ON_RESCHED = 2, +}; + +static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, + unsigned flags, u16 budget) { bool skip_schedule = false; unsigned long timeout; @@ -6198,7 +6220,7 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool local_bh_disable(); - if (prefer_busy_poll) { + if (flags & NAPI_F_PREFER_BUSY_POLL) { napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs); timeout = READ_ONCE(napi->dev->gro_flush_timeout); if (napi->defer_hard_irqs_count && timeout) { @@ -6222,23 +6244,23 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool local_bh_enable(); } -void napi_busy_loop(unsigned int napi_id, - bool (*loop_end)(void *, unsigned long), - void *loop_end_arg, bool prefer_busy_poll, u16 budget) +static void __napi_busy_loop(unsigned int napi_id, + bool (*loop_end)(void *, unsigned long), + void *loop_end_arg, unsigned flags, u16 budget) { unsigned long start_time = loop_end ? busy_loop_current_time() : 0; int (*napi_poll)(struct napi_struct *napi, int budget); void *have_poll_lock = NULL; struct napi_struct *napi; + WARN_ON_ONCE(!rcu_read_lock_held()); + restart: napi_poll = NULL; - rcu_read_lock(); - napi = napi_by_id(napi_id); if (!napi) - goto out; + return; if (!IS_ENABLED(CONFIG_PREEMPT_RT)) preempt_disable(); @@ -6254,14 +6276,14 @@ restart: */ if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED | NAPIF_STATE_IN_BUSY_POLL)) { - if (prefer_busy_poll) + if (flags & NAPI_F_PREFER_BUSY_POLL) set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); goto count; } if (cmpxchg(&napi->state, val, val | NAPIF_STATE_IN_BUSY_POLL | NAPIF_STATE_SCHED) != val) { - if (prefer_busy_poll) + if (flags & NAPI_F_PREFER_BUSY_POLL) set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state); goto count; } @@ -6281,12 +6303,15 @@ count: break; if (unlikely(need_resched())) { + if (flags & NAPI_F_END_ON_RESCHED) + break; if (napi_poll) - busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget); + busy_poll_stop(napi, have_poll_lock, flags, budget); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) preempt_enable(); rcu_read_unlock(); cond_resched(); + rcu_read_lock(); if (loop_end(loop_end_arg, start_time)) return; goto restart; @@ -6294,10 +6319,31 @@ count: cpu_relax(); } if (napi_poll) - busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget); + busy_poll_stop(napi, have_poll_lock, flags, budget); if (!IS_ENABLED(CONFIG_PREEMPT_RT)) preempt_enable(); -out: +} + +void napi_busy_loop_rcu(unsigned int napi_id, + bool (*loop_end)(void *, unsigned long), + void *loop_end_arg, bool prefer_busy_poll, u16 budget) +{ + unsigned flags = NAPI_F_END_ON_RESCHED; + + if (prefer_busy_poll) + flags |= NAPI_F_PREFER_BUSY_POLL; + + __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget); +} + +void napi_busy_loop(unsigned int napi_id, + bool (*loop_end)(void *, unsigned long), + void *loop_end_arg, bool prefer_busy_poll, u16 budget) +{ + unsigned flags = prefer_busy_poll ? NAPI_F_PREFER_BUSY_POLL : 0; + + rcu_read_lock(); + __napi_busy_loop(napi_id, loop_end, loop_end_arg, flags, budget); rcu_read_unlock(); } EXPORT_SYMBOL(napi_busy_loop); @@ -8914,7 +8960,7 @@ int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa, } EXPORT_SYMBOL(dev_set_mac_address); -static DECLARE_RWSEM(dev_addr_sem); +DECLARE_RWSEM(dev_addr_sem); int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa, struct netlink_ext_ack *extack) @@ -9690,11 +9736,11 @@ static void dev_index_release(struct net *net, int ifindex) /* Delayed registration/unregisteration */ LIST_HEAD(net_todo_list); DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq); +atomic_t dev_unreg_count = ATOMIC_INIT(0); static void net_set_todo(struct net_device *dev) { list_add_tail(&dev->todo_list, &net_todo_list); - atomic_inc(&dev_net(dev)->dev_unreg_count); } static netdev_features_t netdev_sync_upper_features(struct net_device *lower, @@ -10259,9 +10305,9 @@ int register_netdevice(struct net_device *dev) goto err_ifindex_release; ret = netdev_register_kobject(dev); - write_lock(&dev_base_lock); - dev->reg_state = ret ? NETREG_UNREGISTERED : NETREG_REGISTERED; - write_unlock(&dev_base_lock); + + WRITE_ONCE(dev->reg_state, ret ? NETREG_UNREGISTERED : NETREG_REGISTERED); + if (ret) goto err_uninit_notify; @@ -10337,7 +10383,7 @@ EXPORT_SYMBOL(register_netdevice); * that need to tie several hardware interfaces to a single NAPI * poll scheduler due to HW limitations. */ -int init_dummy_netdev(struct net_device *dev) +void init_dummy_netdev(struct net_device *dev) { /* Clear everything. Note we don't initialize spinlocks * are they aren't supposed to be taken by any of the @@ -10365,8 +10411,6 @@ int init_dummy_netdev(struct net_device *dev) * because users of this 'device' dont need to change * its refcount. */ - - return 0; } EXPORT_SYMBOL_GPL(init_dummy_netdev); @@ -10521,6 +10565,7 @@ void netdev_run_todo(void) { struct net_device *dev, *tmp; struct list_head list; + int cnt; #ifdef CONFIG_LOCKDEP struct list_head unlink_list; @@ -10551,12 +10596,11 @@ void netdev_run_todo(void) continue; } - write_lock(&dev_base_lock); - dev->reg_state = NETREG_UNREGISTERED; - write_unlock(&dev_base_lock); + WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERED); linkwatch_sync_dev(dev); } + cnt = 0; while (!list_empty(&list)) { dev = netdev_wait_allrefs_any(&list); list_del(&dev->todo_list); @@ -10574,12 +10618,13 @@ void netdev_run_todo(void) if (dev->needs_free_netdev) free_netdev(dev); - if (atomic_dec_and_test(&dev_net(dev)->dev_unreg_count)) - wake_up(&netdev_unregistering_wq); + cnt++; /* Free network device */ kobject_put(&dev->dev.kobj); } + if (cnt && atomic_sub_and_test(cnt, &dev_unreg_count)) + wake_up(&netdev_unregistering_wq); } /* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has @@ -10970,7 +11015,7 @@ void free_netdev(struct net_device *dev) } BUG_ON(dev->reg_state != NETREG_UNREGISTERED); - dev->reg_state = NETREG_RELEASED; + WRITE_ONCE(dev->reg_state, NETREG_RELEASED); /* will free via device release */ put_device(&dev->dev); @@ -11026,6 +11071,7 @@ void unregister_netdevice_many_notify(struct list_head *head, { struct net_device *dev, *tmp; LIST_HEAD(close_head); + int cnt = 0; BUG_ON(dev_boot_phase); ASSERT_RTNL(); @@ -11057,10 +11103,8 @@ void unregister_netdevice_many_notify(struct list_head *head, list_for_each_entry(dev, head, unreg_list) { /* And unlink it from device chain. */ - write_lock(&dev_base_lock); - unlist_netdevice(dev, false); - dev->reg_state = NETREG_UNREGISTERING; - write_unlock(&dev_base_lock); + unlist_netdevice(dev); + WRITE_ONCE(dev->reg_state, NETREG_UNREGISTERING); } flush_all_backlogs(); @@ -11122,7 +11166,9 @@ void unregister_netdevice_many_notify(struct list_head *head, list_for_each_entry(dev, head, unreg_list) { netdev_put(dev, &dev->dev_registered_tracker); net_set_todo(dev); + cnt++; } + atomic_add(cnt, &dev_unreg_count); list_del(head); } @@ -11240,7 +11286,7 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, dev_close(dev); /* And unlink it from device chain */ - unlist_netdevice(dev, true); + unlist_netdevice(dev); synchronize_net(); @@ -11576,11 +11622,8 @@ static void __net_exit default_device_exit_net(struct net *net) snprintf(fb_name, IFNAMSIZ, "dev%%d"); netdev_for_each_altname_safe(dev, name_node, tmp) - if (netdev_name_in_use(&init_net, name_node->name)) { - netdev_name_node_del(name_node); - synchronize_rcu(); + if (netdev_name_in_use(&init_net, name_node->name)) __netdev_name_node_alt_destroy(name_node); - } err = dev_change_net_namespace(dev, &init_net, fb_name); if (err) { @@ -11687,6 +11730,27 @@ static void __init net_dev_struct_check(void) * */ +/* We allocate 256 pages for each CPU if PAGE_SHIFT is 12 */ +#define SYSTEM_PERCPU_PAGE_POOL_SIZE ((1 << 20) / PAGE_SIZE) + +static int net_page_pool_create(int cpuid) +{ +#if IS_ENABLED(CONFIG_PAGE_POOL) + struct page_pool_params page_pool_params = { + .pool_size = SYSTEM_PERCPU_PAGE_POOL_SIZE, + .nid = NUMA_NO_NODE, + }; + struct page_pool *pp_ptr; + + pp_ptr = page_pool_create_percpu(&page_pool_params, cpuid); + if (IS_ERR(pp_ptr)) + return -ENOMEM; + + per_cpu(system_page_pool, cpuid) = pp_ptr; +#endif + return 0; +} + /* * This is called single threaded during boot, so no need * to take the rtnl semaphore. @@ -11739,6 +11803,9 @@ static int __init net_dev_init(void) init_gro_hash(&sd->backlog); sd->backlog.poll = process_backlog; sd->backlog.weight = weight_p; + + if (net_page_pool_create(i)) + goto out; } dev_boot_phase = 0; @@ -11766,6 +11833,19 @@ static int __init net_dev_init(void) WARN_ON(rc < 0); rc = 0; out: + if (rc < 0) { + for_each_possible_cpu(i) { + struct page_pool *pp_ptr; + + pp_ptr = per_cpu(system_page_pool, i); + if (!pp_ptr) + continue; + + page_pool_destroy(pp_ptr); + per_cpu(system_page_pool, i) = NULL; + } + } + return rc; } diff --git a/net/core/dev.h b/net/core/dev.h index 7480b4c84298..45892267848d 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -3,6 +3,7 @@ #define _NET_CORE_DEV_H #include <linux/types.h> +#include <linux/rwsem.h> struct net; struct net_device; @@ -46,6 +47,8 @@ extern int weight_p; extern int dev_weight_rx_bias; extern int dev_weight_tx_bias; +extern struct rw_semaphore dev_addr_sem; + /* rtnl helpers */ extern struct list_head net_todo_list; void netdev_run_todo(void); @@ -56,6 +59,7 @@ struct netdev_name_node { struct list_head list; struct net_device *dev; const char *name; + struct rcu_head rcu; }; int netdev_get_name(struct net *net, char *name, int ifindex); diff --git a/net/core/dst.c b/net/core/dst.c index 6838d3212c37..95f533844f17 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -96,7 +96,7 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, } EXPORT_SYMBOL(dst_alloc); -struct dst_entry *dst_destroy(struct dst_entry * dst) +static void dst_destroy(struct dst_entry *dst) { struct dst_entry *child = NULL; @@ -126,15 +126,13 @@ struct dst_entry *dst_destroy(struct dst_entry * dst) dst = child; if (dst) dst_release_immediate(dst); - return NULL; } -EXPORT_SYMBOL(dst_destroy); static void dst_destroy_rcu(struct rcu_head *head) { struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); - dst = dst_destroy(dst); + dst_destroy(dst); } /* Operations to mark dst as DEAD and clean up the net device referenced diff --git a/net/core/filter.c b/net/core/filter.c index ef3e78b6a39c..358870408a51 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -88,7 +88,7 @@ #include "dev.h" static const struct bpf_func_proto * -bpf_sk_base_func_proto(enum bpf_func_id func_id); +bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog); int copy_bpf_fprog_from_user(struct sock_fprog *dst, sockptr_t src, int len) { @@ -778,7 +778,7 @@ jmp_rest: BPF_EMIT_JMP; break; - /* ldxb 4 * ([14] & 0xf) is remaped into 6 insns. */ + /* ldxb 4 * ([14] & 0xf) is remapped into 6 insns. */ case BPF_LDX | BPF_MSH | BPF_B: { struct sock_filter tmp = { .code = BPF_LD | BPF_ABS | BPF_B, @@ -804,7 +804,7 @@ jmp_rest: *insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP); break; } - /* RET_K is remaped into 2 insns. RET_A case doesn't need an + /* RET_K is remapped into 2 insns. RET_A case doesn't need an * extra mov as BPF_REG_0 is already mapped into BPF_REG_A. */ case BPF_RET | BPF_A: @@ -2968,7 +2968,7 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, * * Then if B is non-zero AND there is no space allocate space and * compact A, B regions into page. If there is space shift ring to - * the rigth free'ing the next element in ring to place B, leaving + * the right free'ing the next element in ring to place B, leaving * A untouched except to reduce length. */ if (start != offset) { @@ -7894,7 +7894,7 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; default: - return bpf_base_func_proto(func_id); + return bpf_base_func_proto(func_id, prog); } } @@ -7987,7 +7987,7 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return NULL; } default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -8006,7 +8006,7 @@ sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_perf_event_output: return &bpf_skb_event_output_proto; default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -8193,7 +8193,7 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #endif #endif default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -8252,13 +8252,13 @@ xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #endif #endif default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } #if IS_MODULE(CONFIG_NF_CONNTRACK) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES) /* The nf_conn___init type is used in the NF_CONNTRACK kfuncs. The * kfuncs are defined in two different modules, and we want to be able - * to use them interchangably with the same BTF type ID. Because modules + * to use them interchangeably with the same BTF type ID. Because modules * can't de-duplicate BTF IDs between each other, we need the type to be * referenced in the vmlinux BTF or the verifier will get confused about * the different types. So we add this dummy type reference which will @@ -8313,7 +8313,7 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_tcp_sock_proto; #endif /* CONFIG_INET */ default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -8355,7 +8355,7 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_cgroup_classid_curr_proto; #endif default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -8399,7 +8399,7 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skc_lookup_tcp_proto; #endif default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -8410,7 +8410,7 @@ flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_skb_load_bytes: return &bpf_flow_dissector_load_bytes_proto; default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -8437,7 +8437,7 @@ lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_skb_under_cgroup: return &bpf_skb_under_cgroup_proto; default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -8612,7 +8612,7 @@ static bool cg_skb_is_valid_access(int off, int size, return false; case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_end): - if (!bpf_capable()) + if (!bpf_token_capable(prog->aux->token, CAP_BPF)) return false; break; } @@ -8624,7 +8624,7 @@ static bool cg_skb_is_valid_access(int off, int size, case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; case bpf_ctx_range(struct __sk_buff, tstamp): - if (!bpf_capable()) + if (!bpf_token_capable(prog->aux->token, CAP_BPF)) return false; break; default: @@ -11268,7 +11268,7 @@ sk_reuseport_func_proto(enum bpf_func_id func_id, case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; default: - return bpf_base_func_proto(func_id); + return bpf_base_func_proto(func_id, prog); } } @@ -11450,7 +11450,7 @@ sk_lookup_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_sk_release: return &bpf_sk_release_proto; default: - return bpf_sk_base_func_proto(func_id); + return bpf_sk_base_func_proto(func_id, prog); } } @@ -11784,7 +11784,7 @@ const struct bpf_func_proto bpf_sock_from_file_proto = { }; static const struct bpf_func_proto * -bpf_sk_base_func_proto(enum bpf_func_id func_id) +bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { const struct bpf_func_proto *func; @@ -11813,10 +11813,10 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id) case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; default: - return bpf_base_func_proto(func_id); + return bpf_base_func_proto(func_id, prog); } - if (!perfmon_capable()) + if (!bpf_token_capable(prog->aux->token, CAP_PERFMON)) return NULL; return func; @@ -11869,6 +11869,103 @@ __bpf_kfunc int bpf_sock_addr_set_sun_path(struct bpf_sock_addr_kern *sa_kern, return 0; } + +__bpf_kfunc int bpf_sk_assign_tcp_reqsk(struct sk_buff *skb, struct sock *sk, + struct bpf_tcp_req_attrs *attrs, int attrs__sz) +{ +#if IS_ENABLED(CONFIG_SYN_COOKIES) + const struct request_sock_ops *ops; + struct inet_request_sock *ireq; + struct tcp_request_sock *treq; + struct request_sock *req; + struct net *net; + __u16 min_mss; + u32 tsoff = 0; + + if (attrs__sz != sizeof(*attrs) || + attrs->reserved[0] || attrs->reserved[1] || attrs->reserved[2]) + return -EINVAL; + + if (!skb_at_tc_ingress(skb)) + return -EINVAL; + + net = dev_net(skb->dev); + if (net != sock_net(sk)) + return -ENETUNREACH; + + switch (skb->protocol) { + case htons(ETH_P_IP): + ops = &tcp_request_sock_ops; + min_mss = 536; + break; +#if IS_BUILTIN(CONFIG_IPV6) + case htons(ETH_P_IPV6): + ops = &tcp6_request_sock_ops; + min_mss = IPV6_MIN_MTU - 60; + break; +#endif + default: + return -EINVAL; + } + + if (sk->sk_type != SOCK_STREAM || sk->sk_state != TCP_LISTEN || + sk_is_mptcp(sk)) + return -EINVAL; + + if (attrs->mss < min_mss) + return -EINVAL; + + if (attrs->wscale_ok) { + if (!READ_ONCE(net->ipv4.sysctl_tcp_window_scaling)) + return -EINVAL; + + if (attrs->snd_wscale > TCP_MAX_WSCALE || + attrs->rcv_wscale > TCP_MAX_WSCALE) + return -EINVAL; + } + + if (attrs->sack_ok && !READ_ONCE(net->ipv4.sysctl_tcp_sack)) + return -EINVAL; + + if (attrs->tstamp_ok) { + if (!READ_ONCE(net->ipv4.sysctl_tcp_timestamps)) + return -EINVAL; + + tsoff = attrs->rcv_tsecr - tcp_ns_to_ts(attrs->usec_ts_ok, tcp_clock_ns()); + } + + req = inet_reqsk_alloc(ops, sk, false); + if (!req) + return -ENOMEM; + + ireq = inet_rsk(req); + treq = tcp_rsk(req); + + req->rsk_listener = sk; + req->syncookie = 1; + req->mss = attrs->mss; + req->ts_recent = attrs->rcv_tsval; + + ireq->snd_wscale = attrs->snd_wscale; + ireq->rcv_wscale = attrs->rcv_wscale; + ireq->tstamp_ok = !!attrs->tstamp_ok; + ireq->sack_ok = !!attrs->sack_ok; + ireq->wscale_ok = !!attrs->wscale_ok; + ireq->ecn_ok = !!attrs->ecn_ok; + + treq->req_usec_ts = !!attrs->usec_ts_ok; + treq->ts_off = tsoff; + + skb_orphan(skb); + skb->sk = req_to_sk(req); + skb->destructor = sock_pfree; + + return 0; +#else + return -EOPNOTSUPP; +#endif +} + __bpf_kfunc_end_defs(); int bpf_dynptr_from_skb_rdonly(struct sk_buff *skb, u64 flags, @@ -11897,6 +11994,10 @@ BTF_SET8_START(bpf_kfunc_check_set_sock_addr) BTF_ID_FLAGS(func, bpf_sock_addr_set_sun_path) BTF_SET8_END(bpf_kfunc_check_set_sock_addr) +BTF_SET8_START(bpf_kfunc_check_set_tcp_reqsk) +BTF_ID_FLAGS(func, bpf_sk_assign_tcp_reqsk, KF_TRUSTED_ARGS) +BTF_SET8_END(bpf_kfunc_check_set_tcp_reqsk) + static const struct btf_kfunc_id_set bpf_kfunc_set_skb = { .owner = THIS_MODULE, .set = &bpf_kfunc_check_set_skb, @@ -11912,6 +12013,11 @@ static const struct btf_kfunc_id_set bpf_kfunc_set_sock_addr = { .set = &bpf_kfunc_check_set_sock_addr, }; +static const struct btf_kfunc_id_set bpf_kfunc_set_tcp_reqsk = { + .owner = THIS_MODULE, + .set = &bpf_kfunc_check_set_tcp_reqsk, +}; + static int __init bpf_kfunc_init(void) { int ret; @@ -11927,8 +12033,9 @@ static int __init bpf_kfunc_init(void) ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_NETFILTER, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp); - return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, - &bpf_kfunc_set_sock_addr); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + &bpf_kfunc_set_sock_addr); + return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_tcp_reqsk); } late_initcall(bpf_kfunc_init); diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 429571c258da..8ec35194bfcb 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -33,7 +33,7 @@ static DECLARE_DELAYED_WORK(linkwatch_work, linkwatch_event); static LIST_HEAD(lweventlist); static DEFINE_SPINLOCK(lweventlist_lock); -static unsigned char default_operstate(const struct net_device *dev) +static unsigned int default_operstate(const struct net_device *dev) { if (netif_testing(dev)) return IF_OPER_TESTING; @@ -62,16 +62,13 @@ static unsigned char default_operstate(const struct net_device *dev) return IF_OPER_UP; } - static void rfc2863_policy(struct net_device *dev) { - unsigned char operstate = default_operstate(dev); + unsigned int operstate = default_operstate(dev); - if (operstate == dev->operstate) + if (operstate == READ_ONCE(dev->operstate)) return; - write_lock(&dev_base_lock); - switch(dev->link_mode) { case IF_LINK_MODE_TESTING: if (operstate == IF_OPER_UP) @@ -87,9 +84,7 @@ static void rfc2863_policy(struct net_device *dev) break; } - dev->operstate = operstate; - - write_unlock(&dev_base_lock); + WRITE_ONCE(dev->operstate, operstate); } diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index 09f7ed1a04e8..2e4e96d30ee1 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -6,49 +6,18 @@ #include "dev.h" -#define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1) - -#define get_bucket(x) ((x) >> BUCKET_SPACE) -#define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1)) -#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) - -static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos) +static void *dev_seq_from_index(struct seq_file *seq, loff_t *pos) { - struct net *net = seq_file_net(seq); + unsigned long ifindex = *pos; struct net_device *dev; - struct hlist_head *h; - unsigned int count = 0, offset = get_offset(*pos); - h = &net->dev_index_head[get_bucket(*pos)]; - hlist_for_each_entry_rcu(dev, h, index_hlist) { - if (++count == offset) - return dev; + for_each_netdev_dump(seq_file_net(seq), dev, ifindex) { + *pos = dev->ifindex; + return dev; } - - return NULL; -} - -static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos) -{ - struct net_device *dev; - unsigned int bucket; - - do { - dev = dev_from_same_bucket(seq, pos); - if (dev) - return dev; - - bucket = get_bucket(*pos) + 1; - *pos = set_bucket_offset(bucket, 1); - } while (bucket < NETDEV_HASHENTRIES); - return NULL; } -/* - * This is invoked by the /proc filesystem handler to display a device - * in detail. - */ static void *dev_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { @@ -56,16 +25,13 @@ static void *dev_seq_start(struct seq_file *seq, loff_t *pos) if (!*pos) return SEQ_START_TOKEN; - if (get_bucket(*pos) >= NETDEV_HASHENTRIES) - return NULL; - - return dev_from_bucket(seq, pos); + return dev_seq_from_index(seq, pos); } static void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) { ++*pos; - return dev_from_bucket(seq, pos); + return dev_seq_from_index(seq, pos); } static void dev_seq_stop(struct seq_file *seq, void *v) diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index a09d507c5b03..946caefdd959 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -34,10 +34,10 @@ static const char fmt_dec[] = "%d\n"; static const char fmt_ulong[] = "%lu\n"; static const char fmt_u64[] = "%llu\n"; -/* Caller holds RTNL or dev_base_lock */ +/* Caller holds RTNL or RCU */ static inline int dev_isalive(const struct net_device *dev) { - return dev->reg_state <= NETREG_REGISTERED; + return READ_ONCE(dev->reg_state) <= NETREG_REGISTERED; } /* use same locking rules as GIF* ioctl's */ @@ -48,10 +48,10 @@ static ssize_t netdev_show(const struct device *dev, struct net_device *ndev = to_net_dev(dev); ssize_t ret = -EINVAL; - read_lock(&dev_base_lock); + rcu_read_lock(); if (dev_isalive(ndev)) ret = (*format)(ndev, buf); - read_unlock(&dev_base_lock); + rcu_read_unlock(); return ret; } @@ -60,7 +60,7 @@ static ssize_t netdev_show(const struct device *dev, #define NETDEVICE_SHOW(field, format_string) \ static ssize_t format_##field(const struct net_device *dev, char *buf) \ { \ - return sysfs_emit(buf, format_string, dev->field); \ + return sysfs_emit(buf, format_string, READ_ONCE(dev->field)); \ } \ static ssize_t field##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ @@ -125,7 +125,7 @@ static DEVICE_ATTR_RO(iflink); static ssize_t format_name_assign_type(const struct net_device *dev, char *buf) { - return sysfs_emit(buf, fmt_dec, dev->name_assign_type); + return sysfs_emit(buf, fmt_dec, READ_ONCE(dev->name_assign_type)); } static ssize_t name_assign_type_show(struct device *dev, @@ -135,24 +135,28 @@ static ssize_t name_assign_type_show(struct device *dev, struct net_device *ndev = to_net_dev(dev); ssize_t ret = -EINVAL; - if (ndev->name_assign_type != NET_NAME_UNKNOWN) + if (READ_ONCE(ndev->name_assign_type) != NET_NAME_UNKNOWN) ret = netdev_show(dev, attr, buf, format_name_assign_type); return ret; } static DEVICE_ATTR_RO(name_assign_type); -/* use same locking rules as GIFHWADDR ioctl's */ +/* use same locking rules as GIFHWADDR ioctl's (dev_get_mac_address()) */ static ssize_t address_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *ndev = to_net_dev(dev); ssize_t ret = -EINVAL; - read_lock(&dev_base_lock); + down_read(&dev_addr_sem); + + rcu_read_lock(); if (dev_isalive(ndev)) ret = sysfs_format_mac(buf, ndev->dev_addr, ndev->addr_len); - read_unlock(&dev_base_lock); + rcu_read_unlock(); + + up_read(&dev_addr_sem); return ret; } static DEVICE_ATTR_RO(address); @@ -161,10 +165,13 @@ static ssize_t broadcast_show(struct device *dev, struct device_attribute *attr, char *buf) { struct net_device *ndev = to_net_dev(dev); + int ret = -EINVAL; + rcu_read_lock(); if (dev_isalive(ndev)) - return sysfs_format_mac(buf, ndev->broadcast, ndev->addr_len); - return -EINVAL; + ret = sysfs_format_mac(buf, ndev->broadcast, ndev->addr_len); + rcu_read_unlock(); + return ret; } static DEVICE_ATTR_RO(broadcast); @@ -318,11 +325,9 @@ static ssize_t operstate_show(struct device *dev, const struct net_device *netdev = to_net_dev(dev); unsigned char operstate; - read_lock(&dev_base_lock); - operstate = netdev->operstate; + operstate = READ_ONCE(netdev->operstate); if (!netif_running(netdev)) operstate = IF_OPER_DOWN; - read_unlock(&dev_base_lock); if (operstate >= ARRAY_SIZE(operstates)) return -EINVAL; /* should not happen */ @@ -680,14 +685,14 @@ static ssize_t netstat_show(const struct device *d, WARN_ON(offset > sizeof(struct rtnl_link_stats64) || offset % sizeof(u64) != 0); - read_lock(&dev_base_lock); + rcu_read_lock(); if (dev_isalive(dev)) { struct rtnl_link_stats64 temp; const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp); ret = sysfs_emit(buf, fmt_u64, *(u64 *)(((u8 *)stats) + offset)); } - read_unlock(&dev_base_lock); + rcu_read_unlock(); return ret; } diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 72799533426b..f0540c557515 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -318,8 +318,9 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) { /* Must be called with pernet_ops_rwsem held */ const struct pernet_operations *ops, *saved_ops; - int error = 0; LIST_HEAD(net_exit_list); + LIST_HEAD(dev_kill_list); + int error = 0; refcount_set(&net->ns.count, 1); ref_tracker_dir_init(&net->refcnt_tracker, 128, "net refcnt"); @@ -358,6 +359,15 @@ out_undo: synchronize_rcu(); ops = saved_ops; + rtnl_lock(); + list_for_each_entry_continue_reverse(ops, &pernet_list, list) { + if (ops->exit_batch_rtnl) + ops->exit_batch_rtnl(&net_exit_list, &dev_kill_list); + } + unregister_netdevice_many(&dev_kill_list); + rtnl_unlock(); + + ops = saved_ops; list_for_each_entry_continue_reverse(ops, &pernet_list, list) ops_exit_list(ops, &net_exit_list); @@ -573,6 +583,7 @@ static void cleanup_net(struct work_struct *work) struct net *net, *tmp, *last; struct llist_node *net_kill_list; LIST_HEAD(net_exit_list); + LIST_HEAD(dev_kill_list); /* Atomically snapshot the list of namespaces to cleanup */ net_kill_list = llist_del_all(&cleanup_list); @@ -611,7 +622,15 @@ static void cleanup_net(struct work_struct *work) * the rcu_barrier() below isn't sufficient alone. * Also the pre_exit() and exit() methods need this barrier. */ - synchronize_rcu(); + synchronize_rcu_expedited(); + + rtnl_lock(); + list_for_each_entry_reverse(ops, &pernet_list, list) { + if (ops->exit_batch_rtnl) + ops->exit_batch_rtnl(&net_exit_list, &dev_kill_list); + } + unregister_netdevice_many(&dev_kill_list); + rtnl_unlock(); /* Run all of the network namespace exit methods */ list_for_each_entry_reverse(ops, &pernet_list, list) @@ -1193,7 +1212,17 @@ static void free_exit_list(struct pernet_operations *ops, struct list_head *net_ { ops_pre_exit_list(ops, net_exit_list); synchronize_rcu(); + + if (ops->exit_batch_rtnl) { + LIST_HEAD(dev_kill_list); + + rtnl_lock(); + ops->exit_batch_rtnl(net_exit_list, &dev_kill_list); + unregister_netdevice_many(&dev_kill_list); + rtnl_unlock(); + } ops_exit_list(ops, net_exit_list); + ops_free_list(ops, net_exit_list); } diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 4933762e5a6b..89c835fcf094 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -171,13 +171,16 @@ static void page_pool_producer_unlock(struct page_pool *pool, } static int page_pool_init(struct page_pool *pool, - const struct page_pool_params *params) + const struct page_pool_params *params, + int cpuid) { unsigned int ring_qsize = 1024; /* Default */ memcpy(&pool->p, ¶ms->fast, sizeof(pool->p)); memcpy(&pool->slow, ¶ms->slow, sizeof(pool->slow)); + pool->cpuid = cpuid; + /* Validate only known flags were used */ if (pool->p.flags & ~(PP_FLAG_ALL)) return -EINVAL; @@ -253,10 +256,12 @@ static void page_pool_uninit(struct page_pool *pool) } /** - * page_pool_create() - create a page pool. + * page_pool_create_percpu() - create a page pool for a given cpu. * @params: parameters, see struct page_pool_params + * @cpuid: cpu identifier */ -struct page_pool *page_pool_create(const struct page_pool_params *params) +struct page_pool * +page_pool_create_percpu(const struct page_pool_params *params, int cpuid) { struct page_pool *pool; int err; @@ -265,7 +270,7 @@ struct page_pool *page_pool_create(const struct page_pool_params *params) if (!pool) return ERR_PTR(-ENOMEM); - err = page_pool_init(pool, params); + err = page_pool_init(pool, params, cpuid); if (err < 0) goto err_free; @@ -282,6 +287,16 @@ err_free: kfree(pool); return ERR_PTR(err); } +EXPORT_SYMBOL(page_pool_create_percpu); + +/** + * page_pool_create() - create a page pool + * @params: parameters, see struct page_pool_params + */ +struct page_pool *page_pool_create(const struct page_pool_params *params) +{ + return page_pool_create_percpu(params, -1); +} EXPORT_SYMBOL(page_pool_create); static void page_pool_return_page(struct page_pool *pool, struct page *page); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 9c4f427f3a50..c54dbe05c4c5 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -483,24 +483,15 @@ EXPORT_SYMBOL_GPL(__rtnl_link_unregister); */ static void rtnl_lock_unregistering_all(void) { - struct net *net; - bool unregistering; DEFINE_WAIT_FUNC(wait, woken_wake_function); add_wait_queue(&netdev_unregistering_wq, &wait); for (;;) { - unregistering = false; rtnl_lock(); /* We held write locked pernet_ops_rwsem, and parallel * setup_net() and cleanup_net() are not possible. */ - for_each_net(net) { - if (atomic_read(&net->dev_unreg_count) > 0) { - unregistering = true; - break; - } - } - if (!unregistering) + if (!atomic_read(&dev_unreg_count)) break; __rtnl_unlock(); @@ -851,9 +842,22 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, } EXPORT_SYMBOL_GPL(rtnl_put_cacheinfo); +void netdev_set_operstate(struct net_device *dev, int newstate) +{ + unsigned int old = READ_ONCE(dev->operstate); + + do { + if (old == newstate) + return; + } while (!try_cmpxchg(&dev->operstate, &old, newstate)); + + netdev_state_change(dev); +} +EXPORT_SYMBOL(netdev_set_operstate); + static void set_operstate(struct net_device *dev, unsigned char transition) { - unsigned char operstate = dev->operstate; + unsigned char operstate = READ_ONCE(dev->operstate); switch (transition) { case IF_OPER_UP: @@ -875,12 +879,7 @@ static void set_operstate(struct net_device *dev, unsigned char transition) break; } - if (dev->operstate != operstate) { - write_lock(&dev_base_lock); - dev->operstate = operstate; - write_unlock(&dev_base_lock); - netdev_state_change(dev); - } + netdev_set_operstate(dev, operstate); } static unsigned int rtnl_dev_get_flags(const struct net_device *dev) @@ -2200,25 +2199,22 @@ static int rtnl_valid_dump_ifinfo_req(const struct nlmsghdr *nlh, static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) { + const struct rtnl_link_ops *kind_ops = NULL; struct netlink_ext_ack *extack = cb->extack; const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); - struct net *tgt_net = net; - int h, s_h; - int idx = 0, s_idx; - struct net_device *dev; - struct hlist_head *head; + unsigned int flags = NLM_F_MULTI; struct nlattr *tb[IFLA_MAX+1]; + struct { + unsigned long ifindex; + } *ctx = (void *)cb->ctx; + struct net *tgt_net = net; u32 ext_filter_mask = 0; - const struct rtnl_link_ops *kind_ops = NULL; - unsigned int flags = NLM_F_MULTI; + struct net_device *dev; int master_idx = 0; int netnsid = -1; int err, i; - s_h = cb->args[0]; - s_idx = cb->args[1]; - err = rtnl_valid_dump_ifinfo_req(nlh, cb->strict_check, tb, extack); if (err < 0) { if (cb->strict_check) @@ -2262,36 +2258,21 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) flags |= NLM_F_DUMP_FILTERED; walk_entries: - for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { - idx = 0; - head = &tgt_net->dev_index_head[h]; - hlist_for_each_entry(dev, head, index_hlist) { - if (link_dump_filtered(dev, master_idx, kind_ops)) - goto cont; - if (idx < s_idx) - goto cont; - err = rtnl_fill_ifinfo(skb, dev, net, - RTM_NEWLINK, - NETLINK_CB(cb->skb).portid, - nlh->nlmsg_seq, 0, flags, - ext_filter_mask, 0, NULL, 0, - netnsid, GFP_KERNEL); - - if (err < 0) { - if (likely(skb->len)) - goto out; - - goto out_err; - } -cont: - idx++; + err = 0; + for_each_netdev_dump(tgt_net, dev, ctx->ifindex) { + if (link_dump_filtered(dev, master_idx, kind_ops)) + continue; + err = rtnl_fill_ifinfo(skb, dev, net, RTM_NEWLINK, + NETLINK_CB(cb->skb).portid, + nlh->nlmsg_seq, 0, flags, + ext_filter_mask, 0, NULL, 0, + netnsid, GFP_KERNEL); + if (err < 0) { + if (likely(skb->len)) + err = skb->len; + break; } } -out: - err = skb->len; -out_err: - cb->args[1] = idx; - cb->args[0] = h; cb->seq = tgt_net->dev_base_seq; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); if (netnsid >= 0) @@ -2983,11 +2964,9 @@ static int do_setlink(const struct sk_buff *skb, if (tb[IFLA_LINKMODE]) { unsigned char value = nla_get_u8(tb[IFLA_LINKMODE]); - write_lock(&dev_base_lock); if (dev->link_mode ^ value) status |= DO_SETLINK_NOTIFY; - dev->link_mode = value; - write_unlock(&dev_base_lock); + WRITE_ONCE(dev->link_mode, value); } if (tb[IFLA_VFINFO_LIST]) { diff --git a/net/core/scm.c b/net/core/scm.c index d0e0852a24d5..9cd4b0a01cd6 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -36,6 +36,7 @@ #include <net/compat.h> #include <net/scm.h> #include <net/cls_cgroup.h> +#include <net/af_unix.h> /* @@ -85,6 +86,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) return -ENOMEM; *fplp = fpl; fpl->count = 0; + fpl->count_unix = 0; fpl->max = SCM_MAX_FD; fpl->user = NULL; } @@ -109,6 +111,9 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) fput(file); return -EINVAL; } + if (unix_get_socket(file)) + fpl->count_unix++; + *fpp++ = file; fpl->count++; } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index edbbef563d4d..0d9a489e6ae1 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -895,6 +895,98 @@ static bool is_pp_page(struct page *page) return (page->pp_magic & ~0x3UL) == PP_SIGNATURE; } +int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb, + unsigned int headroom) +{ +#if IS_ENABLED(CONFIG_PAGE_POOL) + u32 size, truesize, len, max_head_size, off; + struct sk_buff *skb = *pskb, *nskb; + int err, i, head_off; + void *data; + + /* XDP does not support fraglist so we need to linearize + * the skb. + */ + if (skb_has_frag_list(skb)) + return -EOPNOTSUPP; + + max_head_size = SKB_WITH_OVERHEAD(PAGE_SIZE - headroom); + if (skb->len > max_head_size + MAX_SKB_FRAGS * PAGE_SIZE) + return -ENOMEM; + + size = min_t(u32, skb->len, max_head_size); + truesize = SKB_HEAD_ALIGN(size) + headroom; + data = page_pool_dev_alloc_va(pool, &truesize); + if (!data) + return -ENOMEM; + + nskb = napi_build_skb(data, truesize); + if (!nskb) { + page_pool_free_va(pool, data, true); + return -ENOMEM; + } + + skb_reserve(nskb, headroom); + skb_copy_header(nskb, skb); + skb_mark_for_recycle(nskb); + + err = skb_copy_bits(skb, 0, nskb->data, size); + if (err) { + consume_skb(nskb); + return err; + } + skb_put(nskb, size); + + head_off = skb_headroom(nskb) - skb_headroom(skb); + skb_headers_offset_update(nskb, head_off); + + off = size; + len = skb->len - off; + for (i = 0; i < MAX_SKB_FRAGS && off < skb->len; i++) { + struct page *page; + u32 page_off; + + size = min_t(u32, len, PAGE_SIZE); + truesize = size; + + page = page_pool_dev_alloc(pool, &page_off, &truesize); + if (!data) { + consume_skb(nskb); + return -ENOMEM; + } + + skb_add_rx_frag(nskb, i, page, page_off, size, truesize); + err = skb_copy_bits(skb, off, page_address(page) + page_off, + size); + if (err) { + consume_skb(nskb); + return err; + } + + len -= size; + off += size; + } + + consume_skb(skb); + *pskb = nskb; + + return 0; +#else + return -EOPNOTSUPP; +#endif +} +EXPORT_SYMBOL(skb_pp_cow_data); + +int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb, + struct bpf_prog *prog) +{ + if (!prog->aux->xdp_has_frags) + return -EINVAL; + + return skb_pp_cow_data(pool, pskb, XDP_PACKET_HEADROOM); +} +EXPORT_SYMBOL(skb_cow_data_for_xdp); + #if IS_ENABLED(CONFIG_PAGE_POOL) bool napi_pp_put_page(struct page *page, bool napi_safe) { @@ -923,9 +1015,10 @@ bool napi_pp_put_page(struct page *page, bool napi_safe) */ if (napi_safe || in_softirq()) { const struct napi_struct *napi = READ_ONCE(pp->p.napi); + unsigned int cpuid = smp_processor_id(); - allow_direct = napi && - READ_ONCE(napi->list_owner) == smp_processor_id(); + allow_direct = napi && READ_ONCE(napi->list_owner) == cpuid; + allow_direct |= (pp->cpuid == cpuid); } /* Driver set this to memory recycling info. Reset it on recycle. diff --git a/net/core/sock.c b/net/core/sock.c index 0a7f46c37f0c..88bf810394a5 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2583,8 +2583,18 @@ EXPORT_SYMBOL(sock_efree); #ifdef CONFIG_INET void sock_pfree(struct sk_buff *skb) { - if (sk_is_refcounted(skb->sk)) - sock_gen_put(skb->sk); + struct sock *sk = skb->sk; + + if (!sk_is_refcounted(sk)) + return; + + if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) { + inet_reqsk(sk)->rsk_listener = NULL; + reqsk_free(inet_reqsk(sk)); + return; + } + + sock_gen_put(sk); } EXPORT_SYMBOL(sock_pfree); #endif /* CONFIG_INET */ diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index b1e29e18d1d6..654122838025 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -16,9 +16,10 @@ #include <linux/inet_diag.h> #include <linux/sock_diag.h> -static const struct sock_diag_handler *sock_diag_handlers[AF_MAX]; -static int (*inet_rcv_compat)(struct sk_buff *skb, struct nlmsghdr *nlh); -static DEFINE_MUTEX(sock_diag_table_mutex); +static const struct sock_diag_handler __rcu *sock_diag_handlers[AF_MAX]; + +static struct sock_diag_inet_compat __rcu *inet_rcv_compat; + static struct workqueue_struct *broadcast_wq; DEFINE_COOKIE(sock_cookie); @@ -122,6 +123,24 @@ static size_t sock_diag_nlmsg_size(void) + nla_total_size_64bit(sizeof(struct tcp_info))); /* INET_DIAG_INFO */ } +static const struct sock_diag_handler *sock_diag_lock_handler(int family) +{ + const struct sock_diag_handler *handler; + + rcu_read_lock(); + handler = rcu_dereference(sock_diag_handlers[family]); + if (handler && !try_module_get(handler->owner)) + handler = NULL; + rcu_read_unlock(); + + return handler; +} + +static void sock_diag_unlock_handler(const struct sock_diag_handler *handler) +{ + module_put(handler->owner); +} + static void sock_diag_broadcast_destroy_work(struct work_struct *work) { struct broadcast_sk *bsk = @@ -138,12 +157,12 @@ static void sock_diag_broadcast_destroy_work(struct work_struct *work) if (!skb) goto out; - mutex_lock(&sock_diag_table_mutex); - hndl = sock_diag_handlers[sk->sk_family]; - if (hndl && hndl->get_info) - err = hndl->get_info(skb, sk); - mutex_unlock(&sock_diag_table_mutex); - + hndl = sock_diag_lock_handler(sk->sk_family); + if (hndl) { + if (hndl->get_info) + err = hndl->get_info(skb, sk); + sock_diag_unlock_handler(hndl); + } if (!err) nlmsg_multicast(sock_net(sk)->diag_nlsk, skb, 0, group, GFP_KERNEL); @@ -166,51 +185,45 @@ void sock_diag_broadcast_destroy(struct sock *sk) queue_work(broadcast_wq, &bsk->work); } -void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh)) +void sock_diag_register_inet_compat(const struct sock_diag_inet_compat *ptr) { - mutex_lock(&sock_diag_table_mutex); - inet_rcv_compat = fn; - mutex_unlock(&sock_diag_table_mutex); + xchg((__force const struct sock_diag_inet_compat **)&inet_rcv_compat, + ptr); } EXPORT_SYMBOL_GPL(sock_diag_register_inet_compat); -void sock_diag_unregister_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh)) +void sock_diag_unregister_inet_compat(const struct sock_diag_inet_compat *ptr) { - mutex_lock(&sock_diag_table_mutex); - inet_rcv_compat = NULL; - mutex_unlock(&sock_diag_table_mutex); + const struct sock_diag_inet_compat *old; + + old = xchg((__force const struct sock_diag_inet_compat **)&inet_rcv_compat, + NULL); + WARN_ON_ONCE(old != ptr); } EXPORT_SYMBOL_GPL(sock_diag_unregister_inet_compat); int sock_diag_register(const struct sock_diag_handler *hndl) { - int err = 0; + int family = hndl->family; - if (hndl->family >= AF_MAX) + if (family >= AF_MAX) return -EINVAL; - mutex_lock(&sock_diag_table_mutex); - if (sock_diag_handlers[hndl->family]) - err = -EBUSY; - else - sock_diag_handlers[hndl->family] = hndl; - mutex_unlock(&sock_diag_table_mutex); - - return err; + return !cmpxchg((const struct sock_diag_handler **) + &sock_diag_handlers[family], + NULL, hndl) ? 0 : -EBUSY; } EXPORT_SYMBOL_GPL(sock_diag_register); -void sock_diag_unregister(const struct sock_diag_handler *hnld) +void sock_diag_unregister(const struct sock_diag_handler *hndl) { - int family = hnld->family; + int family = hndl->family; if (family >= AF_MAX) return; - mutex_lock(&sock_diag_table_mutex); - BUG_ON(sock_diag_handlers[family] != hnld); - sock_diag_handlers[family] = NULL; - mutex_unlock(&sock_diag_table_mutex); + xchg((const struct sock_diag_handler **)&sock_diag_handlers[family], + NULL); } EXPORT_SYMBOL_GPL(sock_diag_unregister); @@ -227,20 +240,20 @@ static int __sock_diag_cmd(struct sk_buff *skb, struct nlmsghdr *nlh) return -EINVAL; req->sdiag_family = array_index_nospec(req->sdiag_family, AF_MAX); - if (sock_diag_handlers[req->sdiag_family] == NULL) + if (!rcu_access_pointer(sock_diag_handlers[req->sdiag_family])) sock_load_diag_module(req->sdiag_family, 0); - mutex_lock(&sock_diag_table_mutex); - hndl = sock_diag_handlers[req->sdiag_family]; + hndl = sock_diag_lock_handler(req->sdiag_family); if (hndl == NULL) - err = -ENOENT; - else if (nlh->nlmsg_type == SOCK_DIAG_BY_FAMILY) + return -ENOENT; + + if (nlh->nlmsg_type == SOCK_DIAG_BY_FAMILY) err = hndl->dump(skb, nlh); else if (nlh->nlmsg_type == SOCK_DESTROY && hndl->destroy) err = hndl->destroy(skb, nlh); else err = -EOPNOTSUPP; - mutex_unlock(&sock_diag_table_mutex); + sock_diag_unlock_handler(hndl); return err; } @@ -248,20 +261,27 @@ static int __sock_diag_cmd(struct sk_buff *skb, struct nlmsghdr *nlh) static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { + const struct sock_diag_inet_compat *ptr; int ret; switch (nlh->nlmsg_type) { case TCPDIAG_GETSOCK: case DCCPDIAG_GETSOCK: - if (inet_rcv_compat == NULL) + + if (!rcu_access_pointer(inet_rcv_compat)) sock_load_diag_module(AF_INET, 0); - mutex_lock(&sock_diag_table_mutex); - if (inet_rcv_compat != NULL) - ret = inet_rcv_compat(skb, nlh); - else - ret = -EOPNOTSUPP; - mutex_unlock(&sock_diag_table_mutex); + rcu_read_lock(); + ptr = rcu_dereference(inet_rcv_compat); + if (ptr && !try_module_get(ptr->owner)) + ptr = NULL; + rcu_read_unlock(); + + ret = -EOPNOTSUPP; + if (ptr) { + ret = ptr->fn(skb, nlh); + module_put(ptr->owner); + } return ret; case SOCK_DIAG_BY_FAMILY: @@ -272,13 +292,9 @@ static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, } } -static DEFINE_MUTEX(sock_diag_mutex); - static void sock_diag_rcv(struct sk_buff *skb) { - mutex_lock(&sock_diag_mutex); netlink_rcv_skb(skb, &sock_diag_rcv_msg); - mutex_unlock(&sock_diag_mutex); } static int sock_diag_bind(struct net *net, int group) @@ -286,12 +302,12 @@ static int sock_diag_bind(struct net *net, int group) switch (group) { case SKNLGRP_INET_TCP_DESTROY: case SKNLGRP_INET_UDP_DESTROY: - if (!sock_diag_handlers[AF_INET]) + if (!rcu_access_pointer(sock_diag_handlers[AF_INET])) sock_load_diag_module(AF_INET, 0); break; case SKNLGRP_INET6_TCP_DESTROY: case SKNLGRP_INET6_UDP_DESTROY: - if (!sock_diag_handlers[AF_INET6]) + if (!rcu_access_pointer(sock_diag_handlers[AF_INET6])) sock_load_diag_module(AF_INET6, 0); break; } diff --git a/net/core/xdp.c b/net/core/xdp.c index 4869c1c2d8f3..27b585f3fa81 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -75,7 +75,7 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu) xa = container_of(rcu, struct xdp_mem_allocator, rcu); /* Allow this ID to be reused */ - ida_simple_remove(&mem_id_pool, xa->mem.id); + ida_free(&mem_id_pool, xa->mem.id); kfree(xa); } @@ -242,7 +242,7 @@ static int __mem_id_cyclic_get(gfp_t gfp) int id; again: - id = ida_simple_get(&mem_id_pool, mem_id_next, MEM_ID_MAX, gfp); + id = ida_alloc_range(&mem_id_pool, mem_id_next, MEM_ID_MAX - 1, gfp); if (id < 0) { if (id == -ENOSPC) { /* Cyclic allocator, reset next id */ @@ -317,7 +317,7 @@ static struct xdp_mem_allocator *__xdp_reg_mem_model(struct xdp_mem_info *mem, /* Insert allocator into ID lookup table */ ptr = rhashtable_insert_slow(mem_id_ht, &id, &xdp_alloc->node); if (IS_ERR(ptr)) { - ida_simple_remove(&mem_id_pool, mem->id); + ida_free(&mem_id_pool, mem->id); mem->id = 0; errno = PTR_ERR(ptr); goto err; diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c index c4bbac99740d..1cba001bb4c8 100644 --- a/net/dccp/ackvec.c +++ b/net/dccp/ackvec.c @@ -376,15 +376,11 @@ EXPORT_SYMBOL_GPL(dccp_ackvec_parsed_cleanup); int __init dccp_ackvec_init(void) { - dccp_ackvec_slab = kmem_cache_create("dccp_ackvec", - sizeof(struct dccp_ackvec), 0, - SLAB_HWCACHE_ALIGN, NULL); + dccp_ackvec_slab = KMEM_CACHE(dccp_ackvec, SLAB_HWCACHE_ALIGN); if (dccp_ackvec_slab == NULL) goto out_err; - dccp_ackvec_record_slab = kmem_cache_create("dccp_ackvec_record", - sizeof(struct dccp_ackvec_record), - 0, SLAB_HWCACHE_ALIGN, NULL); + dccp_ackvec_record_slab = KMEM_CACHE(dccp_ackvec_record, SLAB_HWCACHE_ALIGN); if (dccp_ackvec_record_slab == NULL) goto out_destroy_slab; diff --git a/net/dccp/diag.c b/net/dccp/diag.c index 8a82c5a2c5a8..f5019d95c3ae 100644 --- a/net/dccp/diag.c +++ b/net/dccp/diag.c @@ -58,6 +58,7 @@ static int dccp_diag_dump_one(struct netlink_callback *cb, } static const struct inet_diag_handler dccp_diag_handler = { + .owner = THIS_MODULE, .dump = dccp_diag_dump, .dump_one = dccp_diag_dump_one, .idiag_get_info = dccp_diag_get_info, diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c index 2717e9d7b612..1aba1d05c27a 100644 --- a/net/dsa/tag_sja1105.c +++ b/net/dsa/tag_sja1105.c @@ -75,7 +75,7 @@ sja1105_tagger_private(struct dsa_switch *ds) } /* Similar to is_link_local_ether_addr(hdr->h_dest) but also covers PTP */ -static inline bool sja1105_is_link_local(const struct sk_buff *skb) +static bool sja1105_is_link_local(const struct sk_buff *skb) { const struct ethhdr *hdr = eth_hdr(skb); u64 dmac = ether_addr_to_u64(hdr->h_dest); @@ -121,7 +121,7 @@ static void sja1105_meta_unpack(const struct sk_buff *skb, packing(buf + 7, &meta->switch_id, 7, 0, 1, UNPACK, 0); } -static inline bool sja1105_is_meta_frame(const struct sk_buff *skb) +static bool sja1105_is_meta_frame(const struct sk_buff *skb) { const struct ethhdr *hdr = eth_hdr(skb); u64 smac = ether_addr_to_u64(hdr->h_source); diff --git a/net/dsa/user.c b/net/dsa/user.c index b15e71cc342c..5d666dfb317d 100644 --- a/net/dsa/user.c +++ b/net/dsa/user.c @@ -210,7 +210,7 @@ static int dsa_user_sync_uc(struct net_device *dev, return 0; return dsa_user_vlan_for_each(dev, dsa_user_host_vlan_rx_filtering, - &ctx); + &ctx); } static int dsa_user_unsync_uc(struct net_device *dev, @@ -230,7 +230,7 @@ static int dsa_user_unsync_uc(struct net_device *dev, return 0; return dsa_user_vlan_for_each(dev, dsa_user_host_vlan_rx_filtering, - &ctx); + &ctx); } static int dsa_user_sync_mc(struct net_device *dev, @@ -250,7 +250,7 @@ static int dsa_user_sync_mc(struct net_device *dev, return 0; return dsa_user_vlan_for_each(dev, dsa_user_host_vlan_rx_filtering, - &ctx); + &ctx); } static int dsa_user_unsync_mc(struct net_device *dev, @@ -270,7 +270,7 @@ static int dsa_user_unsync_mc(struct net_device *dev, return 0; return dsa_user_vlan_for_each(dev, dsa_user_host_vlan_rx_filtering, - &ctx); + &ctx); } void dsa_user_sync_ha(struct net_device *dev) @@ -875,8 +875,8 @@ static int dsa_user_port_obj_del(struct net_device *dev, const void *ctx, return err; } -static inline netdev_tx_t dsa_user_netpoll_send_skb(struct net_device *dev, - struct sk_buff *skb) +static netdev_tx_t dsa_user_netpoll_send_skb(struct net_device *dev, + struct sk_buff *skb) { #ifdef CONFIG_NET_POLL_CONTROLLER struct dsa_user_priv *p = netdev_priv(dev); @@ -1222,7 +1222,7 @@ static int dsa_user_set_wol(struct net_device *dev, struct ethtool_wolinfo *w) return ret; } -static int dsa_user_set_eee(struct net_device *dev, struct ethtool_eee *e) +static int dsa_user_set_eee(struct net_device *dev, struct ethtool_keee *e) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; @@ -1242,7 +1242,7 @@ static int dsa_user_set_eee(struct net_device *dev, struct ethtool_eee *e) return phylink_ethtool_set_eee(dp->pl, e); } -static int dsa_user_get_eee(struct net_device *dev, struct ethtool_eee *e) +static int dsa_user_get_eee(struct net_device *dev, struct ethtool_keee *e) { struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_switch *ds = dp->ds; diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 6b2a360dcdf0..ce486cec346c 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -712,3 +712,8 @@ ethtool_forced_speed_maps_init(struct ethtool_forced_speed_map *maps, u32 size) } } EXPORT_SYMBOL_GPL(ethtool_forced_speed_maps_init); + +bool ethtool_eee_use_linkmodes(const struct ethtool_keee *eee) +{ + return !linkmode_empty(eee->supported); +} diff --git a/net/ethtool/common.h b/net/ethtool/common.h index 28b8aaaf9bcb..0f2b5f7eacee 100644 --- a/net/ethtool/common.h +++ b/net/ethtool/common.h @@ -55,5 +55,6 @@ int ethtool_get_module_eeprom_call(struct net_device *dev, struct ethtool_eeprom *ee, u8 *data); bool __ethtool_dev_mm_supported(struct net_device *dev); +bool ethtool_eee_use_linkmodes(const struct ethtool_keee *eee); #endif /* _ETHTOOL_COMMON_H */ diff --git a/net/ethtool/eee.c b/net/ethtool/eee.c index 2853394d06a8..db6faa18fe41 100644 --- a/net/ethtool/eee.c +++ b/net/ethtool/eee.c @@ -5,7 +5,7 @@ #include "bitset.h" #define EEE_MODES_COUNT \ - (sizeof_field(struct ethtool_eee, supported) * BITS_PER_BYTE) + (sizeof_field(struct ethtool_keee, supported_u32) * BITS_PER_BYTE) struct eee_req_info { struct ethnl_req_info base; @@ -13,7 +13,7 @@ struct eee_req_info { struct eee_reply_data { struct ethnl_reply_data base; - struct ethtool_eee eee; + struct ethtool_keee eee; }; #define EEE_REPDATA(__reply_base) \ @@ -30,6 +30,7 @@ static int eee_prepare_data(const struct ethnl_req_info *req_base, { struct eee_reply_data *data = EEE_REPDATA(reply_base); struct net_device *dev = reply_base->dev; + struct ethtool_keee *eee = &data->eee; int ret; if (!dev->ethtool_ops->get_eee) @@ -37,9 +38,18 @@ static int eee_prepare_data(const struct ethnl_req_info *req_base, ret = ethnl_ops_begin(dev); if (ret < 0) return ret; - ret = dev->ethtool_ops->get_eee(dev, &data->eee); + ret = dev->ethtool_ops->get_eee(dev, eee); ethnl_ops_complete(dev); + if (!ret && !ethtool_eee_use_linkmodes(eee)) { + ethtool_convert_legacy_u32_to_link_mode(eee->supported, + eee->supported_u32); + ethtool_convert_legacy_u32_to_link_mode(eee->advertised, + eee->advertised_u32); + ethtool_convert_legacy_u32_to_link_mode(eee->lp_advertised, + eee->lp_advertised_u32); + } + return ret; } @@ -48,24 +58,26 @@ static int eee_reply_size(const struct ethnl_req_info *req_base, { bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; const struct eee_reply_data *data = EEE_REPDATA(reply_base); - const struct ethtool_eee *eee = &data->eee; + const struct ethtool_keee *eee = &data->eee; int len = 0; int ret; - BUILD_BUG_ON(sizeof(eee->advertised) * BITS_PER_BYTE != + BUILD_BUG_ON(sizeof(eee->advertised_u32) * BITS_PER_BYTE != EEE_MODES_COUNT); - BUILD_BUG_ON(sizeof(eee->lp_advertised) * BITS_PER_BYTE != + BUILD_BUG_ON(sizeof(eee->lp_advertised_u32) * BITS_PER_BYTE != EEE_MODES_COUNT); /* MODES_OURS */ - ret = ethnl_bitset32_size(&eee->advertised, &eee->supported, - EEE_MODES_COUNT, link_mode_names, compact); + ret = ethnl_bitset_size(eee->advertised, eee->supported, + __ETHTOOL_LINK_MODE_MASK_NBITS, + link_mode_names, compact); if (ret < 0) return ret; len += ret; /* MODES_PEERS */ - ret = ethnl_bitset32_size(&eee->lp_advertised, NULL, - EEE_MODES_COUNT, link_mode_names, compact); + ret = ethnl_bitset_size(eee->lp_advertised, NULL, + __ETHTOOL_LINK_MODE_MASK_NBITS, + link_mode_names, compact); if (ret < 0) return ret; len += ret; @@ -84,24 +96,26 @@ static int eee_fill_reply(struct sk_buff *skb, { bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; const struct eee_reply_data *data = EEE_REPDATA(reply_base); - const struct ethtool_eee *eee = &data->eee; + const struct ethtool_keee *eee = &data->eee; int ret; - ret = ethnl_put_bitset32(skb, ETHTOOL_A_EEE_MODES_OURS, - &eee->advertised, &eee->supported, - EEE_MODES_COUNT, link_mode_names, compact); + ret = ethnl_put_bitset(skb, ETHTOOL_A_EEE_MODES_OURS, + eee->advertised, eee->supported, + __ETHTOOL_LINK_MODE_MASK_NBITS, + link_mode_names, compact); if (ret < 0) return ret; - ret = ethnl_put_bitset32(skb, ETHTOOL_A_EEE_MODES_PEER, - &eee->lp_advertised, NULL, EEE_MODES_COUNT, - link_mode_names, compact); + ret = ethnl_put_bitset(skb, ETHTOOL_A_EEE_MODES_PEER, + eee->lp_advertised, NULL, + __ETHTOOL_LINK_MODE_MASK_NBITS, + link_mode_names, compact); if (ret < 0) return ret; - if (nla_put_u8(skb, ETHTOOL_A_EEE_ACTIVE, !!eee->eee_active) || - nla_put_u8(skb, ETHTOOL_A_EEE_ENABLED, !!eee->eee_enabled) || + if (nla_put_u8(skb, ETHTOOL_A_EEE_ACTIVE, eee->eee_active) || + nla_put_u8(skb, ETHTOOL_A_EEE_ENABLED, eee->eee_enabled) || nla_put_u8(skb, ETHTOOL_A_EEE_TX_LPI_ENABLED, - !!eee->tx_lpi_enabled) || + eee->tx_lpi_enabled) || nla_put_u32(skb, ETHTOOL_A_EEE_TX_LPI_TIMER, eee->tx_lpi_timer)) return -EMSGSIZE; @@ -132,7 +146,7 @@ ethnl_set_eee(struct ethnl_req_info *req_info, struct genl_info *info) { struct net_device *dev = req_info->dev; struct nlattr **tb = info->attrs; - struct ethtool_eee eee = {}; + struct ethtool_keee eee = {}; bool mod = false; int ret; @@ -140,14 +154,21 @@ ethnl_set_eee(struct ethnl_req_info *req_info, struct genl_info *info) if (ret < 0) return ret; - ret = ethnl_update_bitset32(&eee.advertised, EEE_MODES_COUNT, - tb[ETHTOOL_A_EEE_MODES_OURS], - link_mode_names, info->extack, &mod); + if (ethtool_eee_use_linkmodes(&eee)) { + ret = ethnl_update_bitset(eee.advertised, + __ETHTOOL_LINK_MODE_MASK_NBITS, + tb[ETHTOOL_A_EEE_MODES_OURS], + link_mode_names, info->extack, &mod); + } else { + ret = ethnl_update_bitset32(&eee.advertised_u32, EEE_MODES_COUNT, + tb[ETHTOOL_A_EEE_MODES_OURS], + link_mode_names, info->extack, &mod); + } if (ret < 0) return ret; - ethnl_update_bool32(&eee.eee_enabled, tb[ETHTOOL_A_EEE_ENABLED], &mod); - ethnl_update_bool32(&eee.tx_lpi_enabled, - tb[ETHTOOL_A_EEE_TX_LPI_ENABLED], &mod); + ethnl_update_bool(&eee.eee_enabled, tb[ETHTOOL_A_EEE_ENABLED], &mod); + ethnl_update_bool(&eee.tx_lpi_enabled, tb[ETHTOOL_A_EEE_TX_LPI_ENABLED], + &mod); ethnl_update_u32(&eee.tx_lpi_timer, tb[ETHTOOL_A_EEE_TX_LPI_TIMER], &mod); if (!mod) diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 7519b0818b91..1763e8b697e1 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -1508,22 +1508,71 @@ static int ethtool_set_wol(struct net_device *dev, char __user *useraddr) return 0; } +static void eee_to_keee(struct ethtool_keee *keee, + const struct ethtool_eee *eee) +{ + memset(keee, 0, sizeof(*keee)); + + keee->supported_u32 = eee->supported; + keee->advertised_u32 = eee->advertised; + keee->lp_advertised_u32 = eee->lp_advertised; + keee->eee_active = eee->eee_active; + keee->eee_enabled = eee->eee_enabled; + keee->tx_lpi_enabled = eee->tx_lpi_enabled; + keee->tx_lpi_timer = eee->tx_lpi_timer; + + ethtool_convert_legacy_u32_to_link_mode(keee->supported, + eee->supported); + ethtool_convert_legacy_u32_to_link_mode(keee->advertised, + eee->advertised); + ethtool_convert_legacy_u32_to_link_mode(keee->lp_advertised, + eee->lp_advertised); +} + +static void keee_to_eee(struct ethtool_eee *eee, + const struct ethtool_keee *keee) +{ + memset(eee, 0, sizeof(*eee)); + + eee->eee_active = keee->eee_active; + eee->eee_enabled = keee->eee_enabled; + eee->tx_lpi_enabled = keee->tx_lpi_enabled; + eee->tx_lpi_timer = keee->tx_lpi_timer; + + if (ethtool_eee_use_linkmodes(keee)) { + bool overflow; + + overflow = !ethtool_convert_link_mode_to_legacy_u32(&eee->supported, + keee->supported); + ethtool_convert_link_mode_to_legacy_u32(&eee->advertised, + keee->advertised); + ethtool_convert_link_mode_to_legacy_u32(&eee->lp_advertised, + keee->lp_advertised); + if (overflow) + pr_warn("Ethtool ioctl interface doesn't support passing EEE linkmodes beyond bit 32\n"); + } else { + eee->supported = keee->supported_u32; + eee->advertised = keee->advertised_u32; + eee->lp_advertised = keee->lp_advertised_u32; + } +} + static int ethtool_get_eee(struct net_device *dev, char __user *useraddr) { - struct ethtool_eee edata; + struct ethtool_keee keee; + struct ethtool_eee eee; int rc; if (!dev->ethtool_ops->get_eee) return -EOPNOTSUPP; - memset(&edata, 0, sizeof(struct ethtool_eee)); - edata.cmd = ETHTOOL_GEEE; - rc = dev->ethtool_ops->get_eee(dev, &edata); - + memset(&keee, 0, sizeof(keee)); + rc = dev->ethtool_ops->get_eee(dev, &keee); if (rc) return rc; - if (copy_to_user(useraddr, &edata, sizeof(edata))) + keee_to_eee(&eee, &keee); + if (copy_to_user(useraddr, &eee, sizeof(eee))) return -EFAULT; return 0; @@ -1531,16 +1580,18 @@ static int ethtool_get_eee(struct net_device *dev, char __user *useraddr) static int ethtool_set_eee(struct net_device *dev, char __user *useraddr) { - struct ethtool_eee edata; + struct ethtool_keee keee; + struct ethtool_eee eee; int ret; if (!dev->ethtool_ops->set_eee) return -EOPNOTSUPP; - if (copy_from_user(&edata, useraddr, sizeof(edata))) + if (copy_from_user(&eee, useraddr, sizeof(eee))) return -EFAULT; - ret = dev->ethtool_ops->set_eee(dev, &edata); + eee_to_keee(&keee, &eee); + ret = dev->ethtool_ops->set_eee(dev, &keee); if (!ret) ethtool_notify(dev, ETHTOOL_MSG_EEE_NTF, NULL); return ret; diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index fe3553f60bf3..bd04f28d5cf4 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -477,11 +477,7 @@ out: return ret; } -/* Default ->dumpit() handler for GET requests. Device iteration copied from - * rtnl_dump_ifinfo(); we have to be more careful about device hashtable - * persistence as we cannot guarantee to hold RTNL lock through the whole - * function as rtnetnlink does. - */ +/* Default ->dumpit() handler for GET requests. */ static int ethnl_default_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { @@ -490,14 +486,14 @@ static int ethnl_default_dumpit(struct sk_buff *skb, struct net_device *dev; int ret = 0; - rtnl_lock(); + rcu_read_lock(); for_each_netdev_dump(net, dev, ctx->pos_ifindex) { dev_hold(dev); - rtnl_unlock(); + rcu_read_unlock(); ret = ethnl_default_dump_one(skb, dev, ctx, genl_info_dump(cb)); - rtnl_lock(); + rcu_read_lock(); dev_put(dev); if (ret < 0 && ret != -EOPNOTSUPP) { @@ -507,7 +503,7 @@ static int ethnl_default_dumpit(struct sk_buff *skb, } ret = 0; } - rtnl_unlock(); + rcu_read_unlock(); return ret; } diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index 9d71b66183da..5ef6d437db72 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -28,29 +28,19 @@ static bool is_slave_up(struct net_device *dev) return dev && is_admin_up(dev) && netif_oper_up(dev); } -static void __hsr_set_operstate(struct net_device *dev, int transition) -{ - write_lock(&dev_base_lock); - if (dev->operstate != transition) { - dev->operstate = transition; - write_unlock(&dev_base_lock); - netdev_state_change(dev); - } else { - write_unlock(&dev_base_lock); - } -} - static void hsr_set_operstate(struct hsr_port *master, bool has_carrier) { - if (!is_admin_up(master->dev)) { - __hsr_set_operstate(master->dev, IF_OPER_DOWN); + struct net_device *dev = master->dev; + + if (!is_admin_up(dev)) { + netdev_set_operstate(dev, IF_OPER_DOWN); return; } if (has_carrier) - __hsr_set_operstate(master->dev, IF_OPER_UP); + netdev_set_operstate(dev, IF_OPER_UP); else - __hsr_set_operstate(master->dev, IF_OPER_LOWERLAYERDOWN); + netdev_set_operstate(dev, IF_OPER_LOWERLAYERDOWN); } static bool hsr_check_carrier(struct hsr_port *master) @@ -78,14 +68,14 @@ static void hsr_check_announce(struct net_device *hsr_dev, hsr = netdev_priv(hsr_dev); - if (hsr_dev->operstate == IF_OPER_UP && old_operstate != IF_OPER_UP) { + if (READ_ONCE(hsr_dev->operstate) == IF_OPER_UP && old_operstate != IF_OPER_UP) { /* Went up */ hsr->announce_count = 0; mod_timer(&hsr->announce_timer, jiffies + msecs_to_jiffies(HSR_ANNOUNCE_INTERVAL)); } - if (hsr_dev->operstate != IF_OPER_UP && old_operstate == IF_OPER_UP) + if (READ_ONCE(hsr_dev->operstate) != IF_OPER_UP && old_operstate == IF_OPER_UP) /* Went down */ del_timer(&hsr->announce_timer); } @@ -100,7 +90,7 @@ void hsr_check_carrier_and_operstate(struct hsr_priv *hsr) /* netif_stacked_transfer_operstate() cannot be used here since * it doesn't set IF_OPER_LOWERLAYERDOWN (?) */ - old_operstate = master->dev->operstate; + old_operstate = READ_ONCE(master->dev->operstate); has_carrier = hsr_check_carrier(master); hsr_set_operstate(master, has_carrier); hsr_check_announce(master->dev, old_operstate); diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c index 2c087b7f17c5..e643f52663f9 100644 --- a/net/ieee802154/6lowpan/core.c +++ b/net/ieee802154/6lowpan/core.c @@ -280,5 +280,6 @@ static void __exit lowpan_cleanup_module(void) module_init(lowpan_init_module); module_exit(lowpan_cleanup_module); +MODULE_DESCRIPTION("IPv6 over Low power Wireless Personal Area Network IEEE 802.15.4 core"); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("lowpan"); diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index 00302e8b9615..990a83455dcf 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -1137,4 +1137,5 @@ module_init(af_ieee802154_init); module_exit(af_ieee802154_remove); MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("IEEE 802.15.4 socket interface"); MODULE_ALIAS_NETPROTO(PF_IEEE802154); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index a5a820ee2026..ad278009e469 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1326,7 +1326,7 @@ int inet_sk_rebuild_header(struct sock *sk) fl4 = &inet->cork.fl.u.ip4; rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet->inet_saddr, inet->inet_dport, inet->inet_sport, - sk->sk_protocol, RT_CONN_FLAGS(sk), + sk->sk_protocol, ip_sock_rt_tos(sk), sk->sk_bound_dev_if); if (!IS_ERR(rt)) { err = 0; diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index ae8b15e6896f..834edc18463a 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -12,7 +12,7 @@ #include <net/bpf_sk_storage.h> /* "extern" is to avoid sparse warning. It is only used in bpf_struct_ops.c. */ -extern struct bpf_struct_ops bpf_tcp_congestion_ops; +static struct bpf_struct_ops bpf_tcp_congestion_ops; static u32 unsupported_ops[] = { offsetof(struct tcp_congestion_ops, get_info), @@ -20,6 +20,7 @@ static u32 unsupported_ops[] = { static const struct btf_type *tcp_sock_type; static u32 tcp_sock_id, sock_id; +static const struct btf_type *tcp_congestion_ops_type; static int bpf_tcp_ca_init(struct btf *btf) { @@ -36,6 +37,11 @@ static int bpf_tcp_ca_init(struct btf *btf) tcp_sock_id = type_id; tcp_sock_type = btf_type_by_id(btf, tcp_sock_id); + type_id = btf_find_by_name_kind(btf, "tcp_congestion_ops", BTF_KIND_STRUCT); + if (type_id < 0) + return -EINVAL; + tcp_congestion_ops_type = btf_type_by_id(btf, type_id); + return 0; } @@ -149,7 +155,7 @@ static u32 prog_ops_moff(const struct bpf_prog *prog) u32 midx; midx = prog->expected_attach_type; - t = bpf_tcp_congestion_ops.type; + t = tcp_congestion_ops_type; m = &btf_type_member(t)[midx]; return __btf_member_bit_offset(t, m) / 8; @@ -191,7 +197,7 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id, case BPF_FUNC_ktime_get_coarse_ns: return &bpf_ktime_get_coarse_ns_proto; default: - return bpf_base_func_proto(func_id); + return bpf_base_func_proto(func_id, prog); } } @@ -339,7 +345,7 @@ static struct tcp_congestion_ops __bpf_ops_tcp_congestion_ops = { .release = __bpf_tcp_ca_release, }; -struct bpf_struct_ops bpf_tcp_congestion_ops = { +static struct bpf_struct_ops bpf_tcp_congestion_ops = { .verifier_ops = &bpf_tcp_ca_verifier_ops, .reg = bpf_tcp_ca_reg, .unreg = bpf_tcp_ca_unreg, @@ -350,10 +356,16 @@ struct bpf_struct_ops bpf_tcp_congestion_ops = { .validate = bpf_tcp_ca_validate, .name = "tcp_congestion_ops", .cfi_stubs = &__bpf_ops_tcp_congestion_ops, + .owner = THIS_MODULE, }; static int __init bpf_tcp_ca_kfunc_init(void) { - return register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &bpf_tcp_ca_kfunc_set); + int ret; + + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &bpf_tcp_ca_kfunc_set); + ret = ret ?: register_bpf_struct_ops(&bpf_tcp_congestion_ops, tcp_congestion_ops); + + return ret; } late_initcall(bpf_tcp_ca_kfunc_init); diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index 2cc50cbfc2a3..cc6d0bd7b0a9 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -119,7 +119,7 @@ void ip4_datagram_release_cb(struct sock *sk) rt = ip_route_output_ports(sock_net(sk), &fl4, sk, daddr, inet->inet_saddr, inet->inet_dport, inet->inet_sport, sk->sk_protocol, - RT_CONN_FLAGS(sk), sk->sk_bound_dev_if); + ip_sock_rt_tos(sk), sk->sk_bound_dev_if); dst = !IS_ERR(rt) ? &rt->dst : NULL; sk_dst_set(sk, dst); diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 3ff35f811765..0fc7ab5832d1 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -501,7 +501,7 @@ static void tnode_free(struct key_vector *tn) if (tnode_free_size >= READ_ONCE(sysctl_fib_sync_mem)) { tnode_free_size = 0; - synchronize_rcu(); + synchronize_net(); } } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 459af1f89739..747ed7344cbe 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -1467,7 +1467,7 @@ static struct dst_entry *inet_csk_rebuild_route(struct sock *sk, struct flowi *f rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet->inet_saddr, inet->inet_dport, inet->inet_sport, sk->sk_protocol, - RT_CONN_FLAGS(sk), sk->sk_bound_dev_if); + ip_sock_rt_tos(sk), sk->sk_bound_dev_if); if (IS_ERR(rt)) rt = NULL; if (rt) diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 8e6b6aa0579e..7adace541fe2 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -32,7 +32,7 @@ #include <linux/inet_diag.h> #include <linux/sock_diag.h> -static const struct inet_diag_handler **inet_diag_table; +static const struct inet_diag_handler __rcu **inet_diag_table; struct inet_diag_entry { const __be32 *saddr; @@ -48,28 +48,28 @@ struct inet_diag_entry { #endif }; -static DEFINE_MUTEX(inet_diag_table_mutex); - static const struct inet_diag_handler *inet_diag_lock_handler(int proto) { - if (proto < 0 || proto >= IPPROTO_MAX) { - mutex_lock(&inet_diag_table_mutex); - return ERR_PTR(-ENOENT); - } + const struct inet_diag_handler *handler; - if (!inet_diag_table[proto]) + if (proto < 0 || proto >= IPPROTO_MAX) + return NULL; + + if (!READ_ONCE(inet_diag_table[proto])) sock_load_diag_module(AF_INET, proto); - mutex_lock(&inet_diag_table_mutex); - if (!inet_diag_table[proto]) - return ERR_PTR(-ENOENT); + rcu_read_lock(); + handler = rcu_dereference(inet_diag_table[proto]); + if (handler && !try_module_get(handler->owner)) + handler = NULL; + rcu_read_unlock(); - return inet_diag_table[proto]; + return handler; } static void inet_diag_unlock_handler(const struct inet_diag_handler *handler) { - mutex_unlock(&inet_diag_table_mutex); + module_put(handler->owner); } void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk) @@ -104,9 +104,12 @@ static size_t inet_sk_attr_size(struct sock *sk, const struct inet_diag_handler *handler; size_t aux = 0; - handler = inet_diag_table[req->sdiag_protocol]; + rcu_read_lock(); + handler = rcu_dereference(inet_diag_table[req->sdiag_protocol]); + DEBUG_NET_WARN_ON_ONCE(!handler); if (handler && handler->idiag_get_aux_size) aux = handler->idiag_get_aux_size(sk, net_admin); + rcu_read_unlock(); return nla_total_size(sizeof(struct tcp_info)) + nla_total_size(sizeof(struct inet_diag_msg)) @@ -244,10 +247,16 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, struct nlmsghdr *nlh; struct nlattr *attr; void *info = NULL; + int protocol; cb_data = cb->data; - handler = inet_diag_table[inet_diag_get_protocol(req, cb_data)]; - BUG_ON(!handler); + protocol = inet_diag_get_protocol(req, cb_data); + + /* inet_diag_lock_handler() made sure inet_diag_table[] is stable. */ + handler = rcu_dereference_protected(inet_diag_table[protocol], 1); + DEBUG_NET_WARN_ON_ONCE(!handler); + if (!handler) + return -ENXIO; nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cb->nlh->nlmsg_type, sizeof(*r), nlmsg_flags); @@ -605,9 +614,10 @@ static int inet_diag_cmd_exact(int cmd, struct sk_buff *in_skb, protocol = inet_diag_get_protocol(req, &dump_data); handler = inet_diag_lock_handler(protocol); - if (IS_ERR(handler)) { - err = PTR_ERR(handler); - } else if (cmd == SOCK_DIAG_BY_FAMILY) { + if (!handler) + return -ENOENT; + + if (cmd == SOCK_DIAG_BY_FAMILY) { struct netlink_callback cb = { .nlh = nlh, .skb = in_skb, @@ -1035,6 +1045,10 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, num = 0; ilb = &hashinfo->lhash2[i]; + if (hlist_nulls_empty(&ilb->nulls_head)) { + s_num = 0; + continue; + } spin_lock(&ilb->lock); sk_nulls_for_each(sk, node, &ilb->nulls_head) { struct inet_sock *inet = inet_sk(sk); @@ -1099,6 +1113,10 @@ resume_bind_walk: accum = 0; ibb = &hashinfo->bhash2[i]; + if (hlist_empty(&ibb->chain)) { + s_num = 0; + continue; + } spin_lock_bh(&ibb->lock); inet_bind_bucket_for_each(tb2, &ibb->chain) { if (!net_eq(ib2_net(tb2), net)) @@ -1259,12 +1277,12 @@ static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, again: prev_min_dump_alloc = cb->min_dump_alloc; handler = inet_diag_lock_handler(protocol); - if (!IS_ERR(handler)) + if (handler) { handler->dump(skb, cb, r); - else - err = PTR_ERR(handler); - inet_diag_unlock_handler(handler); - + inet_diag_unlock_handler(handler); + } else { + err = -ENOENT; + } /* The skb is not large enough to fit one sk info and * inet_sk_diag_fill() has requested for a larger skb. */ @@ -1457,10 +1475,9 @@ int inet_diag_handler_get_info(struct sk_buff *skb, struct sock *sk) } handler = inet_diag_lock_handler(sk->sk_protocol); - if (IS_ERR(handler)) { - inet_diag_unlock_handler(handler); + if (!handler) { nlmsg_cancel(skb, nlh); - return PTR_ERR(handler); + return -ENOENT; } attr = handler->idiag_info_size @@ -1479,6 +1496,7 @@ int inet_diag_handler_get_info(struct sk_buff *skb, struct sock *sk) } static const struct sock_diag_handler inet_diag_handler = { + .owner = THIS_MODULE, .family = AF_INET, .dump = inet_diag_handler_cmd, .get_info = inet_diag_handler_get_info, @@ -1486,6 +1504,7 @@ static const struct sock_diag_handler inet_diag_handler = { }; static const struct sock_diag_handler inet6_diag_handler = { + .owner = THIS_MODULE, .family = AF_INET6, .dump = inet_diag_handler_cmd, .get_info = inet_diag_handler_get_info, @@ -1495,20 +1514,12 @@ static const struct sock_diag_handler inet6_diag_handler = { int inet_diag_register(const struct inet_diag_handler *h) { const __u16 type = h->idiag_type; - int err = -EINVAL; if (type >= IPPROTO_MAX) - goto out; + return -EINVAL; - mutex_lock(&inet_diag_table_mutex); - err = -EEXIST; - if (!inet_diag_table[type]) { - inet_diag_table[type] = h; - err = 0; - } - mutex_unlock(&inet_diag_table_mutex); -out: - return err; + return !cmpxchg((const struct inet_diag_handler **)&inet_diag_table[type], + NULL, h) ? 0 : -EEXIST; } EXPORT_SYMBOL_GPL(inet_diag_register); @@ -1519,12 +1530,16 @@ void inet_diag_unregister(const struct inet_diag_handler *h) if (type >= IPPROTO_MAX) return; - mutex_lock(&inet_diag_table_mutex); - inet_diag_table[type] = NULL; - mutex_unlock(&inet_diag_table_mutex); + xchg((const struct inet_diag_handler **)&inet_diag_table[type], + NULL); } EXPORT_SYMBOL_GPL(inet_diag_unregister); +static const struct sock_diag_inet_compat inet_diag_compat = { + .owner = THIS_MODULE, + .fn = inet_diag_rcv_msg_compat, +}; + static int __init inet_diag_init(void) { const int inet_diag_table_size = (IPPROTO_MAX * @@ -1543,7 +1558,7 @@ static int __init inet_diag_init(void) if (err) goto out_free_inet; - sock_diag_register_inet_compat(inet_diag_rcv_msg_compat); + sock_diag_register_inet_compat(&inet_diag_compat); out: return err; @@ -1558,7 +1573,7 @@ static void __exit inet_diag_exit(void) { sock_diag_unregister(&inet6_diag_handler); sock_diag_unregister(&inet_diag_handler); - sock_diag_unregister_inet_compat(inet_diag_rcv_msg_compat); + sock_diag_unregister_inet_compat(&inet_diag_compat); kfree(inet_diag_table); } diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index e9fed83e9b3c..5bd759963451 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -81,10 +81,7 @@ void __init inet_initpeers(void) inet_peer_threshold = clamp_val(nr_entries, 4096, 65536 + 128); - peer_cachep = kmem_cache_create("inet_peer_cache", - sizeof(struct inet_peer), - 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, - NULL); + peer_cachep = KMEM_CACHE(inet_peer, SLAB_HWCACHE_ALIGN | SLAB_PANIC); } /* Called with rcu_read_lock() or base->lock held */ diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 6b9cf5a24c19..7b16c211b904 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1025,14 +1025,16 @@ static int __net_init ipgre_init_net(struct net *net) return ip_tunnel_init_net(net, ipgre_net_id, &ipgre_link_ops, NULL); } -static void __net_exit ipgre_exit_batch_net(struct list_head *list_net) +static void __net_exit ipgre_exit_batch_rtnl(struct list_head *list_net, + struct list_head *dev_to_kill) { - ip_tunnel_delete_nets(list_net, ipgre_net_id, &ipgre_link_ops); + ip_tunnel_delete_nets(list_net, ipgre_net_id, &ipgre_link_ops, + dev_to_kill); } static struct pernet_operations ipgre_net_ops = { .init = ipgre_init_net, - .exit_batch = ipgre_exit_batch_net, + .exit_batch_rtnl = ipgre_exit_batch_rtnl, .id = &ipgre_net_id, .size = sizeof(struct ip_tunnel_net), }; @@ -1697,14 +1699,16 @@ static int __net_init ipgre_tap_init_net(struct net *net) return ip_tunnel_init_net(net, gre_tap_net_id, &ipgre_tap_ops, "gretap0"); } -static void __net_exit ipgre_tap_exit_batch_net(struct list_head *list_net) +static void __net_exit ipgre_tap_exit_batch_rtnl(struct list_head *list_net, + struct list_head *dev_to_kill) { - ip_tunnel_delete_nets(list_net, gre_tap_net_id, &ipgre_tap_ops); + ip_tunnel_delete_nets(list_net, gre_tap_net_id, &ipgre_tap_ops, + dev_to_kill); } static struct pernet_operations ipgre_tap_net_ops = { .init = ipgre_tap_init_net, - .exit_batch = ipgre_tap_exit_batch_net, + .exit_batch_rtnl = ipgre_tap_exit_batch_rtnl, .id = &gre_tap_net_id, .size = sizeof(struct ip_tunnel_net), }; @@ -1715,14 +1719,16 @@ static int __net_init erspan_init_net(struct net *net) &erspan_link_ops, "erspan0"); } -static void __net_exit erspan_exit_batch_net(struct list_head *net_list) +static void __net_exit erspan_exit_batch_rtnl(struct list_head *net_list, + struct list_head *dev_to_kill) { - ip_tunnel_delete_nets(net_list, erspan_net_id, &erspan_link_ops); + ip_tunnel_delete_nets(net_list, erspan_net_id, &erspan_link_ops, + dev_to_kill); } static struct pernet_operations erspan_net_ops = { .init = erspan_init_net, - .exit_batch = erspan_exit_batch_net, + .exit_batch_rtnl = erspan_exit_batch_rtnl, .id = &erspan_net_id, .size = sizeof(struct ip_tunnel_net), }; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 67d846622365..1fe794967211 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -493,7 +493,7 @@ int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, inet->inet_dport, inet->inet_sport, sk->sk_protocol, - RT_CONN_FLAGS_TOS(sk, tos), + RT_TOS(tos), sk->sk_bound_dev_if); if (IS_ERR(rt)) goto no_route; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index a4513ffb66cb..756f8b923883 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -102,10 +102,9 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, if (!ip_tunnel_key_match(&t->parms, flags, key)) continue; - if (t->parms.link == link) + if (READ_ONCE(t->parms.link) == link) return t; - else - cand = t; + cand = t; } hlist_for_each_entry_rcu(t, head, hash_node) { @@ -117,9 +116,9 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, if (!ip_tunnel_key_match(&t->parms, flags, key)) continue; - if (t->parms.link == link) + if (READ_ONCE(t->parms.link) == link) return t; - else if (!cand) + if (!cand) cand = t; } @@ -137,9 +136,9 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, if (!ip_tunnel_key_match(&t->parms, flags, key)) continue; - if (t->parms.link == link) + if (READ_ONCE(t->parms.link) == link) return t; - else if (!cand) + if (!cand) cand = t; } @@ -150,9 +149,9 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, !(t->dev->flags & IFF_UP)) continue; - if (t->parms.link == link) + if (READ_ONCE(t->parms.link) == link) return t; - else if (!cand) + if (!cand) cand = t; } @@ -221,7 +220,7 @@ static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn, hlist_for_each_entry_rcu(t, head, hash_node) { if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr && - link == t->parms.link && + link == READ_ONCE(t->parms.link) && type == t->dev->type && ip_tunnel_key_match(&t->parms, flags, key)) break; @@ -747,7 +746,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, ip_tunnel_init_flow(&fl4, protocol, dst, tnl_params->saddr, tunnel->parms.o_key, RT_TOS(tos), - dev_net(dev), tunnel->parms.link, + dev_net(dev), READ_ONCE(tunnel->parms.link), tunnel->fwmark, skb_get_hash(skb), 0); if (ip_tunnel_encap(skb, &tunnel->encap, &protocol, &fl4) < 0) @@ -867,7 +866,7 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn, if (t->parms.link != p->link || t->fwmark != fwmark) { int mtu; - t->parms.link = p->link; + WRITE_ONCE(t->parms.link, p->link); t->fwmark = fwmark; mtu = ip_tunnel_bind_dev(dev); if (set_mtu) @@ -1057,9 +1056,9 @@ EXPORT_SYMBOL(ip_tunnel_get_link_net); int ip_tunnel_get_iflink(const struct net_device *dev) { - struct ip_tunnel *tunnel = netdev_priv(dev); + const struct ip_tunnel *tunnel = netdev_priv(dev); - return tunnel->parms.link; + return READ_ONCE(tunnel->parms.link); } EXPORT_SYMBOL(ip_tunnel_get_iflink); @@ -1130,19 +1129,17 @@ static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn, } void ip_tunnel_delete_nets(struct list_head *net_list, unsigned int id, - struct rtnl_link_ops *ops) + struct rtnl_link_ops *ops, + struct list_head *dev_to_kill) { struct ip_tunnel_net *itn; struct net *net; - LIST_HEAD(list); - rtnl_lock(); + ASSERT_RTNL(); list_for_each_entry(net, net_list, exit_list) { itn = net_generic(net, id); - ip_tunnel_destroy(net, itn, &list, ops); + ip_tunnel_destroy(net, itn, dev_to_kill, ops); } - unregister_netdevice_many(&list); - rtnl_unlock(); } EXPORT_SYMBOL_GPL(ip_tunnel_delete_nets); @@ -1271,6 +1268,7 @@ int ip_tunnel_init(struct net_device *dev) if (tunnel->collect_md) netif_keep_dst(dev); + netdev_lockdep_set_classes(dev); return 0; } EXPORT_SYMBOL_GPL(ip_tunnel_init); diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index d1d6bb28ed6e..ee587adb169f 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -510,14 +510,16 @@ static int __net_init vti_init_net(struct net *net) return 0; } -static void __net_exit vti_exit_batch_net(struct list_head *list_net) +static void __net_exit vti_exit_batch_rtnl(struct list_head *list_net, + struct list_head *dev_to_kill) { - ip_tunnel_delete_nets(list_net, vti_net_id, &vti_link_ops); + ip_tunnel_delete_nets(list_net, vti_net_id, &vti_link_ops, + dev_to_kill); } static struct pernet_operations vti_net_ops = { .init = vti_init_net, - .exit_batch = vti_exit_batch_net, + .exit_batch_rtnl = vti_exit_batch_rtnl, .id = &vti_net_id, .size = sizeof(struct ip_tunnel_net), }; diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 03afa3871efc..f2696eaadbe6 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -592,14 +592,16 @@ static int __net_init ipip_init_net(struct net *net) return ip_tunnel_init_net(net, ipip_net_id, &ipip_link_ops, "tunl0"); } -static void __net_exit ipip_exit_batch_net(struct list_head *list_net) +static void __net_exit ipip_exit_batch_rtnl(struct list_head *list_net, + struct list_head *dev_to_kill) { - ip_tunnel_delete_nets(list_net, ipip_net_id, &ipip_link_ops); + ip_tunnel_delete_nets(list_net, ipip_net_id, &ipip_link_ops, + dev_to_kill); } static struct pernet_operations ipip_net_ops = { .init = ipip_init_net, - .exit_batch = ipip_exit_batch_net, + .exit_batch_rtnl = ipip_exit_batch_rtnl, .id = &ipip_net_id, .size = sizeof(struct ip_tunnel_net), }; diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index f71a7e9a7de6..783523087281 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -10,6 +10,10 @@ config NF_DEFRAG_IPV4 tristate default n +# old sockopt interface and eval loop +config IP_NF_IPTABLES_LEGACY + tristate + config NF_SOCKET_IPV4 tristate "IPv4 socket lookup support" help @@ -152,7 +156,7 @@ config IP_NF_MATCH_ECN config IP_NF_MATCH_RPFILTER tristate '"rpfilter" reverse path filter match support' depends on NETFILTER_ADVANCED - depends on IP_NF_MANGLE || IP_NF_RAW + depends on IP_NF_MANGLE || IP_NF_RAW || NFT_COMPAT help This option allows you to match packets whose replies would go out via the interface the packet came in. @@ -173,6 +177,7 @@ config IP_NF_MATCH_TTL config IP_NF_FILTER tristate "Packet filtering" default m if NETFILTER_ADVANCED=n + select IP_NF_IPTABLES_LEGACY help Packet filtering defines a table `filter', which has a series of rules for simple packet filtering at local input, forwarding and @@ -182,7 +187,7 @@ config IP_NF_FILTER config IP_NF_TARGET_REJECT tristate "REJECT target support" - depends on IP_NF_FILTER + depends on IP_NF_FILTER || NFT_COMPAT select NF_REJECT_IPV4 default m if NETFILTER_ADVANCED=n help @@ -212,6 +217,7 @@ config IP_NF_NAT default m if NETFILTER_ADVANCED=n select NF_NAT select NETFILTER_XT_NAT + select IP6_NF_IPTABLES_LEGACY help This enables the `nat' table in iptables. This allows masquerading, port forwarding and other forms of full Network Address Port @@ -252,6 +258,7 @@ endif # IP_NF_NAT config IP_NF_MANGLE tristate "Packet mangling" default m if NETFILTER_ADVANCED=n + select IP_NF_IPTABLES_LEGACY help This option adds a `mangle' table to iptables: see the man page for iptables(8). This table is used for various packet alterations @@ -261,7 +268,7 @@ config IP_NF_MANGLE config IP_NF_TARGET_ECN tristate "ECN target support" - depends on IP_NF_MANGLE + depends on IP_NF_MANGLE || NFT_COMPAT depends on NETFILTER_ADVANCED help This option adds a `ECN' target, which can be used in the iptables mangle @@ -286,6 +293,7 @@ config IP_NF_TARGET_TTL # raw + specific targets config IP_NF_RAW tristate 'raw table support (required for NOTRACK/TRACE)' + select IP_NF_IPTABLES_LEGACY help This option adds a `raw' table to iptables. This table is the very first in the netfilter framework and hooks in at the PREROUTING @@ -299,6 +307,7 @@ config IP_NF_SECURITY tristate "Security table" depends on SECURITY depends on NETFILTER_ADVANCED + select IP_NF_IPTABLES_LEGACY help This option adds a `security' table to iptables, for use with Mandatory Access Control (MAC) policy. @@ -309,36 +318,34 @@ endif # IP_NF_IPTABLES # ARP tables config IP_NF_ARPTABLES - tristate "ARP tables support" - select NETFILTER_XTABLES - select NETFILTER_FAMILY_ARP - depends on NETFILTER_ADVANCED - help - arptables is a general, extensible packet identification framework. - The ARP packet filtering and mangling (manipulation)subsystems - use this: say Y or M here if you want to use either of those. - - To compile it as a module, choose M here. If unsure, say N. + tristate -if IP_NF_ARPTABLES +config NFT_COMPAT_ARP + tristate + depends on NF_TABLES_ARP && NFT_COMPAT + default m if NFT_COMPAT=m + default y if NFT_COMPAT=y config IP_NF_ARPFILTER - tristate "ARP packet filtering" + tristate "arptables-legacy packet filtering support" + select IP_NF_ARPTABLES help ARP packet filtering defines a table `filter', which has a series of rules for simple ARP packet filtering at local input and - local output. On a bridge, you can also specify filtering rules - for forwarded ARP packets. See the man page for arptables(8). + local output. This is only needed for arptables-legacy(8). + Neither arptables-nft nor nftables need this to work. To compile it as a module, choose M here. If unsure, say N. config IP_NF_ARP_MANGLE tristate "ARP payload mangling" + depends on IP_NF_ARPTABLES || NFT_COMPAT_ARP help Allows altering the ARP packet payload: source and destination hardware and network addresses. -endif # IP_NF_ARPTABLES + This option is needed by both arptables-legacy and arptables-nft. + It is not used by nftables. endmenu diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 5a26f9de1ab9..85502d4dfbb4 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -25,7 +25,7 @@ obj-$(CONFIG_NFT_FIB_IPV4) += nft_fib_ipv4.o obj-$(CONFIG_NFT_DUP_IPV4) += nft_dup_ipv4.o # generic IP tables -obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o +obj-$(CONFIG_IP_NF_IPTABLES_LEGACY) += ip_tables.o # the three instances of ip_tables obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index bbff68b5b5d4..70509da4f080 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -3631,17 +3631,24 @@ unlock: } EXPORT_SYMBOL(register_nexthop_notifier); -int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb) +int __unregister_nexthop_notifier(struct net *net, struct notifier_block *nb) { int err; - rtnl_lock(); err = blocking_notifier_chain_unregister(&net->nexthop.notifier_chain, nb); - if (err) - goto unlock; - nexthops_dump(net, nb, NEXTHOP_EVENT_DEL, NULL); -unlock: + if (!err) + nexthops_dump(net, nb, NEXTHOP_EVENT_DEL, NULL); + return err; +} +EXPORT_SYMBOL(__unregister_nexthop_notifier); + +int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb) +{ + int err; + + rtnl_lock(); + err = __unregister_nexthop_notifier(net, nb); rtnl_unlock(); return err; } @@ -3737,16 +3744,20 @@ out: } EXPORT_SYMBOL(nexthop_res_grp_activity_update); -static void __net_exit nexthop_net_exit_batch(struct list_head *net_list) +static void __net_exit nexthop_net_exit_batch_rtnl(struct list_head *net_list, + struct list_head *dev_to_kill) { struct net *net; - rtnl_lock(); - list_for_each_entry(net, net_list, exit_list) { + ASSERT_RTNL(); + list_for_each_entry(net, net_list, exit_list) flush_all_nexthops(net); - kfree(net->nexthop.devhash); - } - rtnl_unlock(); +} + +static void __net_exit nexthop_net_exit(struct net *net) +{ + kfree(net->nexthop.devhash); + net->nexthop.devhash = NULL; } static int __net_init nexthop_net_init(struct net *net) @@ -3764,7 +3775,8 @@ static int __net_init nexthop_net_init(struct net *net) static struct pernet_operations nexthop_net_ops = { .init = nexthop_net_init, - .exit_batch = nexthop_net_exit_batch, + .exit = nexthop_net_exit, + .exit_batch_rtnl = nexthop_net_exit_batch_rtnl, }; static int __init nexthop_init(void) diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c index fe2140c8375c..cc793bd8de25 100644 --- a/net/ipv4/raw_diag.c +++ b/net/ipv4/raw_diag.c @@ -213,6 +213,7 @@ static int raw_diag_destroy(struct sk_buff *in_skb, #endif static const struct inet_diag_handler raw_diag_handler = { + .owner = THIS_MODULE, .dump = raw_diag_dump, .dump_one = raw_diag_dump_one, .idiag_get_info = raw_diag_get_info, diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 61f1c96cfe63..be88bf586ff9 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -51,15 +51,6 @@ static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport, count, &syncookie_secret[c]); } -/* Convert one nsec 64bit timestamp to ts (ms or usec resolution) */ -static u64 tcp_ns_to_ts(bool usec_ts, u64 val) -{ - if (usec_ts) - return div_u64(val, NSEC_PER_USEC); - - return div_u64(val, NSEC_PER_MSEC); -} - /* * when syncookies are in effect and tcp timestamps are enabled we encode * tcp options in the lower bits of the timestamp value that will be @@ -304,6 +295,24 @@ static int cookie_tcp_reqsk_init(struct sock *sk, struct sk_buff *skb, return 0; } +#if IS_ENABLED(CONFIG_BPF) +struct request_sock *cookie_bpf_check(struct sock *sk, struct sk_buff *skb) +{ + struct request_sock *req = inet_reqsk(skb->sk); + + skb->sk = NULL; + skb->destructor = NULL; + + if (cookie_tcp_reqsk_init(sk, skb, req)) { + reqsk_free(req); + req = NULL; + } + + return req; +} +EXPORT_SYMBOL_GPL(cookie_bpf_check); +#endif + struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk, struct sk_buff *skb, struct tcp_options_received *tcp_opt, @@ -404,9 +413,13 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) !th->ack || th->rst) goto out; - req = cookie_tcp_check(net, sk, skb); - if (IS_ERR(req)) - goto out; + if (cookie_bpf_ok(skb)) { + req = cookie_bpf_check(sk, skb); + } else { + req = cookie_tcp_check(net, sk, skb); + if (IS_ERR(req)) + goto out; + } if (!req) goto out_drop; @@ -454,7 +467,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) ireq->wscale_ok, &rcv_wscale, dst_metric(&rt->dst, RTAX_INITRWND)); - ireq->rcv_wscale = rcv_wscale; + if (!req->syncookie) + ireq->rcv_wscale = rcv_wscale; ireq->ecn_ok &= cookie_ecn_ok(net, &rt->dst); ret = tcp_get_cookie_sock(sk, skb, req, &rt->dst); diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 4cbe4b44425a..f428ecf9120f 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -222,6 +222,7 @@ static int tcp_diag_destroy(struct sk_buff *in_skb, #endif static const struct inet_diag_handler tcp_diag_handler = { + .owner = THIS_MODULE, .dump = tcp_diag_dump, .dump_one = tcp_diag_dump_one, .idiag_get_info = tcp_diag_get_info, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index df7b13f0e5e0..b1c4462a0798 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6623,7 +6623,6 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) const struct tcphdr *th = tcp_hdr(skb); struct request_sock *req; int queued = 0; - bool acceptable; SKB_DR(reason); switch (sk->sk_state) { @@ -6649,12 +6648,10 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) */ rcu_read_lock(); local_bh_disable(); - acceptable = icsk->icsk_af_ops->conn_request(sk, skb) >= 0; + icsk->icsk_af_ops->conn_request(sk, skb); local_bh_enable(); rcu_read_unlock(); - if (!acceptable) - return 1; consume_skb(skb); return 0; } @@ -6699,17 +6696,21 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) return 0; /* step 5: check the ACK field */ - acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH | - FLAG_UPDATE_TS_RECENT | - FLAG_NO_CHALLENGE_ACK) > 0; + reason = tcp_ack(sk, skb, FLAG_SLOWPATH | + FLAG_UPDATE_TS_RECENT | + FLAG_NO_CHALLENGE_ACK); - if (!acceptable) { + if ((int)reason <= 0) { if (sk->sk_state == TCP_SYN_RECV) return 1; /* send one RST */ - tcp_send_challenge_ack(sk); - SKB_DR_SET(reason, TCP_OLD_ACK); - goto discard; + /* accept old ack during closing */ + if ((int)reason < 0) { + tcp_send_challenge_ack(sk); + reason = -reason; + goto discard; + } } + SKB_DR_SET(reason, NOT_SPECIFIED); switch (sk->sk_state) { case TCP_SYN_RECV: tp->delivered++; /* SYN-ACK delivery isn't tracked in tcp_ack */ diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index dc41a22ee80e..38cb3a28e4ed 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -237,6 +237,7 @@ static int udplite_diag_destroy(struct sk_buff *in_skb, #endif static const struct inet_diag_handler udp_diag_handler = { + .owner = THIS_MODULE, .dump = udp_diag_dump, .dump_one = udp_diag_dump_one, .idiag_get_info = udp_diag_get_info, @@ -260,6 +261,7 @@ static int udplite_diag_dump_one(struct netlink_callback *cb, } static const struct inet_diag_handler udplite_diag_handler = { + .owner = THIS_MODULE, .dump = udplite_diag_dump, .dump_one = udplite_diag_dump_one, .idiag_get_info = udp_diag_get_info, diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 733ace18806c..d3f4b7b9cf1f 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -195,6 +195,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .use_tempaddr = 0, .temp_valid_lft = TEMP_VALID_LIFETIME, .temp_prefered_lft = TEMP_PREFERRED_LIFETIME, + .regen_min_advance = REGEN_MIN_ADVANCE, .regen_max_retry = REGEN_MAX_RETRY, .max_desync_factor = MAX_DESYNC_FACTOR, .max_addresses = IPV6_MAX_ADDRESSES, @@ -257,6 +258,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .use_tempaddr = 0, .temp_valid_lft = TEMP_VALID_LIFETIME, .temp_prefered_lft = TEMP_PREFERRED_LIFETIME, + .regen_min_advance = REGEN_MIN_ADVANCE, .regen_max_retry = REGEN_MAX_RETRY, .max_desync_factor = MAX_DESYNC_FACTOR, .max_addresses = IPV6_MAX_ADDRESSES, @@ -1255,6 +1257,7 @@ static void cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, bool del_rt, bool del_peer) { + struct fib6_table *table; struct fib6_info *f6i; f6i = addrconf_get_prefix_route(del_peer ? &ifp->peer_addr : &ifp->addr, @@ -1264,8 +1267,15 @@ cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, if (del_rt) ip6_del_rt(dev_net(ifp->idev->dev), f6i, false); else { - if (!(f6i->fib6_flags & RTF_EXPIRES)) + if (!(f6i->fib6_flags & RTF_EXPIRES)) { + table = f6i->fib6_table; + spin_lock_bh(&table->tb6_lock); + fib6_set_expires(f6i, expires); + fib6_add_gc_list(f6i); + + spin_unlock_bh(&table->tb6_lock); + } fib6_info_release(f6i); } } @@ -1331,12 +1341,20 @@ out: in6_ifa_put(ifp); } +static unsigned long ipv6_get_regen_advance(struct inet6_dev *idev) +{ + return idev->cnf.regen_min_advance + idev->cnf.regen_max_retry * + idev->cnf.dad_transmits * + max(NEIGH_VAR(idev->nd_parms, RETRANS_TIME), HZ/100) / HZ; +} + static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, bool block) { struct inet6_dev *idev = ifp->idev; unsigned long tmp_tstamp, age; unsigned long regen_advance; unsigned long now = jiffies; + u32 if_public_preferred_lft; s32 cnf_temp_preferred_lft; struct inet6_ifaddr *ift; struct ifa6_config cfg; @@ -1372,9 +1390,7 @@ retry: age = (now - ifp->tstamp) / HZ; - regen_advance = idev->cnf.regen_max_retry * - idev->cnf.dad_transmits * - max(NEIGH_VAR(idev->nd_parms, RETRANS_TIME), HZ/100) / HZ; + regen_advance = ipv6_get_regen_advance(idev); /* recalculate max_desync_factor each time and update * idev->desync_factor if it's larger @@ -1394,11 +1410,13 @@ retry: } } + if_public_preferred_lft = ifp->prefered_lft; + memset(&cfg, 0, sizeof(cfg)); cfg.valid_lft = min_t(__u32, ifp->valid_lft, idev->cnf.temp_valid_lft + age); cfg.preferred_lft = cnf_temp_preferred_lft + age - idev->desync_factor; - cfg.preferred_lft = min_t(__u32, ifp->prefered_lft, cfg.preferred_lft); + cfg.preferred_lft = min_t(__u32, if_public_preferred_lft, cfg.preferred_lft); cfg.preferred_lft = min_t(__u32, cfg.valid_lft, cfg.preferred_lft); cfg.plen = ifp->prefix_len; @@ -1407,19 +1425,41 @@ retry: write_unlock_bh(&idev->lock); - /* A temporary address is created only if this calculated Preferred - * Lifetime is greater than REGEN_ADVANCE time units. In particular, - * an implementation must not create a temporary address with a zero - * Preferred Lifetime. + /* From RFC 4941: + * + * A temporary address is created only if this calculated Preferred + * Lifetime is greater than REGEN_ADVANCE time units. In + * particular, an implementation must not create a temporary address + * with a zero Preferred Lifetime. + * + * ... + * + * When creating a temporary address, the lifetime values MUST be + * derived from the corresponding prefix as follows: + * + * ... + * + * * Its Preferred Lifetime is the lower of the Preferred Lifetime + * of the public address or TEMP_PREFERRED_LIFETIME - + * DESYNC_FACTOR. + * + * To comply with the RFC's requirements, clamp the preferred lifetime + * to a minimum of regen_advance, unless that would exceed valid_lft or + * ifp->prefered_lft. + * * Use age calculation as in addrconf_verify to avoid unnecessary * temporary addresses being generated. */ age = (now - tmp_tstamp + ADDRCONF_TIMER_FUZZ_MINUS) / HZ; if (cfg.preferred_lft <= regen_advance + age) { - in6_ifa_put(ifp); - in6_dev_put(idev); - ret = -1; - goto out; + cfg.preferred_lft = regen_advance + age + 1; + if (cfg.preferred_lft > cfg.valid_lft || + cfg.preferred_lft > if_public_preferred_lft) { + in6_ifa_put(ifp); + in6_dev_put(idev); + ret = -1; + goto out; + } } cfg.ifa_flags = IFA_F_TEMPORARY; @@ -2706,6 +2746,7 @@ EXPORT_SYMBOL_GPL(addrconf_prefix_rcv_add_addr); void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) { struct prefix_info *pinfo; + struct fib6_table *table; __u32 valid_lft; __u32 prefered_lft; int addr_type, err; @@ -2782,11 +2823,20 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao) if (valid_lft == 0) { ip6_del_rt(net, rt, false); rt = NULL; - } else if (addrconf_finite_timeout(rt_expires)) { - /* not infinity */ - fib6_set_expires(rt, jiffies + rt_expires); } else { - fib6_clean_expires(rt); + table = rt->fib6_table; + spin_lock_bh(&table->tb6_lock); + + if (addrconf_finite_timeout(rt_expires)) { + /* not infinity */ + fib6_set_expires(rt, jiffies + rt_expires); + fib6_add_gc_list(rt); + } else { + fib6_clean_expires(rt); + fib6_remove_gc_list(rt); + } + + spin_unlock_bh(&table->tb6_lock); } } else if (valid_lft) { clock_t expires = 0; @@ -4577,9 +4627,7 @@ restart: !ifp->regen_count && ifp->ifpub) { /* This is a non-regenerated temporary addr. */ - unsigned long regen_advance = ifp->idev->cnf.regen_max_retry * - ifp->idev->cnf.dad_transmits * - max(NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME), HZ/100) / HZ; + unsigned long regen_advance = ipv6_get_regen_advance(ifp->idev); if (age + regen_advance >= ifp->prefered_lft) { struct inet6_ifaddr *ifpub = ifp->ifpub; @@ -4741,6 +4789,7 @@ static int modify_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, u32 flags, bool modify_peer) { + struct fib6_table *table; struct fib6_info *f6i; u32 prio; @@ -4761,10 +4810,18 @@ static int modify_prefix_route(struct inet6_ifaddr *ifp, ifp->rt_priority, ifp->idev->dev, expires, flags, GFP_KERNEL); } else { - if (!expires) + table = f6i->fib6_table; + spin_lock_bh(&table->tb6_lock); + + if (!(flags & RTF_EXPIRES)) { fib6_clean_expires(f6i); - else + fib6_remove_gc_list(f6i); + } else { fib6_set_expires(f6i, expires); + fib6_add_gc_list(f6i); + } + + spin_unlock_bh(&table->tb6_lock); fib6_info_release(f6i); } @@ -5998,7 +6055,7 @@ static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev, (dev->ifindex != dev_get_iflink(dev) && nla_put_u32(skb, IFLA_LINK, dev_get_iflink(dev))) || nla_put_u8(skb, IFLA_OPERSTATE, - netif_running(dev) ? dev->operstate : IF_OPER_DOWN)) + netif_running(dev) ? READ_ONCE(dev->operstate) : IF_OPER_DOWN)) goto nla_put_failure; protoinfo = nla_nest_start_noflag(skb, IFLA_PROTINFO); if (!protoinfo) @@ -6790,6 +6847,13 @@ static const struct ctl_table addrconf_sysctl[] = { .proc_handler = proc_dointvec, }, { + .procname = "regen_min_advance", + .data = &ipv6_devconf.regen_min_advance, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { .procname = "regen_max_retry", .data = &ipv6_devconf.regen_max_retry, .maxlen = sizeof(int), @@ -7349,7 +7413,8 @@ int __init addrconf_init(void) if (err < 0) goto out_addrlabel; - addrconf_wq = create_workqueue("ipv6_addrconf"); + /* All works using addrconf_wq need to lock rtnl. */ + addrconf_wq = create_singlethread_workqueue("ipv6_addrconf"); if (!addrconf_wq) { err = -ENOMEM; goto out_nowq; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 4fc2cae0d116..805bbf26b3ef 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -160,6 +160,8 @@ struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh) INIT_LIST_HEAD(&f6i->fib6_siblings); refcount_set(&f6i->fib6_ref, 1); + INIT_HLIST_NODE(&f6i->gc_link); + return f6i; } @@ -246,6 +248,7 @@ static struct fib6_table *fib6_alloc_table(struct net *net, u32 id) net->ipv6.fib6_null_entry); table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&table->tb6_peers); + INIT_HLIST_HEAD(&table->tb6_gc_hlist); } return table; @@ -751,8 +754,6 @@ static struct fib6_node *fib6_add_1(struct net *net, int bit; __be32 dir = 0; - RT6_TRACE("fib6_add_1\n"); - /* insert node in tree */ fn = root; @@ -1057,6 +1058,9 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, lockdep_is_held(&table->tb6_lock)); } } + + fib6_clean_expires(rt); + fib6_remove_gc_list(rt); } /* @@ -1117,10 +1121,13 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, rt->fib6_nsiblings = 0; if (!(iter->fib6_flags & RTF_EXPIRES)) return -EEXIST; - if (!(rt->fib6_flags & RTF_EXPIRES)) + if (!(rt->fib6_flags & RTF_EXPIRES)) { fib6_clean_expires(iter); - else + fib6_remove_gc_list(iter); + } else { fib6_set_expires(iter, rt->expires); + fib6_add_gc_list(iter); + } if (rt->fib6_pmtu) fib6_metric_set(iter, RTAX_MTU, @@ -1479,6 +1486,10 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt, if (rt->nh) list_add(&rt->nh_list, &rt->nh->f6i_list); __fib6_update_sernum_upto_root(rt, fib6_new_sernum(info->nl_net)); + + if (rt->fib6_flags & RTF_EXPIRES) + fib6_add_gc_list(rt); + fib6_start_gc(info->nl_net, rt); } @@ -1803,7 +1814,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net, lockdep_is_held(&table->tb6_lock)); struct fib6_info *new_fn_leaf; - RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter); + pr_debug("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter); iter++; WARN_ON(fn->fn_flags & RTN_RTINFO); @@ -1866,7 +1877,8 @@ static struct fib6_node *fib6_repair_tree(struct net *net, FOR_WALKERS(net, w) { if (!child) { if (w->node == fn) { - RT6_TRACE("W %p adjusted by delnode 1, s=%d/%d\n", w, w->state, nstate); + pr_debug("W %p adjusted by delnode 1, s=%d/%d\n", + w, w->state, nstate); w->node = pn; w->state = nstate; } @@ -1874,10 +1886,12 @@ static struct fib6_node *fib6_repair_tree(struct net *net, if (w->node == fn) { w->node = child; if (children&2) { - RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state); + pr_debug("W %p adjusted by delnode 2, s=%d\n", + w, w->state); w->state = w->state >= FWS_R ? FWS_U : FWS_INIT; } else { - RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state); + pr_debug("W %p adjusted by delnode 2, s=%d\n", + w, w->state); w->state = w->state >= FWS_C ? FWS_U : FWS_INIT; } } @@ -1905,8 +1919,6 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, struct net *net = info->nl_net; bool notify_del = false; - RT6_TRACE("fib6_del_route\n"); - /* If the deleted route is the first in the node and it is not part of * a multipath route, then we need to replace it with the next route * in the node, if exists. @@ -1955,7 +1967,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, read_lock(&net->ipv6.fib6_walker_lock); FOR_WALKERS(net, w) { if (w->state == FWS_C && w->leaf == rt) { - RT6_TRACE("walker %p adjusted by delroute\n", w); + pr_debug("walker %p adjusted by delroute\n", w); w->leaf = rcu_dereference_protected(rt->fib6_next, lockdep_is_held(&table->tb6_lock)); if (!w->leaf) @@ -2281,9 +2293,8 @@ static void fib6_flush_trees(struct net *net) * Garbage collection */ -static int fib6_age(struct fib6_info *rt, void *arg) +static int fib6_age(struct fib6_info *rt, struct fib6_gc_args *gc_args) { - struct fib6_gc_args *gc_args = arg; unsigned long now = jiffies; /* @@ -2293,7 +2304,7 @@ static int fib6_age(struct fib6_info *rt, void *arg) if (rt->fib6_flags & RTF_EXPIRES && rt->expires) { if (time_after(now, rt->expires)) { - RT6_TRACE("expiring %p\n", rt); + pr_debug("expiring %p\n", rt); return -1; } gc_args->more++; @@ -2308,6 +2319,42 @@ static int fib6_age(struct fib6_info *rt, void *arg) return 0; } +static void fib6_gc_table(struct net *net, + struct fib6_table *tb6, + struct fib6_gc_args *gc_args) +{ + struct fib6_info *rt; + struct hlist_node *n; + struct nl_info info = { + .nl_net = net, + .skip_notify = false, + }; + + hlist_for_each_entry_safe(rt, n, &tb6->tb6_gc_hlist, gc_link) + if (fib6_age(rt, gc_args) == -1) + fib6_del(rt, &info); +} + +static void fib6_gc_all(struct net *net, struct fib6_gc_args *gc_args) +{ + struct fib6_table *table; + struct hlist_head *head; + unsigned int h; + + rcu_read_lock(); + for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { + head = &net->ipv6.fib_table_hash[h]; + hlist_for_each_entry_rcu(table, head, tb6_hlist) { + spin_lock_bh(&table->tb6_lock); + + fib6_gc_table(net, table, gc_args); + + spin_unlock_bh(&table->tb6_lock); + } + } + rcu_read_unlock(); +} + void fib6_run_gc(unsigned long expires, struct net *net, bool force) { struct fib6_gc_args gc_args; @@ -2323,7 +2370,7 @@ void fib6_run_gc(unsigned long expires, struct net *net, bool force) net->ipv6.sysctl.ip6_rt_gc_interval; gc_args.more = 0; - fib6_clean_all(net, fib6_age, &gc_args); + fib6_gc_all(net, &gc_args); now = jiffies; net->ipv6.ip6_rt_last_gc = now; @@ -2383,6 +2430,7 @@ static int __net_init fib6_net_init(struct net *net) net->ipv6.fib6_main_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers); + INIT_HLIST_HEAD(&net->ipv6.fib6_main_tbl->tb6_gc_hlist); #ifdef CONFIG_IPV6_MULTIPLE_TABLES net->ipv6.fib6_local_tbl = kzalloc(sizeof(*net->ipv6.fib6_local_tbl), @@ -2395,6 +2443,7 @@ static int __net_init fib6_net_init(struct net *net) net->ipv6.fib6_local_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers); + INIT_HLIST_HEAD(&net->ipv6.fib6_local_tbl->tb6_gc_hlist); #endif fib6_tables_init(net); diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 070d87abf7c0..5e97e0aa8e07 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1511,6 +1511,7 @@ static int ip6gre_tunnel_init_common(struct net_device *dev) ip6gre_tnl_init_features(dev); netdev_hold(dev, &tunnel->dev_tracker, GFP_KERNEL); + netdev_lockdep_set_classes(dev); return 0; cleanup_dst_cache_init: @@ -1632,21 +1633,19 @@ err_alloc_dev: return err; } -static void __net_exit ip6gre_exit_batch_net(struct list_head *net_list) +static void __net_exit ip6gre_exit_batch_rtnl(struct list_head *net_list, + struct list_head *dev_to_kill) { struct net *net; - LIST_HEAD(list); - rtnl_lock(); + ASSERT_RTNL(); list_for_each_entry(net, net_list, exit_list) - ip6gre_destroy_tunnels(net, &list); - unregister_netdevice_many(&list); - rtnl_unlock(); + ip6gre_destroy_tunnels(net, dev_to_kill); } static struct pernet_operations ip6gre_net_ops = { .init = ip6gre_init_net, - .exit_batch = ip6gre_exit_batch_net, + .exit_batch_rtnl = ip6gre_exit_batch_rtnl, .id = &ip6gre_net_id, .size = sizeof(struct ip6gre_net), }; @@ -1903,6 +1902,7 @@ static int ip6erspan_tap_init(struct net_device *dev) ip6erspan_tnl_link_config(tunnel, 1); netdev_hold(dev, &tunnel->dev_tracker, GFP_KERNEL); + netdev_lockdep_set_classes(dev); return 0; cleanup_dst_cache_init: diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 9bbabf750a21..44406c28445d 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1898,6 +1898,7 @@ ip6_tnl_dev_init_gen(struct net_device *dev) dev->max_mtu = IP6_MAX_MTU - dev->hard_header_len; netdev_hold(dev, &t->dev_tracker, GFP_KERNEL); + netdev_lockdep_set_classes(dev); return 0; destroy_dst: @@ -2282,21 +2283,19 @@ err_alloc_dev: return err; } -static void __net_exit ip6_tnl_exit_batch_net(struct list_head *net_list) +static void __net_exit ip6_tnl_exit_batch_rtnl(struct list_head *net_list, + struct list_head *dev_to_kill) { struct net *net; - LIST_HEAD(list); - rtnl_lock(); + ASSERT_RTNL(); list_for_each_entry(net, net_list, exit_list) - ip6_tnl_destroy_tunnels(net, &list); - unregister_netdevice_many(&list); - rtnl_unlock(); + ip6_tnl_destroy_tunnels(net, dev_to_kill); } static struct pernet_operations ip6_tnl_net_ops = { .init = ip6_tnl_init_net, - .exit_batch = ip6_tnl_exit_batch_net, + .exit_batch_rtnl = ip6_tnl_exit_batch_rtnl, .id = &ip6_tnl_net_id, .size = sizeof(struct ip6_tnl_net), }; diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index e550240c85e1..7f4f976aa24a 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -935,6 +935,7 @@ static inline int vti6_dev_init_gen(struct net_device *dev) if (!dev->tstats) return -ENOMEM; netdev_hold(dev, &t->dev_tracker, GFP_KERNEL); + netdev_lockdep_set_classes(dev); return 0; } @@ -1174,24 +1175,22 @@ err_alloc_dev: return err; } -static void __net_exit vti6_exit_batch_net(struct list_head *net_list) +static void __net_exit vti6_exit_batch_rtnl(struct list_head *net_list, + struct list_head *dev_to_kill) { struct vti6_net *ip6n; struct net *net; - LIST_HEAD(list); - rtnl_lock(); + ASSERT_RTNL(); list_for_each_entry(net, net_list, exit_list) { ip6n = net_generic(net, vti6_net_id); - vti6_destroy_tunnels(ip6n, &list); + vti6_destroy_tunnels(ip6n, dev_to_kill); } - unregister_netdevice_many(&list); - rtnl_unlock(); } static struct pernet_operations vti6_net_ops = { .init = vti6_init_net, - .exit_batch = vti6_exit_batch_net, + .exit_batch_rtnl = vti6_exit_batch_rtnl, .id = &vti6_net_id, .size = sizeof(struct vti6_net), }; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index bc6e0a0bad3c..76ee1615ff2a 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -2719,7 +2719,6 @@ void ipv6_mc_down(struct inet6_dev *idev) /* Should stop work after group drop. or we will * start work again in mld_ifc_event() */ - synchronize_net(); mld_query_stop_work(idev); mld_report_stop_work(idev); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index a19999b30bc0..73cb31afe935 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1237,6 +1237,7 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) struct ndisc_options ndopts; struct fib6_info *rt = NULL; struct inet6_dev *in6_dev; + struct fib6_table *table; u32 defrtr_usr_metric; unsigned int pref = 0; __u32 old_if_flags; @@ -1382,7 +1383,8 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) neigh_release(neigh); rt = rt6_add_dflt_router(net, &ipv6_hdr(skb)->saddr, - skb->dev, pref, defrtr_usr_metric); + skb->dev, pref, defrtr_usr_metric, + lifetime); if (!rt) { ND_PRINTK(0, err, "RA: %s failed to add default route\n", @@ -1409,8 +1411,15 @@ static enum skb_drop_reason ndisc_router_discovery(struct sk_buff *skb) inet6_rt_notify(RTM_NEWROUTE, rt, &nlinfo, NLM_F_REPLACE); } - if (rt) + if (rt) { + table = rt->fib6_table; + spin_lock_bh(&table->tb6_lock); + fib6_set_expires(rt, jiffies + (HZ * lifetime)); + fib6_add_gc_list(rt); + + spin_unlock_bh(&table->tb6_lock); + } if (in6_dev->cnf.accept_ra_min_hop_limit < 256 && ra_msg->icmph.icmp6_hop_limit) { if (in6_dev->cnf.accept_ra_min_hop_limit <= ra_msg->icmph.icmp6_hop_limit) { diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 0ba62f4868f9..f3c8e2d918e1 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -6,6 +6,10 @@ menu "IPv6: Netfilter Configuration" depends on INET && IPV6 && NETFILTER +# old sockopt interface and eval loop +config IP6_NF_IPTABLES_LEGACY + tristate + config NF_SOCKET_IPV6 tristate "IPv6 socket lookup support" help @@ -147,7 +151,7 @@ config IP6_NF_MATCH_MH config IP6_NF_MATCH_RPFILTER tristate '"rpfilter" reverse path filter match support' depends on NETFILTER_ADVANCED - depends on IP6_NF_MANGLE || IP6_NF_RAW + depends on IP6_NF_MANGLE || IP6_NF_RAW || NFT_COMPAT help This option allows you to match packets whose replies would go out via the interface the packet came in. @@ -186,6 +190,8 @@ config IP6_NF_TARGET_HL config IP6_NF_FILTER tristate "Packet filtering" default m if NETFILTER_ADVANCED=n + select IP6_NF_IPTABLES_LEGACY + tristate help Packet filtering defines a table `filter', which has a series of rules for simple packet filtering at local input, forwarding and @@ -195,7 +201,7 @@ config IP6_NF_FILTER config IP6_NF_TARGET_REJECT tristate "REJECT target support" - depends on IP6_NF_FILTER + depends on IP6_NF_FILTER || NFT_COMPAT select NF_REJECT_IPV6 default m if NETFILTER_ADVANCED=n help @@ -221,6 +227,7 @@ config IP6_NF_TARGET_SYNPROXY config IP6_NF_MANGLE tristate "Packet mangling" default m if NETFILTER_ADVANCED=n + select IP6_NF_IPTABLES_LEGACY help This option adds a `mangle' table to iptables: see the man page for iptables(8). This table is used for various packet alterations @@ -230,6 +237,7 @@ config IP6_NF_MANGLE config IP6_NF_RAW tristate 'raw table support (required for TRACE)' + select IP6_NF_IPTABLES_LEGACY help This option adds a `raw' table to ip6tables. This table is the very first in the netfilter framework and hooks in at the PREROUTING @@ -243,6 +251,7 @@ config IP6_NF_SECURITY tristate "Security table" depends on SECURITY depends on NETFILTER_ADVANCED + select IP6_NF_IPTABLES_LEGACY help This option adds a `security' table to iptables, for use with Mandatory Access Control (MAC) policy. @@ -254,6 +263,7 @@ config IP6_NF_NAT depends on NF_CONNTRACK depends on NETFILTER_ADVANCED select NF_NAT + select IP6_NF_IPTABLES_LEGACY select NETFILTER_XT_NAT help This enables the `nat' table in ip6tables. This allows masquerading, @@ -262,25 +272,23 @@ config IP6_NF_NAT To compile it as a module, choose M here. If unsure, say N. -if IP6_NF_NAT - config IP6_NF_TARGET_MASQUERADE tristate "MASQUERADE target support" select NETFILTER_XT_TARGET_MASQUERADE + depends on IP6_NF_NAT help This is a backwards-compat option for the user's convenience (e.g. when running oldconfig). It selects NETFILTER_XT_TARGET_MASQUERADE. config IP6_NF_TARGET_NPT tristate "NPT (Network Prefix translation) target support" + depends on IP6_NF_NAT || NFT_COMPAT help This option adds the `SNPT' and `DNPT' target, which perform stateless IPv6-to-IPv6 Network Prefix Translation per RFC 6296. To compile it as a module, choose M here. If unsure, say N. -endif # IP6_NF_NAT - endif # IP6_NF_IPTABLES endmenu diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index b8d6dc9aeeb6..66ce6fa5b2f5 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -4,7 +4,7 @@ # # Link order matters here. -obj-$(CONFIG_IP6_NF_IPTABLES) += ip6_tables.o +obj-$(CONFIG_IP6_NF_IPTABLES_LEGACY) += ip6_tables.o obj-$(CONFIG_IP6_NF_FILTER) += ip6table_filter.o obj-$(CONFIG_IP6_NF_MANGLE) += ip6table_mangle.o obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o diff --git a/net/ipv6/route.c b/net/ipv6/route.c index ea1dec8448fc..707d65bc9c0e 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -931,6 +931,7 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, struct net *net = dev_net(dev); struct route_info *rinfo = (struct route_info *) opt; struct in6_addr prefix_buf, *prefix; + struct fib6_table *table; unsigned int pref; unsigned long lifetime; struct fib6_info *rt; @@ -989,10 +990,18 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref); if (rt) { - if (!addrconf_finite_timeout(lifetime)) + table = rt->fib6_table; + spin_lock_bh(&table->tb6_lock); + + if (!addrconf_finite_timeout(lifetime)) { fib6_clean_expires(rt); - else + fib6_remove_gc_list(rt); + } else { fib6_set_expires(rt, jiffies + HZ * lifetime); + fib6_add_gc_list(rt); + } + + spin_unlock_bh(&table->tb6_lock); fib6_info_release(rt); } @@ -2085,12 +2094,12 @@ static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, */ if (!(rt->rt6i_flags & RTF_EXPIRES)) { if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) { - RT6_TRACE("aging clone %p\n", rt); + pr_debug("aging clone %p\n", rt); rt6_remove_exception(bucket, rt6_ex); return; } } else if (time_after(jiffies, rt->dst.expires)) { - RT6_TRACE("purging expired route %p\n", rt); + pr_debug("purging expired route %p\n", rt); rt6_remove_exception(bucket, rt6_ex); return; } @@ -2101,8 +2110,8 @@ static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket, neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway); if (!(neigh && (neigh->flags & NTF_ROUTER))) { - RT6_TRACE("purging route %p via non-router but gateway\n", - rt); + pr_debug("purging route %p via non-router but gateway\n", + rt); rt6_remove_exception(bucket, rt6_ex); return; } @@ -3765,8 +3774,6 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, if (cfg->fc_flags & RTF_EXPIRES) fib6_set_expires(rt, jiffies + clock_t_to_jiffies(cfg->fc_expires)); - else - fib6_clean_expires(rt); if (cfg->fc_protocol == RTPROT_UNSPEC) cfg->fc_protocol = RTPROT_BOOT; @@ -4355,7 +4362,8 @@ struct fib6_info *rt6_add_dflt_router(struct net *net, const struct in6_addr *gwaddr, struct net_device *dev, unsigned int pref, - u32 defrtr_usr_metric) + u32 defrtr_usr_metric, + int lifetime) { struct fib6_config cfg = { .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT, @@ -4368,6 +4376,7 @@ struct fib6_info *rt6_add_dflt_router(struct net *net, .fc_nlinfo.portid = 0, .fc_nlinfo.nlh = NULL, .fc_nlinfo.nl_net = net, + .fc_expires = jiffies_to_clock_t(lifetime * HZ), }; cfg.fc_gateway = *gwaddr; diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 5e9f625b76e3..ed3a44aa1e9d 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1460,6 +1460,7 @@ static int ipip6_tunnel_init(struct net_device *dev) return err; } netdev_hold(dev, &tunnel->dev_tracker, GFP_KERNEL); + netdev_lockdep_set_classes(dev); return 0; } @@ -1875,22 +1876,19 @@ err_alloc_dev: return err; } -static void __net_exit sit_exit_batch_net(struct list_head *net_list) +static void __net_exit sit_exit_batch_rtnl(struct list_head *net_list, + struct list_head *dev_to_kill) { - LIST_HEAD(list); struct net *net; - rtnl_lock(); + ASSERT_RTNL(); list_for_each_entry(net, net_list, exit_list) - sit_destroy_tunnels(net, &list); - - unregister_netdevice_many(&list); - rtnl_unlock(); + sit_destroy_tunnels(net, dev_to_kill); } static struct pernet_operations sit_net_ops = { .init = sit_init_net, - .exit_batch = sit_exit_batch_net, + .exit_batch_rtnl = sit_exit_batch_rtnl, .id = &sit_net_id, .size = sizeof(struct sit_net), }; diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index c8d2ca27220c..6b9c69278819 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -182,9 +182,13 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) !th->ack || th->rst) goto out; - req = cookie_tcp_check(net, sk, skb); - if (IS_ERR(req)) - goto out; + if (cookie_bpf_ok(skb)) { + req = cookie_bpf_check(sk, skb); + } else { + req = cookie_tcp_check(net, sk, skb); + if (IS_ERR(req)) + goto out; + } if (!req) goto out_drop; @@ -247,7 +251,8 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) ireq->wscale_ok, &rcv_wscale, dst_metric(dst, RTAX_INITRWND)); - ireq->rcv_wscale = rcv_wscale; + if (!req->syncookie) + ireq->rcv_wscale = rcv_wscale; ireq->ecn_ok &= cookie_ecn_ok(net, dst); ret = tcp_get_cookie_sock(sk, skb, req, dst); diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 9a2a9ed3ba47..970af3983d11 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -478,7 +478,7 @@ static int l2tp_ip_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet->inet_saddr, inet->inet_dport, inet->inet_sport, - sk->sk_protocol, RT_CONN_FLAGS(sk), + sk->sk_protocol, ip_sock_rt_tos(sk), sk->sk_bound_dev_if); if (IS_ERR(rt)) goto no_route; diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c index 5409c2ea3f57..bd8ff5950c8d 100644 --- a/net/mptcp/mptcp_diag.c +++ b/net/mptcp/mptcp_diag.c @@ -225,6 +225,7 @@ static void mptcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, } static const struct inet_diag_handler mptcp_diag_handler = { + .owner = THIS_MODULE, .dump = mptcp_diag_dump, .dump_one = mptcp_diag_dump_one, .idiag_get_info = mptcp_diag_get_info, diff --git a/net/mptcp/options.c b/net/mptcp/options.c index e3e96a49f922..23e317ffc901 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -689,8 +689,8 @@ static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff * opts->suboptions |= OPTION_MPTCP_ADD_ADDR; if (!echo) { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ADDADDRTX); - opts->ahmac = add_addr_generate_hmac(msk->local_key, - msk->remote_key, + opts->ahmac = add_addr_generate_hmac(READ_ONCE(msk->local_key), + READ_ONCE(msk->remote_key), &opts->addr); } else { MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ECHOADDTX); @@ -792,7 +792,7 @@ static bool mptcp_established_options_fastclose(struct sock *sk, *size = TCPOLEN_MPTCP_FASTCLOSE; opts->suboptions |= OPTION_MPTCP_FASTCLOSE; - opts->rcvr_key = msk->remote_key; + opts->rcvr_key = READ_ONCE(msk->remote_key); pr_debug("FASTCLOSE key=%llu", opts->rcvr_key); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFASTCLOSETX); @@ -1031,7 +1031,7 @@ u64 __mptcp_expand_seq(u64 old_seq, u64 cur_seq) static void __mptcp_snd_una_update(struct mptcp_sock *msk, u64 new_snd_una) { msk->bytes_acked += new_snd_una - msk->snd_una; - msk->snd_una = new_snd_una; + WRITE_ONCE(msk->snd_una, new_snd_una); } static void ack_update_msk(struct mptcp_sock *msk, @@ -1058,10 +1058,10 @@ static void ack_update_msk(struct mptcp_sock *msk, new_wnd_end = new_snd_una + tcp_sk(ssk)->snd_wnd; if (after64(new_wnd_end, msk->wnd_end)) - msk->wnd_end = new_wnd_end; + WRITE_ONCE(msk->wnd_end, new_wnd_end); /* this assumes mptcp_incoming_options() is invoked after tcp_ack() */ - if (after64(msk->wnd_end, READ_ONCE(msk->snd_nxt))) + if (after64(msk->wnd_end, snd_nxt)) __mptcp_check_push(sk, ssk); if (after64(new_snd_una, old_snd_una)) { @@ -1072,7 +1072,7 @@ static void ack_update_msk(struct mptcp_sock *msk, trace_ack_update_msk(mp_opt->data_ack, old_snd_una, new_snd_una, - new_wnd_end, msk->wnd_end); + new_wnd_end, READ_ONCE(msk->wnd_end)); } bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit) @@ -1100,8 +1100,8 @@ static bool add_addr_hmac_valid(struct mptcp_sock *msk, if (mp_opt->echo) return true; - hmac = add_addr_generate_hmac(msk->remote_key, - msk->local_key, + hmac = add_addr_generate_hmac(READ_ONCE(msk->remote_key), + READ_ONCE(msk->local_key), &mp_opt->addr); pr_debug("msk=%p, ahmac=%llu, mp_opt->ahmac=%llu\n", @@ -1148,7 +1148,7 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) if (unlikely(mp_opt.suboptions != OPTION_MPTCP_DSS)) { if ((mp_opt.suboptions & OPTION_MPTCP_FASTCLOSE) && - msk->local_key == mp_opt.rcvr_key) { + READ_ONCE(msk->local_key) == mp_opt.rcvr_key) { WRITE_ONCE(msk->rcv_fastclose, true); mptcp_schedule_work((struct sock *)msk); MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPFASTCLOSERX); diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 4ae19113b8eb..53e0b08b1123 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -77,7 +77,7 @@ void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int { struct mptcp_pm_data *pm = &msk->pm; - pr_debug("msk=%p, token=%u side=%d", msk, msk->token, server_side); + pr_debug("msk=%p, token=%u side=%d", msk, READ_ONCE(msk->token), server_side); WRITE_ONCE(pm->server_side, server_side); mptcp_event(MPTCP_EVENT_CREATED, msk, ssk, GFP_ATOMIC); diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 287a60381eae..d9ad45959219 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -1997,7 +1997,7 @@ static int mptcp_event_put_token_and_ssk(struct sk_buff *skb, const struct mptcp_subflow_context *sf; u8 sk_err; - if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, msk->token)) + if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token))) return -EMSGSIZE; if (mptcp_event_add_subflow(skb, ssk)) @@ -2055,7 +2055,7 @@ static int mptcp_event_created(struct sk_buff *skb, const struct mptcp_sock *msk, const struct sock *ssk) { - int err = nla_put_u32(skb, MPTCP_ATTR_TOKEN, msk->token); + int err = nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token)); if (err) return err; @@ -2083,7 +2083,7 @@ void mptcp_event_addr_removed(const struct mptcp_sock *msk, uint8_t id) if (!nlh) goto nla_put_failure; - if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, msk->token)) + if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token))) goto nla_put_failure; if (nla_put_u8(skb, MPTCP_ATTR_REM_ID, id)) @@ -2118,7 +2118,7 @@ void mptcp_event_addr_announced(const struct sock *ssk, if (!nlh) goto nla_put_failure; - if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, msk->token)) + if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token))) goto nla_put_failure; if (nla_put_u8(skb, MPTCP_ATTR_REM_ID, info->id)) @@ -2234,7 +2234,7 @@ void mptcp_event(enum mptcp_event_type type, const struct mptcp_sock *msk, goto nla_put_failure; break; case MPTCP_EVENT_CLOSED: - if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, msk->token) < 0) + if (nla_put_u32(skb, MPTCP_ATTR_TOKEN, READ_ONCE(msk->token)) < 0) goto nla_put_failure; break; case MPTCP_EVENT_ANNOUNCED: diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 8ef2927ebca2..c7af62c057bc 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -410,6 +410,7 @@ static void mptcp_close_wake_up(struct sock *sk) sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); } +/* called under the msk socket lock */ static bool mptcp_pending_data_fin_ack(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -441,16 +442,17 @@ static void mptcp_check_data_fin_ack(struct sock *sk) } } +/* can be called with no lock acquired */ static bool mptcp_pending_data_fin(struct sock *sk, u64 *seq) { struct mptcp_sock *msk = mptcp_sk(sk); if (READ_ONCE(msk->rcv_data_fin) && - ((1 << sk->sk_state) & + ((1 << inet_sk_state_load(sk)) & (TCPF_ESTABLISHED | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2))) { u64 rcv_data_fin_seq = READ_ONCE(msk->rcv_data_fin_seq); - if (msk->ack_seq == rcv_data_fin_seq) { + if (READ_ONCE(msk->ack_seq) == rcv_data_fin_seq) { if (seq) *seq = rcv_data_fin_seq; @@ -748,7 +750,7 @@ static bool __mptcp_ofo_queue(struct mptcp_sock *msk) __skb_queue_tail(&sk->sk_receive_queue, skb); } msk->bytes_received += end_seq - msk->ack_seq; - msk->ack_seq = end_seq; + WRITE_ONCE(msk->ack_seq, end_seq); moved = true; } return moved; @@ -985,6 +987,7 @@ static void dfrag_clear(struct sock *sk, struct mptcp_data_frag *dfrag) put_page(dfrag->page); } +/* called under both the msk socket lock and the data lock */ static void __mptcp_clean_una(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -1033,13 +1036,15 @@ static void __mptcp_clean_una(struct sock *sk) msk->recovery = false; out: - if (snd_una == READ_ONCE(msk->snd_nxt) && - snd_una == READ_ONCE(msk->write_seq)) { + if (snd_una == msk->snd_nxt && snd_una == msk->write_seq) { if (mptcp_rtx_timer_pending(sk) && !mptcp_data_fin_enabled(msk)) mptcp_stop_rtx_timer(sk); } else { mptcp_reset_rtx_timer(sk); } + + if (mptcp_pending_data_fin_ack(sk)) + mptcp_schedule_work(sk); } static void __mptcp_clean_una_wakeup(struct sock *sk) @@ -1499,7 +1504,7 @@ static void mptcp_update_post_push(struct mptcp_sock *msk, */ if (likely(after64(snd_nxt_new, msk->snd_nxt))) { msk->bytes_sent += snd_nxt_new - msk->snd_nxt; - msk->snd_nxt = snd_nxt_new; + WRITE_ONCE(msk->snd_nxt, snd_nxt_new); } } @@ -2114,7 +2119,7 @@ static unsigned int mptcp_inq_hint(const struct sock *sk) skb = skb_peek(&msk->receive_queue); if (skb) { - u64 hint_val = msk->ack_seq - MPTCP_SKB_CB(skb)->map_seq; + u64 hint_val = READ_ONCE(msk->ack_seq) - MPTCP_SKB_CB(skb)->map_seq; if (hint_val >= INT_MAX) return INT_MAX; @@ -2758,7 +2763,7 @@ static void __mptcp_init_sock(struct sock *sk) __skb_queue_head_init(&msk->receive_queue); msk->out_of_order_queue = RB_ROOT; msk->first_pending = NULL; - msk->rmem_fwd_alloc = 0; + WRITE_ONCE(msk->rmem_fwd_alloc, 0); WRITE_ONCE(msk->rmem_released, 0); msk->timer_ival = TCP_RTO_MIN; msk->scaling_ratio = TCP_DEFAULT_SCALING_RATIO; @@ -2974,7 +2979,7 @@ static void __mptcp_destroy_sock(struct sock *sk) sk->sk_prot->destroy(sk); - WARN_ON_ONCE(msk->rmem_fwd_alloc); + WARN_ON_ONCE(READ_ONCE(msk->rmem_fwd_alloc)); WARN_ON_ONCE(msk->rmem_released); sk_stream_kill_queues(sk); xfrm_sk_free_policy(sk); @@ -3149,16 +3154,16 @@ static int mptcp_disconnect(struct sock *sk, int flags) WRITE_ONCE(msk->flags, 0); msk->cb_flags = 0; msk->recovery = false; - msk->can_ack = false; - msk->fully_established = false; - msk->rcv_data_fin = false; - msk->snd_data_fin_enable = false; - msk->rcv_fastclose = false; - msk->use_64bit_ack = false; - msk->bytes_consumed = 0; + WRITE_ONCE(msk->can_ack, false); + WRITE_ONCE(msk->fully_established, false); + WRITE_ONCE(msk->rcv_data_fin, false); + WRITE_ONCE(msk->snd_data_fin_enable, false); + WRITE_ONCE(msk->rcv_fastclose, false); + WRITE_ONCE(msk->use_64bit_ack, false); WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk))); mptcp_pm_data_reset(msk); mptcp_ca_reset(sk); + msk->bytes_consumed = 0; msk->bytes_acked = 0; msk->bytes_received = 0; msk->bytes_sent = 0; @@ -3200,17 +3205,17 @@ struct sock *mptcp_sk_clone_init(const struct sock *sk, __mptcp_init_sock(nsk); msk = mptcp_sk(nsk); - msk->local_key = subflow_req->local_key; - msk->token = subflow_req->token; + WRITE_ONCE(msk->local_key, subflow_req->local_key); + WRITE_ONCE(msk->token, subflow_req->token); msk->in_accept_queue = 1; WRITE_ONCE(msk->fully_established, false); if (mp_opt->suboptions & OPTION_MPTCP_CSUMREQD) WRITE_ONCE(msk->csum_enabled, true); - msk->write_seq = subflow_req->idsn + 1; - msk->snd_nxt = msk->write_seq; - msk->snd_una = msk->write_seq; - msk->wnd_end = msk->snd_nxt + req->rsk_rcv_wnd; + WRITE_ONCE(msk->write_seq, subflow_req->idsn + 1); + WRITE_ONCE(msk->snd_nxt, msk->write_seq); + WRITE_ONCE(msk->snd_una, msk->write_seq); + WRITE_ONCE(msk->wnd_end, msk->snd_nxt + req->rsk_rcv_wnd); msk->setsockopt_seq = mptcp_sk(sk)->setsockopt_seq; mptcp_init_sched(msk, mptcp_sk(sk)->sched); @@ -3313,9 +3318,6 @@ void __mptcp_data_acked(struct sock *sk) __mptcp_clean_una(sk); else __set_bit(MPTCP_CLEAN_UNA, &mptcp_sk(sk)->cb_flags); - - if (mptcp_pending_data_fin_ack(sk)) - mptcp_schedule_work(sk); } void __mptcp_check_push(struct sock *sk, struct sock *ssk) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index ed50f2015dc3..c5ec056040eb 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -260,8 +260,10 @@ struct mptcp_data_frag { struct mptcp_sock { /* inet_connection_sock must be the first member */ struct inet_connection_sock sk; - u64 local_key; - u64 remote_key; + u64 local_key; /* protected by the first subflow socket lock + * lockless access read + */ + u64 remote_key; /* same as above */ u64 write_seq; u64 bytes_sent; u64 snd_nxt; @@ -400,7 +402,7 @@ static inline struct mptcp_data_frag *mptcp_rtx_head(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); - if (msk->snd_una == READ_ONCE(msk->snd_nxt)) + if (msk->snd_una == msk->snd_nxt) return NULL; return list_first_entry_or_null(&msk->rtx_queue, struct mptcp_data_frag, list); diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index c40f1428e602..da37e4541a5d 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -942,7 +942,7 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) mptcp_data_unlock(sk); slow = lock_sock_fast(sk); - info->mptcpi_csum_enabled = msk->csum_enabled; + info->mptcpi_csum_enabled = READ_ONCE(msk->csum_enabled); info->mptcpi_token = msk->token; info->mptcpi_write_seq = msk->write_seq; info->mptcpi_retransmits = inet_csk(sk)->icsk_retransmits; diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index c34ecadee120..02dab0669cfc 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -75,7 +75,8 @@ static void subflow_req_create_thmac(struct mptcp_subflow_request_sock *subflow_ get_random_bytes(&subflow_req->local_nonce, sizeof(u32)); - subflow_generate_hmac(msk->local_key, msk->remote_key, + subflow_generate_hmac(READ_ONCE(msk->local_key), + READ_ONCE(msk->remote_key), subflow_req->local_nonce, subflow_req->remote_nonce, hmac); @@ -714,7 +715,8 @@ static bool subflow_hmac_valid(const struct request_sock *req, if (!msk) return false; - subflow_generate_hmac(msk->remote_key, msk->local_key, + subflow_generate_hmac(READ_ONCE(msk->remote_key), + READ_ONCE(msk->local_key), subflow_req->remote_nonce, subflow_req->local_nonce, hmac); @@ -1548,8 +1550,8 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc, mptcp_pm_get_flags_and_ifindex_by_id(msk, local_id, &flags, &ifindex); subflow->remote_key_valid = 1; - subflow->remote_key = msk->remote_key; - subflow->local_key = msk->local_key; + subflow->remote_key = READ_ONCE(msk->remote_key); + subflow->local_key = READ_ONCE(msk->local_key); subflow->token = msk->token; mptcp_info2sockaddr(loc, &addr, ssk->sk_family); diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 441d1f134110..df2dc21304ef 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -818,7 +818,7 @@ config NETFILTER_XT_TARGET_AUDIT config NETFILTER_XT_TARGET_CHECKSUM tristate "CHECKSUM target support" - depends on IP_NF_MANGLE || IP6_NF_MANGLE + depends on IP_NF_MANGLE || IP6_NF_MANGLE || NFT_COMPAT depends on NETFILTER_ADVANCED help This option adds a `CHECKSUM' target, which can be used in the iptables mangle @@ -869,7 +869,7 @@ config NETFILTER_XT_TARGET_CONNSECMARK config NETFILTER_XT_TARGET_CT tristate '"CT" target support' depends on NF_CONNTRACK - depends on IP_NF_RAW || IP6_NF_RAW + depends on IP_NF_RAW || IP6_NF_RAW || NFT_COMPAT depends on NETFILTER_ADVANCED help This options adds a `CT' target, which allows to specify initial @@ -880,7 +880,7 @@ config NETFILTER_XT_TARGET_CT config NETFILTER_XT_TARGET_DSCP tristate '"DSCP" and "TOS" target support' - depends on IP_NF_MANGLE || IP6_NF_MANGLE + depends on IP_NF_MANGLE || IP6_NF_MANGLE || NFT_COMPAT depends on NETFILTER_ADVANCED help This option adds a `DSCP' target, which allows you to manipulate @@ -896,7 +896,7 @@ config NETFILTER_XT_TARGET_DSCP config NETFILTER_XT_TARGET_HL tristate '"HL" hoplimit target support' - depends on IP_NF_MANGLE || IP6_NF_MANGLE + depends on IP_NF_MANGLE || IP6_NF_MANGLE || NFT_COMPAT depends on NETFILTER_ADVANCED help This option adds the "HL" (for IPv6) and "TTL" (for IPv4) @@ -1080,7 +1080,7 @@ config NETFILTER_XT_TARGET_TPROXY depends on NETFILTER_ADVANCED depends on IPV6 || IPV6=n depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n - depends on IP_NF_MANGLE + depends on IP_NF_MANGLE || NFT_COMPAT select NF_DEFRAG_IPV4 select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES != n select NF_TPROXY_IPV4 @@ -1147,7 +1147,7 @@ config NETFILTER_XT_TARGET_TCPMSS config NETFILTER_XT_TARGET_TCPOPTSTRIP tristate '"TCPOPTSTRIP" target support' - depends on IP_NF_MANGLE || IP6_NF_MANGLE + depends on IP_NF_MANGLE || IP6_NF_MANGLE || NFT_COMPAT depends on NETFILTER_ADVANCED help This option adds a "TCPOPTSTRIP" target, which allows you to strip diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index a743db073887..98d7dbe3d787 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -1511,9 +1511,7 @@ int __init ip_vs_conn_init(void) return -ENOMEM; /* Allocate ip_vs_conn slab cache */ - ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn", - sizeof(struct ip_vs_conn), 0, - SLAB_HWCACHE_ALIGN, NULL); + ip_vs_conn_cachep = KMEM_CACHE(ip_vs_conn, SLAB_HWCACHE_ALIGN); if (!ip_vs_conn_cachep) { kvfree(ip_vs_conn_tab); return -ENOMEM; diff --git a/net/netfilter/nf_bpf_link.c b/net/netfilter/nf_bpf_link.c index 0e4beae421f8..5257d5e7eb09 100644 --- a/net/netfilter/nf_bpf_link.c +++ b/net/netfilter/nf_bpf_link.c @@ -314,7 +314,7 @@ static bool nf_is_valid_access(int off, int size, enum bpf_access_type type, static const struct bpf_func_proto * bpf_nf_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { - return bpf_base_func_proto(func_id); + return bpf_base_func_proto(func_id, prog); } const struct bpf_verifier_ops netfilter_verifier_ops = { diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c index 5d8ed6c90b7e..8715617b02fe 100644 --- a/net/netfilter/nf_conncount.c +++ b/net/netfilter/nf_conncount.c @@ -605,15 +605,11 @@ static int __init nf_conncount_modinit(void) for (i = 0; i < CONNCOUNT_SLOTS; ++i) spin_lock_init(&nf_conncount_locks[i]); - conncount_conn_cachep = kmem_cache_create("nf_conncount_tuple", - sizeof(struct nf_conncount_tuple), - 0, 0, NULL); + conncount_conn_cachep = KMEM_CACHE(nf_conncount_tuple, 0); if (!conncount_conn_cachep) return -ENOMEM; - conncount_rb_cachep = kmem_cache_create("nf_conncount_rb", - sizeof(struct nf_conncount_rb), - 0, 0, NULL); + conncount_rb_cachep = KMEM_CACHE(nf_conncount_rb, 0); if (!conncount_rb_cachep) { kmem_cache_destroy(conncount_conn_cachep); return -ENOMEM; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 2e5f3864d353..90e6bd2c3000 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -2530,7 +2530,7 @@ void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list) * netfilter framework. Roll on, two-stage module * delete... */ - synchronize_net(); + synchronize_rcu_expedited(); i_see_dead_people: busy = 0; list_for_each_entry(net, net_exit_list, exit_list) { diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index f8e3f70c35bd..cb6f49a3d809 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1194,8 +1194,10 @@ static void nf_tables_table_disable(struct net *net, struct nft_table *table) #define __NFT_TABLE_F_INTERNAL (NFT_TABLE_F_MASK + 1) #define __NFT_TABLE_F_WAS_DORMANT (__NFT_TABLE_F_INTERNAL << 0) #define __NFT_TABLE_F_WAS_AWAKEN (__NFT_TABLE_F_INTERNAL << 1) +#define __NFT_TABLE_F_WAS_ORPHAN (__NFT_TABLE_F_INTERNAL << 2) #define __NFT_TABLE_F_UPDATE (__NFT_TABLE_F_WAS_DORMANT | \ - __NFT_TABLE_F_WAS_AWAKEN) + __NFT_TABLE_F_WAS_AWAKEN | \ + __NFT_TABLE_F_WAS_ORPHAN) static int nf_tables_updtable(struct nft_ctx *ctx) { @@ -1215,8 +1217,11 @@ static int nf_tables_updtable(struct nft_ctx *ctx) if ((nft_table_has_owner(ctx->table) && !(flags & NFT_TABLE_F_OWNER)) || - (!nft_table_has_owner(ctx->table) && - flags & NFT_TABLE_F_OWNER)) + (flags & NFT_TABLE_F_OWNER && + !nft_table_is_orphan(ctx->table))) + return -EOPNOTSUPP; + + if ((flags ^ ctx->table->flags) & NFT_TABLE_F_PERSIST) return -EOPNOTSUPP; /* No dormant off/on/off/on games in single transaction */ @@ -1245,6 +1250,13 @@ static int nf_tables_updtable(struct nft_ctx *ctx) } } + if ((flags & NFT_TABLE_F_OWNER) && + !nft_table_has_owner(ctx->table)) { + ctx->table->nlpid = ctx->portid; + ctx->table->flags |= NFT_TABLE_F_OWNER | + __NFT_TABLE_F_WAS_ORPHAN; + } + nft_trans_table_update(trans) = true; nft_trans_commit_list_add_tail(ctx->net, trans); @@ -4235,23 +4247,18 @@ static bool nft_set_ops_candidate(const struct nft_set_type *type, u32 flags) * given, in that case the amount of memory per element is used. */ static const struct nft_set_ops * -nft_select_set_ops(const struct nft_ctx *ctx, - const struct nlattr * const nla[], +nft_select_set_ops(const struct nft_ctx *ctx, u32 flags, const struct nft_set_desc *desc) { struct nftables_pernet *nft_net = nft_pernet(ctx->net); const struct nft_set_ops *ops, *bops; struct nft_set_estimate est, best; const struct nft_set_type *type; - u32 flags = 0; int i; lockdep_assert_held(&nft_net->commit_mutex); lockdep_nfnl_nft_mutex_not_held(); - if (nla[NFTA_SET_FLAGS] != NULL) - flags = ntohl(nla_get_be32(nla[NFTA_SET_FLAGS])); - bops = NULL; best.size = ~0; best.lookup = ~0; @@ -5137,7 +5144,7 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) return -ENOENT; - ops = nft_select_set_ops(&ctx, nla, &desc); + ops = nft_select_set_ops(&ctx, flags, &desc); if (IS_ERR(ops)) return PTR_ERR(ops); @@ -10425,6 +10432,10 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) } else if (trans->ctx.table->flags & __NFT_TABLE_F_WAS_AWAKEN) { trans->ctx.table->flags &= ~NFT_TABLE_F_DORMANT; } + if (trans->ctx.table->flags & __NFT_TABLE_F_WAS_ORPHAN) { + trans->ctx.table->flags &= ~NFT_TABLE_F_OWNER; + trans->ctx.table->nlpid = 0; + } trans->ctx.table->flags &= ~__NFT_TABLE_F_UPDATE; nft_trans_destroy(trans); } else { @@ -11351,6 +11362,10 @@ again: list_for_each_entry(table, &nft_net->tables, list) { if (nft_table_has_owner(table) && n->portid == table->nlpid) { + if (table->flags & NFT_TABLE_F_PERSIST) { + table->flags &= ~NFT_TABLE_F_OWNER; + continue; + } __nft_release_hook(net, table); list_del_rcu(&table->list); to_delete[deleted++] = table; diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c index 27511c90a26f..7b844581ebee 100644 --- a/net/netlabel/netlabel_kapi.c +++ b/net/netlabel/netlabel_kapi.c @@ -610,7 +610,7 @@ int netlbl_catmap_walk(struct netlbl_lsm_catmap *catmap, u32 offset) struct netlbl_lsm_catmap *iter; u32 idx; u32 bit; - NETLBL_CATMAP_MAPTYPE bitmap; + u64 bitmap; iter = _netlbl_catmap_getnode(&catmap, offset, _CM_F_WALK, 0); if (iter == NULL) @@ -666,8 +666,8 @@ int netlbl_catmap_walkrng(struct netlbl_lsm_catmap *catmap, u32 offset) struct netlbl_lsm_catmap *prev = NULL; u32 idx; u32 bit; - NETLBL_CATMAP_MAPTYPE bitmask; - NETLBL_CATMAP_MAPTYPE bitmap; + u64 bitmask; + u64 bitmap; iter = _netlbl_catmap_getnode(&catmap, offset, _CM_F_WALK, 0); if (iter == NULL) @@ -857,7 +857,7 @@ int netlbl_catmap_setlong(struct netlbl_lsm_catmap **catmap, offset -= iter->startbit; idx = offset / NETLBL_CATMAP_MAPSIZE; - iter->bitmap[idx] |= (NETLBL_CATMAP_MAPTYPE)bitmap + iter->bitmap[idx] |= (u64)bitmap << (offset % NETLBL_CATMAP_MAPSIZE); return 0; diff --git a/net/netlink/diag.c b/net/netlink/diag.c index 1eeff9422856..e12c90d5f6ad 100644 --- a/net/netlink/diag.c +++ b/net/netlink/diag.c @@ -241,6 +241,7 @@ static int netlink_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) } static const struct sock_diag_handler netlink_diag_handler = { + .owner = THIS_MODULE, .family = AF_NETLINK, .dump = netlink_diag_handler_dump, }; diff --git a/net/nfc/hci/llc.c b/net/nfc/hci/llc.c index 2140f6724644..ba91284f4086 100644 --- a/net/nfc/hci/llc.c +++ b/net/nfc/hci/llc.c @@ -30,15 +30,19 @@ exit: return r; } +static void nfc_llc_del_engine(struct nfc_llc_engine *llc_engine) +{ + list_del(&llc_engine->entry); + kfree_const(llc_engine->name); + kfree(llc_engine); +} + void nfc_llc_exit(void) { struct nfc_llc_engine *llc_engine, *n; - list_for_each_entry_safe(llc_engine, n, &llc_engines, entry) { - list_del(&llc_engine->entry); - kfree(llc_engine->name); - kfree(llc_engine); - } + list_for_each_entry_safe(llc_engine, n, &llc_engines, entry) + nfc_llc_del_engine(llc_engine); } int nfc_llc_register(const char *name, const struct nfc_llc_ops *ops) @@ -49,7 +53,7 @@ int nfc_llc_register(const char *name, const struct nfc_llc_ops *ops) if (llc_engine == NULL) return -ENOMEM; - llc_engine->name = kstrdup(name, GFP_KERNEL); + llc_engine->name = kstrdup_const(name, GFP_KERNEL); if (llc_engine->name == NULL) { kfree(llc_engine); return -ENOMEM; @@ -82,9 +86,7 @@ void nfc_llc_unregister(const char *name) if (llc_engine == NULL) return; - list_del(&llc_engine->entry); - kfree(llc_engine->name); - kfree(llc_engine); + nfc_llc_del_engine(llc_engine); } struct nfc_llc *nfc_llc_allocate(const char *name, struct nfc_hci_dev *hdev, diff --git a/net/packet/diag.c b/net/packet/diag.c index 9a7980e3309d..b3bd2f6c2bf7 100644 --- a/net/packet/diag.c +++ b/net/packet/diag.c @@ -245,6 +245,7 @@ static int packet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) } static const struct sock_diag_handler packet_diag_handler = { + .owner = THIS_MODULE, .family = AF_PACKET, .dump = packet_diag_handler_dump, }; diff --git a/net/rds/connection.c b/net/rds/connection.c index b4cc699c5fad..c749c5525b40 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -829,9 +829,7 @@ int rds_conn_init(void) if (ret) return ret; - rds_conn_slab = kmem_cache_create("rds_connection", - sizeof(struct rds_connection), - 0, 0, NULL); + rds_conn_slab = KMEM_CACHE(rds_connection, 0); if (!rds_conn_slab) { rds_loop_net_exit(); return -ENOMEM; diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 470c70deffe2..8180d0c12fce 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -737,16 +737,6 @@ config NET_ACT_SAMPLE To compile this code as a module, choose M here: the module will be called act_sample. -config NET_ACT_IPT - tristate "IPtables targets" - depends on NET_CLS_ACT && NETFILTER && NETFILTER_XTABLES - help - Say Y here to be able to invoke iptables targets after successful - classification. - - To compile this code as a module, choose M here: the - module will be called act_ipt. - config NET_ACT_NAT tristate "Stateless NAT" depends on NET_CLS_ACT diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 3e30d7260493..9ee622fb1160 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -1363,7 +1363,7 @@ struct tc_action_ops *tc_action_load_ops(struct nlattr *nla, u32 flags, if (rtnl_held) rtnl_unlock(); - request_module("act_%s", act_name); + request_module(NET_ACT_ALIAS_PREFIX "%s", act_name); if (rtnl_held) rtnl_lock(); diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 6cfee6658103..0e3cf11ae5fc 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -401,6 +401,7 @@ static struct tc_action_ops act_bpf_ops __read_mostly = { .init = tcf_bpf_init, .size = sizeof(struct tcf_bpf), }; +MODULE_ALIAS_NET_ACT("bpf"); static __net_init int bpf_init_net(struct net *net) { diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index f8762756657d..0fce631e7c91 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -242,6 +242,7 @@ static struct tc_action_ops act_connmark_ops = { .cleanup = tcf_connmark_cleanup, .size = sizeof(struct tcf_connmark_info), }; +MODULE_ALIAS_NET_ACT("connmark"); static __net_init int connmark_init_net(struct net *net) { diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 7f8b1f2f2ed9..5cc8e407e791 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -709,6 +709,7 @@ static struct tc_action_ops act_csum_ops = { .offload_act_setup = tcf_csum_offload_act_setup, .size = sizeof(struct tcf_csum), }; +MODULE_ALIAS_NET_ACT("csum"); static __net_init int csum_init_net(struct net *net) { diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 6124d8b128d1..baac083fd8f1 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -1600,6 +1600,7 @@ static struct tc_action_ops act_ct_ops = { .offload_act_setup = tcf_ct_offload_act_setup, .size = sizeof(struct tcf_ct), }; +MODULE_ALIAS_NET_ACT("ct"); static __net_init int ct_init_net(struct net *net) { diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c index e620f9a84afe..5dd41a012110 100644 --- a/net/sched/act_ctinfo.c +++ b/net/sched/act_ctinfo.c @@ -363,6 +363,7 @@ static struct tc_action_ops act_ctinfo_ops = { .cleanup= tcf_ctinfo_cleanup, .size = sizeof(struct tcf_ctinfo), }; +MODULE_ALIAS_NET_ACT("ctinfo"); static __net_init int ctinfo_init_net(struct net *net) { diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index 4af3b7ec249f..e949280eb800 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -296,6 +296,7 @@ static struct tc_action_ops act_gact_ops = { .offload_act_setup = tcf_gact_offload_act_setup, .size = sizeof(struct tcf_gact), }; +MODULE_ALIAS_NET_ACT("gact"); static __net_init int gact_init_net(struct net *net) { diff --git a/net/sched/act_gate.c b/net/sched/act_gate.c index c681cd011afd..1dd74125398a 100644 --- a/net/sched/act_gate.c +++ b/net/sched/act_gate.c @@ -645,6 +645,7 @@ static struct tc_action_ops act_gate_ops = { .offload_act_setup = tcf_gate_offload_act_setup, .size = sizeof(struct tcf_gate), }; +MODULE_ALIAS_NET_ACT("gate"); static __net_init int gate_init_net(struct net *net) { diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 0e867d13beb5..107c6d83dc5c 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -889,6 +889,7 @@ static struct tc_action_ops act_ife_ops = { .init = tcf_ife_init, .size = sizeof(struct tcf_ife_info), }; +MODULE_ALIAS_NET_ACT("ife"); static __net_init int ife_init_net(struct net *net) { diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 0a1a9e40f237..6f4bb1c8ce7b 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -641,6 +641,7 @@ static struct tc_action_ops act_mirred_ops = { .size = sizeof(struct tcf_mirred), .get_dev = tcf_mirred_get_dev, }; +MODULE_ALIAS_NET_ACT("mirred"); static __net_init int mirred_init_net(struct net *net) { diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c index 34b8edb6cc77..44a37a71ae92 100644 --- a/net/sched/act_mpls.c +++ b/net/sched/act_mpls.c @@ -452,6 +452,7 @@ static struct tc_action_ops act_mpls_ops = { .offload_act_setup = tcf_mpls_offload_act_setup, .size = sizeof(struct tcf_mpls), }; +MODULE_ALIAS_NET_ACT("mpls"); static __net_init int mpls_init_net(struct net *net) { diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index a180e724634e..d541f553805f 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -324,6 +324,7 @@ static struct tc_action_ops act_nat_ops = { .cleanup = tcf_nat_cleanup, .size = sizeof(struct tcf_nat), }; +MODULE_ALIAS_NET_ACT("nat"); static __net_init int nat_init_net(struct net *net) { diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 2ef22969f274..df5a02d5f919 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -620,6 +620,7 @@ static struct tc_action_ops act_pedit_ops = { .offload_act_setup = tcf_pedit_offload_act_setup, .size = sizeof(struct tcf_pedit), }; +MODULE_ALIAS_NET_ACT("pedit"); static __net_init int pedit_init_net(struct net *net) { diff --git a/net/sched/act_police.c b/net/sched/act_police.c index e119b4a3db9f..8555125ed34d 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -502,6 +502,7 @@ static struct tc_action_ops act_police_ops = { .offload_act_setup = tcf_police_offload_act_setup, .size = sizeof(struct tcf_police), }; +MODULE_ALIAS_NET_ACT("police"); static __net_init int police_init_net(struct net *net) { diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index c5c61efe6db4..a69b53d54039 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -316,6 +316,7 @@ static struct tc_action_ops act_sample_ops = { .offload_act_setup = tcf_sample_offload_act_setup, .size = sizeof(struct tcf_sample), }; +MODULE_ALIAS_NET_ACT("sample"); static __net_init int sample_init_net(struct net *net) { diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 0a3e92888295..f3abe0545989 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -209,6 +209,7 @@ static struct tc_action_ops act_simp_ops = { .init = tcf_simp_init, .size = sizeof(struct tcf_defact), }; +MODULE_ALIAS_NET_ACT("simple"); static __net_init int simp_init_net(struct net *net) { diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 754f78b35bb8..1f1d9ce3e968 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -426,6 +426,7 @@ static struct tc_action_ops act_skbedit_ops = { .offload_act_setup = tcf_skbedit_offload_act_setup, .size = sizeof(struct tcf_skbedit), }; +MODULE_ALIAS_NET_ACT("skbedit"); static __net_init int skbedit_init_net(struct net *net) { diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index bcb673ab0008..39945b139c48 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -287,6 +287,7 @@ static struct tc_action_ops act_skbmod_ops = { .cleanup = tcf_skbmod_cleanup, .size = sizeof(struct tcf_skbmod), }; +MODULE_ALIAS_NET_ACT("skbmod"); static __net_init int skbmod_init_net(struct net *net) { diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 300b08aa8283..1536f8b16f1b 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -842,6 +842,7 @@ static struct tc_action_ops act_tunnel_key_ops = { .offload_act_setup = tcf_tunnel_key_offload_act_setup, .size = sizeof(struct tcf_tunnel_key), }; +MODULE_ALIAS_NET_ACT("tunnel_key"); static __net_init int tunnel_key_init_net(struct net *net) { diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 836183011a7c..22f4b1e8ade9 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -427,6 +427,7 @@ static struct tc_action_ops act_vlan_ops = { .offload_act_setup = tcf_vlan_offload_act_setup, .size = sizeof(struct tcf_vlan), }; +MODULE_ALIAS_NET_ACT("vlan"); static __net_init int vlan_init_net(struct net *net) { diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index ff3d396a65aa..ca5676b2668e 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -257,7 +257,7 @@ tcf_proto_lookup_ops(const char *kind, bool rtnl_held, #ifdef CONFIG_MODULES if (rtnl_held) rtnl_unlock(); - request_module("cls_%s", kind); + request_module(NET_CLS_ALIAS_PREFIX "%s", kind); if (rtnl_held) rtnl_lock(); ops = __tcf_proto_lookup_ops(kind); diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index a1f56931330c..ecfaa4f9a04e 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -328,6 +328,7 @@ static struct tcf_proto_ops cls_basic_ops __read_mostly = { .bind_class = basic_bind_class, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_CLS("basic"); static int __init init_basic(void) { diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 382c7a71f81f..5e83e890f6a4 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -693,6 +693,7 @@ static struct tcf_proto_ops cls_bpf_ops __read_mostly = { .dump = cls_bpf_dump, .bind_class = cls_bpf_bind_class, }; +MODULE_ALIAS_NET_CLS("bpf"); static int __init cls_bpf_init_mod(void) { diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index 7ee8dbf49ed0..424252982d6a 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -209,6 +209,7 @@ static struct tcf_proto_ops cls_cgroup_ops __read_mostly = { .dump = cls_cgroup_dump, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_CLS("cgroup"); static int __init init_cgroup_cls(void) { diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 6ab317b48d6c..5502998aace7 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -702,6 +702,7 @@ static struct tcf_proto_ops cls_flow_ops __read_mostly = { .walk = flow_walk, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_CLS("flow"); static int __init cls_flow_init(void) { diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index efb9d2811b73..bfedc3d4423d 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -3656,6 +3656,7 @@ static struct tcf_proto_ops cls_fl_ops __read_mostly = { .owner = THIS_MODULE, .flags = TCF_PROTO_OPS_DOIT_UNLOCKED, }; +MODULE_ALIAS_NET_CLS("flower"); static int __init cls_fl_init(void) { diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index afc534ee0a18..cdddc8695228 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -433,6 +433,7 @@ static struct tcf_proto_ops cls_fw_ops __read_mostly = { .bind_class = fw_bind_class, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_CLS("fw"); static int __init init_fw(void) { diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index c4ed11df6254..9f1e62ca508d 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -398,6 +398,7 @@ static struct tcf_proto_ops cls_mall_ops __read_mostly = { .bind_class = mall_bind_class, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_CLS("matchall"); static int __init cls_mall_init(void) { diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index 12a505db4183..b9c58c040c30 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -671,6 +671,7 @@ static struct tcf_proto_ops cls_route4_ops __read_mostly = { .bind_class = route4_bind_class, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_CLS("route"); static int __init init_route4(void) { diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 289e1755c26b..9412d88a99bc 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -1453,6 +1453,7 @@ static struct tcf_proto_ops cls_u32_ops __read_mostly = { .bind_class = u32_bind_class, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_CLS("u32"); static int __init init_u32(void) { diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 36b025cc4fd2..9d928f6a473a 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -228,7 +228,7 @@ int qdisc_set_default(const char *name) if (!ops) { /* Not found, drop lock and try to load module */ write_unlock(&qdisc_mod_lock); - request_module("sch_%s", name); + request_module(NET_SCH_ALIAS_PREFIX "%s", name); write_lock(&qdisc_mod_lock); ops = qdisc_lookup_default(name); @@ -1275,7 +1275,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev, * go away in the mean time. */ rtnl_unlock(); - request_module("sch_%s", name); + request_module(NET_SCH_ALIAS_PREFIX "%s", name); rtnl_lock(); ops = qdisc_lookup_ops(kind); if (ops != NULL) { diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 9cff99558694..edee926ccde8 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -3103,6 +3103,7 @@ static struct Qdisc_ops cake_qdisc_ops __read_mostly = { .dump_stats = cake_dump_stats, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("cake"); static int __init cake_module_init(void) { diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c index beece8e82c23..69001eff0315 100644 --- a/net/sched/sch_cbs.c +++ b/net/sched/sch_cbs.c @@ -546,6 +546,7 @@ static struct Qdisc_ops cbs_qdisc_ops __read_mostly = { .dump = cbs_dump, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("cbs"); static struct notifier_block cbs_device_notifier = { .notifier_call = cbs_dev_notifier, diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index ae1da08e268f..ea108030c6b4 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -498,6 +498,7 @@ static struct Qdisc_ops choke_qdisc_ops __read_mostly = { .dump_stats = choke_dump_stats, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("choke"); static int __init choke_module_init(void) { diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c index d7a4874543de..ecb3f164bb25 100644 --- a/net/sched/sch_codel.c +++ b/net/sched/sch_codel.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause /* * Codel - The Controlled-Delay Active Queue Management algorithm * @@ -7,37 +8,6 @@ * Implemented on linux by : * Copyright (C) 2012 Michael D. Taht <dave.taht@bufferbloat.net> * Copyright (C) 2012,2015 Eric Dumazet <edumazet@google.com> - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions, and the following disclaimer, - * without modification. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. The names of the authors may not be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * Alternatively, provided that this notice is retained in full, this - * software may be distributed under the terms of the GNU General - * Public License ("GPL") version 2, in which case the provisions of the - * GPL apply INSTEAD OF those given above. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH - * DAMAGE. - * */ #include <linux/module.h> @@ -287,6 +257,7 @@ static struct Qdisc_ops codel_qdisc_ops __read_mostly = { .dump_stats = codel_dump_stats, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("codel"); static int __init codel_module_init(void) { diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index 097740a9afea..c69b999fae17 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -481,6 +481,7 @@ static struct Qdisc_ops drr_qdisc_ops __read_mostly = { .destroy = drr_destroy_qdisc, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("drr"); static int __init drr_init(void) { diff --git a/net/sched/sch_etf.c b/net/sched/sch_etf.c index 4808159a5466..2e4bef713b6a 100644 --- a/net/sched/sch_etf.c +++ b/net/sched/sch_etf.c @@ -500,6 +500,7 @@ static struct Qdisc_ops etf_qdisc_ops __read_mostly = { .dump = etf_dump, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("etf"); static int __init etf_module_init(void) { diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c index f7c88495946b..835b4460b448 100644 --- a/net/sched/sch_ets.c +++ b/net/sched/sch_ets.c @@ -812,6 +812,7 @@ static struct Qdisc_ops ets_qdisc_ops __read_mostly = { .dump = ets_qdisc_dump, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("ets"); static int __init ets_init(void) { diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 3a31c47fea9b..cdf23ff16f40 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -1264,6 +1264,7 @@ static struct Qdisc_ops fq_qdisc_ops __read_mostly = { .dump_stats = fq_dump_stats, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("fq"); static int __init fq_module_init(void) { diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 8c4fee063436..79f9d6de6c85 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -717,6 +717,7 @@ static struct Qdisc_ops fq_codel_qdisc_ops __read_mostly = { .dump_stats = fq_codel_dump_stats, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("fq_codel"); static int __init fq_codel_module_init(void) { diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index 8c61eb3dc943..79ba9dc70254 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -930,6 +930,7 @@ static struct Qdisc_ops gred_qdisc_ops __read_mostly = { .dump = gred_dump, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("gred"); static int __init gred_module_init(void) { diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 16c45da4036a..4e626df742d7 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1679,6 +1679,7 @@ static struct Qdisc_ops hfsc_qdisc_ops __read_mostly = { .priv_size = sizeof(struct hfsc_sched), .owner = THIS_MODULE }; +MODULE_ALIAS_NET_SCH("hfsc"); static int __init hfsc_init(void) diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index d26cd436cbe3..3f906df1435b 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -702,6 +702,7 @@ static struct Qdisc_ops hhf_qdisc_ops __read_mostly = { .dump_stats = hhf_dump_stats, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("hhf"); static int __init hhf_module_init(void) { diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 7349233eaa9b..93e6fb56f3b5 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -2166,6 +2166,7 @@ static struct Qdisc_ops htb_qdisc_ops __read_mostly = { .dump = htb_dump, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("htb"); static int __init htb_module_init(void) { diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c index 5fa9eaa79bfc..c2ef9dcf91d2 100644 --- a/net/sched/sch_ingress.c +++ b/net/sched/sch_ingress.c @@ -168,6 +168,7 @@ static struct Qdisc_ops ingress_qdisc_ops __read_mostly = { .ingress_block_get = ingress_ingress_block_get, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("ingress"); struct clsact_sched_data { struct tcf_block *ingress_block; @@ -344,6 +345,7 @@ static struct Qdisc_ops clsact_qdisc_ops __read_mostly = { .egress_block_get = clsact_egress_block_get, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("clsact"); static int __init ingress_module_init(void) { @@ -368,6 +370,5 @@ static void __exit ingress_module_exit(void) module_init(ingress_module_init); module_exit(ingress_module_exit); -MODULE_ALIAS("sch_clsact"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Ingress and clsact based ingress and egress qdiscs"); diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 43e53ee00a56..225353fbb3f1 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -774,6 +774,7 @@ static struct Qdisc_ops mqprio_qdisc_ops __read_mostly = { .dump = mqprio_dump, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("mqprio"); static int __init mqprio_module_init(void) { diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index d66d5f0ec080..79e93a19d5fa 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -395,6 +395,7 @@ static struct Qdisc_ops multiq_qdisc_ops __read_mostly = { .dump = multiq_dump, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("multiq"); static int __init multiq_module_init(void) { diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index fa678eb88528..edc72962ae63 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -1293,6 +1293,7 @@ static struct Qdisc_ops netem_qdisc_ops __read_mostly = { .dump = netem_dump, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("netem"); static int __init netem_module_init(void) diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index 2da6250ec346..1764059b0635 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -556,6 +556,7 @@ static struct Qdisc_ops pie_qdisc_ops __read_mostly = { .dump_stats = pie_dump_stats, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("pie"); static int __init pie_module_init(void) { diff --git a/net/sched/sch_plug.c b/net/sched/sch_plug.c index 992f0c8d7988..cefb65201e17 100644 --- a/net/sched/sch_plug.c +++ b/net/sched/sch_plug.c @@ -213,6 +213,7 @@ static struct Qdisc_ops plug_qdisc_ops __read_mostly = { .reset = qdisc_reset_queue, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("plug"); static int __init plug_module_init(void) { diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index 8ecdd3ef6f8e..cc30f7a32f1a 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -418,6 +418,7 @@ static struct Qdisc_ops prio_qdisc_ops __read_mostly = { .dump = prio_dump, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("prio"); static int __init prio_module_init(void) { diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 48a604c320c7..d584c0c25899 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -1521,6 +1521,7 @@ static struct Qdisc_ops qfq_qdisc_ops __read_mostly = { .destroy = qfq_destroy_qdisc, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("qfq"); static int __init qfq_init(void) { diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 607b6c8b3a9b..b5f096588fae 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -548,6 +548,7 @@ static struct Qdisc_ops red_qdisc_ops __read_mostly = { .dump_stats = red_dump_stats, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("red"); static int __init red_module_init(void) { diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index 1871a1c0224d..b717e15a3a17 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -709,6 +709,7 @@ static struct Qdisc_ops sfb_qdisc_ops __read_mostly = { .dump_stats = sfb_dump_stats, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("sfb"); static int __init sfb_module_init(void) { diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index eb77558fa367..e66f4afb920d 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -925,6 +925,7 @@ static struct Qdisc_ops sfq_qdisc_ops __read_mostly = { .dump = sfq_dump, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("sfq"); static int __init sfq_module_init(void) { diff --git a/net/sched/sch_skbprio.c b/net/sched/sch_skbprio.c index 28beb11762d8..b4dd626c309c 100644 --- a/net/sched/sch_skbprio.c +++ b/net/sched/sch_skbprio.c @@ -292,6 +292,7 @@ static struct Qdisc_ops skbprio_qdisc_ops __read_mostly = { .destroy = skbprio_destroy, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("skbprio"); static int __init skbprio_module_init(void) { diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 31a8252bd09c..c5de70efdc86 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -40,6 +40,8 @@ static struct static_key_false taprio_have_working_mqprio; #define TXTIME_ASSIST_IS_ENABLED(flags) ((flags) & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST) #define FULL_OFFLOAD_IS_ENABLED(flags) ((flags) & TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD) +#define TAPRIO_SUPPORTED_FLAGS \ + (TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST | TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD) #define TAPRIO_FLAGS_INVALID U32_MAX struct sched_entry { @@ -408,19 +410,6 @@ static bool is_valid_interval(struct sk_buff *skb, struct Qdisc *sch) return entry; } -static bool taprio_flags_valid(u32 flags) -{ - /* Make sure no other flag bits are set. */ - if (flags & ~(TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST | - TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD)) - return false; - /* txtime-assist and full offload are mutually exclusive */ - if ((flags & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST) && - (flags & TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD)) - return false; - return true; -} - /* This returns the tstamp value set by TCP in terms of the set clock. */ static ktime_t get_tcp_tstamp(struct taprio_sched *q, struct sk_buff *skb) { @@ -1031,7 +1020,8 @@ static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = { [TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME] = NLA_POLICY_FULL_RANGE_SIGNED(NLA_S64, &taprio_cycle_time_range), [TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION] = { .type = NLA_S64 }, - [TCA_TAPRIO_ATTR_FLAGS] = { .type = NLA_U32 }, + [TCA_TAPRIO_ATTR_FLAGS] = + NLA_POLICY_MASK(NLA_U32, TAPRIO_SUPPORTED_FLAGS), [TCA_TAPRIO_ATTR_TXTIME_DELAY] = { .type = NLA_U32 }, [TCA_TAPRIO_ATTR_TC_ENTRY] = { .type = NLA_NESTED }, }; @@ -1815,33 +1805,6 @@ static int taprio_mqprio_cmp(const struct net_device *dev, return 0; } -/* The semantics of the 'flags' argument in relation to 'change()' - * requests, are interpreted following two rules (which are applied in - * this order): (1) an omitted 'flags' argument is interpreted as - * zero; (2) the 'flags' of a "running" taprio instance cannot be - * changed. - */ -static int taprio_new_flags(const struct nlattr *attr, u32 old, - struct netlink_ext_ack *extack) -{ - u32 new = 0; - - if (attr) - new = nla_get_u32(attr); - - if (old != TAPRIO_FLAGS_INVALID && old != new) { - NL_SET_ERR_MSG_MOD(extack, "Changing 'flags' of a running schedule is not supported"); - return -EOPNOTSUPP; - } - - if (!taprio_flags_valid(new)) { - NL_SET_ERR_MSG_MOD(extack, "Specified 'flags' are not valid"); - return -EINVAL; - } - - return new; -} - static int taprio_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { @@ -1852,6 +1815,7 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, struct net_device *dev = qdisc_dev(sch); struct tc_mqprio_qopt *mqprio = NULL; unsigned long flags; + u32 taprio_flags; ktime_t start; int i, err; @@ -1863,12 +1827,28 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_TAPRIO_ATTR_PRIOMAP]) mqprio = nla_data(tb[TCA_TAPRIO_ATTR_PRIOMAP]); - err = taprio_new_flags(tb[TCA_TAPRIO_ATTR_FLAGS], - q->flags, extack); - if (err < 0) - return err; + /* The semantics of the 'flags' argument in relation to 'change()' + * requests, are interpreted following two rules (which are applied in + * this order): (1) an omitted 'flags' argument is interpreted as + * zero; (2) the 'flags' of a "running" taprio instance cannot be + * changed. + */ + taprio_flags = tb[TCA_TAPRIO_ATTR_FLAGS] ? nla_get_u32(tb[TCA_TAPRIO_ATTR_FLAGS]) : 0; - q->flags = err; + /* txtime-assist and full offload are mutually exclusive */ + if ((taprio_flags & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST) && + (taprio_flags & TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD)) { + NL_SET_ERR_MSG_ATTR(extack, tb[TCA_TAPRIO_ATTR_FLAGS], + "TXTIME_ASSIST and FULL_OFFLOAD are mutually exclusive"); + return -EINVAL; + } + + if (q->flags != TAPRIO_FLAGS_INVALID && q->flags != taprio_flags) { + NL_SET_ERR_MSG_MOD(extack, + "Changing 'flags' of a running schedule is not supported"); + return -EOPNOTSUPP; + } + q->flags = taprio_flags; err = taprio_parse_mqprio_opt(dev, mqprio, extack, q->flags); if (err < 0) @@ -2548,6 +2528,7 @@ static struct Qdisc_ops taprio_qdisc_ops __read_mostly = { .dump_stats = taprio_dump_stats, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("taprio"); static struct notifier_block taprio_device_notifier = { .notifier_call = taprio_dev_notifier, diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index dd6b1a723bf7..f1d09183ae63 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -608,6 +608,7 @@ static struct Qdisc_ops tbf_qdisc_ops __read_mostly = { .dump = tbf_dump, .owner = THIS_MODULE, }; +MODULE_ALIAS_NET_SCH("tbf"); static int __init tbf_module_init(void) { diff --git a/net/sctp/diag.c b/net/sctp/diag.c index eb05131ff1dd..23359e522273 100644 --- a/net/sctp/diag.c +++ b/net/sctp/diag.c @@ -507,6 +507,7 @@ done: } static const struct inet_diag_handler sctp_diag_handler = { + .owner = THIS_MODULE, .dump = sctp_diag_dump, .dump_one = sctp_diag_dump_one, .idiag_get_info = sctp_diag_get_info, diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 94c6dd53cd62..e849f368ed91 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1495,17 +1495,11 @@ static __init int sctp_init(void) /* Allocate bind_bucket and chunk caches. */ status = -ENOBUFS; - sctp_bucket_cachep = kmem_cache_create("sctp_bind_bucket", - sizeof(struct sctp_bind_bucket), - 0, SLAB_HWCACHE_ALIGN, - NULL); + sctp_bucket_cachep = KMEM_CACHE(sctp_bind_bucket, SLAB_HWCACHE_ALIGN); if (!sctp_bucket_cachep) goto out; - sctp_chunk_cachep = kmem_cache_create("sctp_chunk", - sizeof(struct sctp_chunk), - 0, SLAB_HWCACHE_ALIGN, - NULL); + sctp_chunk_cachep = KMEM_CACHE(sctp_chunk, SLAB_HWCACHE_ALIGN); if (!sctp_chunk_cachep) goto err_chunk_cachep; diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 0f53a5c6fd9d..4b52b3b159c0 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1046,7 +1046,7 @@ static int smc_find_ism_v2_device_clnt(struct smc_sock *smc, int rc = SMC_CLC_DECL_NOSMCDDEV; struct smcd_dev *smcd; int i = 1, entry = 1; - bool is_virtual; + bool is_emulated; u16 chid; if (smcd_indicated(ini->smc_type_v1)) @@ -1058,12 +1058,12 @@ static int smc_find_ism_v2_device_clnt(struct smc_sock *smc, chid = smc_ism_get_chid(smcd); if (!smc_find_ism_v2_is_unique_chid(chid, ini, i)) continue; - is_virtual = __smc_ism_is_virtual(chid); + is_emulated = __smc_ism_is_emulated(chid); if (!smc_pnet_is_pnetid_set(smcd->pnetid) || smc_pnet_is_ndev_pnetid(sock_net(&smc->sk), smcd->pnetid)) { - if (is_virtual && entry == SMCD_CLC_MAX_V2_GID_ENTRIES) + if (is_emulated && entry == SMCD_CLC_MAX_V2_GID_ENTRIES) /* It's the last GID-CHID entry left in CLC - * Proposal SMC-Dv2 extension, but a virtual + * Proposal SMC-Dv2 extension, but an Emulated- * ISM device will take two entries. So give * up it and try the next potential ISM device. */ @@ -1073,7 +1073,7 @@ static int smc_find_ism_v2_device_clnt(struct smc_sock *smc, ini->is_smcd = true; rc = 0; i++; - entry = is_virtual ? entry + 2 : entry + 1; + entry = is_emulated ? entry + 2 : entry + 1; if (entry > SMCD_CLC_MAX_V2_GID_ENTRIES) break; } @@ -1414,10 +1414,10 @@ static int smc_connect_ism(struct smc_sock *smc, if (rc) return rc; - if (__smc_ism_is_virtual(ini->ism_chid[ini->ism_selected])) + if (__smc_ism_is_emulated(ini->ism_chid[ini->ism_selected])) ini->ism_peer_gid[ini->ism_selected].gid_ext = ntohll(aclc->d1.gid_ext); - /* for non-virtual ISM devices, peer gid_ext remains 0. */ + /* for non-Emulated-ISM devices, peer gid_ext remains 0. */ } ini->ism_peer_gid[ini->ism_selected].gid = ntohll(aclc->d0.gid); @@ -2118,10 +2118,10 @@ static void smc_check_ism_v2_match(struct smc_init_info *ini, if (smc_ism_get_chid(smcd) == proposed_chid && !smc_ism_cantalk(proposed_gid, ISM_RESERVED_VLANID, smcd)) { ini->ism_peer_gid[*matches].gid = proposed_gid->gid; - if (__smc_ism_is_virtual(proposed_chid)) + if (__smc_ism_is_emulated(proposed_chid)) ini->ism_peer_gid[*matches].gid_ext = proposed_gid->gid_ext; - /* non-virtual ISM's peer gid_ext remains 0. */ + /* non-Emulated-ISM's peer gid_ext remains 0. */ ini->ism_dev[*matches] = smcd; (*matches)++; break; @@ -2171,10 +2171,10 @@ static void smc_find_ism_v2_device_serv(struct smc_sock *new_smc, smcd_gid.gid = ntohll(smcd_v2_ext->gidchid[i].gid); smcd_gid.gid_ext = 0; chid = ntohs(smcd_v2_ext->gidchid[i].chid); - if (__smc_ism_is_virtual(chid)) { + if (__smc_ism_is_emulated(chid)) { if ((i + 1) == smc_v2_ext->hdr.ism_gid_cnt || chid != ntohs(smcd_v2_ext->gidchid[i + 1].chid)) - /* each virtual ISM device takes two GID-CHID + /* each Emulated-ISM device takes two GID-CHID * entries and CHID of the second entry repeats * that of the first entry. * diff --git a/net/smc/smc.h b/net/smc/smc.h index df64efd2dee8..18c8b7870198 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -56,11 +56,11 @@ enum smc_state { /* possible states of an SMC socket */ }; enum smc_supplemental_features { - SMC_SPF_VIRT_ISM_DEV = 0, + SMC_SPF_EMULATED_ISM_DEV = 0, }; #define SMC_FEATURE_MASK \ - (BIT(SMC_SPF_VIRT_ISM_DEV)) + (BIT(SMC_SPF_EMULATED_ISM_DEV)) struct smc_link_group; diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 9a13709bea1c..e55026c7529c 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -952,8 +952,8 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) gidchids[entry].chid = htons(smc_ism_get_chid(ini->ism_dev[i])); gidchids[entry].gid = htonll(smcd_gid.gid); - if (smc_ism_is_virtual(smcd)) { - /* a virtual ISM device takes two + if (smc_ism_is_emulated(smcd)) { + /* an Emulated-ISM device takes two * entries. CHID of the second entry * repeats that of the first entry. */ @@ -1055,7 +1055,7 @@ smcd_clc_prep_confirm_accept(struct smc_connection *conn, clc->d1.chid = htons(chid); if (eid && eid[0]) memcpy(clc->d1.eid, eid, SMC_MAX_EID_LEN); - if (__smc_ism_is_virtual(chid)) + if (__smc_ism_is_emulated(chid)) clc->d1.gid_ext = htonll(smcd_gid.gid_ext); len = SMCD_CLC_ACCEPT_CONFIRM_LEN_V2; if (first_contact) { diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h index a9f9bdd26dcd..7cc7070b9772 100644 --- a/net/smc/smc_clc.h +++ b/net/smc/smc_clc.h @@ -175,7 +175,7 @@ struct smc_clc_msg_proposal { /* clc proposal message sent by Linux */ #define SMCD_CLC_MAX_V2_GID_ENTRIES 8 /* max # of CHID-GID entries in CLC * proposal SMC-Dv2 extension. * each ISM device takes one entry and - * each virtual ISM takes two entries. + * each Emulated-ISM takes two entries */ struct smc_clc_msg_proposal_area { diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index e4c858411207..9b84d5897aa5 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1535,7 +1535,7 @@ void smc_smcd_terminate(struct smcd_dev *dev, struct smcd_gid *peer_gid, list_for_each_entry_safe(lgr, l, &dev->lgr_list, list) { if ((!peer_gid->gid || (lgr->peer_gid.gid == peer_gid->gid && - !smc_ism_is_virtual(dev) ? 1 : + !smc_ism_is_emulated(dev) ? 1 : lgr->peer_gid.gid_ext == peer_gid->gid_ext)) && (vlan == VLAN_VID_MASK || lgr->vlan_id == vlan)) { if (peer_gid->gid) /* peer triggered termination */ @@ -1881,7 +1881,7 @@ static bool smcd_lgr_match(struct smc_link_group *lgr, lgr->smcd != smcismdev) return false; - if (smc_ism_is_virtual(smcismdev) && + if (smc_ism_is_emulated(smcismdev) && lgr->peer_gid.gid_ext != peer_gid->gid_ext) return false; diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index 5a33908015f3..6fdb2d96777a 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -255,6 +255,7 @@ static int smc_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) } static const struct sock_diag_handler smc_diag_handler = { + .owner = THIS_MODULE, .family = AF_SMC, .dump = smc_diag_handler_dump, }; diff --git a/net/smc/smc_ism.h b/net/smc/smc_ism.h index ffff40c30a06..165cd013404b 100644 --- a/net/smc/smc_ism.h +++ b/net/smc/smc_ism.h @@ -15,7 +15,7 @@ #include "smc.h" -#define SMC_VIRTUAL_ISM_CHID_MASK 0xFF00 +#define SMC_EMULATED_ISM_CHID_MASK 0xFF00 #define SMC_ISM_IDENT_MASK 0x00FFFF struct smcd_dev_list { /* List of SMCD devices */ @@ -66,10 +66,10 @@ static inline int smc_ism_write(struct smcd_dev *smcd, u64 dmb_tok, return rc < 0 ? rc : 0; } -static inline bool __smc_ism_is_virtual(u16 chid) +static inline bool __smc_ism_is_emulated(u16 chid) { /* CHIDs in range of 0xFF00 to 0xFFFF are reserved - * for virtual ISM device. + * for Emulated-ISM device. * * loopback-ism: 0xFFFF * virtio-ism: 0xFF00 ~ 0xFFFE @@ -77,11 +77,11 @@ static inline bool __smc_ism_is_virtual(u16 chid) return ((chid & 0xFF00) == 0xFF00); } -static inline bool smc_ism_is_virtual(struct smcd_dev *smcd) +static inline bool smc_ism_is_emulated(struct smcd_dev *smcd) { u16 chid = smcd->ops->get_chid(smcd); - return __smc_ism_is_virtual(chid); + return __smc_ism_is_emulated(chid); } #endif diff --git a/net/tipc/Kconfig b/net/tipc/Kconfig index be1c4003d67d..bb0d71eb02a6 100644 --- a/net/tipc/Kconfig +++ b/net/tipc/Kconfig @@ -32,16 +32,17 @@ config TIPC_MEDIA_UDP bool "IP/UDP media type support" depends on TIPC select NET_UDP_TUNNEL + default y help Saying Y here will enable support for running TIPC over IP/UDP - bool - default y + config TIPC_CRYPTO bool "TIPC encryption support" depends on TIPC select CRYPTO select CRYPTO_AES select CRYPTO_GCM + default y help Saying Y here will enable support for TIPC encryption. All TIPC messages will be encrypted/decrypted by using the currently most @@ -49,8 +50,6 @@ config TIPC_CRYPTO entering the TIPC stack. Key setting from user-space is performed via netlink by a user program (e.g. the iproute2 'tipc' tool). - bool - default y config TIPC_DIAG tristate "TIPC: socket monitoring interface" diff --git a/net/tipc/Makefile b/net/tipc/Makefile index ee49a9f1dd4f..18e1636aa036 100644 --- a/net/tipc/Makefile +++ b/net/tipc/Makefile @@ -18,5 +18,5 @@ tipc-$(CONFIG_TIPC_MEDIA_IB) += ib_media.o tipc-$(CONFIG_SYSCTL) += sysctl.o tipc-$(CONFIG_TIPC_CRYPTO) += crypto.o - -obj-$(CONFIG_TIPC_DIAG) += diag.o +obj-$(CONFIG_TIPC_DIAG) += tipc_diag.o +tipc_diag-y += diag.o diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 878415c43527..5a526ebafeb4 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -1079,30 +1079,27 @@ int tipc_nl_bearer_add(struct sk_buff *skb, struct genl_info *info) rtnl_lock(); b = tipc_bearer_find(net, name); if (!b) { - rtnl_unlock(); NL_SET_ERR_MSG(info->extack, "Bearer not found"); - return -EINVAL; + err = -EINVAL; + goto out; } #ifdef CONFIG_TIPC_MEDIA_UDP if (attrs[TIPC_NLA_BEARER_UDP_OPTS]) { if (b->media->type_id != TIPC_MEDIA_TYPE_UDP) { - rtnl_unlock(); NL_SET_ERR_MSG(info->extack, "UDP option is unsupported"); - return -EINVAL; + err = -EINVAL; + goto out; } err = tipc_udp_nl_bearer_add(b, attrs[TIPC_NLA_BEARER_UDP_OPTS]); - if (err) { - rtnl_unlock(); - return err; - } } #endif +out: rtnl_unlock(); - return 0; + return err; } int __tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info) diff --git a/net/tipc/diag.c b/net/tipc/diag.c index 18733451c9e0..54dde8c4e4d4 100644 --- a/net/tipc/diag.c +++ b/net/tipc/diag.c @@ -95,6 +95,7 @@ static int tipc_sock_diag_handler_dump(struct sk_buff *skb, } static const struct sock_diag_handler tipc_sock_diag_handler = { + .owner = THIS_MODULE, .family = AF_TIPC, .dump = tipc_sock_diag_handler_dump, }; diff --git a/net/tipc/node.c b/net/tipc/node.c index 3105abe97bb9..c1e890a82434 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -86,8 +86,6 @@ struct tipc_bclink_entry { * @lock: rwlock governing access to structure * @net: the applicable net namespace * @hash: links to adjacent nodes in unsorted hash chain - * @inputq: pointer to input queue containing messages for msg event - * @namedq: pointer to name table input queue with name table messages * @active_links: bearer ids of active links, used as index into links[] array * @links: array containing references to all links to node * @bc_entry: broadcast link entry diff --git a/net/tipc/socket.c b/net/tipc/socket.c index bb1118d02f95..7e4135db5816 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -80,7 +80,6 @@ struct sockaddr_pair { * @phdr: preformatted message header used when sending messages * @cong_links: list of congested links * @publications: list of publications for port - * @blocking_link: address of the congested link we are currently sleeping on * @pub_count: total # of publications port has made during its lifetime * @conn_timeout: the time we can wait for an unresponded setup request * @probe_unacked: probe has not received ack yet diff --git a/net/unix/Kconfig b/net/unix/Kconfig index 28b232f281ab..8b5d04210d7c 100644 --- a/net/unix/Kconfig +++ b/net/unix/Kconfig @@ -16,11 +16,6 @@ config UNIX Say Y unless you know what you are doing. -config UNIX_SCM - bool - depends on UNIX - default y - config AF_UNIX_OOB bool depends on UNIX diff --git a/net/unix/Makefile b/net/unix/Makefile index 20491825b4d0..4ddd125c4642 100644 --- a/net/unix/Makefile +++ b/net/unix/Makefile @@ -11,5 +11,3 @@ unix-$(CONFIG_BPF_SYSCALL) += unix_bpf.o obj-$(CONFIG_UNIX_DIAG) += unix_diag.o unix_diag-y := diag.o - -obj-$(CONFIG_UNIX_SCM) += scm.o diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 30b178ebba60..4892e9428c9f 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -118,8 +118,6 @@ #include <linux/btf_ids.h> #include <linux/bpf-cgroup.h> -#include "scm.h" - static atomic_long_t unix_nr_socks; static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2]; static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2]; @@ -993,11 +991,11 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, sk->sk_write_space = unix_write_space; sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen; sk->sk_destruct = unix_sock_destructor; - u = unix_sk(sk); + u = unix_sk(sk); + u->inflight = 0; u->path.dentry = NULL; u->path.mnt = NULL; spin_lock_init(&u->lock); - atomic_long_set(&u->inflight, 0); INIT_LIST_HEAD(&u->link); mutex_init(&u->iolock); /* single task reading lock */ mutex_init(&u->bindlock); /* single task binding lock */ @@ -1788,6 +1786,52 @@ out: return err; } +/* The "user->unix_inflight" variable is protected by the garbage + * collection lock, and we just read it locklessly here. If you go + * over the limit, there might be a tiny race in actually noticing + * it across threads. Tough. + */ +static inline bool too_many_unix_fds(struct task_struct *p) +{ + struct user_struct *user = current_user(); + + if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE))) + return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); + return false; +} + +static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) +{ + int i; + + if (too_many_unix_fds(current)) + return -ETOOMANYREFS; + + /* Need to duplicate file references for the sake of garbage + * collection. Otherwise a socket in the fps might become a + * candidate for GC while the skb is not yet queued. + */ + UNIXCB(skb).fp = scm_fp_dup(scm->fp); + if (!UNIXCB(skb).fp) + return -ENOMEM; + + for (i = scm->fp->count - 1; i >= 0; i--) + unix_inflight(scm->fp->user, scm->fp->fp[i]); + + return 0; +} + +static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) +{ + int i; + + scm->fp = UNIXCB(skb).fp; + UNIXCB(skb).fp = NULL; + + for (i = scm->fp->count - 1; i >= 0; i--) + unix_notinflight(scm->fp->user, scm->fp->fp[i]); +} + static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) { scm->fp = scm_fp_dup(UNIXCB(skb).fp); @@ -1835,6 +1879,21 @@ static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) spin_unlock(&unix_gc_lock); } +static void unix_destruct_scm(struct sk_buff *skb) +{ + struct scm_cookie scm; + + memset(&scm, 0, sizeof(scm)); + scm.pid = UNIXCB(skb).pid; + if (UNIXCB(skb).fp) + unix_detach_fds(&scm, skb); + + /* Alas, it calls VFS */ + /* So fscking what? fput() had been SMP-safe since the last Summer */ + scm_destroy(&scm); + sock_wfree(skb); +} + static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) { int err = 0; @@ -1921,11 +1980,12 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, long timeo; int err; - wait_for_unix_gc(); err = scm_send(sock, msg, &scm, false); if (err < 0) return err; + wait_for_unix_gc(scm.fp); + err = -EOPNOTSUPP; if (msg->msg_flags&MSG_OOB) goto out; @@ -2197,11 +2257,12 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, bool fds_sent = false; int data_len; - wait_for_unix_gc(); err = scm_send(sock, msg, &scm, false); if (err < 0) return err; + wait_for_unix_gc(scm.fp); + err = -EOPNOTSUPP; if (msg->msg_flags & MSG_OOB) { #if IS_ENABLED(CONFIG_AF_UNIX_OOB) diff --git a/net/unix/diag.c b/net/unix/diag.c index be19827eca36..ae39538c5042 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -322,6 +322,7 @@ static int unix_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) } static const struct sock_diag_handler unix_diag_handler = { + .owner = THIS_MODULE, .family = AF_UNIX, .dump = unix_diag_handler_dump, }; diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 2ff7ddbaa782..51acf795f096 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -81,12 +81,80 @@ #include <net/scm.h> #include <net/tcp_states.h> -#include "scm.h" +struct unix_sock *unix_get_socket(struct file *filp) +{ + struct inode *inode = file_inode(filp); + + /* Socket ? */ + if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) { + struct socket *sock = SOCKET_I(inode); + const struct proto_ops *ops; + struct sock *sk = sock->sk; + + ops = READ_ONCE(sock->ops); -/* Internal data structures and random procedures: */ + /* PF_UNIX ? */ + if (sk && ops && ops->family == PF_UNIX) + return unix_sk(sk); + } + + return NULL; +} +DEFINE_SPINLOCK(unix_gc_lock); +unsigned int unix_tot_inflight; static LIST_HEAD(gc_candidates); -static DECLARE_WAIT_QUEUE_HEAD(unix_gc_wait); +static LIST_HEAD(gc_inflight_list); + +/* Keep the number of times in flight count for the file + * descriptor if it is for an AF_UNIX socket. + */ +void unix_inflight(struct user_struct *user, struct file *filp) +{ + struct unix_sock *u = unix_get_socket(filp); + + spin_lock(&unix_gc_lock); + + if (u) { + if (!u->inflight) { + WARN_ON_ONCE(!list_empty(&u->link)); + list_add_tail(&u->link, &gc_inflight_list); + } else { + WARN_ON_ONCE(list_empty(&u->link)); + } + u->inflight++; + + /* Paired with READ_ONCE() in wait_for_unix_gc() */ + WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + 1); + } + + WRITE_ONCE(user->unix_inflight, user->unix_inflight + 1); + + spin_unlock(&unix_gc_lock); +} + +void unix_notinflight(struct user_struct *user, struct file *filp) +{ + struct unix_sock *u = unix_get_socket(filp); + + spin_lock(&unix_gc_lock); + + if (u) { + WARN_ON_ONCE(!u->inflight); + WARN_ON_ONCE(list_empty(&u->link)); + + u->inflight--; + if (!u->inflight) + list_del_init(&u->link); + + /* Paired with READ_ONCE() in wait_for_unix_gc() */ + WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - 1); + } + + WRITE_ONCE(user->unix_inflight, user->unix_inflight - 1); + + spin_unlock(&unix_gc_lock); +} static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *), struct sk_buff_head *hitlist) @@ -105,20 +173,15 @@ static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *), while (nfd--) { /* Get the socket the fd matches if it indeed does so */ - struct sock *sk = unix_get_socket(*fp++); - - if (sk) { - struct unix_sock *u = unix_sk(sk); + struct unix_sock *u = unix_get_socket(*fp++); - /* Ignore non-candidates, they could - * have been added to the queues after - * starting the garbage collection - */ - if (test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) { - hit = true; + /* Ignore non-candidates, they could have been added + * to the queues after starting the garbage collection + */ + if (u && test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) { + hit = true; - func(u); - } + func(u); } } if (hit && hitlist != NULL) { @@ -151,7 +214,7 @@ static void scan_children(struct sock *x, void (*func)(struct unix_sock *), /* An embryo cannot be in-flight, so it's safe * to use the list link. */ - BUG_ON(!list_empty(&u->link)); + WARN_ON_ONCE(!list_empty(&u->link)); list_add_tail(&u->link, &embryos); } spin_unlock(&x->sk_receive_queue.lock); @@ -166,17 +229,18 @@ static void scan_children(struct sock *x, void (*func)(struct unix_sock *), static void dec_inflight(struct unix_sock *usk) { - atomic_long_dec(&usk->inflight); + usk->inflight--; } static void inc_inflight(struct unix_sock *usk) { - atomic_long_inc(&usk->inflight); + usk->inflight++; } static void inc_inflight_move_tail(struct unix_sock *u) { - atomic_long_inc(&u->inflight); + u->inflight++; + /* If this still might be part of a cycle, move it to the end * of the list, so that it's checked even if it was already * passed over @@ -186,40 +250,16 @@ static void inc_inflight_move_tail(struct unix_sock *u) } static bool gc_in_progress; -#define UNIX_INFLIGHT_TRIGGER_GC 16000 - -void wait_for_unix_gc(void) -{ - /* If number of inflight sockets is insane, - * force a garbage collect right now. - * Paired with the WRITE_ONCE() in unix_inflight(), - * unix_notinflight() and gc_in_progress(). - */ - if (READ_ONCE(unix_tot_inflight) > UNIX_INFLIGHT_TRIGGER_GC && - !READ_ONCE(gc_in_progress)) - unix_gc(); - wait_event(unix_gc_wait, gc_in_progress == false); -} -/* The external entry point: unix_gc() */ -void unix_gc(void) +static void __unix_gc(struct work_struct *work) { - struct sk_buff *next_skb, *skb; - struct unix_sock *u; - struct unix_sock *next; struct sk_buff_head hitlist; - struct list_head cursor; + struct unix_sock *u, *next; LIST_HEAD(not_cycle_list); + struct list_head cursor; spin_lock(&unix_gc_lock); - /* Avoid a recursive GC. */ - if (gc_in_progress) - goto out; - - /* Paired with READ_ONCE() in wait_for_unix_gc(). */ - WRITE_ONCE(gc_in_progress, true); - /* First, select candidates for garbage collection. Only * in-flight sockets are considered, and from those only ones * which don't have any external reference. @@ -237,14 +277,12 @@ void unix_gc(void) */ list_for_each_entry_safe(u, next, &gc_inflight_list, link) { long total_refs; - long inflight_refs; total_refs = file_count(u->sk.sk_socket->file); - inflight_refs = atomic_long_read(&u->inflight); - BUG_ON(inflight_refs < 1); - BUG_ON(total_refs < inflight_refs); - if (total_refs == inflight_refs) { + WARN_ON_ONCE(!u->inflight); + WARN_ON_ONCE(total_refs < u->inflight); + if (total_refs == u->inflight) { list_move_tail(&u->link, &gc_candidates); __set_bit(UNIX_GC_CANDIDATE, &u->gc_flags); __set_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags); @@ -271,7 +309,7 @@ void unix_gc(void) /* Move cursor to after the current position. */ list_move(&cursor, &u->link); - if (atomic_long_read(&u->inflight) > 0) { + if (u->inflight) { list_move_tail(&u->link, ¬_cycle_list); __clear_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags); scan_children(&u->sk, inc_inflight_move_tail, NULL); @@ -298,19 +336,6 @@ void unix_gc(void) spin_unlock(&unix_gc_lock); - /* We need io_uring to clean its registered files, ignore all io_uring - * originated skbs. It's fine as io_uring doesn't keep references to - * other io_uring instances and so killing all other files in the cycle - * will put all io_uring references forcing it to go through normal - * release.path eventually putting registered files. - */ - skb_queue_walk_safe(&hitlist, skb, next_skb) { - if (skb->destructor == io_uring_destruct_scm) { - __skb_unlink(skb, &hitlist); - skb_queue_tail(&skb->sk->sk_receive_queue, skb); - } - } - /* Here we are. Hitlist is filled. Die. */ __skb_queue_purge(&hitlist); @@ -328,20 +353,45 @@ void unix_gc(void) spin_lock(&unix_gc_lock); - /* There could be io_uring registered files, just push them back to - * the inflight list - */ - list_for_each_entry_safe(u, next, &gc_candidates, link) - list_move_tail(&u->link, &gc_inflight_list); - /* All candidates should have been detached by now. */ - BUG_ON(!list_empty(&gc_candidates)); + WARN_ON_ONCE(!list_empty(&gc_candidates)); /* Paired with READ_ONCE() in wait_for_unix_gc(). */ WRITE_ONCE(gc_in_progress, false); - wake_up(&unix_gc_wait); - - out: spin_unlock(&unix_gc_lock); } + +static DECLARE_WORK(unix_gc_work, __unix_gc); + +void unix_gc(void) +{ + WRITE_ONCE(gc_in_progress, true); + queue_work(system_unbound_wq, &unix_gc_work); +} + +#define UNIX_INFLIGHT_TRIGGER_GC 16000 +#define UNIX_INFLIGHT_SANE_USER (SCM_MAX_FD * 8) + +void wait_for_unix_gc(struct scm_fp_list *fpl) +{ + /* If number of inflight sockets is insane, + * force a garbage collect right now. + * + * Paired with the WRITE_ONCE() in unix_inflight(), + * unix_notinflight(), and __unix_gc(). + */ + if (READ_ONCE(unix_tot_inflight) > UNIX_INFLIGHT_TRIGGER_GC && + !READ_ONCE(gc_in_progress)) + unix_gc(); + + /* Penalise users who want to send AF_UNIX sockets + * but whose sockets have not been received yet. + */ + if (!fpl || !fpl->count_unix || + READ_ONCE(fpl->user->unix_inflight) < UNIX_INFLIGHT_SANE_USER) + return; + + if (READ_ONCE(gc_in_progress)) + flush_work(&unix_gc_work); +} diff --git a/net/unix/scm.c b/net/unix/scm.c deleted file mode 100644 index 822ce0d0d791..000000000000 --- a/net/unix/scm.c +++ /dev/null @@ -1,159 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/string.h> -#include <linux/socket.h> -#include <linux/net.h> -#include <linux/fs.h> -#include <net/af_unix.h> -#include <net/scm.h> -#include <linux/init.h> -#include <linux/io_uring.h> - -#include "scm.h" - -unsigned int unix_tot_inflight; -EXPORT_SYMBOL(unix_tot_inflight); - -LIST_HEAD(gc_inflight_list); -EXPORT_SYMBOL(gc_inflight_list); - -DEFINE_SPINLOCK(unix_gc_lock); -EXPORT_SYMBOL(unix_gc_lock); - -struct sock *unix_get_socket(struct file *filp) -{ - struct sock *u_sock = NULL; - struct inode *inode = file_inode(filp); - - /* Socket ? */ - if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) { - struct socket *sock = SOCKET_I(inode); - const struct proto_ops *ops = READ_ONCE(sock->ops); - struct sock *s = sock->sk; - - /* PF_UNIX ? */ - if (s && ops && ops->family == PF_UNIX) - u_sock = s; - } - - return u_sock; -} -EXPORT_SYMBOL(unix_get_socket); - -/* Keep the number of times in flight count for the file - * descriptor if it is for an AF_UNIX socket. - */ -void unix_inflight(struct user_struct *user, struct file *fp) -{ - struct sock *s = unix_get_socket(fp); - - spin_lock(&unix_gc_lock); - - if (s) { - struct unix_sock *u = unix_sk(s); - - if (atomic_long_inc_return(&u->inflight) == 1) { - BUG_ON(!list_empty(&u->link)); - list_add_tail(&u->link, &gc_inflight_list); - } else { - BUG_ON(list_empty(&u->link)); - } - /* Paired with READ_ONCE() in wait_for_unix_gc() */ - WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + 1); - } - WRITE_ONCE(user->unix_inflight, user->unix_inflight + 1); - spin_unlock(&unix_gc_lock); -} - -void unix_notinflight(struct user_struct *user, struct file *fp) -{ - struct sock *s = unix_get_socket(fp); - - spin_lock(&unix_gc_lock); - - if (s) { - struct unix_sock *u = unix_sk(s); - - BUG_ON(!atomic_long_read(&u->inflight)); - BUG_ON(list_empty(&u->link)); - - if (atomic_long_dec_and_test(&u->inflight)) - list_del_init(&u->link); - /* Paired with READ_ONCE() in wait_for_unix_gc() */ - WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - 1); - } - WRITE_ONCE(user->unix_inflight, user->unix_inflight - 1); - spin_unlock(&unix_gc_lock); -} - -/* - * The "user->unix_inflight" variable is protected by the garbage - * collection lock, and we just read it locklessly here. If you go - * over the limit, there might be a tiny race in actually noticing - * it across threads. Tough. - */ -static inline bool too_many_unix_fds(struct task_struct *p) -{ - struct user_struct *user = current_user(); - - if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE))) - return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); - return false; -} - -int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) -{ - int i; - - if (too_many_unix_fds(current)) - return -ETOOMANYREFS; - - /* - * Need to duplicate file references for the sake of garbage - * collection. Otherwise a socket in the fps might become a - * candidate for GC while the skb is not yet queued. - */ - UNIXCB(skb).fp = scm_fp_dup(scm->fp); - if (!UNIXCB(skb).fp) - return -ENOMEM; - - for (i = scm->fp->count - 1; i >= 0; i--) - unix_inflight(scm->fp->user, scm->fp->fp[i]); - return 0; -} -EXPORT_SYMBOL(unix_attach_fds); - -void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) -{ - int i; - - scm->fp = UNIXCB(skb).fp; - UNIXCB(skb).fp = NULL; - - for (i = scm->fp->count-1; i >= 0; i--) - unix_notinflight(scm->fp->user, scm->fp->fp[i]); -} -EXPORT_SYMBOL(unix_detach_fds); - -void unix_destruct_scm(struct sk_buff *skb) -{ - struct scm_cookie scm; - - memset(&scm, 0, sizeof(scm)); - scm.pid = UNIXCB(skb).pid; - if (UNIXCB(skb).fp) - unix_detach_fds(&scm, skb); - - /* Alas, it calls VFS */ - /* So fscking what? fput() had been SMP-safe since the last Summer */ - scm_destroy(&scm); - sock_wfree(skb); -} -EXPORT_SYMBOL(unix_destruct_scm); - -void io_uring_destruct_scm(struct sk_buff *skb) -{ - unix_destruct_scm(skb); -} -EXPORT_SYMBOL(io_uring_destruct_scm); diff --git a/net/unix/scm.h b/net/unix/scm.h deleted file mode 100644 index 5a255a477f16..000000000000 --- a/net/unix/scm.h +++ /dev/null @@ -1,10 +0,0 @@ -#ifndef NET_UNIX_SCM_H -#define NET_UNIX_SCM_H - -extern struct list_head gc_inflight_list; -extern spinlock_t unix_gc_lock; - -int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb); -void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb); - -#endif diff --git a/net/vmw_vsock/diag.c b/net/vmw_vsock/diag.c index 2e29994f92ff..ab87ef66c1e8 100644 --- a/net/vmw_vsock/diag.c +++ b/net/vmw_vsock/diag.c @@ -157,6 +157,7 @@ static int vsock_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) } static const struct sock_diag_handler vsock_diag_handler = { + .owner = THIS_MODULE, .family = AF_VSOCK, .dump = vsock_diag_handler_dump, }; diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c index 9f8955367275..09dcea0cbbed 100644 --- a/net/xdp/xsk_diag.c +++ b/net/xdp/xsk_diag.c @@ -194,6 +194,7 @@ static int xsk_diag_handler_dump(struct sk_buff *nlskb, struct nlmsghdr *hdr) } static const struct sock_diag_handler xsk_diag_handler = { + .owner = THIS_MODULE, .family = AF_XDP, .dump = xsk_diag_handler_dump, }; diff --git a/net/xfrm/xfrm_interface_core.c b/net/xfrm/xfrm_interface_core.c index 21d50d75c260..dafefef3cf51 100644 --- a/net/xfrm/xfrm_interface_core.c +++ b/net/xfrm/xfrm_interface_core.c @@ -957,12 +957,12 @@ static struct rtnl_link_ops xfrmi_link_ops __read_mostly = { .get_link_net = xfrmi_get_link_net, }; -static void __net_exit xfrmi_exit_batch_net(struct list_head *net_exit_list) +static void __net_exit xfrmi_exit_batch_rtnl(struct list_head *net_exit_list, + struct list_head *dev_to_kill) { struct net *net; - LIST_HEAD(list); - rtnl_lock(); + ASSERT_RTNL(); list_for_each_entry(net, net_exit_list, exit_list) { struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); struct xfrm_if __rcu **xip; @@ -973,18 +973,16 @@ static void __net_exit xfrmi_exit_batch_net(struct list_head *net_exit_list) for (xip = &xfrmn->xfrmi[i]; (xi = rtnl_dereference(*xip)) != NULL; xip = &xi->next) - unregister_netdevice_queue(xi->dev, &list); + unregister_netdevice_queue(xi->dev, dev_to_kill); } xi = rtnl_dereference(xfrmn->collect_md_xfrmi); if (xi) - unregister_netdevice_queue(xi->dev, &list); + unregister_netdevice_queue(xi->dev, dev_to_kill); } - unregister_netdevice_many(&list); - rtnl_unlock(); } static struct pernet_operations xfrmi_net_ops = { - .exit_batch = xfrmi_exit_batch_net, + .exit_batch_rtnl = xfrmi_exit_batch_rtnl, .id = &xfrmi_net_id, .size = sizeof(struct xfrmi_net), }; diff --git a/net/xfrm/xfrm_proc.c b/net/xfrm/xfrm_proc.c index fee9b5cf37a7..5f9bf8e5c933 100644 --- a/net/xfrm/xfrm_proc.c +++ b/net/xfrm/xfrm_proc.c @@ -52,6 +52,7 @@ static int xfrm_statistics_seq_show(struct seq_file *seq, void *v) memset(buff, 0, sizeof(unsigned long) * LINUX_MIB_XFRMMAX); + xfrm_state_update_stats(net); snmp_get_cpu_field_batch(buff, xfrm_mib_list, net->mib.xfrm_statistics); for (i = 0; xfrm_mib_list[i].name; i++) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index bda5327bf34d..0c306473a79d 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -570,7 +570,7 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me) int err = 0; spin_lock(&x->lock); - xfrm_dev_state_update_curlft(x); + xfrm_dev_state_update_stats(x); if (x->km.state == XFRM_STATE_DEAD) goto out; @@ -1935,7 +1935,7 @@ EXPORT_SYMBOL(xfrm_state_update); int xfrm_state_check_expire(struct xfrm_state *x) { - xfrm_dev_state_update_curlft(x); + xfrm_dev_state_update_stats(x); if (!READ_ONCE(x->curlft.use_time)) WRITE_ONCE(x->curlft.use_time, ktime_get_real_seconds()); @@ -1957,6 +1957,19 @@ int xfrm_state_check_expire(struct xfrm_state *x) } EXPORT_SYMBOL(xfrm_state_check_expire); +void xfrm_state_update_stats(struct net *net) +{ + struct xfrm_state *x; + int i; + + spin_lock_bh(&net->xfrm.xfrm_state_lock); + for (i = 0; i <= net->xfrm.state_hmask; i++) { + hlist_for_each_entry(x, net->xfrm.state_bydst + i, bydst) + xfrm_dev_state_update_stats(x); + } + spin_unlock_bh(&net->xfrm.xfrm_state_lock); +} + struct xfrm_state * xfrm_state_lookup(struct net *net, u32 mark, const xfrm_address_t *daddr, __be32 spi, u8 proto, unsigned short family) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index f037be190bae..a5232dcfea46 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -902,7 +902,7 @@ static void copy_to_user_state(struct xfrm_state *x, struct xfrm_usersa_info *p) memcpy(&p->sel, &x->sel, sizeof(p->sel)); memcpy(&p->lft, &x->lft, sizeof(p->lft)); if (x->xso.dev) - xfrm_dev_state_update_curlft(x); + xfrm_dev_state_update_stats(x); memcpy(&p->curlft, &x->curlft, sizeof(p->curlft)); put_unaligned(x->stats.replay_window, &p->stats.replay_window); put_unaligned(x->stats.replay, &p->stats.replay); |