diff options
Diffstat (limited to 'net/openvswitch/datapath.c')
| -rw-r--r-- | net/openvswitch/datapath.c | 783 |
1 files changed, 585 insertions, 198 deletions
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 6679e96ab1dc..d5b6e2002bc1 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2007-2014 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -28,7 +15,6 @@ #include <linux/delay.h> #include <linux/time.h> #include <linux/etherdevice.h> -#include <linux/genetlink.h> #include <linux/kernel.h> #include <linux/kthread.h> #include <linux/mutex.h> @@ -48,14 +34,18 @@ #include <linux/rculist.h> #include <linux/dmi.h> #include <net/genetlink.h> +#include <net/gso.h> #include <net/net_namespace.h> #include <net/netns/generic.h> +#include <net/pkt_cls.h> #include "datapath.h" +#include "drop.h" #include "flow.h" #include "flow_table.h" #include "flow_netlink.h" #include "meter.h" +#include "openvswitch_trace.h" #include "vport-internal_dev.h" #include "vport-netdev.h" @@ -143,6 +133,10 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *, const struct dp_upcall_info *, uint32_t cutlen); +static void ovs_dp_masks_rebalance(struct work_struct *work); + +static int ovs_dp_set_upcall_portids(struct datapath *, const struct nlattr *); + /* Must be called with rcu_read_lock or ovs_mutex. */ const char *ovs_dp_name(const struct datapath *dp) { @@ -176,6 +170,7 @@ static void destroy_dp_rcu(struct rcu_head *rcu) free_percpu(dp->stats_percpu); kfree(dp->ports); ovs_meters_exit(dp); + kfree(rcu_dereference_raw(dp->upcall_portids)); kfree(dp); } @@ -192,7 +187,8 @@ struct vport *ovs_lookup_vport(const struct datapath *dp, u16 port_no) struct hlist_head *head; head = vport_hash_bucket(dp, port_no); - hlist_for_each_entry_rcu(vport, head, dp_hash_node) { + hlist_for_each_entry_rcu(vport, head, dp_hash_node, + lockdep_ovsl_is_held()) { if (vport->port_no == port_no) return vport; } @@ -214,6 +210,26 @@ static struct vport *new_vport(const struct vport_parms *parms) return vport; } +static void ovs_vport_update_upcall_stats(struct sk_buff *skb, + const struct dp_upcall_info *upcall_info, + bool upcall_result) +{ + struct vport *p = OVS_CB(skb)->input_vport; + struct vport_upcall_stats_percpu *stats; + + if (upcall_info->cmd != OVS_PACKET_CMD_MISS && + upcall_info->cmd != OVS_PACKET_CMD_ACTION) + return; + + stats = this_cpu_ptr(p->upcall_stats); + u64_stats_update_begin(&stats->syncp); + if (upcall_result) + u64_stats_inc(&stats->n_success); + else + u64_stats_inc(&stats->n_fail); + u64_stats_update_end(&stats->syncp); +} + void ovs_dp_detach_port(struct vport *p) { ASSERT_OVSL(); @@ -228,38 +244,76 @@ void ovs_dp_detach_port(struct vport *p) /* Must be called with rcu_read_lock. */ void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) { + struct ovs_pcpu_storage *ovs_pcpu = this_cpu_ptr(ovs_pcpu_storage); const struct vport *p = OVS_CB(skb)->input_vport; struct datapath *dp = p->dp; struct sw_flow *flow; struct sw_flow_actions *sf_acts; struct dp_stats_percpu *stats; + bool ovs_pcpu_locked = false; u64 *stats_counter; u32 n_mask_hit; + u32 n_cache_hit; + int error; stats = this_cpu_ptr(dp->stats_percpu); /* Look up flow. */ - flow = ovs_flow_tbl_lookup_stats(&dp->table, key, &n_mask_hit); + flow = ovs_flow_tbl_lookup_stats(&dp->table, key, skb_get_hash(skb), + &n_mask_hit, &n_cache_hit); if (unlikely(!flow)) { struct dp_upcall_info upcall; - int error; memset(&upcall, 0, sizeof(upcall)); upcall.cmd = OVS_PACKET_CMD_MISS; - upcall.portid = ovs_vport_find_upcall_portid(p, skb); + + if (OVS_CB(skb)->upcall_pid) + upcall.portid = OVS_CB(skb)->upcall_pid; + else if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU) + upcall.portid = + ovs_dp_get_upcall_portid(dp, smp_processor_id()); + else + upcall.portid = ovs_vport_find_upcall_portid(p, skb); + upcall.mru = OVS_CB(skb)->mru; error = ovs_dp_upcall(dp, skb, key, &upcall, 0); - if (unlikely(error)) - kfree_skb(skb); - else + switch (error) { + case 0: + case -EAGAIN: + case -ERESTARTSYS: + case -EINTR: consume_skb(skb); + break; + default: + kfree_skb(skb); + break; + } stats_counter = &stats->n_missed; goto out; } ovs_flow_stats_update(flow, key->tp.flags, skb); sf_acts = rcu_dereference(flow->sf_acts); - ovs_execute_actions(dp, skb, sf_acts, key); + /* This path can be invoked recursively: Use the current task to + * identify recursive invocation - the lock must be acquired only once. + * Even with disabled bottom halves this can be preempted on PREEMPT_RT. + * Limit the locking to RT to avoid assigning `owner' if it can be + * avoided. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && ovs_pcpu->owner != current) { + local_lock_nested_bh(&ovs_pcpu_storage->bh_lock); + ovs_pcpu->owner = current; + ovs_pcpu_locked = true; + } + + error = ovs_execute_actions(dp, skb, sf_acts, key); + if (unlikely(error)) + net_dbg_ratelimited("ovs: action execution error on datapath %s: %d\n", + ovs_dp_name(dp), error); + if (ovs_pcpu_locked) { + ovs_pcpu->owner = NULL; + local_unlock_nested_bh(&ovs_pcpu_storage->bh_lock); + } stats_counter = &stats->n_hit; @@ -268,6 +322,7 @@ out: u64_stats_update_begin(&stats->syncp); (*stats_counter)++; stats->n_mask_hit += n_mask_hit; + stats->n_cache_hit += n_cache_hit; u64_stats_update_end(&stats->syncp); } @@ -279,6 +334,9 @@ int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb, struct dp_stats_percpu *stats; int err; + if (trace_ovs_dp_upcall_enabled()) + trace_ovs_dp_upcall(dp, skb, key, upcall_info); + if (upcall_info->portid == 0) { err = -ENOTCONN; goto err; @@ -288,6 +346,8 @@ int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb, err = queue_userspace_packet(dp, skb, key, upcall_info, cutlen); else err = queue_gso_packets(dp, skb, key, upcall_info, cutlen); + + ovs_vport_update_upcall_stats(skb, upcall_info, !err); if (err) goto err; @@ -306,14 +366,14 @@ err: static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, const struct sw_flow_key *key, const struct dp_upcall_info *upcall_info, - uint32_t cutlen) + uint32_t cutlen) { unsigned int gso_type = skb_shinfo(skb)->gso_type; struct sw_flow_key later_key; struct sk_buff *segs, *nskb; int err; - BUILD_BUG_ON(sizeof(*OVS_CB(skb)) > SKB_SGO_CB_OFFSET); + BUILD_BUG_ON(sizeof(*OVS_CB(skb)) > SKB_GSO_CB_OFFSET); segs = __skb_gso_segment(skb, NETIF_F_SG, false); if (IS_ERR(segs)) return PTR_ERR(segs); @@ -330,8 +390,7 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, } /* Queue all of the segments. */ - skb = segs; - do { + skb_list_walk_safe(segs, skb, nskb) { if (gso_type & SKB_GSO_UDP && skb != segs) key = &later_key; @@ -339,17 +398,15 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, if (err) break; - } while ((skb = skb->next)); + } /* Free all of the segments. */ - skb = segs; - do { - nskb = skb->next; + skb_list_walk_safe(segs, skb, nskb) { if (err) kfree_skb(skb); else consume_skb(skb); - } while ((skb = nskb)); + } return err; } @@ -359,7 +416,8 @@ static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info, size_t size = NLMSG_ALIGN(sizeof(struct ovs_header)) + nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */ + nla_total_size(ovs_key_attr_size()) /* OVS_PACKET_ATTR_KEY */ - + nla_total_size(sizeof(unsigned int)); /* OVS_PACKET_ATTR_LEN */ + + nla_total_size(sizeof(unsigned int)) /* OVS_PACKET_ATTR_LEN */ + + nla_total_size(sizeof(u64)); /* OVS_PACKET_ATTR_HASH */ /* OVS_PACKET_ATTR_USERDATA */ if (upcall_info->userdata) @@ -402,6 +460,7 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, size_t len; unsigned int hlen; int err, dp_ifindex; + u64 hash; dp_ifindex = get_dpifindex(dp); if (!dp_ifindex) @@ -448,10 +507,15 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, upcall = genlmsg_put(user_skb, 0, 0, &dp_packet_genl_family, 0, upcall_info->cmd); + if (!upcall) { + err = -EINVAL; + goto out; + } upcall->dp_ifindex = dp_ifindex; err = ovs_nla_put_key(key, key, OVS_PACKET_ATTR_KEY, false, user_skb); - BUG_ON(err); + if (err) + goto out; if (upcall_info->userdata) __nla_put(user_skb, OVS_PACKET_ATTR_USERDATA, @@ -459,15 +523,26 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, nla_data(upcall_info->userdata)); if (upcall_info->egress_tun_info) { - nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_EGRESS_TUN_KEY); + nla = nla_nest_start_noflag(user_skb, + OVS_PACKET_ATTR_EGRESS_TUN_KEY); + if (!nla) { + err = -EMSGSIZE; + goto out; + } err = ovs_nla_put_tunnel_info(user_skb, upcall_info->egress_tun_info); - BUG_ON(err); + if (err) + goto out; + nla_nest_end(user_skb, nla); } if (upcall_info->actions_len) { - nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_ACTIONS); + nla = nla_nest_start_noflag(user_skb, OVS_PACKET_ATTR_ACTIONS); + if (!nla) { + err = -EMSGSIZE; + goto out; + } err = ovs_nla_put_actions(upcall_info->actions, upcall_info->actions_len, user_skb); @@ -478,23 +553,30 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, } /* Add OVS_PACKET_ATTR_MRU */ - if (upcall_info->mru) { - if (nla_put_u16(user_skb, OVS_PACKET_ATTR_MRU, - upcall_info->mru)) { - err = -ENOBUFS; - goto out; - } - pad_packet(dp, user_skb); + if (upcall_info->mru && + nla_put_u16(user_skb, OVS_PACKET_ATTR_MRU, upcall_info->mru)) { + err = -ENOBUFS; + goto out; } /* Add OVS_PACKET_ATTR_LEN when packet is truncated */ - if (cutlen > 0) { - if (nla_put_u32(user_skb, OVS_PACKET_ATTR_LEN, - skb->len)) { - err = -ENOBUFS; - goto out; - } - pad_packet(dp, user_skb); + if (cutlen > 0 && + nla_put_u32(user_skb, OVS_PACKET_ATTR_LEN, skb->len)) { + err = -ENOBUFS; + goto out; + } + + /* Add OVS_PACKET_ATTR_HASH */ + hash = skb_get_hash_raw(skb); + if (skb->sw_hash) + hash |= OVS_PACKET_HASH_SW_BIT; + + if (skb->l4_hash) + hash |= OVS_PACKET_HASH_L4_BIT; + + if (nla_put(user_skb, OVS_PACKET_ATTR_HASH, sizeof (u64), &hash)) { + err = -ENOBUFS; + goto out; } /* Only reserve room for attribute header, packet data is added @@ -519,14 +601,15 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, out: if (err) skb_tx_error(skb); - kfree_skb(user_skb); - kfree_skb(nskb); + consume_skb(user_skb); + consume_skb(nskb); + return err; } static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) { - struct ovs_header *ovs_header = info->userhdr; + struct ovs_header *ovs_header = genl_info_userhdr(info); struct net *net = sock_net(skb->sk); struct nlattr **a = info->attrs; struct sw_flow_actions *acts; @@ -536,6 +619,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) struct datapath *dp; struct vport *input_vport; u16 mru = 0; + u64 hash; int len; int err; bool log = !a[OVS_PACKET_ATTR_PROBE]; @@ -561,6 +645,17 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) } OVS_CB(packet)->mru = mru; + if (a[OVS_PACKET_ATTR_HASH]) { + hash = nla_get_u64(a[OVS_PACKET_ATTR_HASH]); + + __skb_set_hash(packet, hash & 0xFFFFFFFFULL, + !!(hash & OVS_PACKET_HASH_SW_BIT), + !!(hash & OVS_PACKET_HASH_L4_BIT)); + } + + OVS_CB(packet)->upcall_pid = + nla_get_u32_default(a[OVS_PACKET_ATTR_UPCALL_PID], 0); + /* Build an sw_flow for sending this packet. */ flow = ovs_flow_alloc(); err = PTR_ERR(flow); @@ -599,7 +694,13 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) sf_acts = rcu_dereference(flow->sf_acts); local_bh_disable(); + local_lock_nested_bh(&ovs_pcpu_storage->bh_lock); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + this_cpu_write(ovs_pcpu_storage->owner, current); err = ovs_execute_actions(dp, packet, sf_acts, &flow->key); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + this_cpu_write(ovs_pcpu_storage->owner, NULL); + local_unlock_nested_bh(&ovs_pcpu_storage->bh_lock); local_bh_enable(); rcu_read_unlock(); @@ -622,12 +723,14 @@ static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = { [OVS_PACKET_ATTR_ACTIONS] = { .type = NLA_NESTED }, [OVS_PACKET_ATTR_PROBE] = { .type = NLA_FLAG }, [OVS_PACKET_ATTR_MRU] = { .type = NLA_U16 }, + [OVS_PACKET_ATTR_HASH] = { .type = NLA_U64 }, + [OVS_PACKET_ATTR_UPCALL_PID] = { .type = NLA_U32 }, }; -static const struct genl_ops dp_packet_genl_ops[] = { +static const struct genl_small_ops dp_packet_genl_ops[] = { { .cmd = OVS_PACKET_CMD_EXECUTE, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ - .policy = packet_policy, .doit = ovs_packet_cmd_execute } }; @@ -637,10 +740,12 @@ static struct genl_family dp_packet_genl_family __ro_after_init = { .name = OVS_PACKET_FAMILY, .version = OVS_PACKET_VERSION, .maxattr = OVS_PACKET_ATTR_MAX, + .policy = packet_policy, .netnsok = true, .parallel_ops = true, - .ops = dp_packet_genl_ops, - .n_ops = ARRAY_SIZE(dp_packet_genl_ops), + .small_ops = dp_packet_genl_ops, + .n_small_ops = ARRAY_SIZE(dp_packet_genl_ops), + .resv_start_op = OVS_PACKET_CMD_EXECUTE + 1, .module = THIS_MODULE, }; @@ -664,14 +769,15 @@ static void get_dp_stats(const struct datapath *dp, struct ovs_dp_stats *stats, percpu_stats = per_cpu_ptr(dp->stats_percpu, i); do { - start = u64_stats_fetch_begin_irq(&percpu_stats->syncp); + start = u64_stats_fetch_begin(&percpu_stats->syncp); local_stats = *percpu_stats; - } while (u64_stats_fetch_retry_irq(&percpu_stats->syncp, start)); + } while (u64_stats_fetch_retry(&percpu_stats->syncp, start)); stats->n_hit += local_stats.n_hit; stats->n_missed += local_stats.n_missed; stats->n_lost += local_stats.n_lost; mega_stats->n_mask_hit += local_stats.n_mask_hit; + mega_stats->n_cache_hit += local_stats.n_cache_hit; } } @@ -697,9 +803,13 @@ static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts, { size_t len = NLMSG_ALIGN(sizeof(struct ovs_header)); - /* OVS_FLOW_ATTR_UFID */ + /* OVS_FLOW_ATTR_UFID, or unmasked flow key as fallback + * see ovs_nla_put_identifier() + */ if (sfid && ovs_identifier_is_ufid(sfid)) len += nla_total_size(sfid->ufid_len); + else + len += nla_total_size(ovs_key_attr_size()); /* OVS_FLOW_ATTR_KEY */ if (!sfid || should_fill_key(sfid, ufid_flags)) @@ -764,7 +874,7 @@ static int ovs_flow_cmd_fill_actions(const struct sw_flow *flow, * This can only fail for dump operations because the skb is always * properly sized for single flows. */ - start = nla_nest_start(skb, OVS_FLOW_ATTR_ACTIONS); + start = nla_nest_start_noflag(skb, OVS_FLOW_ATTR_ACTIONS); if (start) { const struct sw_flow_actions *sf_acts; @@ -875,7 +985,10 @@ static struct sk_buff *ovs_flow_cmd_build_info(const struct sw_flow *flow, retval = ovs_flow_cmd_fill_info(flow, dp_ifindex, skb, info->snd_portid, info->snd_seq, 0, cmd, ufid_flags); - BUG_ON(retval < 0); + if (WARN_ON_ONCE(retval < 0)) { + kfree_skb(skb); + skb = ERR_PTR(retval); + } return skb; } @@ -883,11 +996,12 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) { struct net *net = sock_net(skb->sk); struct nlattr **a = info->attrs; - struct ovs_header *ovs_header = info->userhdr; + struct ovs_header *ovs_header = genl_info_userhdr(info); struct sw_flow *flow = NULL, *new_flow; struct sw_flow_mask mask; struct sk_buff *reply; struct datapath *dp; + struct sw_flow_key *key; struct sw_flow_actions *acts; struct sw_flow_match match; u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); @@ -915,30 +1029,32 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) } /* Extract key. */ - ovs_match_init(&match, &new_flow->key, false, &mask); + key = kzalloc(sizeof(*key), GFP_KERNEL); + if (!key) { + error = -ENOMEM; + goto err_kfree_flow; + } + + ovs_match_init(&match, key, false, &mask); error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], a[OVS_FLOW_ATTR_MASK], log); if (error) - goto err_kfree_flow; + goto err_kfree_key; + + ovs_flow_mask_key(&new_flow->key, key, true, &mask); /* Extract flow identifier. */ error = ovs_nla_get_identifier(&new_flow->id, a[OVS_FLOW_ATTR_UFID], - &new_flow->key, log); + key, log); if (error) - goto err_kfree_flow; - - /* unmasked key is needed to match when ufid is not used. */ - if (ovs_identifier_is_key(&new_flow->id)) - match.key = new_flow->id.unmasked_key; - - ovs_flow_mask_key(&new_flow->key, &new_flow->key, true, &mask); + goto err_kfree_key; /* Validate actions. */ error = ovs_nla_copy_actions(net, a[OVS_FLOW_ATTR_ACTIONS], &new_flow->key, &acts, log); if (error) { OVS_NLERR(log, "Flow actions may not be safe on all matching packets."); - goto err_kfree_flow; + goto err_kfree_key; } reply = ovs_flow_cmd_alloc_info(acts, &new_flow->id, info, false, @@ -959,7 +1075,7 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) if (ovs_identifier_is_ufid(&new_flow->id)) flow = ovs_flow_tbl_lookup_ufid(&dp->table, &new_flow->id); if (!flow) - flow = ovs_flow_tbl_lookup(&dp->table, &new_flow->key); + flow = ovs_flow_tbl_lookup(&dp->table, key); if (likely(!flow)) { rcu_assign_pointer(new_flow->sf_acts, acts); @@ -1029,6 +1145,8 @@ static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) if (reply) ovs_notify(&dp_flow_genl_family, reply, info); + + kfree(key); return 0; err_unlock_ovs: @@ -1036,6 +1154,8 @@ err_unlock_ovs: kfree_skb(reply); err_kfree_acts: ovs_nla_free_flow_actions(acts); +err_kfree_key: + kfree(key); err_kfree_flow: ovs_flow_free(new_flow, false); error: @@ -1043,11 +1163,12 @@ error: } /* Factor out action copy to avoid "Wframe-larger-than=1024" warning. */ -static struct sw_flow_actions *get_flow_actions(struct net *net, - const struct nlattr *a, - const struct sw_flow_key *key, - const struct sw_flow_mask *mask, - bool log) +static noinline_for_stack +struct sw_flow_actions *get_flow_actions(struct net *net, + const struct nlattr *a, + const struct sw_flow_key *key, + const struct sw_flow_mask *mask, + bool log) { struct sw_flow_actions *acts; struct sw_flow_key masked_key; @@ -1077,12 +1198,13 @@ static struct sw_flow_actions *get_flow_actions(struct net *net, * we should not to return match object with dangling reference * to mask. * */ -static int ovs_nla_init_match_and_action(struct net *net, - struct sw_flow_match *match, - struct sw_flow_key *key, - struct nlattr **a, - struct sw_flow_actions **acts, - bool log) +static noinline_for_stack int +ovs_nla_init_match_and_action(struct net *net, + struct sw_flow_match *match, + struct sw_flow_key *key, + struct nlattr **a, + struct sw_flow_actions **acts, + bool log) { struct sw_flow_mask mask; int error = 0; @@ -1121,7 +1243,7 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) { struct net *net = sock_net(skb->sk); struct nlattr **a = info->attrs; - struct ovs_header *ovs_header = info->userhdr; + struct ovs_header *ovs_header = genl_info_userhdr(info); struct sw_flow_key key; struct sw_flow *flow; struct sk_buff *reply = NULL; @@ -1222,7 +1344,7 @@ error: static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; - struct ovs_header *ovs_header = info->userhdr; + struct ovs_header *ovs_header = genl_info_userhdr(info); struct net *net = sock_net(skb->sk); struct sw_flow_key key; struct sk_buff *reply; @@ -1281,7 +1403,7 @@ unlock: static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; - struct ovs_header *ovs_header = info->userhdr; + struct ovs_header *ovs_header = genl_info_userhdr(info); struct net *net = sock_net(skb->sk); struct sw_flow_key key; struct sk_buff *reply; @@ -1330,7 +1452,7 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) reply = ovs_flow_cmd_alloc_info((const struct sw_flow_actions __force *) flow->sf_acts, &flow->id, info, false, ufid_flags); if (likely(reply)) { - if (likely(!IS_ERR(reply))) { + if (!IS_ERR(reply)) { rcu_read_lock(); /*To keep RCU checker happy. */ err = ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex, reply, info->snd_portid, @@ -1338,14 +1460,19 @@ static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) OVS_FLOW_CMD_DEL, ufid_flags); rcu_read_unlock(); - BUG_ON(err < 0); + if (WARN_ON_ONCE(err < 0)) { + kfree_skb(reply); + goto out_free; + } ovs_notify(&dp_flow_genl_family, reply, info); } else { - netlink_set_err(sock_net(skb->sk)->genl_sock, 0, 0, PTR_ERR(reply)); + netlink_set_err(sock_net(skb->sk)->genl_sock, 0, 0, + PTR_ERR(reply)); } } +out_free: ovs_flow_free(flow, true); return 0; unlock: @@ -1362,8 +1489,8 @@ static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) u32 ufid_flags; int err; - err = genlmsg_parse(cb->nlh, &dp_flow_genl_family, a, - OVS_FLOW_ATTR_MAX, flow_policy, NULL); + err = genlmsg_parse_deprecated(cb->nlh, &dp_flow_genl_family, a, + OVS_FLOW_ATTR_MAX, flow_policy, NULL); if (err) return err; ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); @@ -1409,26 +1536,26 @@ static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = { [OVS_FLOW_ATTR_UFID_FLAGS] = { .type = NLA_U32 }, }; -static const struct genl_ops dp_flow_genl_ops[] = { +static const struct genl_small_ops dp_flow_genl_ops[] = { { .cmd = OVS_FLOW_CMD_NEW, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ - .policy = flow_policy, .doit = ovs_flow_cmd_new }, { .cmd = OVS_FLOW_CMD_DEL, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ - .policy = flow_policy, .doit = ovs_flow_cmd_del }, { .cmd = OVS_FLOW_CMD_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, /* OK for unprivileged users. */ - .policy = flow_policy, .doit = ovs_flow_cmd_get, .dumpit = ovs_flow_cmd_dump }, { .cmd = OVS_FLOW_CMD_SET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ - .policy = flow_policy, .doit = ovs_flow_cmd_set, }, }; @@ -1438,10 +1565,12 @@ static struct genl_family dp_flow_genl_family __ro_after_init = { .name = OVS_FLOW_FAMILY, .version = OVS_FLOW_VERSION, .maxattr = OVS_FLOW_ATTR_MAX, + .policy = flow_policy, .netnsok = true, .parallel_ops = true, - .ops = dp_flow_genl_ops, - .n_ops = ARRAY_SIZE(dp_flow_genl_ops), + .small_ops = dp_flow_genl_ops, + .n_small_ops = ARRAY_SIZE(dp_flow_genl_ops), + .resv_start_op = OVS_FLOW_CMD_SET + 1, .mcgrps = &ovs_dp_flow_multicast_group, .n_mcgrps = 1, .module = THIS_MODULE, @@ -1455,6 +1584,8 @@ static size_t ovs_dp_cmd_msg_size(void) msgsize += nla_total_size_64bit(sizeof(struct ovs_dp_stats)); msgsize += nla_total_size_64bit(sizeof(struct ovs_dp_megaflow_stats)); msgsize += nla_total_size(sizeof(u32)); /* OVS_DP_ATTR_USER_FEATURES */ + msgsize += nla_total_size(sizeof(u32)); /* OVS_DP_ATTR_MASKS_CACHE_SIZE */ + msgsize += nla_total_size(sizeof(u32) * nr_cpu_ids); /* OVS_DP_ATTR_PER_CPU_PIDS */ return msgsize; } @@ -1466,10 +1597,11 @@ static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb, struct ovs_header *ovs_header; struct ovs_dp_stats dp_stats; struct ovs_dp_megaflow_stats dp_megaflow_stats; - int err; + struct dp_nlsk_pids *pids = ovsl_dereference(dp->upcall_portids); + int err, pids_len; ovs_header = genlmsg_put(skb, portid, seq, &dp_datapath_genl_family, - flags, cmd); + flags, cmd); if (!ovs_header) goto error; @@ -1492,6 +1624,16 @@ static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb, if (nla_put_u32(skb, OVS_DP_ATTR_USER_FEATURES, dp->user_features)) goto nla_put_failure; + if (nla_put_u32(skb, OVS_DP_ATTR_MASKS_CACHE_SIZE, + ovs_flow_tbl_masks_cache_size(&dp->table))) + goto nla_put_failure; + + if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU && pids) { + pids_len = min(pids->n_pids, nr_cpu_ids) * sizeof(u32); + if (nla_put(skb, OVS_DP_ATTR_PER_CPU_PIDS, pids_len, &pids->pids)) + goto nla_put_failure; + } + genlmsg_end(skb, ovs_header); return 0; @@ -1524,22 +1666,147 @@ static struct datapath *lookup_datapath(struct net *net, return dp ? dp : ERR_PTR(-ENODEV); } -static void ovs_dp_reset_user_features(struct sk_buff *skb, struct genl_info *info) +static void ovs_dp_reset_user_features(struct sk_buff *skb, + struct genl_info *info) { struct datapath *dp; - dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs); + dp = lookup_datapath(sock_net(skb->sk), genl_info_userhdr(info), + info->attrs); if (IS_ERR(dp)) return; - WARN(dp->user_features, "Dropping previously announced user features\n"); + pr_warn("%s: Dropping previously announced user features\n", + ovs_dp_name(dp)); dp->user_features = 0; } -static void ovs_dp_change(struct datapath *dp, struct nlattr *a[]) +static int ovs_dp_set_upcall_portids(struct datapath *dp, + const struct nlattr *ids) +{ + struct dp_nlsk_pids *old, *dp_nlsk_pids; + + if (!nla_len(ids) || nla_len(ids) % sizeof(u32)) + return -EINVAL; + + old = ovsl_dereference(dp->upcall_portids); + + dp_nlsk_pids = kmalloc(sizeof(*dp_nlsk_pids) + nla_len(ids), + GFP_KERNEL); + if (!dp_nlsk_pids) + return -ENOMEM; + + dp_nlsk_pids->n_pids = nla_len(ids) / sizeof(u32); + nla_memcpy(dp_nlsk_pids->pids, ids, nla_len(ids)); + + rcu_assign_pointer(dp->upcall_portids, dp_nlsk_pids); + + kfree_rcu(old, rcu); + + return 0; +} + +u32 ovs_dp_get_upcall_portid(const struct datapath *dp, uint32_t cpu_id) +{ + struct dp_nlsk_pids *dp_nlsk_pids; + + dp_nlsk_pids = rcu_dereference(dp->upcall_portids); + + if (dp_nlsk_pids) { + if (cpu_id < dp_nlsk_pids->n_pids) { + return dp_nlsk_pids->pids[cpu_id]; + } else if (dp_nlsk_pids->n_pids > 0 && + cpu_id >= dp_nlsk_pids->n_pids) { + /* If the number of netlink PIDs is mismatched with + * the number of CPUs as seen by the kernel, log this + * and send the upcall to an arbitrary socket (0) in + * order to not drop packets + */ + pr_info_ratelimited("cpu_id mismatch with handler threads"); + return dp_nlsk_pids->pids[cpu_id % + dp_nlsk_pids->n_pids]; + } else { + return 0; + } + } else { + return 0; + } +} + +static int ovs_dp_change(struct datapath *dp, struct nlattr *a[]) +{ + u32 user_features = 0, old_features = dp->user_features; + int err; + + if (a[OVS_DP_ATTR_USER_FEATURES]) { + user_features = nla_get_u32(a[OVS_DP_ATTR_USER_FEATURES]); + + if (user_features & ~(OVS_DP_F_VPORT_PIDS | + OVS_DP_F_UNALIGNED | + OVS_DP_F_TC_RECIRC_SHARING | + OVS_DP_F_DISPATCH_UPCALL_PER_CPU)) + return -EOPNOTSUPP; + +#if !IS_ENABLED(CONFIG_NET_TC_SKB_EXT) + if (user_features & OVS_DP_F_TC_RECIRC_SHARING) + return -EOPNOTSUPP; +#endif + } + + if (a[OVS_DP_ATTR_MASKS_CACHE_SIZE]) { + int err; + u32 cache_size; + + cache_size = nla_get_u32(a[OVS_DP_ATTR_MASKS_CACHE_SIZE]); + err = ovs_flow_tbl_masks_cache_resize(&dp->table, cache_size); + if (err) + return err; + } + + dp->user_features = user_features; + + if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU && + a[OVS_DP_ATTR_PER_CPU_PIDS]) { + /* Upcall Netlink Port IDs have been updated */ + err = ovs_dp_set_upcall_portids(dp, + a[OVS_DP_ATTR_PER_CPU_PIDS]); + if (err) + return err; + } + + if ((dp->user_features & OVS_DP_F_TC_RECIRC_SHARING) && + !(old_features & OVS_DP_F_TC_RECIRC_SHARING)) + tc_skb_ext_tc_enable(); + else if (!(dp->user_features & OVS_DP_F_TC_RECIRC_SHARING) && + (old_features & OVS_DP_F_TC_RECIRC_SHARING)) + tc_skb_ext_tc_disable(); + + return 0; +} + +static int ovs_dp_stats_init(struct datapath *dp) +{ + dp->stats_percpu = netdev_alloc_pcpu_stats(struct dp_stats_percpu); + if (!dp->stats_percpu) + return -ENOMEM; + + return 0; +} + +static int ovs_dp_vport_init(struct datapath *dp) { - if (a[OVS_DP_ATTR_USER_FEATURES]) - dp->user_features = nla_get_u32(a[OVS_DP_ATTR_USER_FEATURES]); + int i; + + dp->ports = kmalloc_array(DP_VPORT_HASH_BUCKETS, + sizeof(struct hlist_head), + GFP_KERNEL); + if (!dp->ports) + return -ENOMEM; + + for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) + INIT_HLIST_HEAD(&dp->ports[i]); + + return 0; } static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) @@ -1550,7 +1817,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) struct datapath *dp; struct vport *vport; struct ovs_net *ovs_net; - int err, i; + int err; err = -EINVAL; if (!a[OVS_DP_ATTR_NAME] || !a[OVS_DP_ATTR_UPCALL_PID]) @@ -1563,35 +1830,26 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) err = -ENOMEM; dp = kzalloc(sizeof(*dp), GFP_KERNEL); if (dp == NULL) - goto err_free_reply; + goto err_destroy_reply; ovs_dp_set_net(dp, sock_net(skb->sk)); /* Allocate table. */ err = ovs_flow_tbl_init(&dp->table); if (err) - goto err_free_dp; + goto err_destroy_dp; - dp->stats_percpu = netdev_alloc_pcpu_stats(struct dp_stats_percpu); - if (!dp->stats_percpu) { - err = -ENOMEM; + err = ovs_dp_stats_init(dp); + if (err) goto err_destroy_table; - } - dp->ports = kmalloc_array(DP_VPORT_HASH_BUCKETS, - sizeof(struct hlist_head), - GFP_KERNEL); - if (!dp->ports) { - err = -ENOMEM; - goto err_destroy_percpu; - } - - for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) - INIT_HLIST_HEAD(&dp->ports[i]); + err = ovs_dp_vport_init(dp); + if (err) + goto err_destroy_stats; err = ovs_meters_init(dp); if (err) - goto err_destroy_ports_array; + goto err_destroy_ports; /* Set up our datapath device. */ parms.name = nla_data(a[OVS_DP_ATTR_NAME]); @@ -1600,12 +1858,15 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) parms.dp = dp; parms.port_no = OVSP_LOCAL; parms.upcall_portids = a[OVS_DP_ATTR_UPCALL_PID]; - - ovs_dp_change(dp, a); + parms.desired_ifindex = nla_get_s32_default(a[OVS_DP_ATTR_IFINDEX], 0); /* So far only local changes have been made, now need the lock. */ ovs_lock(); + err = ovs_dp_change(dp, a); + if (err) + goto err_unlock_and_destroy_meters; + vport = new_vport(&parms); if (IS_ERR(vport)) { err = PTR_ERR(vport); @@ -1621,7 +1882,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) ovs_dp_reset_user_features(skb, info); } - goto err_destroy_meters; + goto err_destroy_portids; } err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, @@ -1636,18 +1897,20 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) ovs_notify(&dp_datapath_genl_family, reply, info); return 0; -err_destroy_meters: +err_destroy_portids: + kfree(rcu_dereference_raw(dp->upcall_portids)); +err_unlock_and_destroy_meters: ovs_unlock(); ovs_meters_exit(dp); -err_destroy_ports_array: +err_destroy_ports: kfree(dp->ports); -err_destroy_percpu: +err_destroy_stats: free_percpu(dp->stats_percpu); err_destroy_table: ovs_flow_tbl_destroy(&dp->table); -err_free_dp: +err_destroy_dp: kfree(dp); -err_free_reply: +err_destroy_reply: kfree_skb(reply); err: return err; @@ -1656,8 +1919,12 @@ err: /* Called with ovs_mutex. */ static void __dp_destroy(struct datapath *dp) { + struct flow_table *table = &dp->table; int i; + if (dp->user_features & OVS_DP_F_TC_RECIRC_SHARING) + tc_skb_ext_tc_disable(); + for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { struct vport *vport; struct hlist_node *n; @@ -1674,7 +1941,14 @@ static void __dp_destroy(struct datapath *dp) */ ovs_dp_detach_port(ovs_vport_ovsl(dp, OVSP_LOCAL)); - /* RCU destroy the flow table */ + /* Flush sw_flow in the tables. RCU cb only releases resource + * such as dp, ports and tables. That may avoid some issues + * such as RCU usage warning. + */ + table_instance_flow_flush(table, ovsl_dereference(table->ti), + ovsl_dereference(table->ufid_ti)); + + /* RCU destroy the ports, meters and flow tables. */ call_rcu(&dp->rcu, destroy_dp_rcu); } @@ -1689,7 +1963,8 @@ static int ovs_dp_cmd_del(struct sk_buff *skb, struct genl_info *info) return -ENOMEM; ovs_lock(); - dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs); + dp = lookup_datapath(sock_net(skb->sk), genl_info_userhdr(info), + info->attrs); err = PTR_ERR(dp); if (IS_ERR(dp)) goto err_unlock_free; @@ -1722,12 +1997,15 @@ static int ovs_dp_cmd_set(struct sk_buff *skb, struct genl_info *info) return -ENOMEM; ovs_lock(); - dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs); + dp = lookup_datapath(sock_net(skb->sk), genl_info_userhdr(info), + info->attrs); err = PTR_ERR(dp); if (IS_ERR(dp)) goto err_unlock_free; - ovs_dp_change(dp, info->attrs); + err = ovs_dp_change(dp, info->attrs); + if (err) + goto err_unlock_free; err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, info->snd_seq, 0, OVS_DP_CMD_SET); @@ -1755,7 +2033,8 @@ static int ovs_dp_cmd_get(struct sk_buff *skb, struct genl_info *info) return -ENOMEM; ovs_lock(); - dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs); + dp = lookup_datapath(sock_net(skb->sk), genl_info_userhdr(info), + info->attrs); if (IS_ERR(dp)) { err = PTR_ERR(dp); goto err_unlock_free; @@ -1800,28 +2079,31 @@ static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = { [OVS_DP_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, [OVS_DP_ATTR_UPCALL_PID] = { .type = NLA_U32 }, [OVS_DP_ATTR_USER_FEATURES] = { .type = NLA_U32 }, + [OVS_DP_ATTR_MASKS_CACHE_SIZE] = NLA_POLICY_RANGE(NLA_U32, 0, + PCPU_MIN_UNIT_SIZE / sizeof(struct mask_cache_entry)), + [OVS_DP_ATTR_IFINDEX] = NLA_POLICY_MIN(NLA_S32, 0), }; -static const struct genl_ops dp_datapath_genl_ops[] = { +static const struct genl_small_ops dp_datapath_genl_ops[] = { { .cmd = OVS_DP_CMD_NEW, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ - .policy = datapath_policy, .doit = ovs_dp_cmd_new }, { .cmd = OVS_DP_CMD_DEL, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ - .policy = datapath_policy, .doit = ovs_dp_cmd_del }, { .cmd = OVS_DP_CMD_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, /* OK for unprivileged users. */ - .policy = datapath_policy, .doit = ovs_dp_cmd_get, .dumpit = ovs_dp_cmd_dump }, { .cmd = OVS_DP_CMD_SET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ - .policy = datapath_policy, .doit = ovs_dp_cmd_set, }, }; @@ -1831,10 +2113,12 @@ static struct genl_family dp_datapath_genl_family __ro_after_init = { .name = OVS_DATAPATH_FAMILY, .version = OVS_DATAPATH_VERSION, .maxattr = OVS_DP_ATTR_MAX, + .policy = datapath_policy, .netnsok = true, .parallel_ops = true, - .ops = dp_datapath_genl_ops, - .n_ops = ARRAY_SIZE(dp_datapath_genl_ops), + .small_ops = dp_datapath_genl_ops, + .n_small_ops = ARRAY_SIZE(dp_datapath_genl_ops), + .resv_start_op = OVS_DP_CMD_SET + 1, .mcgrps = &ovs_dp_datapath_multicast_group, .n_mcgrps = 1, .module = THIS_MODULE, @@ -1843,10 +2127,11 @@ static struct genl_family dp_datapath_genl_family __ro_after_init = { /* Called with ovs_mutex or RCU read lock. */ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, struct net *net, u32 portid, u32 seq, - u32 flags, u8 cmd) + u32 flags, u8 cmd, gfp_t gfp) { struct ovs_header *ovs_header; struct ovs_vport_stats vport_stats; + struct net *net_vport; int err; ovs_header = genlmsg_put(skb, portid, seq, &dp_vport_genl_family, @@ -1863,12 +2148,15 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, nla_put_u32(skb, OVS_VPORT_ATTR_IFINDEX, vport->dev->ifindex)) goto nla_put_failure; - if (!net_eq(net, dev_net(vport->dev))) { - int id = peernet2id_alloc(net, dev_net(vport->dev)); + rcu_read_lock(); + net_vport = dev_net_rcu(vport->dev); + if (!net_eq(net, net_vport)) { + int id = peernet2id_alloc(net, net_vport, GFP_ATOMIC); if (nla_put_s32(skb, OVS_VPORT_ATTR_NETNSID, id)) - goto nla_put_failure; + goto nla_put_failure_unlock; } + rcu_read_unlock(); ovs_vport_get_stats(vport, &vport_stats); if (nla_put_64bit(skb, OVS_VPORT_ATTR_STATS, @@ -1876,6 +2164,9 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, OVS_VPORT_ATTR_PAD)) goto nla_put_failure; + if (ovs_vport_get_upcall_stats(vport, skb)) + goto nla_put_failure; + if (ovs_vport_get_upcall_portids(vport, skb)) goto nla_put_failure; @@ -1886,6 +2177,8 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, genlmsg_end(skb, ovs_header); return 0; +nla_put_failure_unlock: + rcu_read_unlock(); nla_put_failure: err = -EMSGSIZE; error: @@ -1905,11 +2198,12 @@ struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, struct net *net, struct sk_buff *skb; int retval; - skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) return ERR_PTR(-ENOMEM); - retval = ovs_vport_cmd_fill_info(vport, skb, net, portid, seq, 0, cmd); + retval = ovs_vport_cmd_fill_info(vport, skb, net, portid, seq, 0, cmd, + GFP_KERNEL); BUG_ON(retval < 0); return skb; @@ -1952,16 +2246,16 @@ static struct vport *lookup_vport(struct net *net, } -/* Called with ovs_mutex */ -static void update_headroom(struct datapath *dp) +static unsigned int ovs_get_max_headroom(struct datapath *dp) { - unsigned dev_headroom, max_headroom = 0; + unsigned int dev_headroom, max_headroom = 0; struct net_device *dev; struct vport *vport; int i; for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { - hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) { + hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node, + lockdep_ovsl_is_held()) { dev = vport->dev; dev_headroom = netdev_get_fwd_headroom(dev); if (dev_headroom > max_headroom) @@ -1969,31 +2263,45 @@ static void update_headroom(struct datapath *dp) } } - dp->max_headroom = max_headroom; - for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) - hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) - netdev_set_rx_headroom(vport->dev, max_headroom); + return max_headroom; +} + +/* Called with ovs_mutex */ +static void ovs_update_headroom(struct datapath *dp, unsigned int new_headroom) +{ + struct vport *vport; + int i; + + dp->max_headroom = new_headroom; + for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { + hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node, + lockdep_ovsl_is_held()) + netdev_set_rx_headroom(vport->dev, new_headroom); + } } static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; - struct ovs_header *ovs_header = info->userhdr; + struct ovs_header *ovs_header = genl_info_userhdr(info); struct vport_parms parms; struct sk_buff *reply; struct vport *vport; struct datapath *dp; + unsigned int new_headroom; u32 port_no; int err; if (!a[OVS_VPORT_ATTR_NAME] || !a[OVS_VPORT_ATTR_TYPE] || !a[OVS_VPORT_ATTR_UPCALL_PID]) return -EINVAL; - if (a[OVS_VPORT_ATTR_IFINDEX]) + + parms.type = nla_get_u32(a[OVS_VPORT_ATTR_TYPE]); + + if (a[OVS_VPORT_ATTR_IFINDEX] && parms.type != OVS_VPORT_TYPE_INTERNAL) return -EOPNOTSUPP; - port_no = a[OVS_VPORT_ATTR_PORT_NO] - ? nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]) : 0; + port_no = nla_get_u32_default(a[OVS_VPORT_ATTR_PORT_NO], 0); if (port_no >= DP_MAX_PORTS) return -EFBIG; @@ -2026,11 +2334,12 @@ restart: } parms.name = nla_data(a[OVS_VPORT_ATTR_NAME]); - parms.type = nla_get_u32(a[OVS_VPORT_ATTR_TYPE]); parms.options = a[OVS_VPORT_ATTR_OPTIONS]; parms.dp = dp; parms.port_no = port_no; parms.upcall_portids = a[OVS_VPORT_ATTR_UPCALL_PID]; + parms.desired_ifindex = nla_get_s32_default(a[OVS_VPORT_ATTR_IFINDEX], + 0); vport = new_vport(&parms); err = PTR_ERR(vport); @@ -2042,10 +2351,12 @@ restart: err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info), info->snd_portid, info->snd_seq, 0, - OVS_VPORT_CMD_NEW); + OVS_VPORT_CMD_NEW, GFP_KERNEL); - if (netdev_get_fwd_headroom(vport->dev) > dp->max_headroom) - update_headroom(dp); + new_headroom = netdev_get_fwd_headroom(vport->dev); + + if (new_headroom > dp->max_headroom) + ovs_update_headroom(dp, new_headroom); else netdev_set_rx_headroom(vport->dev, dp->max_headroom); @@ -2073,7 +2384,7 @@ static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info) return -ENOMEM; ovs_lock(); - vport = lookup_vport(sock_net(skb->sk), info->userhdr, a); + vport = lookup_vport(sock_net(skb->sk), genl_info_userhdr(info), a); err = PTR_ERR(vport); if (IS_ERR(vport)) goto exit_unlock_free; @@ -2101,7 +2412,7 @@ static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info) err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info), info->snd_portid, info->snd_seq, 0, - OVS_VPORT_CMD_SET); + OVS_VPORT_CMD_SET, GFP_KERNEL); BUG_ON(err < 0); ovs_unlock(); @@ -2116,11 +2427,12 @@ exit_unlock_free: static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info) { - bool must_update_headroom = false; + bool update_headroom = false; struct nlattr **a = info->attrs; struct sk_buff *reply; struct datapath *dp; struct vport *vport; + unsigned int new_headroom; int err; reply = ovs_vport_cmd_alloc_info(); @@ -2128,7 +2440,7 @@ static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info) return -ENOMEM; ovs_lock(); - vport = lookup_vport(sock_net(skb->sk), info->userhdr, a); + vport = lookup_vport(sock_net(skb->sk), genl_info_userhdr(info), a); err = PTR_ERR(vport); if (IS_ERR(vport)) goto exit_unlock_free; @@ -2140,18 +2452,23 @@ static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info) err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info), info->snd_portid, info->snd_seq, 0, - OVS_VPORT_CMD_DEL); + OVS_VPORT_CMD_DEL, GFP_KERNEL); BUG_ON(err < 0); /* the vport deletion may trigger dp headroom update */ dp = vport->dp; if (netdev_get_fwd_headroom(vport->dev) == dp->max_headroom) - must_update_headroom = true; + update_headroom = true; + netdev_reset_rx_headroom(vport->dev); ovs_dp_detach_port(vport); - if (must_update_headroom) - update_headroom(dp); + if (update_headroom) { + new_headroom = ovs_get_max_headroom(dp); + + if (new_headroom < dp->max_headroom) + ovs_update_headroom(dp, new_headroom); + } ovs_unlock(); ovs_notify(&dp_vport_genl_family, reply, info); @@ -2166,7 +2483,7 @@ exit_unlock_free: static int ovs_vport_cmd_get(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; - struct ovs_header *ovs_header = info->userhdr; + struct ovs_header *ovs_header = genl_info_userhdr(info); struct sk_buff *reply; struct vport *vport; int err; @@ -2182,7 +2499,7 @@ static int ovs_vport_cmd_get(struct sk_buff *skb, struct genl_info *info) goto exit_unlock_free; err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info), info->snd_portid, info->snd_seq, 0, - OVS_VPORT_CMD_GET); + OVS_VPORT_CMD_GET, GFP_ATOMIC); BUG_ON(err < 0); rcu_read_unlock(); @@ -2218,7 +2535,8 @@ static int ovs_vport_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - OVS_VPORT_CMD_GET) < 0) + OVS_VPORT_CMD_GET, + GFP_ATOMIC) < 0) goto out; j++; @@ -2234,37 +2552,55 @@ out: return skb->len; } +static void ovs_dp_masks_rebalance(struct work_struct *work) +{ + struct ovs_net *ovs_net = container_of(work, struct ovs_net, + masks_rebalance.work); + struct datapath *dp; + + ovs_lock(); + + list_for_each_entry(dp, &ovs_net->dps, list_node) + ovs_flow_masks_rebalance(&dp->table); + + ovs_unlock(); + + schedule_delayed_work(&ovs_net->masks_rebalance, + msecs_to_jiffies(DP_MASKS_REBALANCE_INTERVAL)); +} + static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = { [OVS_VPORT_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, [OVS_VPORT_ATTR_STATS] = { .len = sizeof(struct ovs_vport_stats) }, [OVS_VPORT_ATTR_PORT_NO] = { .type = NLA_U32 }, [OVS_VPORT_ATTR_TYPE] = { .type = NLA_U32 }, - [OVS_VPORT_ATTR_UPCALL_PID] = { .type = NLA_U32 }, + [OVS_VPORT_ATTR_UPCALL_PID] = { .type = NLA_UNSPEC }, [OVS_VPORT_ATTR_OPTIONS] = { .type = NLA_NESTED }, - [OVS_VPORT_ATTR_IFINDEX] = { .type = NLA_U32 }, + [OVS_VPORT_ATTR_IFINDEX] = NLA_POLICY_MIN(NLA_S32, 0), [OVS_VPORT_ATTR_NETNSID] = { .type = NLA_S32 }, + [OVS_VPORT_ATTR_UPCALL_STATS] = { .type = NLA_NESTED }, }; -static const struct genl_ops dp_vport_genl_ops[] = { +static const struct genl_small_ops dp_vport_genl_ops[] = { { .cmd = OVS_VPORT_CMD_NEW, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ - .policy = vport_policy, .doit = ovs_vport_cmd_new }, { .cmd = OVS_VPORT_CMD_DEL, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ - .policy = vport_policy, .doit = ovs_vport_cmd_del }, { .cmd = OVS_VPORT_CMD_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, /* OK for unprivileged users. */ - .policy = vport_policy, .doit = ovs_vport_cmd_get, .dumpit = ovs_vport_cmd_dump }, { .cmd = OVS_VPORT_CMD_SET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ - .policy = vport_policy, .doit = ovs_vport_cmd_set, }, }; @@ -2274,10 +2610,12 @@ struct genl_family dp_vport_genl_family __ro_after_init = { .name = OVS_VPORT_FAMILY, .version = OVS_VPORT_VERSION, .maxattr = OVS_VPORT_ATTR_MAX, + .policy = vport_policy, .netnsok = true, .parallel_ops = true, - .ops = dp_vport_genl_ops, - .n_ops = ARRAY_SIZE(dp_vport_genl_ops), + .small_ops = dp_vport_genl_ops, + .n_small_ops = ARRAY_SIZE(dp_vport_genl_ops), + .resv_start_op = OVS_VPORT_CMD_SET + 1, .mcgrps = &ovs_dp_vport_multicast_group, .n_mcgrps = 1, .module = THIS_MODULE, @@ -2324,10 +2662,19 @@ error: static int __net_init ovs_init_net(struct net *net) { struct ovs_net *ovs_net = net_generic(net, ovs_net_id); + int err; INIT_LIST_HEAD(&ovs_net->dps); INIT_WORK(&ovs_net->dp_notify_work, ovs_dp_notify_wq); - return ovs_ct_init(net); + INIT_DELAYED_WORK(&ovs_net->masks_rebalance, ovs_dp_masks_rebalance); + + err = ovs_ct_init(net); + if (err) + return err; + + schedule_delayed_work(&ovs_net->masks_rebalance, + msecs_to_jiffies(DP_MASKS_REBALANCE_INTERVAL)); + return 0; } static void __net_exit list_vports_from_net(struct net *net, struct net *dnet, @@ -2361,8 +2708,10 @@ static void __net_exit ovs_exit_net(struct net *dnet) struct net *net; LIST_HEAD(head); - ovs_ct_exit(dnet); ovs_lock(); + + ovs_ct_exit(dnet); + list_for_each_entry_safe(dp, dp_next, &ovs_net->dps, list_node) __dp_destroy(dp); @@ -2379,6 +2728,7 @@ static void __net_exit ovs_exit_net(struct net *dnet) ovs_unlock(); + cancel_delayed_work_sync(&ovs_net->masks_rebalance); cancel_work_sync(&ovs_net->dp_notify_work); } @@ -2389,21 +2739,55 @@ static struct pernet_operations ovs_net_ops = { .size = sizeof(struct ovs_net), }; +static const char * const ovs_drop_reasons[] = { +#define S(x) [(x) & ~SKB_DROP_REASON_SUBSYS_MASK] = (#x), + OVS_DROP_REASONS(S) +#undef S +}; + +static struct drop_reason_list drop_reason_list_ovs = { + .reasons = ovs_drop_reasons, + .n_reasons = ARRAY_SIZE(ovs_drop_reasons), +}; + +static int __init ovs_alloc_percpu_storage(void) +{ + unsigned int cpu; + + ovs_pcpu_storage = alloc_percpu(*ovs_pcpu_storage); + if (!ovs_pcpu_storage) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + struct ovs_pcpu_storage *ovs_pcpu; + + ovs_pcpu = per_cpu_ptr(ovs_pcpu_storage, cpu); + local_lock_init(&ovs_pcpu->bh_lock); + } + return 0; +} + +static void ovs_free_percpu_storage(void) +{ + free_percpu(ovs_pcpu_storage); +} + static int __init dp_init(void) { int err; - BUILD_BUG_ON(sizeof(struct ovs_skb_cb) > FIELD_SIZEOF(struct sk_buff, cb)); + BUILD_BUG_ON(sizeof(struct ovs_skb_cb) > + sizeof_field(struct sk_buff, cb)); pr_info("Open vSwitch switching datapath\n"); - err = action_fifos_init(); + err = ovs_alloc_percpu_storage(); if (err) goto error; err = ovs_internal_dev_rtnl_link_register(); if (err) - goto error_action_fifos_exit; + goto error; err = ovs_flow_init(); if (err) @@ -2429,6 +2813,9 @@ static int __init dp_init(void) if (err < 0) goto error_unreg_netdev; + drop_reasons_register_subsys(SKB_DROP_REASON_SUBSYS_OPENVSWITCH, + &drop_reason_list_ovs); + return 0; error_unreg_netdev: @@ -2443,9 +2830,8 @@ error_flow_exit: ovs_flow_exit(); error_unreg_rtnl_link: ovs_internal_dev_rtnl_link_unregister(); -error_action_fifos_exit: - action_fifos_exit(); error: + ovs_free_percpu_storage(); return err; } @@ -2455,11 +2841,12 @@ static void dp_cleanup(void) ovs_netdev_exit(); unregister_netdevice_notifier(&ovs_dp_device_notifier); unregister_pernet_device(&ovs_net_ops); + drop_reasons_unregister_subsys(SKB_DROP_REASON_SUBSYS_OPENVSWITCH); rcu_barrier(); ovs_vport_exit(); ovs_flow_exit(); ovs_internal_dev_rtnl_link_unregister(); - action_fifos_exit(); + ovs_free_percpu_storage(); } module_init(dp_init); |
