diff options
Diffstat (limited to 'net/openvswitch')
28 files changed, 13493 insertions, 3425 deletions
diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig index 27ee56b688a3..e6aaee92dba4 100644 --- a/net/openvswitch/Kconfig +++ b/net/openvswitch/Kconfig @@ -1,10 +1,24 @@ +# SPDX-License-Identifier: GPL-2.0-only # # Open vSwitch # config OPENVSWITCH tristate "Open vSwitch" - ---help--- + depends on INET + depends on !NF_CONNTRACK || \ + (NF_CONNTRACK && ((!NF_DEFRAG_IPV6 || NF_DEFRAG_IPV6) && \ + (!NF_NAT || NF_NAT) && \ + (!NETFILTER_CONNCOUNT || NETFILTER_CONNCOUNT))) + depends on PSAMPLE || !PSAMPLE + select MPLS + select NET_CRC32C + select NET_MPLS_GSO + select DST_CACHE + select NET_NSH + select NF_CONNTRACK_OVS if NF_CONNTRACK + select NF_NAT_OVS if NF_NAT + help Open vSwitch is a multilayer Ethernet switch targeted at virtualized environments. In addition to supporting a variety of features expected in a traditional hardware switch, it enables fine-grained @@ -28,15 +42,36 @@ config OPENVSWITCH If unsure, say N. config OPENVSWITCH_GRE - bool "Open vSwitch GRE tunneling support" - depends on INET + tristate "Open vSwitch GRE tunneling support" depends on OPENVSWITCH - depends on NET_IPGRE_DEMUX && !(OPENVSWITCH=y && NET_IPGRE_DEMUX=m) - default y - ---help--- + depends on NET_IPGRE + default OPENVSWITCH + help If you say Y here, then the Open vSwitch will be able create GRE vport. Say N to exclude this support and reduce the binary size. If unsure, say Y. + +config OPENVSWITCH_VXLAN + tristate "Open vSwitch VXLAN tunneling support" + depends on OPENVSWITCH + depends on VXLAN + default OPENVSWITCH + help + If you say Y here, then the Open vSwitch will be able create vxlan vport. + + Say N to exclude this support and reduce the binary size. + + If unsure, say Y. + +config OPENVSWITCH_GENEVE + tristate "Open vSwitch Geneve tunneling support" + depends on OPENVSWITCH + depends on GENEVE + default OPENVSWITCH + help + If you say Y here, then the Open vSwitch will be able create geneve vport. + + Say N to exclude this support and reduce the binary size. diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile index 01bddb2991e3..28982630bef3 100644 --- a/net/openvswitch/Makefile +++ b/net/openvswitch/Makefile @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0 # # Makefile for Open vSwitch. # @@ -9,7 +10,20 @@ openvswitch-y := \ datapath.o \ dp_notify.o \ flow.o \ + flow_netlink.o \ + flow_table.o \ + meter.o \ + openvswitch_trace.o \ vport.o \ - vport-gre.o \ vport-internal_dev.o \ vport-netdev.o + +ifneq ($(CONFIG_NF_CONNTRACK),) +openvswitch-y += conntrack.o +endif + +obj-$(CONFIG_OPENVSWITCH_VXLAN)+= vport-vxlan.o +obj-$(CONFIG_OPENVSWITCH_GENEVE)+= vport-geneve.o +obj-$(CONFIG_OPENVSWITCH_GRE) += vport-gre.o + +CFLAGS_openvswitch_trace.o = -I$(src) diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 22c5f399f1cf..792ca44a461d 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (c) 2007-2012 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA + * Copyright (c) 2007-2017 Nicira, Inc. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -22,148 +9,348 @@ #include <linux/in.h> #include <linux/ip.h> #include <linux/openvswitch.h> +#include <linux/sctp.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/in6.h> #include <linux/if_arp.h> #include <linux/if_vlan.h> + +#include <net/dst.h> +#include <net/gso.h> #include <net/ip.h> #include <net/ipv6.h> +#include <net/ip6_fib.h> #include <net/checksum.h> #include <net/dsfield.h> +#include <net/mpls.h> + +#if IS_ENABLED(CONFIG_PSAMPLE) +#include <net/psample.h> +#endif + +#include <net/sctp/checksum.h> #include "datapath.h" +#include "drop.h" +#include "flow.h" +#include "conntrack.h" #include "vport.h" +#include "flow_netlink.h" +#include "openvswitch_trace.h" + +struct ovs_pcpu_storage __percpu *ovs_pcpu_storage; + +/* Make a clone of the 'key', using the pre-allocated percpu 'flow_keys' + * space. Return NULL if out of key spaces. + */ +static struct sw_flow_key *clone_key(const struct sw_flow_key *key_) +{ + struct ovs_pcpu_storage *ovs_pcpu = this_cpu_ptr(ovs_pcpu_storage); + struct action_flow_keys *keys = &ovs_pcpu->flow_keys; + int level = ovs_pcpu->exec_level; + struct sw_flow_key *key = NULL; + + if (level <= OVS_DEFERRED_ACTION_THRESHOLD) { + key = &keys->key[level - 1]; + *key = *key_; + } + + return key; +} + +static void action_fifo_init(struct action_fifo *fifo) +{ + fifo->head = 0; + fifo->tail = 0; +} + +static bool action_fifo_is_empty(const struct action_fifo *fifo) +{ + return (fifo->head == fifo->tail); +} + +static struct deferred_action *action_fifo_get(struct action_fifo *fifo) +{ + if (action_fifo_is_empty(fifo)) + return NULL; + + return &fifo->fifo[fifo->tail++]; +} + +static struct deferred_action *action_fifo_put(struct action_fifo *fifo) +{ + if (fifo->head >= DEFERRED_ACTION_FIFO_SIZE - 1) + return NULL; + + return &fifo->fifo[fifo->head++]; +} + +/* Return true if fifo is not full */ +static struct deferred_action *add_deferred_actions(struct sk_buff *skb, + const struct sw_flow_key *key, + const struct nlattr *actions, + const int actions_len) +{ + struct action_fifo *fifo = this_cpu_ptr(&ovs_pcpu_storage->action_fifos); + struct deferred_action *da; + + da = action_fifo_put(fifo); + if (da) { + da->skb = skb; + da->actions = actions; + da->actions_len = actions_len; + da->pkt_key = *key; + } + + return da; +} + +static void invalidate_flow_key(struct sw_flow_key *key) +{ + key->mac_proto |= SW_FLOW_KEY_INVALID; +} + +static bool is_flow_key_valid(const struct sw_flow_key *key) +{ + return !(key->mac_proto & SW_FLOW_KEY_INVALID); +} + +static int clone_execute(struct datapath *dp, struct sk_buff *skb, + struct sw_flow_key *key, + u32 recirc_id, + const struct nlattr *actions, int len, + bool last, bool clone_flow_key); static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, - const struct nlattr *attr, int len, bool keep_skb); + struct sw_flow_key *key, + const struct nlattr *attr, int len); -static int make_writable(struct sk_buff *skb, int write_len) +static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key, + __be32 mpls_lse, __be16 mpls_ethertype, __u16 mac_len) { - if (!skb_cloned(skb) || skb_clone_writable(skb, write_len)) - return 0; + int err; + + err = skb_mpls_push(skb, mpls_lse, mpls_ethertype, mac_len, !!mac_len); + if (err) + return err; + + if (!mac_len) + key->mac_proto = MAC_PROTO_NONE; - return pskb_expand_head(skb, 0, 0, GFP_ATOMIC); + invalidate_flow_key(key); + return 0; } -/* remove VLAN header from packet and update csum accordingly. */ -static int __pop_vlan_tci(struct sk_buff *skb, __be16 *current_tci) +static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key, + const __be16 ethertype) { - struct vlan_hdr *vhdr; int err; - err = make_writable(skb, VLAN_ETH_HLEN); - if (unlikely(err)) + err = skb_mpls_pop(skb, ethertype, skb->mac_len, + ovs_key_mac_proto(key) == MAC_PROTO_ETHERNET); + if (err) return err; - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->csum = csum_sub(skb->csum, csum_partial(skb->data - + (2 * ETH_ALEN), VLAN_HLEN, 0)); + if (ethertype == htons(ETH_P_TEB)) + key->mac_proto = MAC_PROTO_ETHERNET; - vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN); - *current_tci = vhdr->h_vlan_TCI; + invalidate_flow_key(key); + return 0; +} - memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN); - __skb_pull(skb, VLAN_HLEN); +static int set_mpls(struct sk_buff *skb, struct sw_flow_key *flow_key, + const __be32 *mpls_lse, const __be32 *mask) +{ + struct mpls_shim_hdr *stack; + __be32 lse; + int err; - vlan_set_encap_proto(skb, vhdr); - skb->mac_header += VLAN_HLEN; - skb_reset_mac_len(skb); + if (!pskb_may_pull(skb, skb_network_offset(skb) + MPLS_HLEN)) + return -ENOMEM; + + stack = mpls_hdr(skb); + lse = OVS_MASKED(stack->label_stack_entry, *mpls_lse, *mask); + err = skb_mpls_update_lse(skb, lse); + if (err) + return err; + flow_key->mpls.lse[0] = lse; return 0; } -static int pop_vlan(struct sk_buff *skb) +static int pop_vlan(struct sk_buff *skb, struct sw_flow_key *key) { - __be16 tci; int err; - if (likely(vlan_tx_tag_present(skb))) { - skb->vlan_tci = 0; + err = skb_vlan_pop(skb); + if (skb_vlan_tag_present(skb)) { + invalidate_flow_key(key); } else { - if (unlikely(skb->protocol != htons(ETH_P_8021Q) || - skb->len < VLAN_ETH_HLEN)) - return 0; + key->eth.vlan.tci = 0; + key->eth.vlan.tpid = 0; + } + return err; +} - err = __pop_vlan_tci(skb, &tci); - if (err) - return err; +static int push_vlan(struct sk_buff *skb, struct sw_flow_key *key, + const struct ovs_action_push_vlan *vlan) +{ + int err; + + if (skb_vlan_tag_present(skb)) { + invalidate_flow_key(key); + } else { + key->eth.vlan.tci = vlan->vlan_tci; + key->eth.vlan.tpid = vlan->vlan_tpid; } - /* move next vlan tag to hw accel tag */ - if (likely(skb->protocol != htons(ETH_P_8021Q) || - skb->len < VLAN_ETH_HLEN)) - return 0; + err = skb_vlan_push(skb, vlan->vlan_tpid, + ntohs(vlan->vlan_tci) & ~VLAN_CFI_MASK); + skb_reset_mac_len(skb); + return err; +} + +/* 'src' is already properly masked. */ +static void ether_addr_copy_masked(u8 *dst_, const u8 *src_, const u8 *mask_) +{ + u16 *dst = (u16 *)dst_; + const u16 *src = (const u16 *)src_; + const u16 *mask = (const u16 *)mask_; + + OVS_SET_MASKED(dst[0], src[0], mask[0]); + OVS_SET_MASKED(dst[1], src[1], mask[1]); + OVS_SET_MASKED(dst[2], src[2], mask[2]); +} + +static int set_eth_addr(struct sk_buff *skb, struct sw_flow_key *flow_key, + const struct ovs_key_ethernet *key, + const struct ovs_key_ethernet *mask) +{ + int err; - err = __pop_vlan_tci(skb, &tci); + err = skb_ensure_writable(skb, ETH_HLEN); if (unlikely(err)) return err; - __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ntohs(tci)); + skb_postpull_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2); + + ether_addr_copy_masked(eth_hdr(skb)->h_source, key->eth_src, + mask->eth_src); + ether_addr_copy_masked(eth_hdr(skb)->h_dest, key->eth_dst, + mask->eth_dst); + + skb_postpush_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2); + + ether_addr_copy(flow_key->eth.src, eth_hdr(skb)->h_source); + ether_addr_copy(flow_key->eth.dst, eth_hdr(skb)->h_dest); return 0; } -static int push_vlan(struct sk_buff *skb, const struct ovs_action_push_vlan *vlan) +/* pop_eth does not support VLAN packets as this action is never called + * for them. + */ +static int pop_eth(struct sk_buff *skb, struct sw_flow_key *key) { - if (unlikely(vlan_tx_tag_present(skb))) { - u16 current_tag; + int err; - /* push down current VLAN tag */ - current_tag = vlan_tx_tag_get(skb); + err = skb_eth_pop(skb); + if (err) + return err; - if (!__vlan_put_tag(skb, skb->vlan_proto, current_tag)) - return -ENOMEM; + /* safe right before invalidate_flow_key */ + key->mac_proto = MAC_PROTO_NONE; + invalidate_flow_key(key); + return 0; +} - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->csum = csum_add(skb->csum, csum_partial(skb->data - + (2 * ETH_ALEN), VLAN_HLEN, 0)); +static int push_eth(struct sk_buff *skb, struct sw_flow_key *key, + const struct ovs_action_push_eth *ethh) +{ + int err; - } - __vlan_hwaccel_put_tag(skb, vlan->vlan_tpid, ntohs(vlan->vlan_tci) & ~VLAN_TAG_PRESENT); + err = skb_eth_push(skb, ethh->addresses.eth_dst, + ethh->addresses.eth_src); + if (err) + return err; + + /* safe right before invalidate_flow_key */ + key->mac_proto = MAC_PROTO_ETHERNET; + invalidate_flow_key(key); return 0; } -static int set_eth_addr(struct sk_buff *skb, - const struct ovs_key_ethernet *eth_key) +static noinline_for_stack int push_nsh(struct sk_buff *skb, + struct sw_flow_key *key, + const struct nlattr *a) { + u8 buffer[NSH_HDR_MAX_LEN]; + struct nshhdr *nh = (struct nshhdr *)buffer; int err; - err = make_writable(skb, ETH_HLEN); - if (unlikely(err)) + + err = nsh_hdr_from_nlattr(a, nh, NSH_HDR_MAX_LEN); + if (err) return err; - skb_postpull_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2); + err = nsh_push(skb, nh); + if (err) + return err; + + /* safe right before invalidate_flow_key */ + key->mac_proto = MAC_PROTO_NONE; + invalidate_flow_key(key); + return 0; +} - memcpy(eth_hdr(skb)->h_source, eth_key->eth_src, ETH_ALEN); - memcpy(eth_hdr(skb)->h_dest, eth_key->eth_dst, ETH_ALEN); +static int pop_nsh(struct sk_buff *skb, struct sw_flow_key *key) +{ + int err; - ovs_skb_postpush_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2); + err = nsh_pop(skb); + if (err) + return err; + /* safe right before invalidate_flow_key */ + if (skb->protocol == htons(ETH_P_TEB)) + key->mac_proto = MAC_PROTO_ETHERNET; + else + key->mac_proto = MAC_PROTO_NONE; + invalidate_flow_key(key); return 0; } -static void set_ip_addr(struct sk_buff *skb, struct iphdr *nh, - __be32 *addr, __be32 new_addr) +static void update_ip_l4_checksum(struct sk_buff *skb, struct iphdr *nh, + __be32 addr, __be32 new_addr) { int transport_len = skb->len - skb_transport_offset(skb); + if (nh->frag_off & htons(IP_OFFSET)) + return; + if (nh->protocol == IPPROTO_TCP) { if (likely(transport_len >= sizeof(struct tcphdr))) inet_proto_csum_replace4(&tcp_hdr(skb)->check, skb, - *addr, new_addr, 1); + addr, new_addr, true); } else if (nh->protocol == IPPROTO_UDP) { if (likely(transport_len >= sizeof(struct udphdr))) { struct udphdr *uh = udp_hdr(skb); if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) { inet_proto_csum_replace4(&uh->check, skb, - *addr, new_addr, 1); + addr, new_addr, true); if (!uh->check) uh->check = CSUM_MANGLED_0; } } } +} +static void set_ip_addr(struct sk_buff *skb, struct iphdr *nh, + __be32 *addr, __be32 new_addr) +{ + update_ip_l4_checksum(skb, nh, *addr, new_addr); csum_replace4(&nh->check, *addr, new_addr); - skb->rxhash = 0; + skb_clear_hash(skb); + ovs_ct_clear(skb, NULL); *addr = new_addr; } @@ -172,24 +359,37 @@ static void update_ipv6_checksum(struct sk_buff *skb, u8 l4_proto, { int transport_len = skb->len - skb_transport_offset(skb); - if (l4_proto == IPPROTO_TCP) { + if (l4_proto == NEXTHDR_TCP) { if (likely(transport_len >= sizeof(struct tcphdr))) inet_proto_csum_replace16(&tcp_hdr(skb)->check, skb, - addr, new_addr, 1); - } else if (l4_proto == IPPROTO_UDP) { + addr, new_addr, true); + } else if (l4_proto == NEXTHDR_UDP) { if (likely(transport_len >= sizeof(struct udphdr))) { struct udphdr *uh = udp_hdr(skb); if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) { inet_proto_csum_replace16(&uh->check, skb, - addr, new_addr, 1); + addr, new_addr, true); if (!uh->check) uh->check = CSUM_MANGLED_0; } } + } else if (l4_proto == NEXTHDR_ICMP) { + if (likely(transport_len >= sizeof(struct icmp6hdr))) + inet_proto_csum_replace16(&icmp6_hdr(skb)->icmp6_cksum, + skb, addr, new_addr, true); } } +static void mask_ipv6_addr(const __be32 old[4], const __be32 addr[4], + const __be32 mask[4], __be32 masked[4]) +{ + masked[0] = OVS_MASKED(old[0], addr[0], mask[0]); + masked[1] = OVS_MASKED(old[1], addr[1], mask[1]); + masked[2] = OVS_MASKED(old[2], addr[2], mask[2]); + masked[3] = OVS_MASKED(old[3], addr[3], mask[3]); +} + static void set_ipv6_addr(struct sk_buff *skb, u8 l4_proto, __be32 addr[4], const __be32 new_addr[4], bool recalculate_csum) @@ -197,282 +397,860 @@ static void set_ipv6_addr(struct sk_buff *skb, u8 l4_proto, if (recalculate_csum) update_ipv6_checksum(skb, l4_proto, addr, new_addr); - skb->rxhash = 0; + skb_clear_hash(skb); + ovs_ct_clear(skb, NULL); memcpy(addr, new_addr, sizeof(__be32[4])); } -static void set_ipv6_tc(struct ipv6hdr *nh, u8 tc) +static void set_ipv6_dsfield(struct sk_buff *skb, struct ipv6hdr *nh, u8 ipv6_tclass, u8 mask) { - nh->priority = tc >> 4; - nh->flow_lbl[0] = (nh->flow_lbl[0] & 0x0F) | ((tc & 0x0F) << 4); + u8 old_ipv6_tclass = ipv6_get_dsfield(nh); + + ipv6_tclass = OVS_MASKED(old_ipv6_tclass, ipv6_tclass, mask); + + if (skb->ip_summed == CHECKSUM_COMPLETE) + csum_replace(&skb->csum, (__force __wsum)(old_ipv6_tclass << 12), + (__force __wsum)(ipv6_tclass << 12)); + + ipv6_change_dsfield(nh, ~mask, ipv6_tclass); } -static void set_ipv6_fl(struct ipv6hdr *nh, u32 fl) +static void set_ipv6_fl(struct sk_buff *skb, struct ipv6hdr *nh, u32 fl, u32 mask) { - nh->flow_lbl[0] = (nh->flow_lbl[0] & 0xF0) | (fl & 0x000F0000) >> 16; - nh->flow_lbl[1] = (fl & 0x0000FF00) >> 8; - nh->flow_lbl[2] = fl & 0x000000FF; + u32 ofl; + + ofl = nh->flow_lbl[0] << 16 | nh->flow_lbl[1] << 8 | nh->flow_lbl[2]; + fl = OVS_MASKED(ofl, fl, mask); + + /* Bits 21-24 are always unmasked, so this retains their values. */ + nh->flow_lbl[0] = (u8)(fl >> 16); + nh->flow_lbl[1] = (u8)(fl >> 8); + nh->flow_lbl[2] = (u8)fl; + + if (skb->ip_summed == CHECKSUM_COMPLETE) + csum_replace(&skb->csum, (__force __wsum)htonl(ofl), (__force __wsum)htonl(fl)); +} + +static void set_ipv6_ttl(struct sk_buff *skb, struct ipv6hdr *nh, u8 new_ttl, u8 mask) +{ + new_ttl = OVS_MASKED(nh->hop_limit, new_ttl, mask); + + if (skb->ip_summed == CHECKSUM_COMPLETE) + csum_replace(&skb->csum, (__force __wsum)(nh->hop_limit << 8), + (__force __wsum)(new_ttl << 8)); + nh->hop_limit = new_ttl; } -static void set_ip_ttl(struct sk_buff *skb, struct iphdr *nh, u8 new_ttl) +static void set_ip_ttl(struct sk_buff *skb, struct iphdr *nh, u8 new_ttl, + u8 mask) { + new_ttl = OVS_MASKED(nh->ttl, new_ttl, mask); + csum_replace2(&nh->check, htons(nh->ttl << 8), htons(new_ttl << 8)); nh->ttl = new_ttl; } -static int set_ipv4(struct sk_buff *skb, const struct ovs_key_ipv4 *ipv4_key) +static int set_ipv4(struct sk_buff *skb, struct sw_flow_key *flow_key, + const struct ovs_key_ipv4 *key, + const struct ovs_key_ipv4 *mask) { struct iphdr *nh; + __be32 new_addr; int err; - err = make_writable(skb, skb_network_offset(skb) + - sizeof(struct iphdr)); + err = skb_ensure_writable(skb, skb_network_offset(skb) + + sizeof(struct iphdr)); if (unlikely(err)) return err; nh = ip_hdr(skb); - if (ipv4_key->ipv4_src != nh->saddr) - set_ip_addr(skb, nh, &nh->saddr, ipv4_key->ipv4_src); - - if (ipv4_key->ipv4_dst != nh->daddr) - set_ip_addr(skb, nh, &nh->daddr, ipv4_key->ipv4_dst); + /* Setting an IP addresses is typically only a side effect of + * matching on them in the current userspace implementation, so it + * makes sense to check if the value actually changed. + */ + if (mask->ipv4_src) { + new_addr = OVS_MASKED(nh->saddr, key->ipv4_src, mask->ipv4_src); - if (ipv4_key->ipv4_tos != nh->tos) - ipv4_change_dsfield(nh, 0, ipv4_key->ipv4_tos); + if (unlikely(new_addr != nh->saddr)) { + set_ip_addr(skb, nh, &nh->saddr, new_addr); + flow_key->ipv4.addr.src = new_addr; + } + } + if (mask->ipv4_dst) { + new_addr = OVS_MASKED(nh->daddr, key->ipv4_dst, mask->ipv4_dst); - if (ipv4_key->ipv4_ttl != nh->ttl) - set_ip_ttl(skb, nh, ipv4_key->ipv4_ttl); + if (unlikely(new_addr != nh->daddr)) { + set_ip_addr(skb, nh, &nh->daddr, new_addr); + flow_key->ipv4.addr.dst = new_addr; + } + } + if (mask->ipv4_tos) { + ipv4_change_dsfield(nh, ~mask->ipv4_tos, key->ipv4_tos); + flow_key->ip.tos = nh->tos; + } + if (mask->ipv4_ttl) { + set_ip_ttl(skb, nh, key->ipv4_ttl, mask->ipv4_ttl); + flow_key->ip.ttl = nh->ttl; + } return 0; } -static int set_ipv6(struct sk_buff *skb, const struct ovs_key_ipv6 *ipv6_key) +static bool is_ipv6_mask_nonzero(const __be32 addr[4]) +{ + return !!(addr[0] | addr[1] | addr[2] | addr[3]); +} + +static int set_ipv6(struct sk_buff *skb, struct sw_flow_key *flow_key, + const struct ovs_key_ipv6 *key, + const struct ovs_key_ipv6 *mask) { struct ipv6hdr *nh; int err; - __be32 *saddr; - __be32 *daddr; - err = make_writable(skb, skb_network_offset(skb) + - sizeof(struct ipv6hdr)); + err = skb_ensure_writable(skb, skb_network_offset(skb) + + sizeof(struct ipv6hdr)); if (unlikely(err)) return err; nh = ipv6_hdr(skb); - saddr = (__be32 *)&nh->saddr; - daddr = (__be32 *)&nh->daddr; - - if (memcmp(ipv6_key->ipv6_src, saddr, sizeof(ipv6_key->ipv6_src))) - set_ipv6_addr(skb, ipv6_key->ipv6_proto, saddr, - ipv6_key->ipv6_src, true); - if (memcmp(ipv6_key->ipv6_dst, daddr, sizeof(ipv6_key->ipv6_dst))) { + /* Setting an IP addresses is typically only a side effect of + * matching on them in the current userspace implementation, so it + * makes sense to check if the value actually changed. + */ + if (is_ipv6_mask_nonzero(mask->ipv6_src)) { + __be32 *saddr = (__be32 *)&nh->saddr; + __be32 masked[4]; + + mask_ipv6_addr(saddr, key->ipv6_src, mask->ipv6_src, masked); + + if (unlikely(memcmp(saddr, masked, sizeof(masked)))) { + set_ipv6_addr(skb, flow_key->ip.proto, saddr, masked, + true); + memcpy(&flow_key->ipv6.addr.src, masked, + sizeof(flow_key->ipv6.addr.src)); + } + } + if (is_ipv6_mask_nonzero(mask->ipv6_dst)) { unsigned int offset = 0; int flags = IP6_FH_F_SKIP_RH; bool recalc_csum = true; - - if (ipv6_ext_hdr(nh->nexthdr)) - recalc_csum = ipv6_find_hdr(skb, &offset, - NEXTHDR_ROUTING, NULL, - &flags) != NEXTHDR_ROUTING; - - set_ipv6_addr(skb, ipv6_key->ipv6_proto, daddr, - ipv6_key->ipv6_dst, recalc_csum); + __be32 *daddr = (__be32 *)&nh->daddr; + __be32 masked[4]; + + mask_ipv6_addr(daddr, key->ipv6_dst, mask->ipv6_dst, masked); + + if (unlikely(memcmp(daddr, masked, sizeof(masked)))) { + if (ipv6_ext_hdr(nh->nexthdr)) + recalc_csum = (ipv6_find_hdr(skb, &offset, + NEXTHDR_ROUTING, + NULL, &flags) + != NEXTHDR_ROUTING); + + set_ipv6_addr(skb, flow_key->ip.proto, daddr, masked, + recalc_csum); + memcpy(&flow_key->ipv6.addr.dst, masked, + sizeof(flow_key->ipv6.addr.dst)); + } + } + if (mask->ipv6_tclass) { + set_ipv6_dsfield(skb, nh, key->ipv6_tclass, mask->ipv6_tclass); + flow_key->ip.tos = ipv6_get_dsfield(nh); + } + if (mask->ipv6_label) { + set_ipv6_fl(skb, nh, ntohl(key->ipv6_label), + ntohl(mask->ipv6_label)); + flow_key->ipv6.label = + *(__be32 *)nh & htonl(IPV6_FLOWINFO_FLOWLABEL); + } + if (mask->ipv6_hlimit) { + set_ipv6_ttl(skb, nh, key->ipv6_hlimit, mask->ipv6_hlimit); + flow_key->ip.ttl = nh->hop_limit; } - - set_ipv6_tc(nh, ipv6_key->ipv6_tclass); - set_ipv6_fl(nh, ntohl(ipv6_key->ipv6_label)); - nh->hop_limit = ipv6_key->ipv6_hlimit; - return 0; } -/* Must follow make_writable() since that can move the skb data. */ +/* Must follow skb_ensure_writable() since that can move the skb data. */ static void set_tp_port(struct sk_buff *skb, __be16 *port, - __be16 new_port, __sum16 *check) + __be16 new_port, __sum16 *check) { - inet_proto_csum_replace2(check, skb, *port, new_port, 0); + ovs_ct_clear(skb, NULL); + inet_proto_csum_replace2(check, skb, *port, new_port, false); *port = new_port; - skb->rxhash = 0; } -static void set_udp_port(struct sk_buff *skb, __be16 *port, __be16 new_port) +static int set_udp(struct sk_buff *skb, struct sw_flow_key *flow_key, + const struct ovs_key_udp *key, + const struct ovs_key_udp *mask) { - struct udphdr *uh = udp_hdr(skb); + struct udphdr *uh; + __be16 src, dst; + int err; + + err = skb_ensure_writable(skb, skb_transport_offset(skb) + + sizeof(struct udphdr)); + if (unlikely(err)) + return err; + + uh = udp_hdr(skb); + /* Either of the masks is non-zero, so do not bother checking them. */ + src = OVS_MASKED(uh->source, key->udp_src, mask->udp_src); + dst = OVS_MASKED(uh->dest, key->udp_dst, mask->udp_dst); if (uh->check && skb->ip_summed != CHECKSUM_PARTIAL) { - set_tp_port(skb, port, new_port, &uh->check); + if (likely(src != uh->source)) { + set_tp_port(skb, &uh->source, src, &uh->check); + flow_key->tp.src = src; + } + if (likely(dst != uh->dest)) { + set_tp_port(skb, &uh->dest, dst, &uh->check); + flow_key->tp.dst = dst; + } - if (!uh->check) + if (unlikely(!uh->check)) uh->check = CSUM_MANGLED_0; } else { - *port = new_port; - skb->rxhash = 0; + uh->source = src; + uh->dest = dst; + flow_key->tp.src = src; + flow_key->tp.dst = dst; + ovs_ct_clear(skb, NULL); } + + skb_clear_hash(skb); + + return 0; } -static int set_udp(struct sk_buff *skb, const struct ovs_key_udp *udp_port_key) +static int set_tcp(struct sk_buff *skb, struct sw_flow_key *flow_key, + const struct ovs_key_tcp *key, + const struct ovs_key_tcp *mask) { - struct udphdr *uh; + struct tcphdr *th; + __be16 src, dst; int err; - err = make_writable(skb, skb_transport_offset(skb) + - sizeof(struct udphdr)); + err = skb_ensure_writable(skb, skb_transport_offset(skb) + + sizeof(struct tcphdr)); if (unlikely(err)) return err; - uh = udp_hdr(skb); - if (udp_port_key->udp_src != uh->source) - set_udp_port(skb, &uh->source, udp_port_key->udp_src); - - if (udp_port_key->udp_dst != uh->dest) - set_udp_port(skb, &uh->dest, udp_port_key->udp_dst); + th = tcp_hdr(skb); + src = OVS_MASKED(th->source, key->tcp_src, mask->tcp_src); + if (likely(src != th->source)) { + set_tp_port(skb, &th->source, src, &th->check); + flow_key->tp.src = src; + } + dst = OVS_MASKED(th->dest, key->tcp_dst, mask->tcp_dst); + if (likely(dst != th->dest)) { + set_tp_port(skb, &th->dest, dst, &th->check); + flow_key->tp.dst = dst; + } + skb_clear_hash(skb); return 0; } -static int set_tcp(struct sk_buff *skb, const struct ovs_key_tcp *tcp_port_key) +static int set_sctp(struct sk_buff *skb, struct sw_flow_key *flow_key, + const struct ovs_key_sctp *key, + const struct ovs_key_sctp *mask) { - struct tcphdr *th; + unsigned int sctphoff = skb_transport_offset(skb); + struct sctphdr *sh; + __le32 old_correct_csum, new_csum, old_csum; int err; - err = make_writable(skb, skb_transport_offset(skb) + - sizeof(struct tcphdr)); + err = skb_ensure_writable(skb, sctphoff + sizeof(struct sctphdr)); if (unlikely(err)) return err; - th = tcp_hdr(skb); - if (tcp_port_key->tcp_src != th->source) - set_tp_port(skb, &th->source, tcp_port_key->tcp_src, &th->check); + sh = sctp_hdr(skb); + old_csum = sh->checksum; + old_correct_csum = sctp_compute_cksum(skb, sctphoff); - if (tcp_port_key->tcp_dst != th->dest) - set_tp_port(skb, &th->dest, tcp_port_key->tcp_dst, &th->check); + sh->source = OVS_MASKED(sh->source, key->sctp_src, mask->sctp_src); + sh->dest = OVS_MASKED(sh->dest, key->sctp_dst, mask->sctp_dst); + + new_csum = sctp_compute_cksum(skb, sctphoff); + + /* Carry any checksum errors through. */ + sh->checksum = old_csum ^ old_correct_csum ^ new_csum; + + skb_clear_hash(skb); + ovs_ct_clear(skb, NULL); + + flow_key->tp.src = sh->source; + flow_key->tp.dst = sh->dest; return 0; } -static int do_output(struct datapath *dp, struct sk_buff *skb, int out_port) +static int ovs_vport_output(struct net *net, struct sock *sk, + struct sk_buff *skb) { - struct vport *vport; + struct ovs_frag_data *data = this_cpu_ptr(&ovs_pcpu_storage->frag_data); + struct vport *vport = data->vport; - if (unlikely(!skb)) + if (skb_cow_head(skb, data->l2_len) < 0) { + kfree_skb_reason(skb, SKB_DROP_REASON_NOMEM); return -ENOMEM; + } - vport = ovs_vport_rcu(dp, out_port); - if (unlikely(!vport)) { - kfree_skb(skb); - return -ENODEV; + __skb_dst_copy(skb, data->dst); + *OVS_CB(skb) = data->cb; + skb->inner_protocol = data->inner_protocol; + if (data->vlan_tci & VLAN_CFI_MASK) + __vlan_hwaccel_put_tag(skb, data->vlan_proto, data->vlan_tci & ~VLAN_CFI_MASK); + else + __vlan_hwaccel_clear_tag(skb); + + /* Reconstruct the MAC header. */ + skb_push(skb, data->l2_len); + memcpy(skb->data, &data->l2_data, data->l2_len); + skb_postpush_rcsum(skb, skb->data, data->l2_len); + skb_reset_mac_header(skb); + + if (eth_p_mpls(skb->protocol)) { + skb->inner_network_header = skb->network_header; + skb_set_network_header(skb, data->network_offset); + skb_reset_mac_len(skb); } - ovs_vport_send(vport, skb); + ovs_vport_send(vport, skb, data->mac_proto); return 0; } +static unsigned int +ovs_dst_get_mtu(const struct dst_entry *dst) +{ + return dst->dev->mtu; +} + +static struct dst_ops ovs_dst_ops = { + .family = AF_UNSPEC, + .mtu = ovs_dst_get_mtu, +}; + +/* prepare_frag() is called once per (larger-than-MTU) frame; its inverse is + * ovs_vport_output(), which is called once per fragmented packet. + */ +static void prepare_frag(struct vport *vport, struct sk_buff *skb, + u16 orig_network_offset, u8 mac_proto) +{ + unsigned int hlen = skb_network_offset(skb); + struct ovs_frag_data *data; + + data = this_cpu_ptr(&ovs_pcpu_storage->frag_data); + data->dst = skb->_skb_refdst; + data->vport = vport; + data->cb = *OVS_CB(skb); + data->inner_protocol = skb->inner_protocol; + data->network_offset = orig_network_offset; + if (skb_vlan_tag_present(skb)) + data->vlan_tci = skb_vlan_tag_get(skb) | VLAN_CFI_MASK; + else + data->vlan_tci = 0; + data->vlan_proto = skb->vlan_proto; + data->mac_proto = mac_proto; + data->l2_len = hlen; + memcpy(&data->l2_data, skb->data, hlen); + + memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); + skb_pull(skb, hlen); +} + +static void ovs_fragment(struct net *net, struct vport *vport, + struct sk_buff *skb, u16 mru, + struct sw_flow_key *key) +{ + enum ovs_drop_reason reason; + u16 orig_network_offset = 0; + + if (eth_p_mpls(skb->protocol)) { + orig_network_offset = skb_network_offset(skb); + skb->network_header = skb->inner_network_header; + } + + if (skb_network_offset(skb) > MAX_L2_LEN) { + OVS_NLERR(1, "L2 header too long to fragment"); + reason = OVS_DROP_FRAG_L2_TOO_LONG; + goto err; + } + + if (key->eth.type == htons(ETH_P_IP)) { + struct rtable ovs_rt = { 0 }; + unsigned long orig_dst; + + prepare_frag(vport, skb, orig_network_offset, + ovs_key_mac_proto(key)); + dst_init(&ovs_rt.dst, &ovs_dst_ops, NULL, + DST_OBSOLETE_NONE, DST_NOCOUNT); + ovs_rt.dst.dev = vport->dev; + + orig_dst = skb->_skb_refdst; + skb_dst_set_noref(skb, &ovs_rt.dst); + IPCB(skb)->frag_max_size = mru; + + ip_do_fragment(net, skb->sk, skb, ovs_vport_output); + refdst_drop(orig_dst); + } else if (key->eth.type == htons(ETH_P_IPV6)) { + unsigned long orig_dst; + struct rt6_info ovs_rt; + + prepare_frag(vport, skb, orig_network_offset, + ovs_key_mac_proto(key)); + memset(&ovs_rt, 0, sizeof(ovs_rt)); + dst_init(&ovs_rt.dst, &ovs_dst_ops, NULL, + DST_OBSOLETE_NONE, DST_NOCOUNT); + ovs_rt.dst.dev = vport->dev; + + orig_dst = skb->_skb_refdst; + skb_dst_set_noref(skb, &ovs_rt.dst); + IP6CB(skb)->frag_max_size = mru; + + ipv6_stub->ipv6_fragment(net, skb->sk, skb, ovs_vport_output); + refdst_drop(orig_dst); + } else { + WARN_ONCE(1, "Failed fragment ->%s: eth=%04x, MRU=%d, MTU=%d.", + ovs_vport_name(vport), ntohs(key->eth.type), mru, + vport->dev->mtu); + reason = OVS_DROP_FRAG_INVALID_PROTO; + goto err; + } + + return; +err: + ovs_kfree_skb_reason(skb, reason); +} + +static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port, + struct sw_flow_key *key) +{ + struct vport *vport = ovs_vport_rcu(dp, out_port); + + if (likely(vport && + netif_running(vport->dev) && + netif_carrier_ok(vport->dev))) { + u16 mru = OVS_CB(skb)->mru; + u32 cutlen = OVS_CB(skb)->cutlen; + + if (unlikely(cutlen > 0)) { + if (skb->len - cutlen > ovs_mac_header_len(key)) + pskb_trim(skb, skb->len - cutlen); + else + pskb_trim(skb, ovs_mac_header_len(key)); + } + + if (likely(!mru || + (skb->len <= mru + vport->dev->hard_header_len))) { + ovs_vport_send(vport, skb, ovs_key_mac_proto(key)); + } else if (mru <= vport->dev->mtu) { + struct net *net = read_pnet(&dp->net); + + ovs_fragment(net, vport, skb, mru, key); + } else { + kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG); + } + } else { + kfree_skb_reason(skb, SKB_DROP_REASON_DEV_READY); + } +} + static int output_userspace(struct datapath *dp, struct sk_buff *skb, - const struct nlattr *attr) + struct sw_flow_key *key, const struct nlattr *attr, + const struct nlattr *actions, int actions_len, + uint32_t cutlen) { struct dp_upcall_info upcall; const struct nlattr *a; int rem; + memset(&upcall, 0, sizeof(upcall)); upcall.cmd = OVS_PACKET_CMD_ACTION; - upcall.key = &OVS_CB(skb)->flow->key; - upcall.userdata = NULL; - upcall.portid = 0; + upcall.mru = OVS_CB(skb)->mru; - for (a = nla_data(attr), rem = nla_len(attr); rem > 0; - a = nla_next(a, &rem)) { + nla_for_each_nested(a, attr, rem) { switch (nla_type(a)) { case OVS_USERSPACE_ATTR_USERDATA: upcall.userdata = a; break; case OVS_USERSPACE_ATTR_PID: - upcall.portid = nla_get_u32(a); + if (OVS_CB(skb)->upcall_pid) + upcall.portid = OVS_CB(skb)->upcall_pid; + else if (dp->user_features & + OVS_DP_F_DISPATCH_UPCALL_PER_CPU) + upcall.portid = + ovs_dp_get_upcall_portid(dp, + smp_processor_id()); + else + upcall.portid = nla_get_u32(a); + break; + + case OVS_USERSPACE_ATTR_EGRESS_TUN_PORT: { + /* Get out tunnel info. */ + struct vport *vport; + + vport = ovs_vport_rcu(dp, nla_get_u32(a)); + if (vport) { + int err; + + err = dev_fill_metadata_dst(vport->dev, skb); + if (!err) + upcall.egress_tun_info = skb_tunnel_info(skb); + } + break; } + + case OVS_USERSPACE_ATTR_ACTIONS: { + /* Include actions. */ + upcall.actions = actions; + upcall.actions_len = actions_len; + break; + } + + } /* End of switch. */ } - return ovs_dp_upcall(dp, skb, &upcall); + return ovs_dp_upcall(dp, skb, key, &upcall, cutlen); +} + +static int dec_ttl_exception_handler(struct datapath *dp, struct sk_buff *skb, + struct sw_flow_key *key, + const struct nlattr *attr) +{ + /* The first attribute is always 'OVS_DEC_TTL_ATTR_ACTION'. */ + struct nlattr *actions = nla_data(attr); + + if (nla_len(actions)) + return clone_execute(dp, skb, key, 0, nla_data(actions), + nla_len(actions), true, false); + + ovs_kfree_skb_reason(skb, OVS_DROP_IP_TTL); + return 0; } +/* When 'last' is true, sample() should always consume the 'skb'. + * Otherwise, sample() should keep 'skb' intact regardless what + * actions are executed within sample(). + */ static int sample(struct datapath *dp, struct sk_buff *skb, - const struct nlattr *attr) + struct sw_flow_key *key, const struct nlattr *attr, + bool last) { - const struct nlattr *acts_list = NULL; - const struct nlattr *a; - int rem; + struct nlattr *actions; + struct nlattr *sample_arg; + int rem = nla_len(attr); + const struct sample_arg *arg; + u32 init_probability; + bool clone_flow_key; + int err; - for (a = nla_data(attr), rem = nla_len(attr); rem > 0; - a = nla_next(a, &rem)) { - switch (nla_type(a)) { - case OVS_SAMPLE_ATTR_PROBABILITY: - if (net_random() >= nla_get_u32(a)) - return 0; - break; + /* The first action is always 'OVS_SAMPLE_ATTR_ARG'. */ + sample_arg = nla_data(attr); + arg = nla_data(sample_arg); + actions = nla_next(sample_arg, &rem); + init_probability = OVS_CB(skb)->probability; - case OVS_SAMPLE_ATTR_ACTIONS: - acts_list = a; - break; - } + if ((arg->probability != U32_MAX) && + (!arg->probability || get_random_u32() > arg->probability)) { + if (last) + ovs_kfree_skb_reason(skb, OVS_DROP_LAST_ACTION); + return 0; + } + + OVS_CB(skb)->probability = arg->probability; + + clone_flow_key = !arg->exec; + err = clone_execute(dp, skb, key, 0, actions, rem, last, + clone_flow_key); + + if (!last) + OVS_CB(skb)->probability = init_probability; + + return err; +} + +/* When 'last' is true, clone() should always consume the 'skb'. + * Otherwise, clone() should keep 'skb' intact regardless what + * actions are executed within clone(). + */ +static int clone(struct datapath *dp, struct sk_buff *skb, + struct sw_flow_key *key, const struct nlattr *attr, + bool last) +{ + struct nlattr *actions; + struct nlattr *clone_arg; + int rem = nla_len(attr); + bool dont_clone_flow_key; + + /* The first action is always 'OVS_CLONE_ATTR_EXEC'. */ + clone_arg = nla_data(attr); + dont_clone_flow_key = nla_get_u32(clone_arg); + actions = nla_next(clone_arg, &rem); + + return clone_execute(dp, skb, key, 0, actions, rem, last, + !dont_clone_flow_key); +} + +static void execute_hash(struct sk_buff *skb, struct sw_flow_key *key, + const struct nlattr *attr) +{ + struct ovs_action_hash *hash_act = nla_data(attr); + u32 hash = 0; + + if (hash_act->hash_alg == OVS_HASH_ALG_L4) { + /* OVS_HASH_ALG_L4 hasing type. */ + hash = skb_get_hash(skb); + } else if (hash_act->hash_alg == OVS_HASH_ALG_SYM_L4) { + /* OVS_HASH_ALG_SYM_L4 hashing type. NOTE: this doesn't + * extend past an encapsulated header. + */ + hash = __skb_get_hash_symmetric(skb); } - return do_execute_actions(dp, skb, nla_data(acts_list), - nla_len(acts_list), true); + hash = jhash_1word(hash, hash_act->hash_basis); + if (!hash) + hash = 0x1; + + key->ovs_flow_hash = hash; } static int execute_set_action(struct sk_buff *skb, - const struct nlattr *nested_attr) + struct sw_flow_key *flow_key, + const struct nlattr *a) +{ + /* Only tunnel set execution is supported without a mask. */ + if (nla_type(a) == OVS_KEY_ATTR_TUNNEL_INFO) { + struct ovs_tunnel_info *tun = nla_data(a); + + skb_dst_drop(skb); + dst_hold((struct dst_entry *)tun->tun_dst); + skb_dst_set(skb, (struct dst_entry *)tun->tun_dst); + return 0; + } + + return -EINVAL; +} + +/* Mask is at the midpoint of the data. */ +#define get_mask(a, type) ((const type)nla_data(a) + 1) + +static int execute_masked_set_action(struct sk_buff *skb, + struct sw_flow_key *flow_key, + const struct nlattr *a) { int err = 0; - switch (nla_type(nested_attr)) { + switch (nla_type(a)) { case OVS_KEY_ATTR_PRIORITY: - skb->priority = nla_get_u32(nested_attr); + OVS_SET_MASKED(skb->priority, nla_get_u32(a), + *get_mask(a, u32 *)); + flow_key->phy.priority = skb->priority; break; case OVS_KEY_ATTR_SKB_MARK: - skb->mark = nla_get_u32(nested_attr); + OVS_SET_MASKED(skb->mark, nla_get_u32(a), *get_mask(a, u32 *)); + flow_key->phy.skb_mark = skb->mark; break; - case OVS_KEY_ATTR_IPV4_TUNNEL: - OVS_CB(skb)->tun_key = nla_data(nested_attr); + case OVS_KEY_ATTR_TUNNEL_INFO: + /* Masked data not supported for tunnel. */ + err = -EINVAL; break; case OVS_KEY_ATTR_ETHERNET: - err = set_eth_addr(skb, nla_data(nested_attr)); + err = set_eth_addr(skb, flow_key, nla_data(a), + get_mask(a, struct ovs_key_ethernet *)); break; case OVS_KEY_ATTR_IPV4: - err = set_ipv4(skb, nla_data(nested_attr)); + err = set_ipv4(skb, flow_key, nla_data(a), + get_mask(a, struct ovs_key_ipv4 *)); break; case OVS_KEY_ATTR_IPV6: - err = set_ipv6(skb, nla_data(nested_attr)); + err = set_ipv6(skb, flow_key, nla_data(a), + get_mask(a, struct ovs_key_ipv6 *)); break; case OVS_KEY_ATTR_TCP: - err = set_tcp(skb, nla_data(nested_attr)); + err = set_tcp(skb, flow_key, nla_data(a), + get_mask(a, struct ovs_key_tcp *)); break; case OVS_KEY_ATTR_UDP: - err = set_udp(skb, nla_data(nested_attr)); + err = set_udp(skb, flow_key, nla_data(a), + get_mask(a, struct ovs_key_udp *)); + break; + + case OVS_KEY_ATTR_SCTP: + err = set_sctp(skb, flow_key, nla_data(a), + get_mask(a, struct ovs_key_sctp *)); + break; + + case OVS_KEY_ATTR_MPLS: + err = set_mpls(skb, flow_key, nla_data(a), get_mask(a, + __be32 *)); + break; + + case OVS_KEY_ATTR_CT_STATE: + case OVS_KEY_ATTR_CT_ZONE: + case OVS_KEY_ATTR_CT_MARK: + case OVS_KEY_ATTR_CT_LABELS: + case OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4: + case OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6: + case OVS_KEY_ATTR_NSH: + err = -EINVAL; break; } return err; } +static int execute_recirc(struct datapath *dp, struct sk_buff *skb, + struct sw_flow_key *key, + const struct nlattr *a, bool last) +{ + u32 recirc_id; + + if (!is_flow_key_valid(key)) { + int err; + + err = ovs_flow_key_update(skb, key); + if (err) + return err; + } + BUG_ON(!is_flow_key_valid(key)); + + recirc_id = nla_get_u32(a); + return clone_execute(dp, skb, key, recirc_id, NULL, 0, last, true); +} + +static int execute_check_pkt_len(struct datapath *dp, struct sk_buff *skb, + struct sw_flow_key *key, + const struct nlattr *attr, bool last) +{ + struct ovs_skb_cb *ovs_cb = OVS_CB(skb); + const struct nlattr *actions, *cpl_arg; + int len, max_len, rem = nla_len(attr); + const struct check_pkt_len_arg *arg; + bool clone_flow_key; + + /* The first netlink attribute in 'attr' is always + * 'OVS_CHECK_PKT_LEN_ATTR_ARG'. + */ + cpl_arg = nla_data(attr); + arg = nla_data(cpl_arg); + + len = ovs_cb->mru ? ovs_cb->mru + skb->mac_len : skb->len; + max_len = arg->pkt_len; + + if ((skb_is_gso(skb) && skb_gso_validate_mac_len(skb, max_len)) || + len <= max_len) { + /* Second netlink attribute in 'attr' is always + * 'OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL'. + */ + actions = nla_next(cpl_arg, &rem); + clone_flow_key = !arg->exec_for_lesser_equal; + } else { + /* Third netlink attribute in 'attr' is always + * 'OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER'. + */ + actions = nla_next(cpl_arg, &rem); + actions = nla_next(actions, &rem); + clone_flow_key = !arg->exec_for_greater; + } + + return clone_execute(dp, skb, key, 0, nla_data(actions), + nla_len(actions), last, clone_flow_key); +} + +static int execute_dec_ttl(struct sk_buff *skb, struct sw_flow_key *key) +{ + int err; + + if (skb->protocol == htons(ETH_P_IPV6)) { + struct ipv6hdr *nh; + + err = skb_ensure_writable(skb, skb_network_offset(skb) + + sizeof(*nh)); + if (unlikely(err)) + return err; + + nh = ipv6_hdr(skb); + + if (nh->hop_limit <= 1) + return -EHOSTUNREACH; + + key->ip.ttl = --nh->hop_limit; + } else if (skb->protocol == htons(ETH_P_IP)) { + struct iphdr *nh; + u8 old_ttl; + + err = skb_ensure_writable(skb, skb_network_offset(skb) + + sizeof(*nh)); + if (unlikely(err)) + return err; + + nh = ip_hdr(skb); + if (nh->ttl <= 1) + return -EHOSTUNREACH; + + old_ttl = nh->ttl--; + csum_replace2(&nh->check, htons(old_ttl << 8), + htons(nh->ttl << 8)); + key->ip.ttl = nh->ttl; + } + return 0; +} + +#if IS_ENABLED(CONFIG_PSAMPLE) +static void execute_psample(struct datapath *dp, struct sk_buff *skb, + const struct nlattr *attr) +{ + struct psample_group psample_group = {}; + struct psample_metadata md = {}; + const struct nlattr *a; + u32 rate; + int rem; + + nla_for_each_attr(a, nla_data(attr), nla_len(attr), rem) { + switch (nla_type(a)) { + case OVS_PSAMPLE_ATTR_GROUP: + psample_group.group_num = nla_get_u32(a); + break; + + case OVS_PSAMPLE_ATTR_COOKIE: + md.user_cookie = nla_data(a); + md.user_cookie_len = nla_len(a); + break; + } + } + + psample_group.net = ovs_dp_get_net(dp); + md.in_ifindex = OVS_CB(skb)->input_vport->dev->ifindex; + md.trunc_size = skb->len - OVS_CB(skb)->cutlen; + md.rate_as_probability = 1; + + rate = OVS_CB(skb)->probability ? OVS_CB(skb)->probability : U32_MAX; + + psample_sample_packet(&psample_group, skb, rate, &md); +} +#else +static void execute_psample(struct datapath *dp, struct sk_buff *skb, + const struct nlattr *attr) +{} +#endif + /* Execute a list of actions against 'skb'. */ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, - const struct nlattr *attr, int len, bool keep_skb) + struct sw_flow_key *key, + const struct nlattr *attr, int len) { - /* Every output action needs a separate clone of 'skb', but the common - * case is just a single output action, so that doing a clone and - * then freeing the original skbuff is wasteful. So the following code - * is slightly obscure just to avoid that. */ - int prev_port = -1; const struct nlattr *a; int rem; @@ -480,61 +1258,340 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, a = nla_next(a, &rem)) { int err = 0; - if (prev_port != -1) { - do_output(dp, skb_clone(skb, GFP_ATOMIC), prev_port); - prev_port = -1; - } + if (trace_ovs_do_execute_action_enabled()) + trace_ovs_do_execute_action(dp, skb, key, a, rem); + /* Actions that rightfully have to consume the skb should do it + * and return directly. + */ switch (nla_type(a)) { - case OVS_ACTION_ATTR_OUTPUT: - prev_port = nla_get_u32(a); + case OVS_ACTION_ATTR_OUTPUT: { + int port = nla_get_u32(a); + struct sk_buff *clone; + + /* Every output action needs a separate clone + * of 'skb', In case the output action is the + * last action, cloning can be avoided. + */ + if (nla_is_last(a, rem)) { + do_output(dp, skb, port, key); + /* 'skb' has been used for output. + */ + return 0; + } + + clone = skb_clone(skb, GFP_ATOMIC); + if (clone) + do_output(dp, clone, port, key); + OVS_CB(skb)->cutlen = 0; break; + } + + case OVS_ACTION_ATTR_TRUNC: { + struct ovs_action_trunc *trunc = nla_data(a); + + if (skb->len > trunc->max_len) + OVS_CB(skb)->cutlen = skb->len - trunc->max_len; + break; + } case OVS_ACTION_ATTR_USERSPACE: - output_userspace(dp, skb, a); + output_userspace(dp, skb, key, a, attr, + len, OVS_CB(skb)->cutlen); + OVS_CB(skb)->cutlen = 0; + if (nla_is_last(a, rem)) { + consume_skb(skb); + return 0; + } + break; + + case OVS_ACTION_ATTR_HASH: + execute_hash(skb, key, a); + break; + + case OVS_ACTION_ATTR_PUSH_MPLS: { + struct ovs_action_push_mpls *mpls = nla_data(a); + + err = push_mpls(skb, key, mpls->mpls_lse, + mpls->mpls_ethertype, skb->mac_len); + break; + } + case OVS_ACTION_ATTR_ADD_MPLS: { + struct ovs_action_add_mpls *mpls = nla_data(a); + __u16 mac_len = 0; + + if (mpls->tun_flags & OVS_MPLS_L3_TUNNEL_FLAG_MASK) + mac_len = skb->mac_len; + + err = push_mpls(skb, key, mpls->mpls_lse, + mpls->mpls_ethertype, mac_len); + break; + } + case OVS_ACTION_ATTR_POP_MPLS: + err = pop_mpls(skb, key, nla_get_be16(a)); break; case OVS_ACTION_ATTR_PUSH_VLAN: - err = push_vlan(skb, nla_data(a)); - if (unlikely(err)) /* skb already freed. */ - return err; + err = push_vlan(skb, key, nla_data(a)); break; case OVS_ACTION_ATTR_POP_VLAN: - err = pop_vlan(skb); + err = pop_vlan(skb, key); + break; + + case OVS_ACTION_ATTR_RECIRC: { + bool last = nla_is_last(a, rem); + + err = execute_recirc(dp, skb, key, a, last); + if (last) { + /* If this is the last action, the skb has + * been consumed or freed. + * Return immediately. + */ + return err; + } break; + } case OVS_ACTION_ATTR_SET: - err = execute_set_action(skb, nla_data(a)); + err = execute_set_action(skb, key, nla_data(a)); + break; + + case OVS_ACTION_ATTR_SET_MASKED: + case OVS_ACTION_ATTR_SET_TO_MASKED: + err = execute_masked_set_action(skb, key, nla_data(a)); + break; + + case OVS_ACTION_ATTR_SAMPLE: { + bool last = nla_is_last(a, rem); + + err = sample(dp, skb, key, a, last); + if (last) + return err; + + break; + } + + case OVS_ACTION_ATTR_CT: + if (!is_flow_key_valid(key)) { + err = ovs_flow_key_update(skb, key); + if (err) + return err; + } + + err = ovs_ct_execute(ovs_dp_get_net(dp), skb, key, + nla_data(a)); + + /* Hide stolen IP fragments from user space. */ + if (err) + return err == -EINPROGRESS ? 0 : err; + break; + + case OVS_ACTION_ATTR_CT_CLEAR: + err = ovs_ct_clear(skb, key); + break; + + case OVS_ACTION_ATTR_PUSH_ETH: + err = push_eth(skb, key, nla_data(a)); + break; + + case OVS_ACTION_ATTR_POP_ETH: + err = pop_eth(skb, key); + break; + + case OVS_ACTION_ATTR_PUSH_NSH: + err = push_nsh(skb, key, nla_data(a)); + break; + + case OVS_ACTION_ATTR_POP_NSH: + err = pop_nsh(skb, key); break; - case OVS_ACTION_ATTR_SAMPLE: - err = sample(dp, skb, a); + case OVS_ACTION_ATTR_METER: + if (ovs_meter_execute(dp, skb, key, nla_get_u32(a))) { + ovs_kfree_skb_reason(skb, OVS_DROP_METER); + return 0; + } + break; + + case OVS_ACTION_ATTR_CLONE: { + bool last = nla_is_last(a, rem); + + err = clone(dp, skb, key, a, last); + if (last) + return err; + + break; + } + + case OVS_ACTION_ATTR_CHECK_PKT_LEN: { + bool last = nla_is_last(a, rem); + + err = execute_check_pkt_len(dp, skb, key, a, last); + if (last) + return err; + + break; + } + + case OVS_ACTION_ATTR_DEC_TTL: + err = execute_dec_ttl(skb, key); + if (err == -EHOSTUNREACH) + return dec_ttl_exception_handler(dp, skb, + key, a); + break; + + case OVS_ACTION_ATTR_DROP: { + enum ovs_drop_reason reason = nla_get_u32(a) + ? OVS_DROP_EXPLICIT_WITH_ERROR + : OVS_DROP_EXPLICIT; + + ovs_kfree_skb_reason(skb, reason); + return 0; + } + + case OVS_ACTION_ATTR_PSAMPLE: + execute_psample(dp, skb, a); + OVS_CB(skb)->cutlen = 0; + if (nla_is_last(a, rem)) { + consume_skb(skb); + return 0; + } break; } if (unlikely(err)) { - kfree_skb(skb); + ovs_kfree_skb_reason(skb, OVS_DROP_ACTION_ERROR); return err; } } - if (prev_port != -1) { - if (keep_skb) - skb = skb_clone(skb, GFP_ATOMIC); + ovs_kfree_skb_reason(skb, OVS_DROP_LAST_ACTION); + return 0; +} - do_output(dp, skb, prev_port); - } else if (!keep_skb) - consume_skb(skb); +/* Execute the actions on the clone of the packet. The effect of the + * execution does not affect the original 'skb' nor the original 'key'. + * + * The execution may be deferred in case the actions can not be executed + * immediately. + */ +static int clone_execute(struct datapath *dp, struct sk_buff *skb, + struct sw_flow_key *key, u32 recirc_id, + const struct nlattr *actions, int len, + bool last, bool clone_flow_key) +{ + struct deferred_action *da; + struct sw_flow_key *clone; + + skb = last ? skb : skb_clone(skb, GFP_ATOMIC); + if (!skb) { + /* Out of memory, skip this action. + */ + return 0; + } + + /* When clone_flow_key is false, the 'key' will not be change + * by the actions, then the 'key' can be used directly. + * Otherwise, try to clone key from the next recursion level of + * 'flow_keys'. If clone is successful, execute the actions + * without deferring. + */ + clone = clone_flow_key ? clone_key(key) : key; + if (clone) { + int err = 0; + if (actions) { /* Sample action */ + if (clone_flow_key) + __this_cpu_inc(ovs_pcpu_storage->exec_level); + + err = do_execute_actions(dp, skb, clone, + actions, len); + + if (clone_flow_key) + __this_cpu_dec(ovs_pcpu_storage->exec_level); + } else { /* Recirc action */ + clone->recirc_id = recirc_id; + ovs_dp_process_packet(skb, clone); + } + return err; + } + /* Out of 'flow_keys' space. Defer actions */ + da = add_deferred_actions(skb, key, actions, len); + if (da) { + if (!actions) { /* Recirc action */ + key = &da->pkt_key; + key->recirc_id = recirc_id; + } + } else { + /* Out of per CPU action FIFO space. Drop the 'skb' and + * log an error. + */ + ovs_kfree_skb_reason(skb, OVS_DROP_DEFERRED_LIMIT); + + if (net_ratelimit()) { + if (actions) { /* Sample action */ + pr_warn("%s: deferred action limit reached, drop sample action\n", + ovs_dp_name(dp)); + } else { /* Recirc action */ + pr_warn("%s: deferred action limit reached, drop recirc action (recirc_id=%#x)\n", + ovs_dp_name(dp), recirc_id); + } + } + } return 0; } +static void process_deferred_actions(struct datapath *dp) +{ + struct action_fifo *fifo = this_cpu_ptr(&ovs_pcpu_storage->action_fifos); + + /* Do not touch the FIFO in case there is no deferred actions. */ + if (action_fifo_is_empty(fifo)) + return; + + /* Finishing executing all deferred actions. */ + do { + struct deferred_action *da = action_fifo_get(fifo); + struct sk_buff *skb = da->skb; + struct sw_flow_key *key = &da->pkt_key; + const struct nlattr *actions = da->actions; + int actions_len = da->actions_len; + + if (actions) + do_execute_actions(dp, skb, key, actions, actions_len); + else + ovs_dp_process_packet(skb, key); + } while (!action_fifo_is_empty(fifo)); + + /* Reset FIFO for the next packet. */ + action_fifo_init(fifo); +} + /* Execute a list of actions against 'skb'. */ -int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb) +int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, + const struct sw_flow_actions *acts, + struct sw_flow_key *key) { - struct sw_flow_actions *acts = rcu_dereference(OVS_CB(skb)->flow->sf_acts); + int err, level; + + level = __this_cpu_inc_return(ovs_pcpu_storage->exec_level); + if (unlikely(level > OVS_RECURSION_LIMIT)) { + net_crit_ratelimited("ovs: recursion limit reached on datapath %s, probable configuration error\n", + ovs_dp_name(dp)); + ovs_kfree_skb_reason(skb, OVS_DROP_RECURSION_LIMIT); + err = -ENETDOWN; + goto out; + } - return do_execute_actions(dp, skb, acts->actions, - acts->actions_len, false); + OVS_CB(skb)->acts_origlen = acts->orig_len; + err = do_execute_actions(dp, skb, key, + acts->actions, acts->actions_len); + + if (level == 1) + process_deferred_actions(dp); + +out: + __this_cpu_dec(ovs_pcpu_storage->exec_level); + return err; } diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c new file mode 100644 index 000000000000..a0811e1fba65 --- /dev/null +++ b/net/openvswitch/conntrack.c @@ -0,0 +1,2031 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2015 Nicira, Inc. + */ + +#include <linux/module.h> +#include <linux/openvswitch.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/sctp.h> +#include <linux/static_key.h> +#include <linux/string_helpers.h> +#include <net/ip.h> +#include <net/genetlink.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_conntrack_count.h> +#include <net/netfilter/nf_conntrack_helper.h> +#include <net/netfilter/nf_conntrack_labels.h> +#include <net/netfilter/nf_conntrack_seqadj.h> +#include <net/netfilter/nf_conntrack_timeout.h> +#include <net/netfilter/nf_conntrack_zones.h> +#include <net/netfilter/ipv6/nf_defrag_ipv6.h> +#include <net/ipv6_frag.h> + +#if IS_ENABLED(CONFIG_NF_NAT) +#include <net/netfilter/nf_nat.h> +#endif + +#include <net/netfilter/nf_conntrack_act_ct.h> + +#include "datapath.h" +#include "drop.h" +#include "conntrack.h" +#include "flow.h" +#include "flow_netlink.h" + +struct ovs_ct_len_tbl { + int maxlen; + int minlen; +}; + +/* Metadata mark for masked write to conntrack mark */ +struct md_mark { + u32 value; + u32 mask; +}; + +/* Metadata label for masked write to conntrack label. */ +struct md_labels { + struct ovs_key_ct_labels value; + struct ovs_key_ct_labels mask; +}; + +enum ovs_ct_nat { + OVS_CT_NAT = 1 << 0, /* NAT for committed connections only. */ + OVS_CT_SRC_NAT = 1 << 1, /* Source NAT for NEW connections. */ + OVS_CT_DST_NAT = 1 << 2, /* Destination NAT for NEW connections. */ +}; + +/* Conntrack action context for execution. */ +struct ovs_conntrack_info { + struct nf_conntrack_helper *helper; + struct nf_conntrack_zone zone; + struct nf_conn *ct; + u8 commit : 1; + u8 nat : 3; /* enum ovs_ct_nat */ + u8 force : 1; + u8 have_eventmask : 1; + u16 family; + u32 eventmask; /* Mask of 1 << IPCT_*. */ + struct md_mark mark; + struct md_labels labels; + char timeout[CTNL_TIMEOUT_NAME_MAX]; + struct nf_ct_timeout *nf_ct_timeout; +#if IS_ENABLED(CONFIG_NF_NAT) + struct nf_nat_range2 range; /* Only present for SRC NAT and DST NAT. */ +#endif +}; + +#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) +#define OVS_CT_LIMIT_UNLIMITED 0 +#define OVS_CT_LIMIT_DEFAULT OVS_CT_LIMIT_UNLIMITED +#define CT_LIMIT_HASH_BUCKETS 512 +static DEFINE_STATIC_KEY_FALSE(ovs_ct_limit_enabled); + +struct ovs_ct_limit { + /* Elements in ovs_ct_limit_info->limits hash table */ + struct hlist_node hlist_node; + struct rcu_head rcu; + u16 zone; + u32 limit; +}; + +struct ovs_ct_limit_info { + u32 default_limit; + struct hlist_head *limits; + struct nf_conncount_data *data; +}; + +static const struct nla_policy ct_limit_policy[OVS_CT_LIMIT_ATTR_MAX + 1] = { + [OVS_CT_LIMIT_ATTR_ZONE_LIMIT] = { .type = NLA_NESTED, }, +}; +#endif + +static bool labels_nonzero(const struct ovs_key_ct_labels *labels); + +static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info); + +static u16 key_to_nfproto(const struct sw_flow_key *key) +{ + switch (ntohs(key->eth.type)) { + case ETH_P_IP: + return NFPROTO_IPV4; + case ETH_P_IPV6: + return NFPROTO_IPV6; + default: + return NFPROTO_UNSPEC; + } +} + +/* Map SKB connection state into the values used by flow definition. */ +static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo) +{ + u8 ct_state = OVS_CS_F_TRACKED; + + switch (ctinfo) { + case IP_CT_ESTABLISHED_REPLY: + case IP_CT_RELATED_REPLY: + ct_state |= OVS_CS_F_REPLY_DIR; + break; + default: + break; + } + + switch (ctinfo) { + case IP_CT_ESTABLISHED: + case IP_CT_ESTABLISHED_REPLY: + ct_state |= OVS_CS_F_ESTABLISHED; + break; + case IP_CT_RELATED: + case IP_CT_RELATED_REPLY: + ct_state |= OVS_CS_F_RELATED; + break; + case IP_CT_NEW: + ct_state |= OVS_CS_F_NEW; + break; + default: + break; + } + + return ct_state; +} + +static u32 ovs_ct_get_mark(const struct nf_conn *ct) +{ +#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) + return ct ? READ_ONCE(ct->mark) : 0; +#else + return 0; +#endif +} + +/* Guard against conntrack labels max size shrinking below 128 bits. */ +#if NF_CT_LABELS_MAX_SIZE < 16 +#error NF_CT_LABELS_MAX_SIZE must be at least 16 bytes +#endif + +static void ovs_ct_get_labels(const struct nf_conn *ct, + struct ovs_key_ct_labels *labels) +{ + struct nf_conn_labels *cl = NULL; + + if (ct) { + if (ct->master && !nf_ct_is_confirmed(ct)) + ct = ct->master; + cl = nf_ct_labels_find(ct); + } + if (cl) + memcpy(labels, cl->bits, OVS_CT_LABELS_LEN); + else + memset(labels, 0, OVS_CT_LABELS_LEN); +} + +static void __ovs_ct_update_key_orig_tp(struct sw_flow_key *key, + const struct nf_conntrack_tuple *orig, + u8 icmp_proto) +{ + key->ct_orig_proto = orig->dst.protonum; + if (orig->dst.protonum == icmp_proto) { + key->ct.orig_tp.src = htons(orig->dst.u.icmp.type); + key->ct.orig_tp.dst = htons(orig->dst.u.icmp.code); + } else { + key->ct.orig_tp.src = orig->src.u.all; + key->ct.orig_tp.dst = orig->dst.u.all; + } +} + +static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state, + const struct nf_conntrack_zone *zone, + const struct nf_conn *ct) +{ + key->ct_state = state; + key->ct_zone = zone->id; + key->ct.mark = ovs_ct_get_mark(ct); + ovs_ct_get_labels(ct, &key->ct.labels); + + if (ct) { + const struct nf_conntrack_tuple *orig; + + /* Use the master if we have one. */ + if (ct->master) + ct = ct->master; + orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + + /* IP version must match with the master connection. */ + if (key->eth.type == htons(ETH_P_IP) && + nf_ct_l3num(ct) == NFPROTO_IPV4) { + key->ipv4.ct_orig.src = orig->src.u3.ip; + key->ipv4.ct_orig.dst = orig->dst.u3.ip; + __ovs_ct_update_key_orig_tp(key, orig, IPPROTO_ICMP); + return; + } else if (key->eth.type == htons(ETH_P_IPV6) && + !sw_flow_key_is_nd(key) && + nf_ct_l3num(ct) == NFPROTO_IPV6) { + key->ipv6.ct_orig.src = orig->src.u3.in6; + key->ipv6.ct_orig.dst = orig->dst.u3.in6; + __ovs_ct_update_key_orig_tp(key, orig, NEXTHDR_ICMP); + return; + } + } + /* Clear 'ct_orig_proto' to mark the non-existence of conntrack + * original direction key fields. + */ + key->ct_orig_proto = 0; +} + +/* Update 'key' based on skb->_nfct. If 'post_ct' is true, then OVS has + * previously sent the packet to conntrack via the ct action. If + * 'keep_nat_flags' is true, the existing NAT flags retained, else they are + * initialized from the connection status. + */ +static void ovs_ct_update_key(const struct sk_buff *skb, + const struct ovs_conntrack_info *info, + struct sw_flow_key *key, bool post_ct, + bool keep_nat_flags) +{ + const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt; + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + u8 state = 0; + + ct = nf_ct_get(skb, &ctinfo); + if (ct) { + state = ovs_ct_get_state(ctinfo); + /* All unconfirmed entries are NEW connections. */ + if (!nf_ct_is_confirmed(ct)) + state |= OVS_CS_F_NEW; + /* OVS persists the related flag for the duration of the + * connection. + */ + if (ct->master) + state |= OVS_CS_F_RELATED; + if (keep_nat_flags) { + state |= key->ct_state & OVS_CS_F_NAT_MASK; + } else { + if (ct->status & IPS_SRC_NAT) + state |= OVS_CS_F_SRC_NAT; + if (ct->status & IPS_DST_NAT) + state |= OVS_CS_F_DST_NAT; + } + zone = nf_ct_zone(ct); + } else if (post_ct) { + state = OVS_CS_F_TRACKED | OVS_CS_F_INVALID; + if (info) + zone = &info->zone; + } + __ovs_ct_update_key(key, state, zone, ct); +} + +/* This is called to initialize CT key fields possibly coming in from the local + * stack. + */ +void ovs_ct_fill_key(const struct sk_buff *skb, + struct sw_flow_key *key, + bool post_ct) +{ + ovs_ct_update_key(skb, NULL, key, post_ct, false); +} + +int ovs_ct_put_key(const struct sw_flow_key *swkey, + const struct sw_flow_key *output, struct sk_buff *skb) +{ + if (nla_put_u32(skb, OVS_KEY_ATTR_CT_STATE, output->ct_state)) + return -EMSGSIZE; + + if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) && + nla_put_u16(skb, OVS_KEY_ATTR_CT_ZONE, output->ct_zone)) + return -EMSGSIZE; + + if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) && + nla_put_u32(skb, OVS_KEY_ATTR_CT_MARK, output->ct.mark)) + return -EMSGSIZE; + + if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && + nla_put(skb, OVS_KEY_ATTR_CT_LABELS, sizeof(output->ct.labels), + &output->ct.labels)) + return -EMSGSIZE; + + if (swkey->ct_orig_proto) { + if (swkey->eth.type == htons(ETH_P_IP)) { + struct ovs_key_ct_tuple_ipv4 orig; + + memset(&orig, 0, sizeof(orig)); + orig.ipv4_src = output->ipv4.ct_orig.src; + orig.ipv4_dst = output->ipv4.ct_orig.dst; + orig.src_port = output->ct.orig_tp.src; + orig.dst_port = output->ct.orig_tp.dst; + orig.ipv4_proto = output->ct_orig_proto; + + if (nla_put(skb, OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4, + sizeof(orig), &orig)) + return -EMSGSIZE; + } else if (swkey->eth.type == htons(ETH_P_IPV6)) { + struct ovs_key_ct_tuple_ipv6 orig; + + memset(&orig, 0, sizeof(orig)); + memcpy(orig.ipv6_src, output->ipv6.ct_orig.src.s6_addr32, + sizeof(orig.ipv6_src)); + memcpy(orig.ipv6_dst, output->ipv6.ct_orig.dst.s6_addr32, + sizeof(orig.ipv6_dst)); + orig.src_port = output->ct.orig_tp.src; + orig.dst_port = output->ct.orig_tp.dst; + orig.ipv6_proto = output->ct_orig_proto; + + if (nla_put(skb, OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6, + sizeof(orig), &orig)) + return -EMSGSIZE; + } + } + + return 0; +} + +static int ovs_ct_set_mark(struct nf_conn *ct, struct sw_flow_key *key, + u32 ct_mark, u32 mask) +{ +#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) + u32 new_mark; + + new_mark = ct_mark | (READ_ONCE(ct->mark) & ~(mask)); + if (READ_ONCE(ct->mark) != new_mark) { + WRITE_ONCE(ct->mark, new_mark); + if (nf_ct_is_confirmed(ct)) + nf_conntrack_event_cache(IPCT_MARK, ct); + key->ct.mark = new_mark; + } + + return 0; +#else + return -ENOTSUPP; +#endif +} + +static struct nf_conn_labels *ovs_ct_get_conn_labels(struct nf_conn *ct) +{ + struct nf_conn_labels *cl; + + cl = nf_ct_labels_find(ct); + if (!cl) { + nf_ct_labels_ext_add(ct); + cl = nf_ct_labels_find(ct); + } + + return cl; +} + +/* Initialize labels for a new, yet to be committed conntrack entry. Note that + * since the new connection is not yet confirmed, and thus no-one else has + * access to it's labels, we simply write them over. + */ +static int ovs_ct_init_labels(struct nf_conn *ct, struct sw_flow_key *key, + const struct ovs_key_ct_labels *labels, + const struct ovs_key_ct_labels *mask) +{ + struct nf_conn_labels *cl, *master_cl; + bool have_mask = labels_nonzero(mask); + + /* Inherit master's labels to the related connection? */ + master_cl = ct->master ? nf_ct_labels_find(ct->master) : NULL; + + if (!master_cl && !have_mask) + return 0; /* Nothing to do. */ + + cl = ovs_ct_get_conn_labels(ct); + if (!cl) + return -ENOSPC; + + /* Inherit the master's labels, if any. */ + if (master_cl) + *cl = *master_cl; + + if (have_mask) { + u32 *dst = (u32 *)cl->bits; + int i; + + for (i = 0; i < OVS_CT_LABELS_LEN_32; i++) + dst[i] = (dst[i] & ~mask->ct_labels_32[i]) | + (labels->ct_labels_32[i] + & mask->ct_labels_32[i]); + } + + /* Labels are included in the IPCTNL_MSG_CT_NEW event only if the + * IPCT_LABEL bit is set in the event cache. + */ + nf_conntrack_event_cache(IPCT_LABEL, ct); + + memcpy(&key->ct.labels, cl->bits, OVS_CT_LABELS_LEN); + + return 0; +} + +static int ovs_ct_set_labels(struct nf_conn *ct, struct sw_flow_key *key, + const struct ovs_key_ct_labels *labels, + const struct ovs_key_ct_labels *mask) +{ + struct nf_conn_labels *cl; + int err; + + cl = ovs_ct_get_conn_labels(ct); + if (!cl) + return -ENOSPC; + + err = nf_connlabels_replace(ct, labels->ct_labels_32, + mask->ct_labels_32, + OVS_CT_LABELS_LEN_32); + if (err) + return err; + + memcpy(&key->ct.labels, cl->bits, OVS_CT_LABELS_LEN); + + return 0; +} + +static int ovs_ct_handle_fragments(struct net *net, struct sw_flow_key *key, + u16 zone, int family, struct sk_buff *skb) +{ + struct ovs_skb_cb ovs_cb = *OVS_CB(skb); + int err; + + err = nf_ct_handle_fragments(net, skb, zone, family, &key->ip.proto, &ovs_cb.mru); + if (err) + return err; + + /* The key extracted from the fragment that completed this datagram + * likely didn't have an L4 header, so regenerate it. + */ + ovs_flow_key_update_l3l4(skb, key); + key->ip.frag = OVS_FRAG_TYPE_NONE; + *OVS_CB(skb) = ovs_cb; + + return 0; +} + +/* This replicates logic from nf_conntrack_core.c that is not exported. */ +static enum ip_conntrack_info +ovs_ct_get_info(const struct nf_conntrack_tuple_hash *h) +{ + const struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); + + if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) + return IP_CT_ESTABLISHED_REPLY; + /* Once we've had two way comms, always ESTABLISHED. */ + if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) + return IP_CT_ESTABLISHED; + if (test_bit(IPS_EXPECTED_BIT, &ct->status)) + return IP_CT_RELATED; + return IP_CT_NEW; +} + +/* Find an existing connection which this packet belongs to without + * re-attributing statistics or modifying the connection state. This allows an + * skb->_nfct lost due to an upcall to be recovered during actions execution. + * + * Must be called with rcu_read_lock. + * + * On success, populates skb->_nfct and returns the connection. Returns NULL + * if there is no existing entry. + */ +static struct nf_conn * +ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone, + u8 l3num, struct sk_buff *skb, bool natted) +{ + struct nf_conntrack_tuple tuple; + struct nf_conntrack_tuple_hash *h; + struct nf_conn *ct; + + if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), l3num, + net, &tuple)) { + pr_debug("ovs_ct_find_existing: Can't get tuple\n"); + return NULL; + } + + /* Must invert the tuple if skb has been transformed by NAT. */ + if (natted) { + struct nf_conntrack_tuple inverse; + + if (!nf_ct_invert_tuple(&inverse, &tuple)) { + pr_debug("ovs_ct_find_existing: Inversion failed!\n"); + return NULL; + } + tuple = inverse; + } + + /* look for tuple match */ + h = nf_conntrack_find_get(net, zone, &tuple); + if (!h) + return NULL; /* Not found. */ + + ct = nf_ct_tuplehash_to_ctrack(h); + + /* Inverted packet tuple matches the reverse direction conntrack tuple, + * select the other tuplehash to get the right 'ctinfo' bits for this + * packet. + */ + if (natted) + h = &ct->tuplehash[!h->tuple.dst.dir]; + + nf_ct_set(skb, ct, ovs_ct_get_info(h)); + return ct; +} + +static +struct nf_conn *ovs_ct_executed(struct net *net, + const struct sw_flow_key *key, + const struct ovs_conntrack_info *info, + struct sk_buff *skb, + bool *ct_executed) +{ + struct nf_conn *ct = NULL; + + /* If no ct, check if we have evidence that an existing conntrack entry + * might be found for this skb. This happens when we lose a skb->_nfct + * due to an upcall, or if the direction is being forced. If the + * connection was not confirmed, it is not cached and needs to be run + * through conntrack again. + */ + *ct_executed = (key->ct_state & OVS_CS_F_TRACKED) && + !(key->ct_state & OVS_CS_F_INVALID) && + (key->ct_zone == info->zone.id); + + if (*ct_executed || (!key->ct_state && info->force)) { + ct = ovs_ct_find_existing(net, &info->zone, info->family, skb, + !!(key->ct_state & + OVS_CS_F_NAT_MASK)); + } + + return ct; +} + +/* Determine whether skb->_nfct is equal to the result of conntrack lookup. */ +static bool skb_nfct_cached(struct net *net, + const struct sw_flow_key *key, + const struct ovs_conntrack_info *info, + struct sk_buff *skb) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + bool ct_executed = true; + + ct = nf_ct_get(skb, &ctinfo); + if (!ct) + ct = ovs_ct_executed(net, key, info, skb, &ct_executed); + + if (ct) + nf_ct_get(skb, &ctinfo); + else + return false; + + if (!net_eq(net, read_pnet(&ct->ct_net))) + return false; + if (!nf_ct_zone_equal_any(info->ct, nf_ct_zone(ct))) + return false; + if (info->helper) { + struct nf_conn_help *help; + + help = nf_ct_ext_find(ct, NF_CT_EXT_HELPER); + if (help && rcu_access_pointer(help->helper) != info->helper) + return false; + } + if (info->nf_ct_timeout) { + struct nf_conn_timeout *timeout_ext; + + timeout_ext = nf_ct_timeout_find(ct); + if (!timeout_ext || info->nf_ct_timeout != + rcu_dereference(timeout_ext->timeout)) + return false; + } + /* Force conntrack entry direction to the current packet? */ + if (info->force && CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) { + /* Delete the conntrack entry if confirmed, else just release + * the reference. + */ + if (nf_ct_is_confirmed(ct)) + nf_ct_delete(ct, 0, 0); + + nf_ct_put(ct); + nf_ct_set(skb, NULL, 0); + return false; + } + + return ct_executed; +} + +#if IS_ENABLED(CONFIG_NF_NAT) +static void ovs_nat_update_key(struct sw_flow_key *key, + const struct sk_buff *skb, + enum nf_nat_manip_type maniptype) +{ + if (maniptype == NF_NAT_MANIP_SRC) { + __be16 src; + + key->ct_state |= OVS_CS_F_SRC_NAT; + if (key->eth.type == htons(ETH_P_IP)) + key->ipv4.addr.src = ip_hdr(skb)->saddr; + else if (key->eth.type == htons(ETH_P_IPV6)) + memcpy(&key->ipv6.addr.src, &ipv6_hdr(skb)->saddr, + sizeof(key->ipv6.addr.src)); + else + return; + + if (key->ip.proto == IPPROTO_UDP) + src = udp_hdr(skb)->source; + else if (key->ip.proto == IPPROTO_TCP) + src = tcp_hdr(skb)->source; + else if (key->ip.proto == IPPROTO_SCTP) + src = sctp_hdr(skb)->source; + else + return; + + key->tp.src = src; + } else { + __be16 dst; + + key->ct_state |= OVS_CS_F_DST_NAT; + if (key->eth.type == htons(ETH_P_IP)) + key->ipv4.addr.dst = ip_hdr(skb)->daddr; + else if (key->eth.type == htons(ETH_P_IPV6)) + memcpy(&key->ipv6.addr.dst, &ipv6_hdr(skb)->daddr, + sizeof(key->ipv6.addr.dst)); + else + return; + + if (key->ip.proto == IPPROTO_UDP) + dst = udp_hdr(skb)->dest; + else if (key->ip.proto == IPPROTO_TCP) + dst = tcp_hdr(skb)->dest; + else if (key->ip.proto == IPPROTO_SCTP) + dst = sctp_hdr(skb)->dest; + else + return; + + key->tp.dst = dst; + } +} + +/* Returns NF_DROP if the packet should be dropped, NF_ACCEPT otherwise. */ +static int ovs_ct_nat(struct net *net, struct sw_flow_key *key, + const struct ovs_conntrack_info *info, + struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo) +{ + int err, action = 0; + + if (!(info->nat & OVS_CT_NAT)) + return NF_ACCEPT; + if (info->nat & OVS_CT_SRC_NAT) + action |= BIT(NF_NAT_MANIP_SRC); + if (info->nat & OVS_CT_DST_NAT) + action |= BIT(NF_NAT_MANIP_DST); + + err = nf_ct_nat(skb, ct, ctinfo, &action, &info->range, info->commit); + if (err != NF_ACCEPT) + return err; + + if (action & BIT(NF_NAT_MANIP_SRC)) + ovs_nat_update_key(key, skb, NF_NAT_MANIP_SRC); + if (action & BIT(NF_NAT_MANIP_DST)) + ovs_nat_update_key(key, skb, NF_NAT_MANIP_DST); + + return err; +} +#else /* !CONFIG_NF_NAT */ +static int ovs_ct_nat(struct net *net, struct sw_flow_key *key, + const struct ovs_conntrack_info *info, + struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo) +{ + return NF_ACCEPT; +} +#endif + +static int verdict_to_errno(unsigned int verdict) +{ + switch (verdict & NF_VERDICT_MASK) { + case NF_ACCEPT: + return 0; + case NF_DROP: + return -EINVAL; + case NF_STOLEN: + return -EINPROGRESS; + default: + break; + } + + return -EINVAL; +} + +/* Pass 'skb' through conntrack in 'net', using zone configured in 'info', if + * not done already. Update key with new CT state after passing the packet + * through conntrack. + * Note that if the packet is deemed invalid by conntrack, skb->_nfct will be + * set to NULL and 0 will be returned. + */ +static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, + const struct ovs_conntrack_info *info, + struct sk_buff *skb) +{ + /* If we are recirculating packets to match on conntrack fields and + * committing with a separate conntrack action, then we don't need to + * actually run the packet through conntrack twice unless it's for a + * different zone. + */ + bool cached = skb_nfct_cached(net, key, info, skb); + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + + if (!cached) { + struct nf_hook_state state = { + .hook = NF_INET_PRE_ROUTING, + .pf = info->family, + .net = net, + }; + struct nf_conn *tmpl = info->ct; + int err; + + /* Associate skb with specified zone. */ + if (tmpl) { + ct = nf_ct_get(skb, &ctinfo); + nf_ct_put(ct); + nf_conntrack_get(&tmpl->ct_general); + nf_ct_set(skb, tmpl, IP_CT_NEW); + } + + err = nf_conntrack_in(skb, &state); + if (err != NF_ACCEPT) + return verdict_to_errno(err); + + /* Clear CT state NAT flags to mark that we have not yet done + * NAT after the nf_conntrack_in() call. We can actually clear + * the whole state, as it will be re-initialized below. + */ + key->ct_state = 0; + + /* Update the key, but keep the NAT flags. */ + ovs_ct_update_key(skb, info, key, true, true); + } + + ct = nf_ct_get(skb, &ctinfo); + if (ct) { + bool add_helper = false; + + /* Packets starting a new connection must be NATted before the + * helper, so that the helper knows about the NAT. We enforce + * this by delaying both NAT and helper calls for unconfirmed + * connections until the committing CT action. For later + * packets NAT and Helper may be called in either order. + * + * NAT will be done only if the CT action has NAT, and only + * once per packet (per zone), as guarded by the NAT bits in + * the key->ct_state. + */ + if (info->nat && !(key->ct_state & OVS_CS_F_NAT_MASK) && + (nf_ct_is_confirmed(ct) || info->commit)) { + int err = ovs_ct_nat(net, key, info, skb, ct, ctinfo); + + err = verdict_to_errno(err); + if (err) + return err; + } + + /* Userspace may decide to perform a ct lookup without a helper + * specified followed by a (recirculate and) commit with one, + * or attach a helper in a later commit. Therefore, for + * connections which we will commit, we may need to attach + * the helper here. + */ + if (!nf_ct_is_confirmed(ct) && info->commit && + info->helper && !nfct_help(ct)) { + int err = __nf_ct_try_assign_helper(ct, info->ct, + GFP_ATOMIC); + if (err) + return err; + add_helper = true; + + /* helper installed, add seqadj if NAT is required */ + if (info->nat && !nfct_seqadj(ct)) { + if (!nfct_seqadj_ext_add(ct)) + return -EINVAL; + } + } + + /* Call the helper only if: + * - nf_conntrack_in() was executed above ("!cached") or a + * helper was just attached ("add_helper") for a confirmed + * connection, or + * - When committing an unconfirmed connection. + */ + if ((nf_ct_is_confirmed(ct) ? !cached || add_helper : + info->commit)) { + int err = nf_ct_helper(skb, ct, ctinfo, info->family); + + err = verdict_to_errno(err); + if (err) + return err; + } + + if (nf_ct_protonum(ct) == IPPROTO_TCP && + nf_ct_is_confirmed(ct) && nf_conntrack_tcp_established(ct)) { + /* Be liberal for tcp packets so that out-of-window + * packets are not marked invalid. + */ + nf_ct_set_tcp_be_liberal(ct); + } + + nf_conn_act_ct_ext_fill(skb, ct, ctinfo); + } + + return 0; +} + +/* Lookup connection and read fields into key. */ +static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key, + const struct ovs_conntrack_info *info, + struct sk_buff *skb) +{ + struct nf_conn *ct; + int err; + + err = __ovs_ct_lookup(net, key, info, skb); + if (err) + return err; + + ct = (struct nf_conn *)skb_nfct(skb); + if (ct) + nf_ct_deliver_cached_events(ct); + + return 0; +} + +static bool labels_nonzero(const struct ovs_key_ct_labels *labels) +{ + size_t i; + + for (i = 0; i < OVS_CT_LABELS_LEN_32; i++) + if (labels->ct_labels_32[i]) + return true; + + return false; +} + +#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) +static struct hlist_head *ct_limit_hash_bucket( + const struct ovs_ct_limit_info *info, u16 zone) +{ + return &info->limits[zone & (CT_LIMIT_HASH_BUCKETS - 1)]; +} + +/* Call with ovs_mutex */ +static void ct_limit_set(const struct ovs_ct_limit_info *info, + struct ovs_ct_limit *new_ct_limit) +{ + struct ovs_ct_limit *ct_limit; + struct hlist_head *head; + + head = ct_limit_hash_bucket(info, new_ct_limit->zone); + hlist_for_each_entry_rcu(ct_limit, head, hlist_node) { + if (ct_limit->zone == new_ct_limit->zone) { + hlist_replace_rcu(&ct_limit->hlist_node, + &new_ct_limit->hlist_node); + kfree_rcu(ct_limit, rcu); + return; + } + } + + hlist_add_head_rcu(&new_ct_limit->hlist_node, head); +} + +/* Call with ovs_mutex */ +static void ct_limit_del(const struct ovs_ct_limit_info *info, u16 zone) +{ + struct ovs_ct_limit *ct_limit; + struct hlist_head *head; + struct hlist_node *n; + + head = ct_limit_hash_bucket(info, zone); + hlist_for_each_entry_safe(ct_limit, n, head, hlist_node) { + if (ct_limit->zone == zone) { + hlist_del_rcu(&ct_limit->hlist_node); + kfree_rcu(ct_limit, rcu); + return; + } + } +} + +/* Call with RCU read lock */ +static u32 ct_limit_get(const struct ovs_ct_limit_info *info, u16 zone) +{ + struct ovs_ct_limit *ct_limit; + struct hlist_head *head; + + head = ct_limit_hash_bucket(info, zone); + hlist_for_each_entry_rcu(ct_limit, head, hlist_node) { + if (ct_limit->zone == zone) + return ct_limit->limit; + } + + return info->default_limit; +} + +static int ovs_ct_check_limit(struct net *net, + const struct sk_buff *skb, + const struct ovs_conntrack_info *info) +{ + struct ovs_net *ovs_net = net_generic(net, ovs_net_id); + const struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info; + u32 per_zone_limit, connections; + u32 conncount_key; + + conncount_key = info->zone.id; + + per_zone_limit = ct_limit_get(ct_limit_info, info->zone.id); + if (per_zone_limit == OVS_CT_LIMIT_UNLIMITED) + return 0; + + connections = nf_conncount_count_skb(net, skb, info->family, + ct_limit_info->data, + &conncount_key); + if (connections > per_zone_limit) + return -ENOMEM; + + return 0; +} +#endif + +/* Lookup connection and confirm if unconfirmed. */ +static int ovs_ct_commit(struct net *net, struct sw_flow_key *key, + const struct ovs_conntrack_info *info, + struct sk_buff *skb) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + int err; + + err = __ovs_ct_lookup(net, key, info, skb); + if (err) + return err; + + /* The connection could be invalid, in which case this is a no-op.*/ + ct = nf_ct_get(skb, &ctinfo); + if (!ct) + return 0; + +#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) + if (static_branch_unlikely(&ovs_ct_limit_enabled)) { + if (!nf_ct_is_confirmed(ct)) { + err = ovs_ct_check_limit(net, skb, info); + if (err) { + net_warn_ratelimited("openvswitch: zone: %u " + "exceeds conntrack limit\n", + info->zone.id); + return err; + } + } + } +#endif + + /* Set the conntrack event mask if given. NEW and DELETE events have + * their own groups, but the NFNLGRP_CONNTRACK_UPDATE group listener + * typically would receive many kinds of updates. Setting the event + * mask allows those events to be filtered. The set event mask will + * remain in effect for the lifetime of the connection unless changed + * by a further CT action with both the commit flag and the eventmask + * option. */ + if (info->have_eventmask) { + struct nf_conntrack_ecache *cache = nf_ct_ecache_find(ct); + + if (cache) + cache->ctmask = info->eventmask; + } + + /* Apply changes before confirming the connection so that the initial + * conntrack NEW netlink event carries the values given in the CT + * action. + */ + if (info->mark.mask) { + err = ovs_ct_set_mark(ct, key, info->mark.value, + info->mark.mask); + if (err) + return err; + } + if (!nf_ct_is_confirmed(ct)) { + err = ovs_ct_init_labels(ct, key, &info->labels.value, + &info->labels.mask); + if (err) + return err; + + nf_conn_act_ct_ext_add(skb, ct, ctinfo); + } else if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && + labels_nonzero(&info->labels.mask)) { + err = ovs_ct_set_labels(ct, key, &info->labels.value, + &info->labels.mask); + if (err) + return err; + } + /* This will take care of sending queued events even if the connection + * is already confirmed. + */ + err = nf_conntrack_confirm(skb); + + return verdict_to_errno(err); +} + +/* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero + * value if 'skb' is freed. + */ +int ovs_ct_execute(struct net *net, struct sk_buff *skb, + struct sw_flow_key *key, + const struct ovs_conntrack_info *info) +{ + int nh_ofs; + int err; + + /* The conntrack module expects to be working at L3. */ + nh_ofs = skb_network_offset(skb); + skb_pull_rcsum(skb, nh_ofs); + + err = nf_ct_skb_network_trim(skb, info->family); + if (err) { + kfree_skb(skb); + return err; + } + + if (key->ip.frag != OVS_FRAG_TYPE_NONE) { + err = ovs_ct_handle_fragments(net, key, info->zone.id, + info->family, skb); + if (err) + return err; + } + + if (info->commit) + err = ovs_ct_commit(net, key, info, skb); + else + err = ovs_ct_lookup(net, key, info, skb); + + /* conntrack core returned NF_STOLEN */ + if (err == -EINPROGRESS) + return err; + + skb_push_rcsum(skb, nh_ofs); + if (err) + ovs_kfree_skb_reason(skb, OVS_DROP_CONNTRACK); + return err; +} + +int ovs_ct_clear(struct sk_buff *skb, struct sw_flow_key *key) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct; + + ct = nf_ct_get(skb, &ctinfo); + + nf_ct_put(ct); + nf_ct_set(skb, NULL, IP_CT_UNTRACKED); + + if (key) + ovs_ct_fill_key(skb, key, false); + + return 0; +} + +#if IS_ENABLED(CONFIG_NF_NAT) +static int parse_nat(const struct nlattr *attr, + struct ovs_conntrack_info *info, bool log) +{ + struct nlattr *a; + int rem; + bool have_ip_max = false; + bool have_proto_max = false; + bool ip_vers = (info->family == NFPROTO_IPV6); + + nla_for_each_nested(a, attr, rem) { + static const int ovs_nat_attr_lens[OVS_NAT_ATTR_MAX + 1][2] = { + [OVS_NAT_ATTR_SRC] = {0, 0}, + [OVS_NAT_ATTR_DST] = {0, 0}, + [OVS_NAT_ATTR_IP_MIN] = {sizeof(struct in_addr), + sizeof(struct in6_addr)}, + [OVS_NAT_ATTR_IP_MAX] = {sizeof(struct in_addr), + sizeof(struct in6_addr)}, + [OVS_NAT_ATTR_PROTO_MIN] = {sizeof(u16), sizeof(u16)}, + [OVS_NAT_ATTR_PROTO_MAX] = {sizeof(u16), sizeof(u16)}, + [OVS_NAT_ATTR_PERSISTENT] = {0, 0}, + [OVS_NAT_ATTR_PROTO_HASH] = {0, 0}, + [OVS_NAT_ATTR_PROTO_RANDOM] = {0, 0}, + }; + int type = nla_type(a); + + if (type > OVS_NAT_ATTR_MAX) { + OVS_NLERR(log, "Unknown NAT attribute (type=%d, max=%d)", + type, OVS_NAT_ATTR_MAX); + return -EINVAL; + } + + if (nla_len(a) != ovs_nat_attr_lens[type][ip_vers]) { + OVS_NLERR(log, "NAT attribute type %d has unexpected length (%d != %d)", + type, nla_len(a), + ovs_nat_attr_lens[type][ip_vers]); + return -EINVAL; + } + + switch (type) { + case OVS_NAT_ATTR_SRC: + case OVS_NAT_ATTR_DST: + if (info->nat) { + OVS_NLERR(log, "Only one type of NAT may be specified"); + return -ERANGE; + } + info->nat |= OVS_CT_NAT; + info->nat |= ((type == OVS_NAT_ATTR_SRC) + ? OVS_CT_SRC_NAT : OVS_CT_DST_NAT); + break; + + case OVS_NAT_ATTR_IP_MIN: + nla_memcpy(&info->range.min_addr, a, + sizeof(info->range.min_addr)); + info->range.flags |= NF_NAT_RANGE_MAP_IPS; + break; + + case OVS_NAT_ATTR_IP_MAX: + have_ip_max = true; + nla_memcpy(&info->range.max_addr, a, + sizeof(info->range.max_addr)); + info->range.flags |= NF_NAT_RANGE_MAP_IPS; + break; + + case OVS_NAT_ATTR_PROTO_MIN: + info->range.min_proto.all = htons(nla_get_u16(a)); + info->range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; + break; + + case OVS_NAT_ATTR_PROTO_MAX: + have_proto_max = true; + info->range.max_proto.all = htons(nla_get_u16(a)); + info->range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; + break; + + case OVS_NAT_ATTR_PERSISTENT: + info->range.flags |= NF_NAT_RANGE_PERSISTENT; + break; + + case OVS_NAT_ATTR_PROTO_HASH: + info->range.flags |= NF_NAT_RANGE_PROTO_RANDOM; + break; + + case OVS_NAT_ATTR_PROTO_RANDOM: + info->range.flags |= NF_NAT_RANGE_PROTO_RANDOM_FULLY; + break; + + default: + OVS_NLERR(log, "Unknown nat attribute (%d)", type); + return -EINVAL; + } + } + + if (rem > 0) { + OVS_NLERR(log, "NAT attribute has %d unknown bytes", rem); + return -EINVAL; + } + if (!info->nat) { + /* Do not allow flags if no type is given. */ + if (info->range.flags) { + OVS_NLERR(log, + "NAT flags may be given only when NAT range (SRC or DST) is also specified." + ); + return -EINVAL; + } + info->nat = OVS_CT_NAT; /* NAT existing connections. */ + } else if (!info->commit) { + OVS_NLERR(log, + "NAT attributes may be specified only when CT COMMIT flag is also specified." + ); + return -EINVAL; + } + /* Allow missing IP_MAX. */ + if (info->range.flags & NF_NAT_RANGE_MAP_IPS && !have_ip_max) { + memcpy(&info->range.max_addr, &info->range.min_addr, + sizeof(info->range.max_addr)); + } + /* Allow missing PROTO_MAX. */ + if (info->range.flags & NF_NAT_RANGE_PROTO_SPECIFIED && + !have_proto_max) { + info->range.max_proto.all = info->range.min_proto.all; + } + return 0; +} +#endif + +static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = { + [OVS_CT_ATTR_COMMIT] = { .minlen = 0, .maxlen = 0 }, + [OVS_CT_ATTR_FORCE_COMMIT] = { .minlen = 0, .maxlen = 0 }, + [OVS_CT_ATTR_ZONE] = { .minlen = sizeof(u16), + .maxlen = sizeof(u16) }, + [OVS_CT_ATTR_MARK] = { .minlen = sizeof(struct md_mark), + .maxlen = sizeof(struct md_mark) }, + [OVS_CT_ATTR_LABELS] = { .minlen = sizeof(struct md_labels), + .maxlen = sizeof(struct md_labels) }, + [OVS_CT_ATTR_HELPER] = { .minlen = 1, + .maxlen = NF_CT_HELPER_NAME_LEN }, +#if IS_ENABLED(CONFIG_NF_NAT) + /* NAT length is checked when parsing the nested attributes. */ + [OVS_CT_ATTR_NAT] = { .minlen = 0, .maxlen = INT_MAX }, +#endif + [OVS_CT_ATTR_EVENTMASK] = { .minlen = sizeof(u32), + .maxlen = sizeof(u32) }, + [OVS_CT_ATTR_TIMEOUT] = { .minlen = 1, + .maxlen = CTNL_TIMEOUT_NAME_MAX }, +}; + +static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, + const char **helper, bool log) +{ + struct nlattr *a; + int rem; + + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); + int maxlen; + int minlen; + + if (type > OVS_CT_ATTR_MAX) { + OVS_NLERR(log, + "Unknown conntrack attr (type=%d, max=%d)", + type, OVS_CT_ATTR_MAX); + return -EINVAL; + } + + maxlen = ovs_ct_attr_lens[type].maxlen; + minlen = ovs_ct_attr_lens[type].minlen; + if (nla_len(a) < minlen || nla_len(a) > maxlen) { + OVS_NLERR(log, + "Conntrack attr type has unexpected length (type=%d, length=%d, expected=%d)", + type, nla_len(a), maxlen); + return -EINVAL; + } + + switch (type) { + case OVS_CT_ATTR_FORCE_COMMIT: + info->force = true; + fallthrough; + case OVS_CT_ATTR_COMMIT: + info->commit = true; + break; +#ifdef CONFIG_NF_CONNTRACK_ZONES + case OVS_CT_ATTR_ZONE: + info->zone.id = nla_get_u16(a); + break; +#endif +#ifdef CONFIG_NF_CONNTRACK_MARK + case OVS_CT_ATTR_MARK: { + struct md_mark *mark = nla_data(a); + + if (!mark->mask) { + OVS_NLERR(log, "ct_mark mask cannot be 0"); + return -EINVAL; + } + info->mark = *mark; + break; + } +#endif +#ifdef CONFIG_NF_CONNTRACK_LABELS + case OVS_CT_ATTR_LABELS: { + struct md_labels *labels = nla_data(a); + + if (!labels_nonzero(&labels->mask)) { + OVS_NLERR(log, "ct_labels mask cannot be 0"); + return -EINVAL; + } + info->labels = *labels; + break; + } +#endif + case OVS_CT_ATTR_HELPER: + *helper = nla_data(a); + if (!string_is_terminated(*helper, nla_len(a))) { + OVS_NLERR(log, "Invalid conntrack helper"); + return -EINVAL; + } + break; +#if IS_ENABLED(CONFIG_NF_NAT) + case OVS_CT_ATTR_NAT: { + int err = parse_nat(a, info, log); + + if (err) + return err; + break; + } +#endif + case OVS_CT_ATTR_EVENTMASK: + info->have_eventmask = true; + info->eventmask = nla_get_u32(a); + break; +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT + case OVS_CT_ATTR_TIMEOUT: + memcpy(info->timeout, nla_data(a), nla_len(a)); + if (!string_is_terminated(info->timeout, nla_len(a))) { + OVS_NLERR(log, "Invalid conntrack timeout"); + return -EINVAL; + } + break; +#endif + + default: + OVS_NLERR(log, "Unknown conntrack attr (%d)", + type); + return -EINVAL; + } + } + +#ifdef CONFIG_NF_CONNTRACK_MARK + if (!info->commit && info->mark.mask) { + OVS_NLERR(log, + "Setting conntrack mark requires 'commit' flag."); + return -EINVAL; + } +#endif +#ifdef CONFIG_NF_CONNTRACK_LABELS + if (!info->commit && labels_nonzero(&info->labels.mask)) { + OVS_NLERR(log, + "Setting conntrack labels requires 'commit' flag."); + return -EINVAL; + } +#endif + if (rem > 0) { + OVS_NLERR(log, "Conntrack attr has %d unknown bytes", rem); + return -EINVAL; + } + + return 0; +} + +bool ovs_ct_verify(struct net *net, enum ovs_key_attr attr) +{ + if (attr == OVS_KEY_ATTR_CT_STATE) + return true; + if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) && + attr == OVS_KEY_ATTR_CT_ZONE) + return true; + if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) && + attr == OVS_KEY_ATTR_CT_MARK) + return true; + if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && + attr == OVS_KEY_ATTR_CT_LABELS) { + struct ovs_net *ovs_net = net_generic(net, ovs_net_id); + + return ovs_net->xt_label; + } + + return false; +} + +int ovs_ct_copy_action(struct net *net, const struct nlattr *attr, + const struct sw_flow_key *key, + struct sw_flow_actions **sfa, bool log) +{ + struct ovs_conntrack_info ct_info; + const char *helper = NULL; + u16 family; + int err; + + family = key_to_nfproto(key); + if (family == NFPROTO_UNSPEC) { + OVS_NLERR(log, "ct family unspecified"); + return -EINVAL; + } + + memset(&ct_info, 0, sizeof(ct_info)); + ct_info.family = family; + + nf_ct_zone_init(&ct_info.zone, NF_CT_DEFAULT_ZONE_ID, + NF_CT_DEFAULT_ZONE_DIR, 0); + + err = parse_ct(attr, &ct_info, &helper, log); + if (err) + return err; + + /* Set up template for tracking connections in specific zones. */ + ct_info.ct = nf_ct_tmpl_alloc(net, &ct_info.zone, GFP_KERNEL); + if (!ct_info.ct) { + OVS_NLERR(log, "Failed to allocate conntrack template"); + return -ENOMEM; + } + + if (ct_info.timeout[0]) { + if (nf_ct_set_timeout(net, ct_info.ct, family, key->ip.proto, + ct_info.timeout)) + OVS_NLERR(log, + "Failed to associated timeout policy '%s'", + ct_info.timeout); + else + ct_info.nf_ct_timeout = rcu_dereference( + nf_ct_timeout_find(ct_info.ct)->timeout); + + } + + if (helper) { + err = nf_ct_add_helper(ct_info.ct, helper, ct_info.family, + key->ip.proto, ct_info.nat, &ct_info.helper); + if (err) { + OVS_NLERR(log, "Failed to add %s helper %d", helper, err); + goto err_free_ct; + } + } + + err = ovs_nla_add_action(sfa, OVS_ACTION_ATTR_CT, &ct_info, + sizeof(ct_info), log); + if (err) + goto err_free_ct; + + if (ct_info.commit) + __set_bit(IPS_CONFIRMED_BIT, &ct_info.ct->status); + return 0; +err_free_ct: + __ovs_ct_free_action(&ct_info); + return err; +} + +#if IS_ENABLED(CONFIG_NF_NAT) +static bool ovs_ct_nat_to_attr(const struct ovs_conntrack_info *info, + struct sk_buff *skb) +{ + struct nlattr *start; + + start = nla_nest_start_noflag(skb, OVS_CT_ATTR_NAT); + if (!start) + return false; + + if (info->nat & OVS_CT_SRC_NAT) { + if (nla_put_flag(skb, OVS_NAT_ATTR_SRC)) + return false; + } else if (info->nat & OVS_CT_DST_NAT) { + if (nla_put_flag(skb, OVS_NAT_ATTR_DST)) + return false; + } else { + goto out; + } + + if (info->range.flags & NF_NAT_RANGE_MAP_IPS) { + if (IS_ENABLED(CONFIG_NF_NAT) && + info->family == NFPROTO_IPV4) { + if (nla_put_in_addr(skb, OVS_NAT_ATTR_IP_MIN, + info->range.min_addr.ip) || + (info->range.max_addr.ip + != info->range.min_addr.ip && + (nla_put_in_addr(skb, OVS_NAT_ATTR_IP_MAX, + info->range.max_addr.ip)))) + return false; + } else if (IS_ENABLED(CONFIG_IPV6) && + info->family == NFPROTO_IPV6) { + if (nla_put_in6_addr(skb, OVS_NAT_ATTR_IP_MIN, + &info->range.min_addr.in6) || + (memcmp(&info->range.max_addr.in6, + &info->range.min_addr.in6, + sizeof(info->range.max_addr.in6)) && + (nla_put_in6_addr(skb, OVS_NAT_ATTR_IP_MAX, + &info->range.max_addr.in6)))) + return false; + } else { + return false; + } + } + if (info->range.flags & NF_NAT_RANGE_PROTO_SPECIFIED && + (nla_put_u16(skb, OVS_NAT_ATTR_PROTO_MIN, + ntohs(info->range.min_proto.all)) || + (info->range.max_proto.all != info->range.min_proto.all && + nla_put_u16(skb, OVS_NAT_ATTR_PROTO_MAX, + ntohs(info->range.max_proto.all))))) + return false; + + if (info->range.flags & NF_NAT_RANGE_PERSISTENT && + nla_put_flag(skb, OVS_NAT_ATTR_PERSISTENT)) + return false; + if (info->range.flags & NF_NAT_RANGE_PROTO_RANDOM && + nla_put_flag(skb, OVS_NAT_ATTR_PROTO_HASH)) + return false; + if (info->range.flags & NF_NAT_RANGE_PROTO_RANDOM_FULLY && + nla_put_flag(skb, OVS_NAT_ATTR_PROTO_RANDOM)) + return false; +out: + nla_nest_end(skb, start); + + return true; +} +#endif + +int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info, + struct sk_buff *skb) +{ + struct nlattr *start; + + start = nla_nest_start_noflag(skb, OVS_ACTION_ATTR_CT); + if (!start) + return -EMSGSIZE; + + if (ct_info->commit && nla_put_flag(skb, ct_info->force + ? OVS_CT_ATTR_FORCE_COMMIT + : OVS_CT_ATTR_COMMIT)) + return -EMSGSIZE; + if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) && + nla_put_u16(skb, OVS_CT_ATTR_ZONE, ct_info->zone.id)) + return -EMSGSIZE; + if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) && ct_info->mark.mask && + nla_put(skb, OVS_CT_ATTR_MARK, sizeof(ct_info->mark), + &ct_info->mark)) + return -EMSGSIZE; + if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) && + labels_nonzero(&ct_info->labels.mask) && + nla_put(skb, OVS_CT_ATTR_LABELS, sizeof(ct_info->labels), + &ct_info->labels)) + return -EMSGSIZE; + if (ct_info->helper) { + if (nla_put_string(skb, OVS_CT_ATTR_HELPER, + ct_info->helper->name)) + return -EMSGSIZE; + } + if (ct_info->have_eventmask && + nla_put_u32(skb, OVS_CT_ATTR_EVENTMASK, ct_info->eventmask)) + return -EMSGSIZE; + if (ct_info->timeout[0]) { + if (nla_put_string(skb, OVS_CT_ATTR_TIMEOUT, ct_info->timeout)) + return -EMSGSIZE; + } + +#if IS_ENABLED(CONFIG_NF_NAT) + if (ct_info->nat && !ovs_ct_nat_to_attr(ct_info, skb)) + return -EMSGSIZE; +#endif + nla_nest_end(skb, start); + + return 0; +} + +void ovs_ct_free_action(const struct nlattr *a) +{ + struct ovs_conntrack_info *ct_info = nla_data(a); + + __ovs_ct_free_action(ct_info); +} + +static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info) +{ + if (ct_info->helper) { +#if IS_ENABLED(CONFIG_NF_NAT) + if (ct_info->nat) + nf_nat_helper_put(ct_info->helper); +#endif + nf_conntrack_helper_put(ct_info->helper); + } + if (ct_info->ct) { + if (ct_info->timeout[0]) + nf_ct_destroy_timeout(ct_info->ct); + nf_ct_tmpl_free(ct_info->ct); + } +} + +#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) +static int ovs_ct_limit_init(struct net *net, struct ovs_net *ovs_net) +{ + int i, err; + + ovs_net->ct_limit_info = kmalloc(sizeof(*ovs_net->ct_limit_info), + GFP_KERNEL); + if (!ovs_net->ct_limit_info) + return -ENOMEM; + + ovs_net->ct_limit_info->default_limit = OVS_CT_LIMIT_DEFAULT; + ovs_net->ct_limit_info->limits = + kmalloc_array(CT_LIMIT_HASH_BUCKETS, sizeof(struct hlist_head), + GFP_KERNEL); + if (!ovs_net->ct_limit_info->limits) { + kfree(ovs_net->ct_limit_info); + return -ENOMEM; + } + + for (i = 0; i < CT_LIMIT_HASH_BUCKETS; i++) + INIT_HLIST_HEAD(&ovs_net->ct_limit_info->limits[i]); + + ovs_net->ct_limit_info->data = nf_conncount_init(net, sizeof(u32)); + + if (IS_ERR(ovs_net->ct_limit_info->data)) { + err = PTR_ERR(ovs_net->ct_limit_info->data); + kfree(ovs_net->ct_limit_info->limits); + kfree(ovs_net->ct_limit_info); + pr_err("openvswitch: failed to init nf_conncount %d\n", err); + return err; + } + return 0; +} + +static void ovs_ct_limit_exit(struct net *net, struct ovs_net *ovs_net) +{ + const struct ovs_ct_limit_info *info = ovs_net->ct_limit_info; + int i; + + nf_conncount_destroy(net, info->data); + for (i = 0; i < CT_LIMIT_HASH_BUCKETS; ++i) { + struct hlist_head *head = &info->limits[i]; + struct ovs_ct_limit *ct_limit; + struct hlist_node *next; + + hlist_for_each_entry_safe(ct_limit, next, head, hlist_node) + kfree_rcu(ct_limit, rcu); + } + kfree(info->limits); + kfree(info); +} + +static struct sk_buff * +ovs_ct_limit_cmd_reply_start(struct genl_info *info, u8 cmd, + struct ovs_header **ovs_reply_header) +{ + struct ovs_header *ovs_header = genl_info_userhdr(info); + struct sk_buff *skb; + + skb = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!skb) + return ERR_PTR(-ENOMEM); + + *ovs_reply_header = genlmsg_put(skb, info->snd_portid, + info->snd_seq, + &dp_ct_limit_genl_family, 0, cmd); + + if (!*ovs_reply_header) { + nlmsg_free(skb); + return ERR_PTR(-EMSGSIZE); + } + (*ovs_reply_header)->dp_ifindex = ovs_header->dp_ifindex; + + return skb; +} + +static bool check_zone_id(int zone_id, u16 *pzone) +{ + if (zone_id >= 0 && zone_id <= 65535) { + *pzone = (u16)zone_id; + return true; + } + return false; +} + +static int ovs_ct_limit_set_zone_limit(struct nlattr *nla_zone_limit, + struct ovs_ct_limit_info *info) +{ + struct ovs_zone_limit *zone_limit; + int rem; + u16 zone; + + rem = NLA_ALIGN(nla_len(nla_zone_limit)); + zone_limit = (struct ovs_zone_limit *)nla_data(nla_zone_limit); + + while (rem >= sizeof(*zone_limit)) { + if (unlikely(zone_limit->zone_id == + OVS_ZONE_LIMIT_DEFAULT_ZONE)) { + ovs_lock(); + info->default_limit = zone_limit->limit; + ovs_unlock(); + } else if (unlikely(!check_zone_id( + zone_limit->zone_id, &zone))) { + OVS_NLERR(true, "zone id is out of range"); + } else { + struct ovs_ct_limit *ct_limit; + + ct_limit = kmalloc(sizeof(*ct_limit), + GFP_KERNEL_ACCOUNT); + if (!ct_limit) + return -ENOMEM; + + ct_limit->zone = zone; + ct_limit->limit = zone_limit->limit; + + ovs_lock(); + ct_limit_set(info, ct_limit); + ovs_unlock(); + } + rem -= NLA_ALIGN(sizeof(*zone_limit)); + zone_limit = (struct ovs_zone_limit *)((u8 *)zone_limit + + NLA_ALIGN(sizeof(*zone_limit))); + } + + if (rem) + OVS_NLERR(true, "set zone limit has %d unknown bytes", rem); + + return 0; +} + +static int ovs_ct_limit_del_zone_limit(struct nlattr *nla_zone_limit, + struct ovs_ct_limit_info *info) +{ + struct ovs_zone_limit *zone_limit; + int rem; + u16 zone; + + rem = NLA_ALIGN(nla_len(nla_zone_limit)); + zone_limit = (struct ovs_zone_limit *)nla_data(nla_zone_limit); + + while (rem >= sizeof(*zone_limit)) { + if (unlikely(zone_limit->zone_id == + OVS_ZONE_LIMIT_DEFAULT_ZONE)) { + ovs_lock(); + info->default_limit = OVS_CT_LIMIT_DEFAULT; + ovs_unlock(); + } else if (unlikely(!check_zone_id( + zone_limit->zone_id, &zone))) { + OVS_NLERR(true, "zone id is out of range"); + } else { + ovs_lock(); + ct_limit_del(info, zone); + ovs_unlock(); + } + rem -= NLA_ALIGN(sizeof(*zone_limit)); + zone_limit = (struct ovs_zone_limit *)((u8 *)zone_limit + + NLA_ALIGN(sizeof(*zone_limit))); + } + + if (rem) + OVS_NLERR(true, "del zone limit has %d unknown bytes", rem); + + return 0; +} + +static int ovs_ct_limit_get_default_limit(struct ovs_ct_limit_info *info, + struct sk_buff *reply) +{ + struct ovs_zone_limit zone_limit = { + .zone_id = OVS_ZONE_LIMIT_DEFAULT_ZONE, + .limit = info->default_limit, + }; + + return nla_put_nohdr(reply, sizeof(zone_limit), &zone_limit); +} + +static int __ovs_ct_limit_get_zone_limit(struct net *net, + struct nf_conncount_data *data, + u16 zone_id, u32 limit, + struct sk_buff *reply) +{ + struct nf_conntrack_zone ct_zone; + struct ovs_zone_limit zone_limit; + u32 conncount_key = zone_id; + + zone_limit.zone_id = zone_id; + zone_limit.limit = limit; + nf_ct_zone_init(&ct_zone, zone_id, NF_CT_DEFAULT_ZONE_DIR, 0); + + zone_limit.count = nf_conncount_count_skb(net, NULL, 0, data, + &conncount_key); + return nla_put_nohdr(reply, sizeof(zone_limit), &zone_limit); +} + +static int ovs_ct_limit_get_zone_limit(struct net *net, + struct nlattr *nla_zone_limit, + struct ovs_ct_limit_info *info, + struct sk_buff *reply) +{ + struct ovs_zone_limit *zone_limit; + int rem, err; + u32 limit; + u16 zone; + + rem = NLA_ALIGN(nla_len(nla_zone_limit)); + zone_limit = (struct ovs_zone_limit *)nla_data(nla_zone_limit); + + while (rem >= sizeof(*zone_limit)) { + if (unlikely(zone_limit->zone_id == + OVS_ZONE_LIMIT_DEFAULT_ZONE)) { + err = ovs_ct_limit_get_default_limit(info, reply); + if (err) + return err; + } else if (unlikely(!check_zone_id(zone_limit->zone_id, + &zone))) { + OVS_NLERR(true, "zone id is out of range"); + } else { + rcu_read_lock(); + limit = ct_limit_get(info, zone); + rcu_read_unlock(); + + err = __ovs_ct_limit_get_zone_limit( + net, info->data, zone, limit, reply); + if (err) + return err; + } + rem -= NLA_ALIGN(sizeof(*zone_limit)); + zone_limit = (struct ovs_zone_limit *)((u8 *)zone_limit + + NLA_ALIGN(sizeof(*zone_limit))); + } + + if (rem) + OVS_NLERR(true, "get zone limit has %d unknown bytes", rem); + + return 0; +} + +static int ovs_ct_limit_get_all_zone_limit(struct net *net, + struct ovs_ct_limit_info *info, + struct sk_buff *reply) +{ + struct ovs_ct_limit *ct_limit; + struct hlist_head *head; + int i, err = 0; + + err = ovs_ct_limit_get_default_limit(info, reply); + if (err) + return err; + + rcu_read_lock(); + for (i = 0; i < CT_LIMIT_HASH_BUCKETS; ++i) { + head = &info->limits[i]; + hlist_for_each_entry_rcu(ct_limit, head, hlist_node) { + err = __ovs_ct_limit_get_zone_limit(net, info->data, + ct_limit->zone, ct_limit->limit, reply); + if (err) + goto exit_err; + } + } + +exit_err: + rcu_read_unlock(); + return err; +} + +static int ovs_ct_limit_cmd_set(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr **a = info->attrs; + struct sk_buff *reply; + struct ovs_header *ovs_reply_header; + struct ovs_net *ovs_net = net_generic(sock_net(skb->sk), ovs_net_id); + struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info; + int err; + + reply = ovs_ct_limit_cmd_reply_start(info, OVS_CT_LIMIT_CMD_SET, + &ovs_reply_header); + if (IS_ERR(reply)) + return PTR_ERR(reply); + + if (!a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT]) { + err = -EINVAL; + goto exit_err; + } + + err = ovs_ct_limit_set_zone_limit(a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT], + ct_limit_info); + if (err) + goto exit_err; + + static_branch_enable(&ovs_ct_limit_enabled); + + genlmsg_end(reply, ovs_reply_header); + return genlmsg_reply(reply, info); + +exit_err: + nlmsg_free(reply); + return err; +} + +static int ovs_ct_limit_cmd_del(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr **a = info->attrs; + struct sk_buff *reply; + struct ovs_header *ovs_reply_header; + struct ovs_net *ovs_net = net_generic(sock_net(skb->sk), ovs_net_id); + struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info; + int err; + + reply = ovs_ct_limit_cmd_reply_start(info, OVS_CT_LIMIT_CMD_DEL, + &ovs_reply_header); + if (IS_ERR(reply)) + return PTR_ERR(reply); + + if (!a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT]) { + err = -EINVAL; + goto exit_err; + } + + err = ovs_ct_limit_del_zone_limit(a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT], + ct_limit_info); + if (err) + goto exit_err; + + genlmsg_end(reply, ovs_reply_header); + return genlmsg_reply(reply, info); + +exit_err: + nlmsg_free(reply); + return err; +} + +static int ovs_ct_limit_cmd_get(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr **a = info->attrs; + struct nlattr *nla_reply; + struct sk_buff *reply; + struct ovs_header *ovs_reply_header; + struct net *net = sock_net(skb->sk); + struct ovs_net *ovs_net = net_generic(net, ovs_net_id); + struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info; + int err; + + reply = ovs_ct_limit_cmd_reply_start(info, OVS_CT_LIMIT_CMD_GET, + &ovs_reply_header); + if (IS_ERR(reply)) + return PTR_ERR(reply); + + nla_reply = nla_nest_start_noflag(reply, OVS_CT_LIMIT_ATTR_ZONE_LIMIT); + if (!nla_reply) { + err = -EMSGSIZE; + goto exit_err; + } + + if (a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT]) { + err = ovs_ct_limit_get_zone_limit( + net, a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT], ct_limit_info, + reply); + if (err) + goto exit_err; + } else { + err = ovs_ct_limit_get_all_zone_limit(net, ct_limit_info, + reply); + if (err) + goto exit_err; + } + + nla_nest_end(reply, nla_reply); + genlmsg_end(reply, ovs_reply_header); + return genlmsg_reply(reply, info); + +exit_err: + nlmsg_free(reply); + return err; +} + +static const struct genl_small_ops ct_limit_genl_ops[] = { + { .cmd = OVS_CT_LIMIT_CMD_SET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN + * privilege. + */ + .doit = ovs_ct_limit_cmd_set, + }, + { .cmd = OVS_CT_LIMIT_CMD_DEL, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN + * privilege. + */ + .doit = ovs_ct_limit_cmd_del, + }, + { .cmd = OVS_CT_LIMIT_CMD_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .flags = 0, /* OK for unprivileged users. */ + .doit = ovs_ct_limit_cmd_get, + }, +}; + +static const struct genl_multicast_group ovs_ct_limit_multicast_group = { + .name = OVS_CT_LIMIT_MCGROUP, +}; + +struct genl_family dp_ct_limit_genl_family __ro_after_init = { + .hdrsize = sizeof(struct ovs_header), + .name = OVS_CT_LIMIT_FAMILY, + .version = OVS_CT_LIMIT_VERSION, + .maxattr = OVS_CT_LIMIT_ATTR_MAX, + .policy = ct_limit_policy, + .netnsok = true, + .parallel_ops = true, + .small_ops = ct_limit_genl_ops, + .n_small_ops = ARRAY_SIZE(ct_limit_genl_ops), + .resv_start_op = OVS_CT_LIMIT_CMD_GET + 1, + .mcgrps = &ovs_ct_limit_multicast_group, + .n_mcgrps = 1, + .module = THIS_MODULE, +}; +#endif + +int ovs_ct_init(struct net *net) +{ + unsigned int n_bits = sizeof(struct ovs_key_ct_labels) * BITS_PER_BYTE; + struct ovs_net *ovs_net = net_generic(net, ovs_net_id); + + if (nf_connlabels_get(net, n_bits - 1)) { + ovs_net->xt_label = false; + OVS_NLERR(true, "Failed to set connlabel length"); + } else { + ovs_net->xt_label = true; + } + +#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) + return ovs_ct_limit_init(net, ovs_net); +#else + return 0; +#endif +} + +void ovs_ct_exit(struct net *net) +{ + struct ovs_net *ovs_net = net_generic(net, ovs_net_id); + +#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) + ovs_ct_limit_exit(net, ovs_net); +#endif + + if (ovs_net->xt_label) + nf_connlabels_put(net); +} diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h new file mode 100644 index 000000000000..317e525c8a11 --- /dev/null +++ b/net/openvswitch/conntrack.h @@ -0,0 +1,106 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2015 Nicira, Inc. + */ + +#ifndef OVS_CONNTRACK_H +#define OVS_CONNTRACK_H 1 + +#include "flow.h" + +struct ovs_conntrack_info; +struct ovs_ct_limit_info; +enum ovs_key_attr; + +#if IS_ENABLED(CONFIG_NF_CONNTRACK) +int ovs_ct_init(struct net *); +void ovs_ct_exit(struct net *); +bool ovs_ct_verify(struct net *, enum ovs_key_attr attr); +int ovs_ct_copy_action(struct net *, const struct nlattr *, + const struct sw_flow_key *, struct sw_flow_actions **, + bool log); +int ovs_ct_action_to_attr(const struct ovs_conntrack_info *, struct sk_buff *); + +int ovs_ct_execute(struct net *, struct sk_buff *, struct sw_flow_key *, + const struct ovs_conntrack_info *); +int ovs_ct_clear(struct sk_buff *skb, struct sw_flow_key *key); + +void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key, + bool post_ct); +int ovs_ct_put_key(const struct sw_flow_key *swkey, + const struct sw_flow_key *output, struct sk_buff *skb); +void ovs_ct_free_action(const struct nlattr *a); + +#define CT_SUPPORTED_MASK (OVS_CS_F_NEW | OVS_CS_F_ESTABLISHED | \ + OVS_CS_F_RELATED | OVS_CS_F_REPLY_DIR | \ + OVS_CS_F_INVALID | OVS_CS_F_TRACKED | \ + OVS_CS_F_SRC_NAT | OVS_CS_F_DST_NAT) +#else +#include <linux/errno.h> + +static inline int ovs_ct_init(struct net *net) { return 0; } + +static inline void ovs_ct_exit(struct net *net) { } + +static inline bool ovs_ct_verify(struct net *net, int attr) +{ + return false; +} + +static inline int ovs_ct_copy_action(struct net *net, const struct nlattr *nla, + const struct sw_flow_key *key, + struct sw_flow_actions **acts, bool log) +{ + return -ENOTSUPP; +} + +static inline int ovs_ct_action_to_attr(const struct ovs_conntrack_info *info, + struct sk_buff *skb) +{ + return -ENOTSUPP; +} + +static inline int ovs_ct_execute(struct net *net, struct sk_buff *skb, + struct sw_flow_key *key, + const struct ovs_conntrack_info *info) +{ + kfree_skb(skb); + return -ENOTSUPP; +} + +static inline int ovs_ct_clear(struct sk_buff *skb, + struct sw_flow_key *key) +{ + return -ENOTSUPP; +} + +static inline void ovs_ct_fill_key(const struct sk_buff *skb, + struct sw_flow_key *key, + bool post_ct) +{ + key->ct_state = 0; + key->ct_zone = 0; + key->ct.mark = 0; + memset(&key->ct.labels, 0, sizeof(key->ct.labels)); + /* Clear 'ct_orig_proto' to mark the non-existence of original + * direction key fields. + */ + key->ct_orig_proto = 0; +} + +static inline int ovs_ct_put_key(const struct sw_flow_key *swkey, + const struct sw_flow_key *output, + struct sk_buff *skb) +{ + return 0; +} + +static inline void ovs_ct_free_action(const struct nlattr *a) { } + +#define CT_SUPPORTED_MASK 0 +#endif /* CONFIG_NF_CONNTRACK */ + +#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) +extern struct genl_family dp_ct_limit_genl_family; +#endif +#endif /* ovs_conntrack.h */ diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index f7e3a0d84c40..d5b6e2002bc1 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (c) 2007-2012 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA + * Copyright (c) 2007-2014 Nicira, Inc. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -28,7 +15,6 @@ #include <linux/delay.h> #include <linux/time.h> #include <linux/etherdevice.h> -#include <linux/genetlink.h> #include <linux/kernel.h> #include <linux/kthread.h> #include <linux/mutex.h> @@ -44,32 +30,58 @@ #include <linux/netfilter_ipv4.h> #include <linux/inetdevice.h> #include <linux/list.h> -#include <linux/lockdep.h> #include <linux/openvswitch.h> #include <linux/rculist.h> #include <linux/dmi.h> -#include <linux/workqueue.h> #include <net/genetlink.h> +#include <net/gso.h> #include <net/net_namespace.h> #include <net/netns/generic.h> +#include <net/pkt_cls.h> #include "datapath.h" +#include "drop.h" #include "flow.h" +#include "flow_table.h" +#include "flow_netlink.h" +#include "meter.h" +#include "openvswitch_trace.h" #include "vport-internal_dev.h" #include "vport-netdev.h" +unsigned int ovs_net_id __read_mostly; + +static struct genl_family dp_packet_genl_family; +static struct genl_family dp_flow_genl_family; +static struct genl_family dp_datapath_genl_family; + +static const struct nla_policy flow_policy[]; + +static const struct genl_multicast_group ovs_dp_flow_multicast_group = { + .name = OVS_FLOW_MCGROUP, +}; + +static const struct genl_multicast_group ovs_dp_datapath_multicast_group = { + .name = OVS_DATAPATH_MCGROUP, +}; -#define REHASH_FLOW_INTERVAL (10 * 60 * HZ) -static void rehash_flow_table(struct work_struct *work); -static DECLARE_DELAYED_WORK(rehash_flow_wq, rehash_flow_table); +static const struct genl_multicast_group ovs_dp_vport_multicast_group = { + .name = OVS_VPORT_MCGROUP, +}; -int ovs_net_id __read_mostly; +/* Check if need to build a reply message. + * OVS userspace sets the NLM_F_ECHO flag if it needs the reply. */ +static bool ovs_must_notify(struct genl_family *family, struct genl_info *info, + unsigned int group) +{ + return info->nlhdr->nlmsg_flags & NLM_F_ECHO || + genl_has_listeners(family, genl_info_net(info), group); +} -static void ovs_notify(struct sk_buff *skb, struct genl_info *info, - struct genl_multicast_group *grp) +static void ovs_notify(struct genl_family *family, + struct sk_buff *skb, struct genl_info *info) { - genl_notify(skb, genl_info_net(info), info->snd_portid, - grp->id, info->nlhdr, GFP_KERNEL); + genl_notify(family, skb, info, 0, GFP_KERNEL); } /** @@ -112,38 +124,27 @@ int lockdep_ovsl_is_held(void) #endif static struct vport *new_vport(const struct vport_parms *); -static int queue_gso_packets(struct net *, int dp_ifindex, struct sk_buff *, - const struct dp_upcall_info *); -static int queue_userspace_packet(struct net *, int dp_ifindex, - struct sk_buff *, - const struct dp_upcall_info *); +static int queue_gso_packets(struct datapath *dp, struct sk_buff *, + const struct sw_flow_key *, + const struct dp_upcall_info *, + uint32_t cutlen); +static int queue_userspace_packet(struct datapath *dp, struct sk_buff *, + const struct sw_flow_key *, + const struct dp_upcall_info *, + uint32_t cutlen); -/* Must be called with rcu_read_lock or ovs_mutex. */ -static struct datapath *get_dp(struct net *net, int dp_ifindex) -{ - struct datapath *dp = NULL; - struct net_device *dev; +static void ovs_dp_masks_rebalance(struct work_struct *work); - rcu_read_lock(); - dev = dev_get_by_index_rcu(net, dp_ifindex); - if (dev) { - struct vport *vport = ovs_internal_dev_get_vport(dev); - if (vport) - dp = vport->dp; - } - rcu_read_unlock(); - - return dp; -} +static int ovs_dp_set_upcall_portids(struct datapath *, const struct nlattr *); /* Must be called with rcu_read_lock or ovs_mutex. */ const char *ovs_dp_name(const struct datapath *dp) { struct vport *vport = ovs_vport_ovsl_rcu(dp, OVSP_LOCAL); - return vport->ops->get_name(vport); + return ovs_vport_name(vport); } -static int get_dpifindex(struct datapath *dp) +static int get_dpifindex(const struct datapath *dp) { struct vport *local; int ifindex; @@ -152,7 +153,7 @@ static int get_dpifindex(struct datapath *dp) local = ovs_vport_rcu(dp, OVSP_LOCAL); if (local) - ifindex = netdev_vport_priv(local)->dev->ifindex; + ifindex = local->dev->ifindex; else ifindex = 0; @@ -165,10 +166,11 @@ static void destroy_dp_rcu(struct rcu_head *rcu) { struct datapath *dp = container_of(rcu, struct datapath, rcu); - ovs_flow_tbl_destroy((__force struct flow_table *)dp->table); + ovs_flow_tbl_destroy(&dp->table); free_percpu(dp->stats_percpu); - release_net(ovs_dp_get_net(dp)); kfree(dp->ports); + ovs_meters_exit(dp); + kfree(rcu_dereference_raw(dp->upcall_portids)); kfree(dp); } @@ -178,13 +180,15 @@ static struct hlist_head *vport_hash_bucket(const struct datapath *dp, return &dp->ports[port_no & (DP_VPORT_HASH_BUCKETS - 1)]; } +/* Called with ovs_mutex or RCU read lock. */ struct vport *ovs_lookup_vport(const struct datapath *dp, u16 port_no) { struct vport *vport; struct hlist_head *head; head = vport_hash_bucket(dp, port_no); - hlist_for_each_entry_rcu(vport, head, dp_hash_node) { + hlist_for_each_entry_rcu(vport, head, dp_hash_node, + lockdep_ovsl_is_held()) { if (vport->port_no == port_no) return vport; } @@ -206,6 +210,26 @@ static struct vport *new_vport(const struct vport_parms *parms) return vport; } +static void ovs_vport_update_upcall_stats(struct sk_buff *skb, + const struct dp_upcall_info *upcall_info, + bool upcall_result) +{ + struct vport *p = OVS_CB(skb)->input_vport; + struct vport_upcall_stats_percpu *stats; + + if (upcall_info->cmd != OVS_PACKET_CMD_MISS && + upcall_info->cmd != OVS_PACKET_CMD_ACTION) + return; + + stats = this_cpu_ptr(p->upcall_stats); + u64_stats_update_begin(&stats->syncp); + if (upcall_result) + u64_stats_inc(&stats->n_success); + else + u64_stats_inc(&stats->n_fail); + u64_stats_update_end(&stats->syncp); +} + void ovs_dp_detach_port(struct vport *p) { ASSERT_OVSL(); @@ -218,85 +242,112 @@ void ovs_dp_detach_port(struct vport *p) } /* Must be called with rcu_read_lock. */ -void ovs_dp_process_received_packet(struct vport *p, struct sk_buff *skb) +void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key) { + struct ovs_pcpu_storage *ovs_pcpu = this_cpu_ptr(ovs_pcpu_storage); + const struct vport *p = OVS_CB(skb)->input_vport; struct datapath *dp = p->dp; struct sw_flow *flow; + struct sw_flow_actions *sf_acts; struct dp_stats_percpu *stats; - struct sw_flow_key key; + bool ovs_pcpu_locked = false; u64 *stats_counter; + u32 n_mask_hit; + u32 n_cache_hit; int error; - int key_len; stats = this_cpu_ptr(dp->stats_percpu); - /* Extract flow from 'skb' into 'key'. */ - error = ovs_flow_extract(skb, p->port_no, &key, &key_len); - if (unlikely(error)) { - kfree_skb(skb); - return; - } - /* Look up flow. */ - flow = ovs_flow_tbl_lookup(rcu_dereference(dp->table), &key, key_len); + flow = ovs_flow_tbl_lookup_stats(&dp->table, key, skb_get_hash(skb), + &n_mask_hit, &n_cache_hit); if (unlikely(!flow)) { struct dp_upcall_info upcall; + memset(&upcall, 0, sizeof(upcall)); upcall.cmd = OVS_PACKET_CMD_MISS; - upcall.key = &key; - upcall.userdata = NULL; - upcall.portid = p->upcall_portid; - ovs_dp_upcall(dp, skb, &upcall); - consume_skb(skb); + + if (OVS_CB(skb)->upcall_pid) + upcall.portid = OVS_CB(skb)->upcall_pid; + else if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU) + upcall.portid = + ovs_dp_get_upcall_portid(dp, smp_processor_id()); + else + upcall.portid = ovs_vport_find_upcall_portid(p, skb); + + upcall.mru = OVS_CB(skb)->mru; + error = ovs_dp_upcall(dp, skb, key, &upcall, 0); + switch (error) { + case 0: + case -EAGAIN: + case -ERESTARTSYS: + case -EINTR: + consume_skb(skb); + break; + default: + kfree_skb(skb); + break; + } stats_counter = &stats->n_missed; goto out; } - OVS_CB(skb)->flow = flow; + ovs_flow_stats_update(flow, key->tp.flags, skb); + sf_acts = rcu_dereference(flow->sf_acts); + /* This path can be invoked recursively: Use the current task to + * identify recursive invocation - the lock must be acquired only once. + * Even with disabled bottom halves this can be preempted on PREEMPT_RT. + * Limit the locking to RT to avoid assigning `owner' if it can be + * avoided. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && ovs_pcpu->owner != current) { + local_lock_nested_bh(&ovs_pcpu_storage->bh_lock); + ovs_pcpu->owner = current; + ovs_pcpu_locked = true; + } + + error = ovs_execute_actions(dp, skb, sf_acts, key); + if (unlikely(error)) + net_dbg_ratelimited("ovs: action execution error on datapath %s: %d\n", + ovs_dp_name(dp), error); + if (ovs_pcpu_locked) { + ovs_pcpu->owner = NULL; + local_unlock_nested_bh(&ovs_pcpu_storage->bh_lock); + } stats_counter = &stats->n_hit; - ovs_flow_used(OVS_CB(skb)->flow, skb); - ovs_execute_actions(dp, skb); out: /* Update datapath statistics. */ - u64_stats_update_begin(&stats->sync); + u64_stats_update_begin(&stats->syncp); (*stats_counter)++; - u64_stats_update_end(&stats->sync); + stats->n_mask_hit += n_mask_hit; + stats->n_cache_hit += n_cache_hit; + u64_stats_update_end(&stats->syncp); } -static struct genl_family dp_packet_genl_family = { - .id = GENL_ID_GENERATE, - .hdrsize = sizeof(struct ovs_header), - .name = OVS_PACKET_FAMILY, - .version = OVS_PACKET_VERSION, - .maxattr = OVS_PACKET_ATTR_MAX, - .netnsok = true, - .parallel_ops = true, -}; - int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb, - const struct dp_upcall_info *upcall_info) + const struct sw_flow_key *key, + const struct dp_upcall_info *upcall_info, + uint32_t cutlen) { struct dp_stats_percpu *stats; - int dp_ifindex; int err; + if (trace_ovs_dp_upcall_enabled()) + trace_ovs_dp_upcall(dp, skb, key, upcall_info); + if (upcall_info->portid == 0) { err = -ENOTCONN; goto err; } - dp_ifindex = get_dpifindex(dp); - if (!dp_ifindex) { - err = -ENODEV; - goto err; - } - if (!skb_is_gso(skb)) - err = queue_userspace_packet(ovs_dp_get_net(dp), dp_ifindex, skb, upcall_info); + err = queue_userspace_packet(dp, skb, key, upcall_info, cutlen); else - err = queue_gso_packets(ovs_dp_get_net(dp), dp_ifindex, skb, upcall_info); + err = queue_gso_packets(dp, skb, key, upcall_info, cutlen); + + ovs_vport_update_upcall_stats(skb, upcall_info, !err); if (err) goto err; @@ -305,117 +356,125 @@ int ovs_dp_upcall(struct datapath *dp, struct sk_buff *skb, err: stats = this_cpu_ptr(dp->stats_percpu); - u64_stats_update_begin(&stats->sync); + u64_stats_update_begin(&stats->syncp); stats->n_lost++; - u64_stats_update_end(&stats->sync); + u64_stats_update_end(&stats->syncp); return err; } -static int queue_gso_packets(struct net *net, int dp_ifindex, - struct sk_buff *skb, - const struct dp_upcall_info *upcall_info) +static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb, + const struct sw_flow_key *key, + const struct dp_upcall_info *upcall_info, + uint32_t cutlen) { - unsigned short gso_type = skb_shinfo(skb)->gso_type; - struct dp_upcall_info later_info; + unsigned int gso_type = skb_shinfo(skb)->gso_type; struct sw_flow_key later_key; struct sk_buff *segs, *nskb; int err; - segs = __skb_gso_segment(skb, NETIF_F_SG | NETIF_F_HW_CSUM, false); + BUILD_BUG_ON(sizeof(*OVS_CB(skb)) > SKB_GSO_CB_OFFSET); + segs = __skb_gso_segment(skb, NETIF_F_SG, false); if (IS_ERR(segs)) return PTR_ERR(segs); + if (segs == NULL) + return -EINVAL; + + if (gso_type & SKB_GSO_UDP) { + /* The initial flow key extracted by ovs_flow_key_extract() + * in this case is for a first fragment, so we need to + * properly mark later fragments. + */ + later_key = *key; + later_key.ip.frag = OVS_FRAG_TYPE_LATER; + } /* Queue all of the segments. */ - skb = segs; - do { - err = queue_userspace_packet(net, dp_ifindex, skb, upcall_info); + skb_list_walk_safe(segs, skb, nskb) { + if (gso_type & SKB_GSO_UDP && skb != segs) + key = &later_key; + + err = queue_userspace_packet(dp, skb, key, upcall_info, cutlen); if (err) break; - if (skb == segs && gso_type & SKB_GSO_UDP) { - /* The initial flow key extracted by ovs_flow_extract() - * in this case is for a first fragment, so we need to - * properly mark later fragments. - */ - later_key = *upcall_info->key; - later_key.ip.frag = OVS_FRAG_TYPE_LATER; - - later_info = *upcall_info; - later_info.key = &later_key; - upcall_info = &later_info; - } - } while ((skb = skb->next)); + } /* Free all of the segments. */ - skb = segs; - do { - nskb = skb->next; + skb_list_walk_safe(segs, skb, nskb) { if (err) kfree_skb(skb); else consume_skb(skb); - } while ((skb = nskb)); + } return err; } -static size_t key_attr_size(void) -{ - return nla_total_size(4) /* OVS_KEY_ATTR_PRIORITY */ - + nla_total_size(0) /* OVS_KEY_ATTR_TUNNEL */ - + nla_total_size(8) /* OVS_TUNNEL_KEY_ATTR_ID */ - + nla_total_size(4) /* OVS_TUNNEL_KEY_ATTR_IPV4_SRC */ - + nla_total_size(4) /* OVS_TUNNEL_KEY_ATTR_IPV4_DST */ - + nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TOS */ - + nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TTL */ - + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT */ - + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_CSUM */ - + nla_total_size(4) /* OVS_KEY_ATTR_IN_PORT */ - + nla_total_size(4) /* OVS_KEY_ATTR_SKB_MARK */ - + nla_total_size(12) /* OVS_KEY_ATTR_ETHERNET */ - + nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */ - + nla_total_size(4) /* OVS_KEY_ATTR_8021Q */ - + nla_total_size(0) /* OVS_KEY_ATTR_ENCAP */ - + nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */ - + nla_total_size(40) /* OVS_KEY_ATTR_IPV6 */ - + nla_total_size(2) /* OVS_KEY_ATTR_ICMPV6 */ - + nla_total_size(28); /* OVS_KEY_ATTR_ND */ -} - -static size_t upcall_msg_size(const struct sk_buff *skb, - const struct nlattr *userdata) +static size_t upcall_msg_size(const struct dp_upcall_info *upcall_info, + unsigned int hdrlen, int actions_attrlen) { size_t size = NLMSG_ALIGN(sizeof(struct ovs_header)) - + nla_total_size(skb->len) /* OVS_PACKET_ATTR_PACKET */ - + nla_total_size(key_attr_size()); /* OVS_PACKET_ATTR_KEY */ + + nla_total_size(hdrlen) /* OVS_PACKET_ATTR_PACKET */ + + nla_total_size(ovs_key_attr_size()) /* OVS_PACKET_ATTR_KEY */ + + nla_total_size(sizeof(unsigned int)) /* OVS_PACKET_ATTR_LEN */ + + nla_total_size(sizeof(u64)); /* OVS_PACKET_ATTR_HASH */ /* OVS_PACKET_ATTR_USERDATA */ - if (userdata) - size += NLA_ALIGN(userdata->nla_len); + if (upcall_info->userdata) + size += NLA_ALIGN(upcall_info->userdata->nla_len); + + /* OVS_PACKET_ATTR_EGRESS_TUN_KEY */ + if (upcall_info->egress_tun_info) + size += nla_total_size(ovs_tun_key_attr_size()); + + /* OVS_PACKET_ATTR_ACTIONS */ + if (upcall_info->actions_len) + size += nla_total_size(actions_attrlen); + + /* OVS_PACKET_ATTR_MRU */ + if (upcall_info->mru) + size += nla_total_size(sizeof(upcall_info->mru)); return size; } -static int queue_userspace_packet(struct net *net, int dp_ifindex, - struct sk_buff *skb, - const struct dp_upcall_info *upcall_info) +static void pad_packet(struct datapath *dp, struct sk_buff *skb) +{ + if (!(dp->user_features & OVS_DP_F_UNALIGNED)) { + size_t plen = NLA_ALIGN(skb->len) - skb->len; + + if (plen > 0) + skb_put_zero(skb, plen); + } +} + +static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb, + const struct sw_flow_key *key, + const struct dp_upcall_info *upcall_info, + uint32_t cutlen) { struct ovs_header *upcall; struct sk_buff *nskb = NULL; - struct sk_buff *user_skb; /* to be queued to userspace */ + struct sk_buff *user_skb = NULL; /* to be queued to userspace */ struct nlattr *nla; - int err; + size_t len; + unsigned int hlen; + int err, dp_ifindex; + u64 hash; + + dp_ifindex = get_dpifindex(dp); + if (!dp_ifindex) + return -ENODEV; - if (vlan_tx_tag_present(skb)) { + if (skb_vlan_tag_present(skb)) { nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) return -ENOMEM; - nskb = __vlan_put_tag(nskb, nskb->vlan_proto, vlan_tx_tag_get(nskb)); + nskb = __vlan_hwaccel_push_inside(nskb); if (!nskb) return -ENOMEM; - nskb->vlan_tci = 0; skb = nskb; } @@ -424,7 +483,23 @@ static int queue_userspace_packet(struct net *net, int dp_ifindex, goto out; } - user_skb = genlmsg_new(upcall_msg_size(skb, upcall_info->userdata), GFP_ATOMIC); + /* Complete checksum if needed */ + if (skb->ip_summed == CHECKSUM_PARTIAL && + (err = skb_csum_hwoffload_help(skb, 0))) + goto out; + + /* Older versions of OVS user space enforce alignment of the last + * Netlink attribute to NLA_ALIGNTO which would require extensive + * padding logic. Only perform zerocopy if padding is not required. + */ + if (dp->user_features & OVS_DP_F_UNALIGNED) + hlen = skb_zerocopy_headlen(skb); + else + hlen = skb->len; + + len = upcall_msg_size(upcall_info, hlen - cutlen, + OVS_CB(skb)->acts_origlen); + user_skb = genlmsg_new(len, GFP_ATOMIC); if (!user_skb) { err = -ENOMEM; goto out; @@ -432,432 +507,122 @@ static int queue_userspace_packet(struct net *net, int dp_ifindex, upcall = genlmsg_put(user_skb, 0, 0, &dp_packet_genl_family, 0, upcall_info->cmd); + if (!upcall) { + err = -EINVAL; + goto out; + } upcall->dp_ifindex = dp_ifindex; - nla = nla_nest_start(user_skb, OVS_PACKET_ATTR_KEY); - ovs_flow_to_nlattrs(upcall_info->key, user_skb); - nla_nest_end(user_skb, nla); + err = ovs_nla_put_key(key, key, OVS_PACKET_ATTR_KEY, false, user_skb); + if (err) + goto out; if (upcall_info->userdata) __nla_put(user_skb, OVS_PACKET_ATTR_USERDATA, nla_len(upcall_info->userdata), nla_data(upcall_info->userdata)); - nla = __nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, skb->len); - - skb_copy_and_csum_dev(skb, nla_data(nla)); - - genlmsg_end(user_skb, upcall); - err = genlmsg_unicast(net, user_skb, upcall_info->portid); - -out: - kfree_skb(nskb); - return err; -} - -/* Called with ovs_mutex. */ -static int flush_flows(struct datapath *dp) -{ - struct flow_table *old_table; - struct flow_table *new_table; - - old_table = ovsl_dereference(dp->table); - new_table = ovs_flow_tbl_alloc(TBL_MIN_BUCKETS); - if (!new_table) - return -ENOMEM; - - rcu_assign_pointer(dp->table, new_table); - - ovs_flow_tbl_deferred_destroy(old_table); - return 0; -} + if (upcall_info->egress_tun_info) { + nla = nla_nest_start_noflag(user_skb, + OVS_PACKET_ATTR_EGRESS_TUN_KEY); + if (!nla) { + err = -EMSGSIZE; + goto out; + } + err = ovs_nla_put_tunnel_info(user_skb, + upcall_info->egress_tun_info); + if (err) + goto out; -static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa, int attr_len) -{ + nla_nest_end(user_skb, nla); + } - struct sw_flow_actions *acts; - int new_acts_size; - int req_size = NLA_ALIGN(attr_len); - int next_offset = offsetof(struct sw_flow_actions, actions) + - (*sfa)->actions_len; + if (upcall_info->actions_len) { + nla = nla_nest_start_noflag(user_skb, OVS_PACKET_ATTR_ACTIONS); + if (!nla) { + err = -EMSGSIZE; + goto out; + } + err = ovs_nla_put_actions(upcall_info->actions, + upcall_info->actions_len, + user_skb); + if (!err) + nla_nest_end(user_skb, nla); + else + nla_nest_cancel(user_skb, nla); + } - if (req_size <= (ksize(*sfa) - next_offset)) + /* Add OVS_PACKET_ATTR_MRU */ + if (upcall_info->mru && + nla_put_u16(user_skb, OVS_PACKET_ATTR_MRU, upcall_info->mru)) { + err = -ENOBUFS; goto out; - - new_acts_size = ksize(*sfa) * 2; - - if (new_acts_size > MAX_ACTIONS_BUFSIZE) { - if ((MAX_ACTIONS_BUFSIZE - next_offset) < req_size) - return ERR_PTR(-EMSGSIZE); - new_acts_size = MAX_ACTIONS_BUFSIZE; } - acts = ovs_flow_actions_alloc(new_acts_size); - if (IS_ERR(acts)) - return (void *)acts; - - memcpy(acts->actions, (*sfa)->actions, (*sfa)->actions_len); - acts->actions_len = (*sfa)->actions_len; - kfree(*sfa); - *sfa = acts; - -out: - (*sfa)->actions_len += req_size; - return (struct nlattr *) ((unsigned char *)(*sfa) + next_offset); -} - -static int add_action(struct sw_flow_actions **sfa, int attrtype, void *data, int len) -{ - struct nlattr *a; - - a = reserve_sfa_size(sfa, nla_attr_size(len)); - if (IS_ERR(a)) - return PTR_ERR(a); - - a->nla_type = attrtype; - a->nla_len = nla_attr_size(len); - - if (data) - memcpy(nla_data(a), data, len); - memset((unsigned char *) a + a->nla_len, 0, nla_padlen(len)); - - return 0; -} - -static inline int add_nested_action_start(struct sw_flow_actions **sfa, int attrtype) -{ - int used = (*sfa)->actions_len; - int err; - - err = add_action(sfa, attrtype, NULL, 0); - if (err) - return err; - - return used; -} - -static inline void add_nested_action_end(struct sw_flow_actions *sfa, int st_offset) -{ - struct nlattr *a = (struct nlattr *) ((unsigned char *)sfa->actions + st_offset); - - a->nla_len = sfa->actions_len - st_offset; -} + /* Add OVS_PACKET_ATTR_LEN when packet is truncated */ + if (cutlen > 0 && + nla_put_u32(user_skb, OVS_PACKET_ATTR_LEN, skb->len)) { + err = -ENOBUFS; + goto out; + } -static int validate_and_copy_actions(const struct nlattr *attr, - const struct sw_flow_key *key, int depth, - struct sw_flow_actions **sfa); + /* Add OVS_PACKET_ATTR_HASH */ + hash = skb_get_hash_raw(skb); + if (skb->sw_hash) + hash |= OVS_PACKET_HASH_SW_BIT; -static int validate_and_copy_sample(const struct nlattr *attr, - const struct sw_flow_key *key, int depth, - struct sw_flow_actions **sfa) -{ - const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1]; - const struct nlattr *probability, *actions; - const struct nlattr *a; - int rem, start, err, st_acts; + if (skb->l4_hash) + hash |= OVS_PACKET_HASH_L4_BIT; - memset(attrs, 0, sizeof(attrs)); - nla_for_each_nested(a, attr, rem) { - int type = nla_type(a); - if (!type || type > OVS_SAMPLE_ATTR_MAX || attrs[type]) - return -EINVAL; - attrs[type] = a; + if (nla_put(user_skb, OVS_PACKET_ATTR_HASH, sizeof (u64), &hash)) { + err = -ENOBUFS; + goto out; } - if (rem) - return -EINVAL; - - probability = attrs[OVS_SAMPLE_ATTR_PROBABILITY]; - if (!probability || nla_len(probability) != sizeof(u32)) - return -EINVAL; - actions = attrs[OVS_SAMPLE_ATTR_ACTIONS]; - if (!actions || (nla_len(actions) && nla_len(actions) < NLA_HDRLEN)) - return -EINVAL; - - /* validation done, copy sample action. */ - start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SAMPLE); - if (start < 0) - return start; - err = add_action(sfa, OVS_SAMPLE_ATTR_PROBABILITY, nla_data(probability), sizeof(u32)); - if (err) - return err; - st_acts = add_nested_action_start(sfa, OVS_SAMPLE_ATTR_ACTIONS); - if (st_acts < 0) - return st_acts; + /* Only reserve room for attribute header, packet data is added + * in skb_zerocopy() */ + if (!(nla = nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, 0))) { + err = -ENOBUFS; + goto out; + } + nla->nla_len = nla_attr_size(skb->len - cutlen); - err = validate_and_copy_actions(actions, key, depth + 1, sfa); + err = skb_zerocopy(user_skb, skb, skb->len - cutlen, hlen); if (err) - return err; - - add_nested_action_end(*sfa, st_acts); - add_nested_action_end(*sfa, start); - - return 0; -} - -static int validate_tp_port(const struct sw_flow_key *flow_key) -{ - if (flow_key->eth.type == htons(ETH_P_IP)) { - if (flow_key->ipv4.tp.src || flow_key->ipv4.tp.dst) - return 0; - } else if (flow_key->eth.type == htons(ETH_P_IPV6)) { - if (flow_key->ipv6.tp.src || flow_key->ipv6.tp.dst) - return 0; - } + goto out; - return -EINVAL; -} + /* Pad OVS_PACKET_ATTR_PACKET if linear copy was performed */ + pad_packet(dp, user_skb); -static int validate_and_copy_set_tun(const struct nlattr *attr, - struct sw_flow_actions **sfa) -{ - struct ovs_key_ipv4_tunnel tun_key; - int err, start; + ((struct nlmsghdr *) user_skb->data)->nlmsg_len = user_skb->len; - err = ovs_ipv4_tun_from_nlattr(nla_data(attr), &tun_key); + err = genlmsg_unicast(ovs_dp_get_net(dp), user_skb, upcall_info->portid); + user_skb = NULL; +out: if (err) - return err; - - start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SET); - if (start < 0) - return start; - - err = add_action(sfa, OVS_KEY_ATTR_IPV4_TUNNEL, &tun_key, sizeof(tun_key)); - add_nested_action_end(*sfa, start); + skb_tx_error(skb); + consume_skb(user_skb); + consume_skb(nskb); return err; } -static int validate_set(const struct nlattr *a, - const struct sw_flow_key *flow_key, - struct sw_flow_actions **sfa, - bool *set_tun) -{ - const struct nlattr *ovs_key = nla_data(a); - int key_type = nla_type(ovs_key); - - /* There can be only one key in a action */ - if (nla_total_size(nla_len(ovs_key)) != nla_len(a)) - return -EINVAL; - - if (key_type > OVS_KEY_ATTR_MAX || - (ovs_key_lens[key_type] != nla_len(ovs_key) && - ovs_key_lens[key_type] != -1)) - return -EINVAL; - - switch (key_type) { - const struct ovs_key_ipv4 *ipv4_key; - const struct ovs_key_ipv6 *ipv6_key; - int err; - - case OVS_KEY_ATTR_PRIORITY: - case OVS_KEY_ATTR_SKB_MARK: - case OVS_KEY_ATTR_ETHERNET: - break; - - case OVS_KEY_ATTR_TUNNEL: - *set_tun = true; - err = validate_and_copy_set_tun(a, sfa); - if (err) - return err; - break; - - case OVS_KEY_ATTR_IPV4: - if (flow_key->eth.type != htons(ETH_P_IP)) - return -EINVAL; - - if (!flow_key->ip.proto) - return -EINVAL; - - ipv4_key = nla_data(ovs_key); - if (ipv4_key->ipv4_proto != flow_key->ip.proto) - return -EINVAL; - - if (ipv4_key->ipv4_frag != flow_key->ip.frag) - return -EINVAL; - - break; - - case OVS_KEY_ATTR_IPV6: - if (flow_key->eth.type != htons(ETH_P_IPV6)) - return -EINVAL; - - if (!flow_key->ip.proto) - return -EINVAL; - - ipv6_key = nla_data(ovs_key); - if (ipv6_key->ipv6_proto != flow_key->ip.proto) - return -EINVAL; - - if (ipv6_key->ipv6_frag != flow_key->ip.frag) - return -EINVAL; - - if (ntohl(ipv6_key->ipv6_label) & 0xFFF00000) - return -EINVAL; - - break; - - case OVS_KEY_ATTR_TCP: - if (flow_key->ip.proto != IPPROTO_TCP) - return -EINVAL; - - return validate_tp_port(flow_key); - - case OVS_KEY_ATTR_UDP: - if (flow_key->ip.proto != IPPROTO_UDP) - return -EINVAL; - - return validate_tp_port(flow_key); - - default: - return -EINVAL; - } - - return 0; -} - -static int validate_userspace(const struct nlattr *attr) -{ - static const struct nla_policy userspace_policy[OVS_USERSPACE_ATTR_MAX + 1] = { - [OVS_USERSPACE_ATTR_PID] = {.type = NLA_U32 }, - [OVS_USERSPACE_ATTR_USERDATA] = {.type = NLA_UNSPEC }, - }; - struct nlattr *a[OVS_USERSPACE_ATTR_MAX + 1]; - int error; - - error = nla_parse_nested(a, OVS_USERSPACE_ATTR_MAX, - attr, userspace_policy); - if (error) - return error; - - if (!a[OVS_USERSPACE_ATTR_PID] || - !nla_get_u32(a[OVS_USERSPACE_ATTR_PID])) - return -EINVAL; - - return 0; -} - -static int copy_action(const struct nlattr *from, - struct sw_flow_actions **sfa) -{ - int totlen = NLA_ALIGN(from->nla_len); - struct nlattr *to; - - to = reserve_sfa_size(sfa, from->nla_len); - if (IS_ERR(to)) - return PTR_ERR(to); - - memcpy(to, from, totlen); - return 0; -} - -static int validate_and_copy_actions(const struct nlattr *attr, - const struct sw_flow_key *key, - int depth, - struct sw_flow_actions **sfa) -{ - const struct nlattr *a; - int rem, err; - - if (depth >= SAMPLE_ACTION_DEPTH) - return -EOVERFLOW; - - nla_for_each_nested(a, attr, rem) { - /* Expected argument lengths, (u32)-1 for variable length. */ - static const u32 action_lens[OVS_ACTION_ATTR_MAX + 1] = { - [OVS_ACTION_ATTR_OUTPUT] = sizeof(u32), - [OVS_ACTION_ATTR_USERSPACE] = (u32)-1, - [OVS_ACTION_ATTR_PUSH_VLAN] = sizeof(struct ovs_action_push_vlan), - [OVS_ACTION_ATTR_POP_VLAN] = 0, - [OVS_ACTION_ATTR_SET] = (u32)-1, - [OVS_ACTION_ATTR_SAMPLE] = (u32)-1 - }; - const struct ovs_action_push_vlan *vlan; - int type = nla_type(a); - bool skip_copy; - - if (type > OVS_ACTION_ATTR_MAX || - (action_lens[type] != nla_len(a) && - action_lens[type] != (u32)-1)) - return -EINVAL; - - skip_copy = false; - switch (type) { - case OVS_ACTION_ATTR_UNSPEC: - return -EINVAL; - - case OVS_ACTION_ATTR_USERSPACE: - err = validate_userspace(a); - if (err) - return err; - break; - - case OVS_ACTION_ATTR_OUTPUT: - if (nla_get_u32(a) >= DP_MAX_PORTS) - return -EINVAL; - break; - - - case OVS_ACTION_ATTR_POP_VLAN: - break; - - case OVS_ACTION_ATTR_PUSH_VLAN: - vlan = nla_data(a); - if (vlan->vlan_tpid != htons(ETH_P_8021Q)) - return -EINVAL; - if (!(vlan->vlan_tci & htons(VLAN_TAG_PRESENT))) - return -EINVAL; - break; - - case OVS_ACTION_ATTR_SET: - err = validate_set(a, key, sfa, &skip_copy); - if (err) - return err; - break; - - case OVS_ACTION_ATTR_SAMPLE: - err = validate_and_copy_sample(a, key, depth, sfa); - if (err) - return err; - skip_copy = true; - break; - - default: - return -EINVAL; - } - if (!skip_copy) { - err = copy_action(a, sfa); - if (err) - return err; - } - } - - if (rem > 0) - return -EINVAL; - - return 0; -} - -static void clear_stats(struct sw_flow *flow) -{ - flow->used = 0; - flow->tcp_flags = 0; - flow->packet_count = 0; - flow->byte_count = 0; -} - static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) { - struct ovs_header *ovs_header = info->userhdr; + struct ovs_header *ovs_header = genl_info_userhdr(info); + struct net *net = sock_net(skb->sk); struct nlattr **a = info->attrs; struct sw_flow_actions *acts; struct sk_buff *packet; struct sw_flow *flow; + struct sw_flow_actions *sf_acts; struct datapath *dp; - struct ethhdr *eth; + struct vport *input_vport; + u16 mru = 0; + u64 hash; int len; int err; - int key_len; + bool log = !a[OVS_PACKET_ATTR_PROBE]; err = -EINVAL; if (!a[OVS_PACKET_ATTR_PACKET] || !a[OVS_PACKET_ATTR_KEY] || @@ -873,16 +638,23 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) nla_memcpy(__skb_put(packet, len), a[OVS_PACKET_ATTR_PACKET], len); - skb_reset_mac_header(packet); - eth = eth_hdr(packet); + /* Set packet's mru */ + if (a[OVS_PACKET_ATTR_MRU]) { + mru = nla_get_u16(a[OVS_PACKET_ATTR_MRU]); + packet->ignore_df = 1; + } + OVS_CB(packet)->mru = mru; - /* Normally, setting the skb 'protocol' field would be handled by a - * call to eth_type_trans(), but it assumes there's a sending - * device, which we may not have. */ - if (ntohs(eth->h_proto) >= ETH_P_802_3_MIN) - packet->protocol = eth->h_proto; - else - packet->protocol = htons(ETH_P_802_2); + if (a[OVS_PACKET_ATTR_HASH]) { + hash = nla_get_u64(a[OVS_PACKET_ATTR_HASH]); + + __skb_set_hash(packet, hash & 0xFFFFFFFFULL, + !!(hash & OVS_PACKET_HASH_SW_BIT), + !!(hash & OVS_PACKET_HASH_L4_BIT)); + } + + OVS_CB(packet)->upcall_pid = + nla_get_u32_default(a[OVS_PACKET_ATTR_UPCALL_PID], 0); /* Build an sw_flow for sending this packet. */ flow = ovs_flow_alloc(); @@ -890,45 +662,55 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info) if (IS_ERR(flow)) goto err_kfree_skb; - err = ovs_flow_extract(packet, -1, &flow->key, &key_len); + err = ovs_flow_key_extract_userspace(net, a[OVS_PACKET_ATTR_KEY], + packet, &flow->key, log); if (err) goto err_flow_free; - err = ovs_flow_metadata_from_nlattrs(flow, key_len, a[OVS_PACKET_ATTR_KEY]); + err = ovs_nla_copy_actions(net, a[OVS_PACKET_ATTR_ACTIONS], + &flow->key, &acts, log); if (err) goto err_flow_free; - acts = ovs_flow_actions_alloc(nla_len(a[OVS_PACKET_ATTR_ACTIONS])); - err = PTR_ERR(acts); - if (IS_ERR(acts)) - goto err_flow_free; - err = validate_and_copy_actions(a[OVS_PACKET_ATTR_ACTIONS], &flow->key, 0, &acts); rcu_assign_pointer(flow->sf_acts, acts); - if (err) - goto err_flow_free; - - OVS_CB(packet)->flow = flow; packet->priority = flow->key.phy.priority; packet->mark = flow->key.phy.skb_mark; rcu_read_lock(); - dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + dp = get_dp_rcu(net, ovs_header->dp_ifindex); err = -ENODEV; if (!dp) goto err_unlock; + input_vport = ovs_vport_rcu(dp, flow->key.phy.in_port); + if (!input_vport) + input_vport = ovs_vport_rcu(dp, OVSP_LOCAL); + + if (!input_vport) + goto err_unlock; + + packet->dev = input_vport->dev; + OVS_CB(packet)->input_vport = input_vport; + sf_acts = rcu_dereference(flow->sf_acts); + local_bh_disable(); - err = ovs_execute_actions(dp, packet); + local_lock_nested_bh(&ovs_pcpu_storage->bh_lock); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + this_cpu_write(ovs_pcpu_storage->owner, current); + err = ovs_execute_actions(dp, packet, sf_acts, &flow->key); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + this_cpu_write(ovs_pcpu_storage->owner, NULL); + local_unlock_nested_bh(&ovs_pcpu_storage->bh_lock); local_bh_enable(); rcu_read_unlock(); - ovs_flow_free(flow); + ovs_flow_free(flow, false); return err; err_unlock: rcu_read_unlock(); err_flow_free: - ovs_flow_free(flow); + ovs_flow_free(flow, false); err_kfree_skb: kfree_skb(packet); err: @@ -939,24 +721,46 @@ static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = { [OVS_PACKET_ATTR_PACKET] = { .len = ETH_HLEN }, [OVS_PACKET_ATTR_KEY] = { .type = NLA_NESTED }, [OVS_PACKET_ATTR_ACTIONS] = { .type = NLA_NESTED }, + [OVS_PACKET_ATTR_PROBE] = { .type = NLA_FLAG }, + [OVS_PACKET_ATTR_MRU] = { .type = NLA_U16 }, + [OVS_PACKET_ATTR_HASH] = { .type = NLA_U64 }, + [OVS_PACKET_ATTR_UPCALL_PID] = { .type = NLA_U32 }, }; -static struct genl_ops dp_packet_genl_ops[] = { +static const struct genl_small_ops dp_packet_genl_ops[] = { { .cmd = OVS_PACKET_CMD_EXECUTE, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ - .policy = packet_policy, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .doit = ovs_packet_cmd_execute } }; -static void get_dp_stats(struct datapath *dp, struct ovs_dp_stats *stats) +static struct genl_family dp_packet_genl_family __ro_after_init = { + .hdrsize = sizeof(struct ovs_header), + .name = OVS_PACKET_FAMILY, + .version = OVS_PACKET_VERSION, + .maxattr = OVS_PACKET_ATTR_MAX, + .policy = packet_policy, + .netnsok = true, + .parallel_ops = true, + .small_ops = dp_packet_genl_ops, + .n_small_ops = ARRAY_SIZE(dp_packet_genl_ops), + .resv_start_op = OVS_PACKET_CMD_EXECUTE + 1, + .module = THIS_MODULE, +}; + +static void get_dp_stats(const struct datapath *dp, struct ovs_dp_stats *stats, + struct ovs_dp_megaflow_stats *mega_stats) { int i; - struct flow_table *table = ovsl_dereference(dp->table); - stats->n_flows = ovs_flow_tbl_count(table); + memset(mega_stats, 0, sizeof(*mega_stats)); + + stats->n_flows = ovs_flow_tbl_count(&dp->table); + mega_stats->n_masks = ovs_flow_tbl_num_masks(&dp->table); stats->n_hit = stats->n_missed = stats->n_lost = 0; + for_each_possible_cpu(i) { const struct dp_stats_percpu *percpu_stats; struct dp_stats_percpu local_stats; @@ -965,189 +769,100 @@ static void get_dp_stats(struct datapath *dp, struct ovs_dp_stats *stats) percpu_stats = per_cpu_ptr(dp->stats_percpu, i); do { - start = u64_stats_fetch_begin_bh(&percpu_stats->sync); + start = u64_stats_fetch_begin(&percpu_stats->syncp); local_stats = *percpu_stats; - } while (u64_stats_fetch_retry_bh(&percpu_stats->sync, start)); + } while (u64_stats_fetch_retry(&percpu_stats->syncp, start)); stats->n_hit += local_stats.n_hit; stats->n_missed += local_stats.n_missed; stats->n_lost += local_stats.n_lost; + mega_stats->n_mask_hit += local_stats.n_mask_hit; + mega_stats->n_cache_hit += local_stats.n_cache_hit; } } -static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = { - [OVS_FLOW_ATTR_KEY] = { .type = NLA_NESTED }, - [OVS_FLOW_ATTR_ACTIONS] = { .type = NLA_NESTED }, - [OVS_FLOW_ATTR_CLEAR] = { .type = NLA_FLAG }, -}; - -static struct genl_family dp_flow_genl_family = { - .id = GENL_ID_GENERATE, - .hdrsize = sizeof(struct ovs_header), - .name = OVS_FLOW_FAMILY, - .version = OVS_FLOW_VERSION, - .maxattr = OVS_FLOW_ATTR_MAX, - .netnsok = true, - .parallel_ops = true, -}; - -static struct genl_multicast_group ovs_dp_flow_multicast_group = { - .name = OVS_FLOW_MCGROUP -}; - -static int actions_to_attr(const struct nlattr *attr, int len, struct sk_buff *skb); -static int sample_action_to_attr(const struct nlattr *attr, struct sk_buff *skb) +static bool should_fill_key(const struct sw_flow_id *sfid, uint32_t ufid_flags) { - const struct nlattr *a; - struct nlattr *start; - int err = 0, rem; - - start = nla_nest_start(skb, OVS_ACTION_ATTR_SAMPLE); - if (!start) - return -EMSGSIZE; - - nla_for_each_nested(a, attr, rem) { - int type = nla_type(a); - struct nlattr *st_sample; - - switch (type) { - case OVS_SAMPLE_ATTR_PROBABILITY: - if (nla_put(skb, OVS_SAMPLE_ATTR_PROBABILITY, sizeof(u32), nla_data(a))) - return -EMSGSIZE; - break; - case OVS_SAMPLE_ATTR_ACTIONS: - st_sample = nla_nest_start(skb, OVS_SAMPLE_ATTR_ACTIONS); - if (!st_sample) - return -EMSGSIZE; - err = actions_to_attr(nla_data(a), nla_len(a), skb); - if (err) - return err; - nla_nest_end(skb, st_sample); - break; - } - } - - nla_nest_end(skb, start); - return err; + return ovs_identifier_is_ufid(sfid) && + !(ufid_flags & OVS_UFID_F_OMIT_KEY); } -static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb) +static bool should_fill_mask(uint32_t ufid_flags) { - const struct nlattr *ovs_key = nla_data(a); - int key_type = nla_type(ovs_key); - struct nlattr *start; - int err; - - switch (key_type) { - case OVS_KEY_ATTR_IPV4_TUNNEL: - start = nla_nest_start(skb, OVS_ACTION_ATTR_SET); - if (!start) - return -EMSGSIZE; - - err = ovs_ipv4_tun_to_nlattr(skb, nla_data(ovs_key)); - if (err) - return err; - nla_nest_end(skb, start); - break; - default: - if (nla_put(skb, OVS_ACTION_ATTR_SET, nla_len(a), ovs_key)) - return -EMSGSIZE; - break; - } + return !(ufid_flags & OVS_UFID_F_OMIT_MASK); +} - return 0; +static bool should_fill_actions(uint32_t ufid_flags) +{ + return !(ufid_flags & OVS_UFID_F_OMIT_ACTIONS); } -static int actions_to_attr(const struct nlattr *attr, int len, struct sk_buff *skb) +static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts, + const struct sw_flow_id *sfid, + uint32_t ufid_flags) { - const struct nlattr *a; - int rem, err; + size_t len = NLMSG_ALIGN(sizeof(struct ovs_header)); - nla_for_each_attr(a, attr, len, rem) { - int type = nla_type(a); + /* OVS_FLOW_ATTR_UFID, or unmasked flow key as fallback + * see ovs_nla_put_identifier() + */ + if (sfid && ovs_identifier_is_ufid(sfid)) + len += nla_total_size(sfid->ufid_len); + else + len += nla_total_size(ovs_key_attr_size()); - switch (type) { - case OVS_ACTION_ATTR_SET: - err = set_action_to_attr(a, skb); - if (err) - return err; - break; + /* OVS_FLOW_ATTR_KEY */ + if (!sfid || should_fill_key(sfid, ufid_flags)) + len += nla_total_size(ovs_key_attr_size()); - case OVS_ACTION_ATTR_SAMPLE: - err = sample_action_to_attr(a, skb); - if (err) - return err; - break; - default: - if (nla_put(skb, type, nla_len(a), nla_data(a))) - return -EMSGSIZE; - break; - } - } + /* OVS_FLOW_ATTR_MASK */ + if (should_fill_mask(ufid_flags)) + len += nla_total_size(ovs_key_attr_size()); - return 0; -} + /* OVS_FLOW_ATTR_ACTIONS */ + if (should_fill_actions(ufid_flags)) + len += nla_total_size(acts->orig_len); -static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts) -{ - return NLMSG_ALIGN(sizeof(struct ovs_header)) - + nla_total_size(key_attr_size()) /* OVS_FLOW_ATTR_KEY */ - + nla_total_size(sizeof(struct ovs_flow_stats)) /* OVS_FLOW_ATTR_STATS */ + return len + + nla_total_size_64bit(sizeof(struct ovs_flow_stats)) /* OVS_FLOW_ATTR_STATS */ + nla_total_size(1) /* OVS_FLOW_ATTR_TCP_FLAGS */ - + nla_total_size(8) /* OVS_FLOW_ATTR_USED */ - + nla_total_size(acts->actions_len); /* OVS_FLOW_ATTR_ACTIONS */ + + nla_total_size_64bit(8); /* OVS_FLOW_ATTR_USED */ } -/* Called with ovs_mutex. */ -static int ovs_flow_cmd_fill_info(struct sw_flow *flow, struct datapath *dp, - struct sk_buff *skb, u32 portid, - u32 seq, u32 flags, u8 cmd) +/* Called with ovs_mutex or RCU read lock. */ +static int ovs_flow_cmd_fill_stats(const struct sw_flow *flow, + struct sk_buff *skb) { - const int skb_orig_len = skb->len; - const struct sw_flow_actions *sf_acts; - struct nlattr *start; struct ovs_flow_stats stats; - struct ovs_header *ovs_header; - struct nlattr *nla; + __be16 tcp_flags; unsigned long used; - u8 tcp_flags; - int err; - sf_acts = ovsl_dereference(flow->sf_acts); + ovs_flow_stats_get(flow, &stats, &used, &tcp_flags); - ovs_header = genlmsg_put(skb, portid, seq, &dp_flow_genl_family, flags, cmd); - if (!ovs_header) + if (used && + nla_put_u64_64bit(skb, OVS_FLOW_ATTR_USED, ovs_flow_used_time(used), + OVS_FLOW_ATTR_PAD)) return -EMSGSIZE; - ovs_header->dp_ifindex = get_dpifindex(dp); - - nla = nla_nest_start(skb, OVS_FLOW_ATTR_KEY); - if (!nla) - goto nla_put_failure; - err = ovs_flow_to_nlattrs(&flow->key, skb); - if (err) - goto error; - nla_nest_end(skb, nla); - - spin_lock_bh(&flow->lock); - used = flow->used; - stats.n_packets = flow->packet_count; - stats.n_bytes = flow->byte_count; - tcp_flags = flow->tcp_flags; - spin_unlock_bh(&flow->lock); + if (stats.n_packets && + nla_put_64bit(skb, OVS_FLOW_ATTR_STATS, + sizeof(struct ovs_flow_stats), &stats, + OVS_FLOW_ATTR_PAD)) + return -EMSGSIZE; - if (used && - nla_put_u64(skb, OVS_FLOW_ATTR_USED, ovs_flow_used_time(used))) - goto nla_put_failure; + if ((u8)ntohs(tcp_flags) && + nla_put_u8(skb, OVS_FLOW_ATTR_TCP_FLAGS, (u8)ntohs(tcp_flags))) + return -EMSGSIZE; - if (stats.n_packets && - nla_put(skb, OVS_FLOW_ATTR_STATS, - sizeof(struct ovs_flow_stats), &stats)) - goto nla_put_failure; + return 0; +} - if (tcp_flags && - nla_put_u8(skb, OVS_FLOW_ATTR_TCP_FLAGS, tcp_flags)) - goto nla_put_failure; +/* Called with ovs_mutex or RCU read lock. */ +static int ovs_flow_cmd_fill_actions(const struct sw_flow *flow, + struct sk_buff *skb, int skb_orig_len) +{ + struct nlattr *start; + int err; /* If OVS_FLOW_ATTR_ACTIONS doesn't fit, skip dumping the actions if * this is the first flow to be dumped into 'skb'. This is unusual for @@ -1159,134 +874,229 @@ static int ovs_flow_cmd_fill_info(struct sw_flow *flow, struct datapath *dp, * This can only fail for dump operations because the skb is always * properly sized for single flows. */ - start = nla_nest_start(skb, OVS_FLOW_ATTR_ACTIONS); + start = nla_nest_start_noflag(skb, OVS_FLOW_ATTR_ACTIONS); if (start) { - err = actions_to_attr(sf_acts->actions, sf_acts->actions_len, skb); + const struct sw_flow_actions *sf_acts; + + sf_acts = rcu_dereference_ovsl(flow->sf_acts); + err = ovs_nla_put_actions(sf_acts->actions, + sf_acts->actions_len, skb); + if (!err) nla_nest_end(skb, start); else { if (skb_orig_len) - goto error; + return err; nla_nest_cancel(skb, start); } - } else if (skb_orig_len) - goto nla_put_failure; + } else if (skb_orig_len) { + return -EMSGSIZE; + } - return genlmsg_end(skb, ovs_header); + return 0; +} + +/* Called with ovs_mutex or RCU read lock. */ +static int ovs_flow_cmd_fill_info(const struct sw_flow *flow, int dp_ifindex, + struct sk_buff *skb, u32 portid, + u32 seq, u32 flags, u8 cmd, u32 ufid_flags) +{ + const int skb_orig_len = skb->len; + struct ovs_header *ovs_header; + int err; + + ovs_header = genlmsg_put(skb, portid, seq, &dp_flow_genl_family, + flags, cmd); + if (!ovs_header) + return -EMSGSIZE; + + ovs_header->dp_ifindex = dp_ifindex; + + err = ovs_nla_put_identifier(flow, skb); + if (err) + goto error; + + if (should_fill_key(&flow->id, ufid_flags)) { + err = ovs_nla_put_masked_key(flow, skb); + if (err) + goto error; + } + + if (should_fill_mask(ufid_flags)) { + err = ovs_nla_put_mask(flow, skb); + if (err) + goto error; + } + + err = ovs_flow_cmd_fill_stats(flow, skb); + if (err) + goto error; + + if (should_fill_actions(ufid_flags)) { + err = ovs_flow_cmd_fill_actions(flow, skb, skb_orig_len); + if (err) + goto error; + } + + genlmsg_end(skb, ovs_header); + return 0; -nla_put_failure: - err = -EMSGSIZE; error: genlmsg_cancel(skb, ovs_header); return err; } -static struct sk_buff *ovs_flow_cmd_alloc_info(struct sw_flow *flow) +/* May not be called with RCU read lock. */ +static struct sk_buff *ovs_flow_cmd_alloc_info(const struct sw_flow_actions *acts, + const struct sw_flow_id *sfid, + struct genl_info *info, + bool always, + uint32_t ufid_flags) { - const struct sw_flow_actions *sf_acts; + struct sk_buff *skb; + size_t len; + + if (!always && !ovs_must_notify(&dp_flow_genl_family, info, 0)) + return NULL; - sf_acts = ovsl_dereference(flow->sf_acts); + len = ovs_flow_cmd_msg_size(acts, sfid, ufid_flags); + skb = genlmsg_new(len, GFP_KERNEL); + if (!skb) + return ERR_PTR(-ENOMEM); - return genlmsg_new(ovs_flow_cmd_msg_size(sf_acts), GFP_KERNEL); + return skb; } -static struct sk_buff *ovs_flow_cmd_build_info(struct sw_flow *flow, - struct datapath *dp, - u32 portid, u32 seq, u8 cmd) +/* Called with ovs_mutex. */ +static struct sk_buff *ovs_flow_cmd_build_info(const struct sw_flow *flow, + int dp_ifindex, + struct genl_info *info, u8 cmd, + bool always, u32 ufid_flags) { struct sk_buff *skb; int retval; - skb = ovs_flow_cmd_alloc_info(flow); - if (!skb) - return ERR_PTR(-ENOMEM); + skb = ovs_flow_cmd_alloc_info(ovsl_dereference(flow->sf_acts), + &flow->id, info, always, ufid_flags); + if (IS_ERR_OR_NULL(skb)) + return skb; - retval = ovs_flow_cmd_fill_info(flow, dp, skb, portid, seq, 0, cmd); - BUG_ON(retval < 0); + retval = ovs_flow_cmd_fill_info(flow, dp_ifindex, skb, + info->snd_portid, info->snd_seq, 0, + cmd, ufid_flags); + if (WARN_ON_ONCE(retval < 0)) { + kfree_skb(skb); + skb = ERR_PTR(retval); + } return skb; } -static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) +static int ovs_flow_cmd_new(struct sk_buff *skb, struct genl_info *info) { + struct net *net = sock_net(skb->sk); struct nlattr **a = info->attrs; - struct ovs_header *ovs_header = info->userhdr; - struct sw_flow_key key; - struct sw_flow *flow; + struct ovs_header *ovs_header = genl_info_userhdr(info); + struct sw_flow *flow = NULL, *new_flow; + struct sw_flow_mask mask; struct sk_buff *reply; struct datapath *dp; - struct flow_table *table; - struct sw_flow_actions *acts = NULL; + struct sw_flow_key *key; + struct sw_flow_actions *acts; + struct sw_flow_match match; + u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); int error; - int key_len; + bool log = !a[OVS_FLOW_ATTR_PROBE]; - /* Extract key. */ + /* Must have key and actions. */ error = -EINVAL; - if (!a[OVS_FLOW_ATTR_KEY]) + if (!a[OVS_FLOW_ATTR_KEY]) { + OVS_NLERR(log, "Flow key attr not present in new flow."); goto error; - error = ovs_flow_from_nlattrs(&key, &key_len, a[OVS_FLOW_ATTR_KEY]); - if (error) + } + if (!a[OVS_FLOW_ATTR_ACTIONS]) { + OVS_NLERR(log, "Flow actions attr not present in new flow."); goto error; + } - /* Validate actions. */ - if (a[OVS_FLOW_ATTR_ACTIONS]) { - acts = ovs_flow_actions_alloc(nla_len(a[OVS_FLOW_ATTR_ACTIONS])); - error = PTR_ERR(acts); - if (IS_ERR(acts)) - goto error; - - error = validate_and_copy_actions(a[OVS_FLOW_ATTR_ACTIONS], &key, 0, &acts); - if (error) - goto err_kfree; - } else if (info->genlhdr->cmd == OVS_FLOW_CMD_NEW) { - error = -EINVAL; + /* Most of the time we need to allocate a new flow, do it before + * locking. + */ + new_flow = ovs_flow_alloc(); + if (IS_ERR(new_flow)) { + error = PTR_ERR(new_flow); goto error; } - ovs_lock(); - dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); - error = -ENODEV; - if (!dp) - goto err_unlock_ovs; + /* Extract key. */ + key = kzalloc(sizeof(*key), GFP_KERNEL); + if (!key) { + error = -ENOMEM; + goto err_kfree_flow; + } - table = ovsl_dereference(dp->table); - flow = ovs_flow_tbl_lookup(table, &key, key_len); - if (!flow) { - /* Bail out if we're not allowed to create a new flow. */ - error = -ENOENT; - if (info->genlhdr->cmd == OVS_FLOW_CMD_SET) - goto err_unlock_ovs; + ovs_match_init(&match, key, false, &mask); + error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], + a[OVS_FLOW_ATTR_MASK], log); + if (error) + goto err_kfree_key; - /* Expand table, if necessary, to make room. */ - if (ovs_flow_tbl_need_to_expand(table)) { - struct flow_table *new_table; + ovs_flow_mask_key(&new_flow->key, key, true, &mask); - new_table = ovs_flow_tbl_expand(table); - if (!IS_ERR(new_table)) { - rcu_assign_pointer(dp->table, new_table); - ovs_flow_tbl_deferred_destroy(table); - table = ovsl_dereference(dp->table); - } - } + /* Extract flow identifier. */ + error = ovs_nla_get_identifier(&new_flow->id, a[OVS_FLOW_ATTR_UFID], + key, log); + if (error) + goto err_kfree_key; - /* Allocate flow. */ - flow = ovs_flow_alloc(); - if (IS_ERR(flow)) { - error = PTR_ERR(flow); - goto err_unlock_ovs; - } - clear_stats(flow); + /* Validate actions. */ + error = ovs_nla_copy_actions(net, a[OVS_FLOW_ATTR_ACTIONS], + &new_flow->key, &acts, log); + if (error) { + OVS_NLERR(log, "Flow actions may not be safe on all matching packets."); + goto err_kfree_key; + } - rcu_assign_pointer(flow->sf_acts, acts); + reply = ovs_flow_cmd_alloc_info(acts, &new_flow->id, info, false, + ufid_flags); + if (IS_ERR(reply)) { + error = PTR_ERR(reply); + goto err_kfree_acts; + } + + ovs_lock(); + dp = get_dp(net, ovs_header->dp_ifindex); + if (unlikely(!dp)) { + error = -ENODEV; + goto err_unlock_ovs; + } + + /* Check if this is a duplicate flow */ + if (ovs_identifier_is_ufid(&new_flow->id)) + flow = ovs_flow_tbl_lookup_ufid(&dp->table, &new_flow->id); + if (!flow) + flow = ovs_flow_tbl_lookup(&dp->table, key); + if (likely(!flow)) { + rcu_assign_pointer(new_flow->sf_acts, acts); /* Put flow in bucket. */ - ovs_flow_tbl_insert(table, flow, &key, key_len); + error = ovs_flow_tbl_insert(&dp->table, new_flow, &mask); + if (unlikely(error)) { + acts = NULL; + goto err_unlock_ovs; + } - reply = ovs_flow_cmd_build_info(flow, dp, info->snd_portid, - info->snd_seq, - OVS_FLOW_CMD_NEW); + if (unlikely(reply)) { + error = ovs_flow_cmd_fill_info(new_flow, + ovs_header->dp_ifindex, + reply, info->snd_portid, + info->snd_seq, 0, + OVS_FLOW_CMD_NEW, + ufid_flags); + BUG_ON(error < 0); + } + ovs_unlock(); } else { - /* We found a matching flow. */ struct sw_flow_actions *old_acts; /* Bail out if we're not allowed to modify an existing flow. @@ -1295,39 +1105,238 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info) * request. We also accept NLM_F_EXCL in case that bug ever * gets fixed. */ - error = -EEXIST; - if (info->genlhdr->cmd == OVS_FLOW_CMD_NEW && - info->nlhdr->nlmsg_flags & (NLM_F_CREATE | NLM_F_EXCL)) + if (unlikely(info->nlhdr->nlmsg_flags & (NLM_F_CREATE + | NLM_F_EXCL))) { + error = -EEXIST; goto err_unlock_ovs; - + } + /* The flow identifier has to be the same for flow updates. + * Look for any overlapping flow. + */ + if (unlikely(!ovs_flow_cmp(flow, &match))) { + if (ovs_identifier_is_key(&flow->id)) + flow = ovs_flow_tbl_lookup_exact(&dp->table, + &match); + else /* UFID matches but key is different */ + flow = NULL; + if (!flow) { + error = -ENOENT; + goto err_unlock_ovs; + } + } /* Update actions. */ old_acts = ovsl_dereference(flow->sf_acts); rcu_assign_pointer(flow->sf_acts, acts); - ovs_flow_deferred_free_acts(old_acts); - - reply = ovs_flow_cmd_build_info(flow, dp, info->snd_portid, - info->snd_seq, OVS_FLOW_CMD_NEW); - /* Clear stats. */ - if (a[OVS_FLOW_ATTR_CLEAR]) { - spin_lock_bh(&flow->lock); - clear_stats(flow); - spin_unlock_bh(&flow->lock); + if (unlikely(reply)) { + error = ovs_flow_cmd_fill_info(flow, + ovs_header->dp_ifindex, + reply, info->snd_portid, + info->snd_seq, 0, + OVS_FLOW_CMD_NEW, + ufid_flags); + BUG_ON(error < 0); } + ovs_unlock(); + + ovs_nla_free_flow_actions_rcu(old_acts); + ovs_flow_free(new_flow, false); } + + if (reply) + ovs_notify(&dp_flow_genl_family, reply, info); + + kfree(key); + return 0; + +err_unlock_ovs: ovs_unlock(); + kfree_skb(reply); +err_kfree_acts: + ovs_nla_free_flow_actions(acts); +err_kfree_key: + kfree(key); +err_kfree_flow: + ovs_flow_free(new_flow, false); +error: + return error; +} + +/* Factor out action copy to avoid "Wframe-larger-than=1024" warning. */ +static noinline_for_stack +struct sw_flow_actions *get_flow_actions(struct net *net, + const struct nlattr *a, + const struct sw_flow_key *key, + const struct sw_flow_mask *mask, + bool log) +{ + struct sw_flow_actions *acts; + struct sw_flow_key masked_key; + int error; - if (!IS_ERR(reply)) - ovs_notify(reply, info, &ovs_dp_flow_multicast_group); + ovs_flow_mask_key(&masked_key, key, true, mask); + error = ovs_nla_copy_actions(net, a, &masked_key, &acts, log); + if (error) { + OVS_NLERR(log, + "Actions may not be safe on all matching packets"); + return ERR_PTR(error); + } + + return acts; +} + +/* Factor out match-init and action-copy to avoid + * "Wframe-larger-than=1024" warning. Because mask is only + * used to get actions, we new a function to save some + * stack space. + * + * If there are not key and action attrs, we return 0 + * directly. In the case, the caller will also not use the + * match as before. If there is action attr, we try to get + * actions and save them to *acts. Before returning from + * the function, we reset the match->mask pointer. Because + * we should not to return match object with dangling reference + * to mask. + * */ +static noinline_for_stack int +ovs_nla_init_match_and_action(struct net *net, + struct sw_flow_match *match, + struct sw_flow_key *key, + struct nlattr **a, + struct sw_flow_actions **acts, + bool log) +{ + struct sw_flow_mask mask; + int error = 0; + + if (a[OVS_FLOW_ATTR_KEY]) { + ovs_match_init(match, key, true, &mask); + error = ovs_nla_get_match(net, match, a[OVS_FLOW_ATTR_KEY], + a[OVS_FLOW_ATTR_MASK], log); + if (error) + goto error; + } + + if (a[OVS_FLOW_ATTR_ACTIONS]) { + if (!a[OVS_FLOW_ATTR_KEY]) { + OVS_NLERR(log, + "Flow key attribute not present in set flow."); + error = -EINVAL; + goto error; + } + + *acts = get_flow_actions(net, a[OVS_FLOW_ATTR_ACTIONS], key, + &mask, log); + if (IS_ERR(*acts)) { + error = PTR_ERR(*acts); + goto error; + } + } + + /* On success, error is 0. */ +error: + match->mask = NULL; + return error; +} + +static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = sock_net(skb->sk); + struct nlattr **a = info->attrs; + struct ovs_header *ovs_header = genl_info_userhdr(info); + struct sw_flow_key key; + struct sw_flow *flow; + struct sk_buff *reply = NULL; + struct datapath *dp; + struct sw_flow_actions *old_acts = NULL, *acts = NULL; + struct sw_flow_match match; + struct sw_flow_id sfid; + u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); + int error = 0; + bool log = !a[OVS_FLOW_ATTR_PROBE]; + bool ufid_present; + + ufid_present = ovs_nla_get_ufid(&sfid, a[OVS_FLOW_ATTR_UFID], log); + if (!a[OVS_FLOW_ATTR_KEY] && !ufid_present) { + OVS_NLERR(log, + "Flow set message rejected, Key attribute missing."); + return -EINVAL; + } + + error = ovs_nla_init_match_and_action(net, &match, &key, a, + &acts, log); + if (error) + goto error; + + if (acts) { + /* Can allocate before locking if have acts. */ + reply = ovs_flow_cmd_alloc_info(acts, &sfid, info, false, + ufid_flags); + if (IS_ERR(reply)) { + error = PTR_ERR(reply); + goto err_kfree_acts; + } + } + + ovs_lock(); + dp = get_dp(net, ovs_header->dp_ifindex); + if (unlikely(!dp)) { + error = -ENODEV; + goto err_unlock_ovs; + } + /* Check that the flow exists. */ + if (ufid_present) + flow = ovs_flow_tbl_lookup_ufid(&dp->table, &sfid); else - netlink_set_err(sock_net(skb->sk)->genl_sock, 0, - ovs_dp_flow_multicast_group.id, PTR_ERR(reply)); + flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); + if (unlikely(!flow)) { + error = -ENOENT; + goto err_unlock_ovs; + } + + /* Update actions, if present. */ + if (likely(acts)) { + old_acts = ovsl_dereference(flow->sf_acts); + rcu_assign_pointer(flow->sf_acts, acts); + + if (unlikely(reply)) { + error = ovs_flow_cmd_fill_info(flow, + ovs_header->dp_ifindex, + reply, info->snd_portid, + info->snd_seq, 0, + OVS_FLOW_CMD_SET, + ufid_flags); + BUG_ON(error < 0); + } + } else { + /* Could not alloc without acts before locking. */ + reply = ovs_flow_cmd_build_info(flow, ovs_header->dp_ifindex, + info, OVS_FLOW_CMD_SET, false, + ufid_flags); + + if (IS_ERR(reply)) { + error = PTR_ERR(reply); + goto err_unlock_ovs; + } + } + + /* Clear stats. */ + if (a[OVS_FLOW_ATTR_CLEAR]) + ovs_flow_stats_clear(flow); + ovs_unlock(); + + if (reply) + ovs_notify(&dp_flow_genl_family, reply, info); + if (old_acts) + ovs_nla_free_flow_actions_rcu(old_acts); + return 0; err_unlock_ovs: ovs_unlock(); -err_kfree: - kfree(acts); + kfree_skb(reply); +err_kfree_acts: + ovs_nla_free_flow_actions(acts); error: return error; } @@ -1335,18 +1344,29 @@ error: static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; - struct ovs_header *ovs_header = info->userhdr; + struct ovs_header *ovs_header = genl_info_userhdr(info); + struct net *net = sock_net(skb->sk); struct sw_flow_key key; struct sk_buff *reply; struct sw_flow *flow; struct datapath *dp; - struct flow_table *table; - int err; - int key_len; - - if (!a[OVS_FLOW_ATTR_KEY]) - return -EINVAL; - err = ovs_flow_from_nlattrs(&key, &key_len, a[OVS_FLOW_ATTR_KEY]); + struct sw_flow_match match; + struct sw_flow_id ufid; + u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); + int err = 0; + bool log = !a[OVS_FLOW_ATTR_PROBE]; + bool ufid_present; + + ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log); + if (a[OVS_FLOW_ATTR_KEY]) { + ovs_match_init(&match, &key, true, NULL); + err = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], NULL, + log); + } else if (!ufid_present) { + OVS_NLERR(log, + "Flow get message rejected, Key attribute missing."); + err = -EINVAL; + } if (err) return err; @@ -1357,15 +1377,17 @@ static int ovs_flow_cmd_get(struct sk_buff *skb, struct genl_info *info) goto unlock; } - table = ovsl_dereference(dp->table); - flow = ovs_flow_tbl_lookup(table, &key, key_len); + if (ufid_present) + flow = ovs_flow_tbl_lookup_ufid(&dp->table, &ufid); + else + flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); if (!flow) { err = -ENOENT; goto unlock; } - reply = ovs_flow_cmd_build_info(flow, dp, info->snd_portid, - info->snd_seq, OVS_FLOW_CMD_NEW); + reply = ovs_flow_cmd_build_info(flow, ovs_header->dp_ifindex, info, + OVS_FLOW_CMD_GET, true, ufid_flags); if (IS_ERR(reply)) { err = PTR_ERR(reply); goto unlock; @@ -1381,53 +1403,77 @@ unlock: static int ovs_flow_cmd_del(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; - struct ovs_header *ovs_header = info->userhdr; + struct ovs_header *ovs_header = genl_info_userhdr(info); + struct net *net = sock_net(skb->sk); struct sw_flow_key key; struct sk_buff *reply; - struct sw_flow *flow; + struct sw_flow *flow = NULL; struct datapath *dp; - struct flow_table *table; + struct sw_flow_match match; + struct sw_flow_id ufid; + u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); int err; - int key_len; + bool log = !a[OVS_FLOW_ATTR_PROBE]; + bool ufid_present; + + ufid_present = ovs_nla_get_ufid(&ufid, a[OVS_FLOW_ATTR_UFID], log); + if (a[OVS_FLOW_ATTR_KEY]) { + ovs_match_init(&match, &key, true, NULL); + err = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY], + NULL, log); + if (unlikely(err)) + return err; + } ovs_lock(); dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); - if (!dp) { + if (unlikely(!dp)) { err = -ENODEV; goto unlock; } - if (!a[OVS_FLOW_ATTR_KEY]) { - err = flush_flows(dp); + if (unlikely(!a[OVS_FLOW_ATTR_KEY] && !ufid_present)) { + err = ovs_flow_tbl_flush(&dp->table); goto unlock; } - err = ovs_flow_from_nlattrs(&key, &key_len, a[OVS_FLOW_ATTR_KEY]); - if (err) - goto unlock; - table = ovsl_dereference(dp->table); - flow = ovs_flow_tbl_lookup(table, &key, key_len); - if (!flow) { + if (ufid_present) + flow = ovs_flow_tbl_lookup_ufid(&dp->table, &ufid); + else + flow = ovs_flow_tbl_lookup_exact(&dp->table, &match); + if (unlikely(!flow)) { err = -ENOENT; goto unlock; } - reply = ovs_flow_cmd_alloc_info(flow); - if (!reply) { - err = -ENOMEM; - goto unlock; - } - - ovs_flow_tbl_remove(table, flow); + ovs_flow_tbl_remove(&dp->table, flow); + ovs_unlock(); - err = ovs_flow_cmd_fill_info(flow, dp, reply, info->snd_portid, - info->snd_seq, 0, OVS_FLOW_CMD_DEL); - BUG_ON(err < 0); + reply = ovs_flow_cmd_alloc_info((const struct sw_flow_actions __force *) flow->sf_acts, + &flow->id, info, false, ufid_flags); + if (likely(reply)) { + if (!IS_ERR(reply)) { + rcu_read_lock(); /*To keep RCU checker happy. */ + err = ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex, + reply, info->snd_portid, + info->snd_seq, 0, + OVS_FLOW_CMD_DEL, + ufid_flags); + rcu_read_unlock(); + if (WARN_ON_ONCE(err < 0)) { + kfree_skb(reply); + goto out_free; + } - ovs_flow_deferred_free(flow); - ovs_unlock(); + ovs_notify(&dp_flow_genl_family, reply, info); + } else { + netlink_set_err(sock_net(skb->sk)->genl_sock, 0, 0, + PTR_ERR(reply)); + } + } - ovs_notify(reply, info, &ovs_dp_flow_multicast_group); +out_free: + ovs_flow_free(flow, true); return 0; unlock: ovs_unlock(); @@ -1436,83 +1482,98 @@ unlock: static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) { + struct nlattr *a[__OVS_FLOW_ATTR_MAX]; struct ovs_header *ovs_header = genlmsg_data(nlmsg_data(cb->nlh)); + struct table_instance *ti; struct datapath *dp; - struct flow_table *table; + u32 ufid_flags; + int err; - ovs_lock(); - dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + err = genlmsg_parse_deprecated(cb->nlh, &dp_flow_genl_family, a, + OVS_FLOW_ATTR_MAX, flow_policy, NULL); + if (err) + return err; + ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); + + rcu_read_lock(); + dp = get_dp_rcu(sock_net(skb->sk), ovs_header->dp_ifindex); if (!dp) { - ovs_unlock(); + rcu_read_unlock(); return -ENODEV; } - table = ovsl_dereference(dp->table); - + ti = rcu_dereference(dp->table.ti); for (;;) { struct sw_flow *flow; u32 bucket, obj; bucket = cb->args[0]; obj = cb->args[1]; - flow = ovs_flow_tbl_next(table, &bucket, &obj); + flow = ovs_flow_tbl_dump_next(ti, &bucket, &obj); if (!flow) break; - if (ovs_flow_cmd_fill_info(flow, dp, skb, + if (ovs_flow_cmd_fill_info(flow, ovs_header->dp_ifindex, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - OVS_FLOW_CMD_NEW) < 0) + OVS_FLOW_CMD_GET, ufid_flags) < 0) break; cb->args[0] = bucket; cb->args[1] = obj; } - ovs_unlock(); + rcu_read_unlock(); return skb->len; } -static struct genl_ops dp_flow_genl_ops[] = { +static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = { + [OVS_FLOW_ATTR_KEY] = { .type = NLA_NESTED }, + [OVS_FLOW_ATTR_MASK] = { .type = NLA_NESTED }, + [OVS_FLOW_ATTR_ACTIONS] = { .type = NLA_NESTED }, + [OVS_FLOW_ATTR_CLEAR] = { .type = NLA_FLAG }, + [OVS_FLOW_ATTR_PROBE] = { .type = NLA_FLAG }, + [OVS_FLOW_ATTR_UFID] = { .type = NLA_UNSPEC, .len = 1 }, + [OVS_FLOW_ATTR_UFID_FLAGS] = { .type = NLA_U32 }, +}; + +static const struct genl_small_ops dp_flow_genl_ops[] = { { .cmd = OVS_FLOW_CMD_NEW, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ - .policy = flow_policy, - .doit = ovs_flow_cmd_new_or_set + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .doit = ovs_flow_cmd_new }, { .cmd = OVS_FLOW_CMD_DEL, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ - .policy = flow_policy, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .doit = ovs_flow_cmd_del }, { .cmd = OVS_FLOW_CMD_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, /* OK for unprivileged users. */ - .policy = flow_policy, .doit = ovs_flow_cmd_get, .dumpit = ovs_flow_cmd_dump }, { .cmd = OVS_FLOW_CMD_SET, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ - .policy = flow_policy, - .doit = ovs_flow_cmd_new_or_set, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ + .doit = ovs_flow_cmd_set, }, }; -static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = { - [OVS_DP_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, - [OVS_DP_ATTR_UPCALL_PID] = { .type = NLA_U32 }, -}; - -static struct genl_family dp_datapath_genl_family = { - .id = GENL_ID_GENERATE, +static struct genl_family dp_flow_genl_family __ro_after_init = { .hdrsize = sizeof(struct ovs_header), - .name = OVS_DATAPATH_FAMILY, - .version = OVS_DATAPATH_VERSION, - .maxattr = OVS_DP_ATTR_MAX, + .name = OVS_FLOW_FAMILY, + .version = OVS_FLOW_VERSION, + .maxattr = OVS_FLOW_ATTR_MAX, + .policy = flow_policy, .netnsok = true, .parallel_ops = true, -}; - -static struct genl_multicast_group ovs_dp_datapath_multicast_group = { - .name = OVS_DATAPATH_MCGROUP + .small_ops = dp_flow_genl_ops, + .n_small_ops = ARRAY_SIZE(dp_flow_genl_ops), + .resv_start_op = OVS_FLOW_CMD_SET + 1, + .mcgrps = &ovs_dp_flow_multicast_group, + .n_mcgrps = 1, + .module = THIS_MODULE, }; static size_t ovs_dp_cmd_msg_size(void) @@ -1520,36 +1581,61 @@ static size_t ovs_dp_cmd_msg_size(void) size_t msgsize = NLMSG_ALIGN(sizeof(struct ovs_header)); msgsize += nla_total_size(IFNAMSIZ); - msgsize += nla_total_size(sizeof(struct ovs_dp_stats)); + msgsize += nla_total_size_64bit(sizeof(struct ovs_dp_stats)); + msgsize += nla_total_size_64bit(sizeof(struct ovs_dp_megaflow_stats)); + msgsize += nla_total_size(sizeof(u32)); /* OVS_DP_ATTR_USER_FEATURES */ + msgsize += nla_total_size(sizeof(u32)); /* OVS_DP_ATTR_MASKS_CACHE_SIZE */ + msgsize += nla_total_size(sizeof(u32) * nr_cpu_ids); /* OVS_DP_ATTR_PER_CPU_PIDS */ return msgsize; } +/* Called with ovs_mutex. */ static int ovs_dp_cmd_fill_info(struct datapath *dp, struct sk_buff *skb, u32 portid, u32 seq, u32 flags, u8 cmd) { struct ovs_header *ovs_header; struct ovs_dp_stats dp_stats; - int err; + struct ovs_dp_megaflow_stats dp_megaflow_stats; + struct dp_nlsk_pids *pids = ovsl_dereference(dp->upcall_portids); + int err, pids_len; ovs_header = genlmsg_put(skb, portid, seq, &dp_datapath_genl_family, - flags, cmd); + flags, cmd); if (!ovs_header) goto error; ovs_header->dp_ifindex = get_dpifindex(dp); - rcu_read_lock(); err = nla_put_string(skb, OVS_DP_ATTR_NAME, ovs_dp_name(dp)); - rcu_read_unlock(); if (err) goto nla_put_failure; - get_dp_stats(dp, &dp_stats); - if (nla_put(skb, OVS_DP_ATTR_STATS, sizeof(struct ovs_dp_stats), &dp_stats)) + get_dp_stats(dp, &dp_stats, &dp_megaflow_stats); + if (nla_put_64bit(skb, OVS_DP_ATTR_STATS, sizeof(struct ovs_dp_stats), + &dp_stats, OVS_DP_ATTR_PAD)) + goto nla_put_failure; + + if (nla_put_64bit(skb, OVS_DP_ATTR_MEGAFLOW_STATS, + sizeof(struct ovs_dp_megaflow_stats), + &dp_megaflow_stats, OVS_DP_ATTR_PAD)) goto nla_put_failure; - return genlmsg_end(skb, ovs_header); + if (nla_put_u32(skb, OVS_DP_ATTR_USER_FEATURES, dp->user_features)) + goto nla_put_failure; + + if (nla_put_u32(skb, OVS_DP_ATTR_MASKS_CACHE_SIZE, + ovs_flow_tbl_masks_cache_size(&dp->table))) + goto nla_put_failure; + + if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU && pids) { + pids_len = min(pids->n_pids, nr_cpu_ids) * sizeof(u32); + if (nla_put(skb, OVS_DP_ATTR_PER_CPU_PIDS, pids_len, &pids->pids)) + goto nla_put_failure; + } + + genlmsg_end(skb, ovs_header); + return 0; nla_put_failure: genlmsg_cancel(skb, ovs_header); @@ -1557,27 +1643,14 @@ error: return -EMSGSIZE; } -static struct sk_buff *ovs_dp_cmd_build_info(struct datapath *dp, u32 portid, - u32 seq, u8 cmd) +static struct sk_buff *ovs_dp_cmd_alloc_info(void) { - struct sk_buff *skb; - int retval; - - skb = genlmsg_new(ovs_dp_cmd_msg_size(), GFP_KERNEL); - if (!skb) - return ERR_PTR(-ENOMEM); - - retval = ovs_dp_cmd_fill_info(dp, skb, portid, seq, 0, cmd); - if (retval < 0) { - kfree_skb(skb); - return ERR_PTR(retval); - } - return skb; + return genlmsg_new(ovs_dp_cmd_msg_size(), GFP_KERNEL); } -/* Called with ovs_mutex. */ +/* Called with rcu_read_lock or ovs_mutex. */ static struct datapath *lookup_datapath(struct net *net, - struct ovs_header *ovs_header, + const struct ovs_header *ovs_header, struct nlattr *a[OVS_DP_ATTR_MAX + 1]) { struct datapath *dp; @@ -1587,14 +1660,155 @@ static struct datapath *lookup_datapath(struct net *net, else { struct vport *vport; - rcu_read_lock(); vport = ovs_vport_locate(net, nla_data(a[OVS_DP_ATTR_NAME])); dp = vport && vport->port_no == OVSP_LOCAL ? vport->dp : NULL; - rcu_read_unlock(); } return dp ? dp : ERR_PTR(-ENODEV); } +static void ovs_dp_reset_user_features(struct sk_buff *skb, + struct genl_info *info) +{ + struct datapath *dp; + + dp = lookup_datapath(sock_net(skb->sk), genl_info_userhdr(info), + info->attrs); + if (IS_ERR(dp)) + return; + + pr_warn("%s: Dropping previously announced user features\n", + ovs_dp_name(dp)); + dp->user_features = 0; +} + +static int ovs_dp_set_upcall_portids(struct datapath *dp, + const struct nlattr *ids) +{ + struct dp_nlsk_pids *old, *dp_nlsk_pids; + + if (!nla_len(ids) || nla_len(ids) % sizeof(u32)) + return -EINVAL; + + old = ovsl_dereference(dp->upcall_portids); + + dp_nlsk_pids = kmalloc(sizeof(*dp_nlsk_pids) + nla_len(ids), + GFP_KERNEL); + if (!dp_nlsk_pids) + return -ENOMEM; + + dp_nlsk_pids->n_pids = nla_len(ids) / sizeof(u32); + nla_memcpy(dp_nlsk_pids->pids, ids, nla_len(ids)); + + rcu_assign_pointer(dp->upcall_portids, dp_nlsk_pids); + + kfree_rcu(old, rcu); + + return 0; +} + +u32 ovs_dp_get_upcall_portid(const struct datapath *dp, uint32_t cpu_id) +{ + struct dp_nlsk_pids *dp_nlsk_pids; + + dp_nlsk_pids = rcu_dereference(dp->upcall_portids); + + if (dp_nlsk_pids) { + if (cpu_id < dp_nlsk_pids->n_pids) { + return dp_nlsk_pids->pids[cpu_id]; + } else if (dp_nlsk_pids->n_pids > 0 && + cpu_id >= dp_nlsk_pids->n_pids) { + /* If the number of netlink PIDs is mismatched with + * the number of CPUs as seen by the kernel, log this + * and send the upcall to an arbitrary socket (0) in + * order to not drop packets + */ + pr_info_ratelimited("cpu_id mismatch with handler threads"); + return dp_nlsk_pids->pids[cpu_id % + dp_nlsk_pids->n_pids]; + } else { + return 0; + } + } else { + return 0; + } +} + +static int ovs_dp_change(struct datapath *dp, struct nlattr *a[]) +{ + u32 user_features = 0, old_features = dp->user_features; + int err; + + if (a[OVS_DP_ATTR_USER_FEATURES]) { + user_features = nla_get_u32(a[OVS_DP_ATTR_USER_FEATURES]); + + if (user_features & ~(OVS_DP_F_VPORT_PIDS | + OVS_DP_F_UNALIGNED | + OVS_DP_F_TC_RECIRC_SHARING | + OVS_DP_F_DISPATCH_UPCALL_PER_CPU)) + return -EOPNOTSUPP; + +#if !IS_ENABLED(CONFIG_NET_TC_SKB_EXT) + if (user_features & OVS_DP_F_TC_RECIRC_SHARING) + return -EOPNOTSUPP; +#endif + } + + if (a[OVS_DP_ATTR_MASKS_CACHE_SIZE]) { + int err; + u32 cache_size; + + cache_size = nla_get_u32(a[OVS_DP_ATTR_MASKS_CACHE_SIZE]); + err = ovs_flow_tbl_masks_cache_resize(&dp->table, cache_size); + if (err) + return err; + } + + dp->user_features = user_features; + + if (dp->user_features & OVS_DP_F_DISPATCH_UPCALL_PER_CPU && + a[OVS_DP_ATTR_PER_CPU_PIDS]) { + /* Upcall Netlink Port IDs have been updated */ + err = ovs_dp_set_upcall_portids(dp, + a[OVS_DP_ATTR_PER_CPU_PIDS]); + if (err) + return err; + } + + if ((dp->user_features & OVS_DP_F_TC_RECIRC_SHARING) && + !(old_features & OVS_DP_F_TC_RECIRC_SHARING)) + tc_skb_ext_tc_enable(); + else if (!(dp->user_features & OVS_DP_F_TC_RECIRC_SHARING) && + (old_features & OVS_DP_F_TC_RECIRC_SHARING)) + tc_skb_ext_tc_disable(); + + return 0; +} + +static int ovs_dp_stats_init(struct datapath *dp) +{ + dp->stats_percpu = netdev_alloc_pcpu_stats(struct dp_stats_percpu); + if (!dp->stats_percpu) + return -ENOMEM; + + return 0; +} + +static int ovs_dp_vport_init(struct datapath *dp) +{ + int i; + + dp->ports = kmalloc_array(DP_VPORT_HASH_BUCKETS, + sizeof(struct hlist_head), + GFP_KERNEL); + if (!dp->ports) + return -ENOMEM; + + for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) + INIT_HLIST_HEAD(&dp->ports[i]); + + return 0; +} + static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; @@ -1603,42 +1817,39 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) struct datapath *dp; struct vport *vport; struct ovs_net *ovs_net; - int err, i; + int err; err = -EINVAL; if (!a[OVS_DP_ATTR_NAME] || !a[OVS_DP_ATTR_UPCALL_PID]) goto err; - ovs_lock(); + reply = ovs_dp_cmd_alloc_info(); + if (!reply) + return -ENOMEM; err = -ENOMEM; dp = kzalloc(sizeof(*dp), GFP_KERNEL); if (dp == NULL) - goto err_unlock_ovs; + goto err_destroy_reply; - ovs_dp_set_net(dp, hold_net(sock_net(skb->sk))); + ovs_dp_set_net(dp, sock_net(skb->sk)); /* Allocate table. */ - err = -ENOMEM; - rcu_assign_pointer(dp->table, ovs_flow_tbl_alloc(TBL_MIN_BUCKETS)); - if (!dp->table) - goto err_free_dp; + err = ovs_flow_tbl_init(&dp->table); + if (err) + goto err_destroy_dp; - dp->stats_percpu = alloc_percpu(struct dp_stats_percpu); - if (!dp->stats_percpu) { - err = -ENOMEM; + err = ovs_dp_stats_init(dp); + if (err) goto err_destroy_table; - } - dp->ports = kmalloc(DP_VPORT_HASH_BUCKETS * sizeof(struct hlist_head), - GFP_KERNEL); - if (!dp->ports) { - err = -ENOMEM; - goto err_destroy_percpu; - } + err = ovs_dp_vport_init(dp); + if (err) + goto err_destroy_stats; - for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) - INIT_HLIST_HEAD(&dp->ports[i]); + err = ovs_meters_init(dp); + if (err) + goto err_destroy_ports; /* Set up our datapath device. */ parms.name = nla_data(a[OVS_DP_ATTR_NAME]); @@ -1646,7 +1857,15 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) parms.options = NULL; parms.dp = dp; parms.port_no = OVSP_LOCAL; - parms.upcall_portid = nla_get_u32(a[OVS_DP_ATTR_UPCALL_PID]); + parms.upcall_portids = a[OVS_DP_ATTR_UPCALL_PID]; + parms.desired_ifindex = nla_get_s32_default(a[OVS_DP_ATTR_IFINDEX], 0); + + /* So far only local changes have been made, now need the lock. */ + ovs_lock(); + + err = ovs_dp_change(dp, a); + if (err) + goto err_unlock_and_destroy_meters; vport = new_vport(&parms); if (IS_ERR(vport)) { @@ -1654,36 +1873,45 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info) if (err == -EBUSY) err = -EEXIST; - goto err_destroy_ports_array; + if (err == -EEXIST) { + /* An outdated user space instance that does not understand + * the concept of user_features has attempted to create a new + * datapath and is likely to reuse it. Drop all user features. + */ + if (info->genlhdr->version < OVS_DP_VER_FEATURES) + ovs_dp_reset_user_features(skb, info); + } + + goto err_destroy_portids; } - reply = ovs_dp_cmd_build_info(dp, info->snd_portid, - info->snd_seq, OVS_DP_CMD_NEW); - err = PTR_ERR(reply); - if (IS_ERR(reply)) - goto err_destroy_local_port; + err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, + info->snd_seq, 0, OVS_DP_CMD_NEW); + BUG_ON(err < 0); ovs_net = net_generic(ovs_dp_get_net(dp), ovs_net_id); - list_add_tail(&dp->list_node, &ovs_net->dps); + list_add_tail_rcu(&dp->list_node, &ovs_net->dps); ovs_unlock(); - ovs_notify(reply, info, &ovs_dp_datapath_multicast_group); + ovs_notify(&dp_datapath_genl_family, reply, info); return 0; -err_destroy_local_port: - ovs_dp_detach_port(ovs_vport_ovsl(dp, OVSP_LOCAL)); -err_destroy_ports_array: +err_destroy_portids: + kfree(rcu_dereference_raw(dp->upcall_portids)); +err_unlock_and_destroy_meters: + ovs_unlock(); + ovs_meters_exit(dp); +err_destroy_ports: kfree(dp->ports); -err_destroy_percpu: +err_destroy_stats: free_percpu(dp->stats_percpu); err_destroy_table: - ovs_flow_tbl_destroy(ovsl_dereference(dp->table)); -err_free_dp: - release_net(ovs_dp_get_net(dp)); + ovs_flow_tbl_destroy(&dp->table); +err_destroy_dp: kfree(dp); -err_unlock_ovs: - ovs_unlock(); +err_destroy_reply: + kfree_skb(reply); err: return err; } @@ -1691,8 +1919,12 @@ err: /* Called with ovs_mutex. */ static void __dp_destroy(struct datapath *dp) { + struct flow_table *table = &dp->table; int i; + if (dp->user_features & OVS_DP_F_TC_RECIRC_SHARING) + tc_skb_ext_tc_disable(); + for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { struct vport *vport; struct hlist_node *n; @@ -1702,13 +1934,21 @@ static void __dp_destroy(struct datapath *dp) ovs_dp_detach_port(vport); } - list_del(&dp->list_node); + list_del_rcu(&dp->list_node); /* OVSP_LOCAL is datapath internal port. We need to make sure that - * all port in datapath are destroyed first before freeing datapath. + * all ports in datapath are destroyed first before freeing datapath. */ ovs_dp_detach_port(ovs_vport_ovsl(dp, OVSP_LOCAL)); + /* Flush sw_flow in the tables. RCU cb only releases resource + * such as dp, ports and tables. That may avoid some issues + * such as RCU usage warning. + */ + table_instance_flow_flush(table, ovsl_dereference(table->ti), + ovsl_dereference(table->ufid_ti)); + + /* RCU destroy the ports, meters and flow tables. */ call_rcu(&dp->rcu, destroy_dp_rcu); } @@ -1718,26 +1958,31 @@ static int ovs_dp_cmd_del(struct sk_buff *skb, struct genl_info *info) struct datapath *dp; int err; + reply = ovs_dp_cmd_alloc_info(); + if (!reply) + return -ENOMEM; + ovs_lock(); - dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs); + dp = lookup_datapath(sock_net(skb->sk), genl_info_userhdr(info), + info->attrs); err = PTR_ERR(dp); if (IS_ERR(dp)) - goto unlock; + goto err_unlock_free; - reply = ovs_dp_cmd_build_info(dp, info->snd_portid, - info->snd_seq, OVS_DP_CMD_DEL); - err = PTR_ERR(reply); - if (IS_ERR(reply)) - goto unlock; + err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, + info->snd_seq, 0, OVS_DP_CMD_DEL); + BUG_ON(err < 0); __dp_destroy(dp); ovs_unlock(); - ovs_notify(reply, info, &ovs_dp_datapath_multicast_group); + ovs_notify(&dp_datapath_genl_family, reply, info); return 0; -unlock: + +err_unlock_free: ovs_unlock(); + kfree_skb(reply); return err; } @@ -1747,28 +1992,33 @@ static int ovs_dp_cmd_set(struct sk_buff *skb, struct genl_info *info) struct datapath *dp; int err; + reply = ovs_dp_cmd_alloc_info(); + if (!reply) + return -ENOMEM; + ovs_lock(); - dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs); + dp = lookup_datapath(sock_net(skb->sk), genl_info_userhdr(info), + info->attrs); err = PTR_ERR(dp); if (IS_ERR(dp)) - goto unlock; + goto err_unlock_free; - reply = ovs_dp_cmd_build_info(dp, info->snd_portid, - info->snd_seq, OVS_DP_CMD_NEW); - if (IS_ERR(reply)) { - err = PTR_ERR(reply); - netlink_set_err(sock_net(skb->sk)->genl_sock, 0, - ovs_dp_datapath_multicast_group.id, err); - err = 0; - goto unlock; - } + err = ovs_dp_change(dp, info->attrs); + if (err) + goto err_unlock_free; + + err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, + info->snd_seq, 0, OVS_DP_CMD_SET); + BUG_ON(err < 0); ovs_unlock(); - ovs_notify(reply, info, &ovs_dp_datapath_multicast_group); + ovs_notify(&dp_datapath_genl_family, reply, info); return 0; -unlock: + +err_unlock_free: ovs_unlock(); + kfree_skb(reply); return err; } @@ -1778,25 +2028,27 @@ static int ovs_dp_cmd_get(struct sk_buff *skb, struct genl_info *info) struct datapath *dp; int err; + reply = ovs_dp_cmd_alloc_info(); + if (!reply) + return -ENOMEM; + ovs_lock(); - dp = lookup_datapath(sock_net(skb->sk), info->userhdr, info->attrs); + dp = lookup_datapath(sock_net(skb->sk), genl_info_userhdr(info), + info->attrs); if (IS_ERR(dp)) { err = PTR_ERR(dp); - goto unlock; - } - - reply = ovs_dp_cmd_build_info(dp, info->snd_portid, - info->snd_seq, OVS_DP_CMD_NEW); - if (IS_ERR(reply)) { - err = PTR_ERR(reply); - goto unlock; + goto err_unlock_free; } - + err = ovs_dp_cmd_fill_info(dp, reply, info->snd_portid, + info->snd_seq, 0, OVS_DP_CMD_GET); + BUG_ON(err < 0); ovs_unlock(); + return genlmsg_reply(reply, info); -unlock: +err_unlock_free: ovs_unlock(); + kfree_skb(reply); return err; } @@ -1812,7 +2064,7 @@ static int ovs_dp_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) if (i >= skip && ovs_dp_cmd_fill_info(dp, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - OVS_DP_CMD_NEW) < 0) + OVS_DP_CMD_GET) < 0) break; i++; } @@ -1823,59 +2075,63 @@ static int ovs_dp_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } -static struct genl_ops dp_datapath_genl_ops[] = { +static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = { + [OVS_DP_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, + [OVS_DP_ATTR_UPCALL_PID] = { .type = NLA_U32 }, + [OVS_DP_ATTR_USER_FEATURES] = { .type = NLA_U32 }, + [OVS_DP_ATTR_MASKS_CACHE_SIZE] = NLA_POLICY_RANGE(NLA_U32, 0, + PCPU_MIN_UNIT_SIZE / sizeof(struct mask_cache_entry)), + [OVS_DP_ATTR_IFINDEX] = NLA_POLICY_MIN(NLA_S32, 0), +}; + +static const struct genl_small_ops dp_datapath_genl_ops[] = { { .cmd = OVS_DP_CMD_NEW, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ - .policy = datapath_policy, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .doit = ovs_dp_cmd_new }, { .cmd = OVS_DP_CMD_DEL, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ - .policy = datapath_policy, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .doit = ovs_dp_cmd_del }, { .cmd = OVS_DP_CMD_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, /* OK for unprivileged users. */ - .policy = datapath_policy, .doit = ovs_dp_cmd_get, .dumpit = ovs_dp_cmd_dump }, { .cmd = OVS_DP_CMD_SET, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ - .policy = datapath_policy, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .doit = ovs_dp_cmd_set, }, }; -static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = { - [OVS_VPORT_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, - [OVS_VPORT_ATTR_STATS] = { .len = sizeof(struct ovs_vport_stats) }, - [OVS_VPORT_ATTR_PORT_NO] = { .type = NLA_U32 }, - [OVS_VPORT_ATTR_TYPE] = { .type = NLA_U32 }, - [OVS_VPORT_ATTR_UPCALL_PID] = { .type = NLA_U32 }, - [OVS_VPORT_ATTR_OPTIONS] = { .type = NLA_NESTED }, -}; - -static struct genl_family dp_vport_genl_family = { - .id = GENL_ID_GENERATE, +static struct genl_family dp_datapath_genl_family __ro_after_init = { .hdrsize = sizeof(struct ovs_header), - .name = OVS_VPORT_FAMILY, - .version = OVS_VPORT_VERSION, - .maxattr = OVS_VPORT_ATTR_MAX, + .name = OVS_DATAPATH_FAMILY, + .version = OVS_DATAPATH_VERSION, + .maxattr = OVS_DP_ATTR_MAX, + .policy = datapath_policy, .netnsok = true, .parallel_ops = true, -}; - -struct genl_multicast_group ovs_dp_vport_multicast_group = { - .name = OVS_VPORT_MCGROUP + .small_ops = dp_datapath_genl_ops, + .n_small_ops = ARRAY_SIZE(dp_datapath_genl_ops), + .resv_start_op = OVS_DP_CMD_SET + 1, + .mcgrps = &ovs_dp_datapath_multicast_group, + .n_mcgrps = 1, + .module = THIS_MODULE, }; /* Called with ovs_mutex or RCU read lock. */ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, - u32 portid, u32 seq, u32 flags, u8 cmd) + struct net *net, u32 portid, u32 seq, + u32 flags, u8 cmd, gfp_t gfp) { struct ovs_header *ovs_header; struct ovs_vport_stats vport_stats; + struct net *net_vport; int err; ovs_header = genlmsg_put(skb, portid, seq, &dp_vport_genl_family, @@ -1887,21 +2143,42 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb, if (nla_put_u32(skb, OVS_VPORT_ATTR_PORT_NO, vport->port_no) || nla_put_u32(skb, OVS_VPORT_ATTR_TYPE, vport->ops->type) || - nla_put_string(skb, OVS_VPORT_ATTR_NAME, vport->ops->get_name(vport)) || - nla_put_u32(skb, OVS_VPORT_ATTR_UPCALL_PID, vport->upcall_portid)) + nla_put_string(skb, OVS_VPORT_ATTR_NAME, + ovs_vport_name(vport)) || + nla_put_u32(skb, OVS_VPORT_ATTR_IFINDEX, vport->dev->ifindex)) goto nla_put_failure; + rcu_read_lock(); + net_vport = dev_net_rcu(vport->dev); + if (!net_eq(net, net_vport)) { + int id = peernet2id_alloc(net, net_vport, GFP_ATOMIC); + + if (nla_put_s32(skb, OVS_VPORT_ATTR_NETNSID, id)) + goto nla_put_failure_unlock; + } + rcu_read_unlock(); + ovs_vport_get_stats(vport, &vport_stats); - if (nla_put(skb, OVS_VPORT_ATTR_STATS, sizeof(struct ovs_vport_stats), - &vport_stats)) + if (nla_put_64bit(skb, OVS_VPORT_ATTR_STATS, + sizeof(struct ovs_vport_stats), &vport_stats, + OVS_VPORT_ATTR_PAD)) + goto nla_put_failure; + + if (ovs_vport_get_upcall_stats(vport, skb)) + goto nla_put_failure; + + if (ovs_vport_get_upcall_portids(vport, skb)) goto nla_put_failure; err = ovs_vport_get_options(vport, skb); if (err == -EMSGSIZE) goto error; - return genlmsg_end(skb, ovs_header); + genlmsg_end(skb, ovs_header); + return 0; +nla_put_failure_unlock: + rcu_read_unlock(); nla_put_failure: err = -EMSGSIZE; error: @@ -1909,18 +2186,24 @@ error: return err; } -/* Called with ovs_mutex or RCU read lock. */ -struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, u32 portid, - u32 seq, u8 cmd) +static struct sk_buff *ovs_vport_cmd_alloc_info(void) +{ + return nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); +} + +/* Called with ovs_mutex, only via ovs_dp_notify_wq(). */ +struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, struct net *net, + u32 portid, u32 seq, u8 cmd) { struct sk_buff *skb; int retval; - skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) return ERR_PTR(-ENOMEM); - retval = ovs_vport_cmd_fill_info(vport, skb, portid, seq, 0, cmd); + retval = ovs_vport_cmd_fill_info(vport, skb, net, portid, seq, 0, cmd, + GFP_KERNEL); BUG_ON(retval < 0); return skb; @@ -1928,12 +2211,14 @@ struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, u32 portid, /* Called with ovs_mutex or RCU read lock. */ static struct vport *lookup_vport(struct net *net, - struct ovs_header *ovs_header, + const struct ovs_header *ovs_header, struct nlattr *a[OVS_VPORT_ATTR_MAX + 1]) { struct datapath *dp; struct vport *vport; + if (a[OVS_VPORT_ATTR_IFINDEX]) + return ERR_PTR(-EOPNOTSUPP); if (a[OVS_VPORT_ATTR_NAME]) { vport = ovs_vport_locate(net, nla_data(a[OVS_VPORT_ATTR_NAME])); if (!vport) @@ -1958,46 +2243,89 @@ static struct vport *lookup_vport(struct net *net, return vport; } else return ERR_PTR(-EINVAL); + +} + +static unsigned int ovs_get_max_headroom(struct datapath *dp) +{ + unsigned int dev_headroom, max_headroom = 0; + struct net_device *dev; + struct vport *vport; + int i; + + for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { + hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node, + lockdep_ovsl_is_held()) { + dev = vport->dev; + dev_headroom = netdev_get_fwd_headroom(dev); + if (dev_headroom > max_headroom) + max_headroom = dev_headroom; + } + } + + return max_headroom; +} + +/* Called with ovs_mutex */ +static void ovs_update_headroom(struct datapath *dp, unsigned int new_headroom) +{ + struct vport *vport; + int i; + + dp->max_headroom = new_headroom; + for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { + hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node, + lockdep_ovsl_is_held()) + netdev_set_rx_headroom(vport->dev, new_headroom); + } } static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; - struct ovs_header *ovs_header = info->userhdr; + struct ovs_header *ovs_header = genl_info_userhdr(info); struct vport_parms parms; struct sk_buff *reply; struct vport *vport; struct datapath *dp; + unsigned int new_headroom; u32 port_no; int err; - err = -EINVAL; if (!a[OVS_VPORT_ATTR_NAME] || !a[OVS_VPORT_ATTR_TYPE] || !a[OVS_VPORT_ATTR_UPCALL_PID]) - goto exit; + return -EINVAL; + + parms.type = nla_get_u32(a[OVS_VPORT_ATTR_TYPE]); + + if (a[OVS_VPORT_ATTR_IFINDEX] && parms.type != OVS_VPORT_TYPE_INTERNAL) + return -EOPNOTSUPP; + + port_no = nla_get_u32_default(a[OVS_VPORT_ATTR_PORT_NO], 0); + if (port_no >= DP_MAX_PORTS) + return -EFBIG; + + reply = ovs_vport_cmd_alloc_info(); + if (!reply) + return -ENOMEM; ovs_lock(); +restart: dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); err = -ENODEV; if (!dp) - goto exit_unlock; - - if (a[OVS_VPORT_ATTR_PORT_NO]) { - port_no = nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]); - - err = -EFBIG; - if (port_no >= DP_MAX_PORTS) - goto exit_unlock; + goto exit_unlock_free; + if (port_no) { vport = ovs_vport_ovsl(dp, port_no); err = -EBUSY; if (vport) - goto exit_unlock; + goto exit_unlock_free; } else { for (port_no = 1; ; port_no++) { if (port_no >= DP_MAX_PORTS) { err = -EFBIG; - goto exit_unlock; + goto exit_unlock_free; } vport = ovs_vport_ovsl(dp, port_no); if (!vport) @@ -2006,31 +2334,41 @@ static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info) } parms.name = nla_data(a[OVS_VPORT_ATTR_NAME]); - parms.type = nla_get_u32(a[OVS_VPORT_ATTR_TYPE]); parms.options = a[OVS_VPORT_ATTR_OPTIONS]; parms.dp = dp; parms.port_no = port_no; - parms.upcall_portid = nla_get_u32(a[OVS_VPORT_ATTR_UPCALL_PID]); + parms.upcall_portids = a[OVS_VPORT_ATTR_UPCALL_PID]; + parms.desired_ifindex = nla_get_s32_default(a[OVS_VPORT_ATTR_IFINDEX], + 0); vport = new_vport(&parms); err = PTR_ERR(vport); - if (IS_ERR(vport)) - goto exit_unlock; - - err = 0; - reply = ovs_vport_cmd_build_info(vport, info->snd_portid, info->snd_seq, - OVS_VPORT_CMD_NEW); - if (IS_ERR(reply)) { - err = PTR_ERR(reply); - ovs_dp_detach_port(vport); - goto exit_unlock; + if (IS_ERR(vport)) { + if (err == -EAGAIN) + goto restart; + goto exit_unlock_free; } - ovs_notify(reply, info, &ovs_dp_vport_multicast_group); + err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info), + info->snd_portid, info->snd_seq, 0, + OVS_VPORT_CMD_NEW, GFP_KERNEL); + + new_headroom = netdev_get_fwd_headroom(vport->dev); + + if (new_headroom > dp->max_headroom) + ovs_update_headroom(dp, new_headroom); + else + netdev_set_rx_headroom(vport->dev, dp->max_headroom); + + BUG_ON(err < 0); + ovs_unlock(); + + ovs_notify(&dp_vport_genl_family, reply, info); + return 0; -exit_unlock: +exit_unlock_free: ovs_unlock(); -exit: + kfree_skb(reply); return err; } @@ -2041,111 +2379,135 @@ static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info) struct vport *vport; int err; + reply = ovs_vport_cmd_alloc_info(); + if (!reply) + return -ENOMEM; + ovs_lock(); - vport = lookup_vport(sock_net(skb->sk), info->userhdr, a); + vport = lookup_vport(sock_net(skb->sk), genl_info_userhdr(info), a); err = PTR_ERR(vport); if (IS_ERR(vport)) - goto exit_unlock; + goto exit_unlock_free; if (a[OVS_VPORT_ATTR_TYPE] && nla_get_u32(a[OVS_VPORT_ATTR_TYPE]) != vport->ops->type) { err = -EINVAL; - goto exit_unlock; - } - - reply = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!reply) { - err = -ENOMEM; - goto exit_unlock; + goto exit_unlock_free; } if (a[OVS_VPORT_ATTR_OPTIONS]) { err = ovs_vport_set_options(vport, a[OVS_VPORT_ATTR_OPTIONS]); if (err) - goto exit_free; + goto exit_unlock_free; } - if (a[OVS_VPORT_ATTR_UPCALL_PID]) - vport->upcall_portid = nla_get_u32(a[OVS_VPORT_ATTR_UPCALL_PID]); - err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid, - info->snd_seq, 0, OVS_VPORT_CMD_NEW); + if (a[OVS_VPORT_ATTR_UPCALL_PID]) { + struct nlattr *ids = a[OVS_VPORT_ATTR_UPCALL_PID]; + + err = ovs_vport_set_upcall_portids(vport, ids); + if (err) + goto exit_unlock_free; + } + + err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info), + info->snd_portid, info->snd_seq, 0, + OVS_VPORT_CMD_SET, GFP_KERNEL); BUG_ON(err < 0); ovs_unlock(); - ovs_notify(reply, info, &ovs_dp_vport_multicast_group); - return 0; - - rtnl_unlock(); + ovs_notify(&dp_vport_genl_family, reply, info); return 0; -exit_free: - kfree_skb(reply); -exit_unlock: +exit_unlock_free: ovs_unlock(); + kfree_skb(reply); return err; } static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info) { + bool update_headroom = false; struct nlattr **a = info->attrs; struct sk_buff *reply; + struct datapath *dp; struct vport *vport; + unsigned int new_headroom; int err; + reply = ovs_vport_cmd_alloc_info(); + if (!reply) + return -ENOMEM; + ovs_lock(); - vport = lookup_vport(sock_net(skb->sk), info->userhdr, a); + vport = lookup_vport(sock_net(skb->sk), genl_info_userhdr(info), a); err = PTR_ERR(vport); if (IS_ERR(vport)) - goto exit_unlock; + goto exit_unlock_free; if (vport->port_no == OVSP_LOCAL) { err = -EINVAL; - goto exit_unlock; + goto exit_unlock_free; } - reply = ovs_vport_cmd_build_info(vport, info->snd_portid, - info->snd_seq, OVS_VPORT_CMD_DEL); - err = PTR_ERR(reply); - if (IS_ERR(reply)) - goto exit_unlock; + err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info), + info->snd_portid, info->snd_seq, 0, + OVS_VPORT_CMD_DEL, GFP_KERNEL); + BUG_ON(err < 0); + + /* the vport deletion may trigger dp headroom update */ + dp = vport->dp; + if (netdev_get_fwd_headroom(vport->dev) == dp->max_headroom) + update_headroom = true; - err = 0; + netdev_reset_rx_headroom(vport->dev); ovs_dp_detach_port(vport); - ovs_notify(reply, info, &ovs_dp_vport_multicast_group); + if (update_headroom) { + new_headroom = ovs_get_max_headroom(dp); -exit_unlock: + if (new_headroom < dp->max_headroom) + ovs_update_headroom(dp, new_headroom); + } ovs_unlock(); + + ovs_notify(&dp_vport_genl_family, reply, info); + return 0; + +exit_unlock_free: + ovs_unlock(); + kfree_skb(reply); return err; } static int ovs_vport_cmd_get(struct sk_buff *skb, struct genl_info *info) { struct nlattr **a = info->attrs; - struct ovs_header *ovs_header = info->userhdr; + struct ovs_header *ovs_header = genl_info_userhdr(info); struct sk_buff *reply; struct vport *vport; int err; + reply = ovs_vport_cmd_alloc_info(); + if (!reply) + return -ENOMEM; + rcu_read_lock(); vport = lookup_vport(sock_net(skb->sk), ovs_header, a); err = PTR_ERR(vport); if (IS_ERR(vport)) - goto exit_unlock; - - reply = ovs_vport_cmd_build_info(vport, info->snd_portid, - info->snd_seq, OVS_VPORT_CMD_NEW); - err = PTR_ERR(reply); - if (IS_ERR(reply)) - goto exit_unlock; - + goto exit_unlock_free; + err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info), + info->snd_portid, info->snd_seq, 0, + OVS_VPORT_CMD_GET, GFP_ATOMIC); + BUG_ON(err < 0); rcu_read_unlock(); return genlmsg_reply(reply, info); -exit_unlock: +exit_unlock_free: rcu_read_unlock(); + kfree_skb(reply); return err; } @@ -2156,11 +2518,12 @@ static int ovs_vport_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) int bucket = cb->args[0], skip = cb->args[1]; int i, j = 0; - dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); - if (!dp) - return -ENODEV; - rcu_read_lock(); + dp = get_dp_rcu(sock_net(skb->sk), ovs_header->dp_ifindex); + if (!dp) { + rcu_read_unlock(); + return -ENODEV; + } for (i = bucket; i < DP_VPORT_HASH_BUCKETS; i++) { struct vport *vport; @@ -2168,10 +2531,12 @@ static int ovs_vport_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) { if (j >= skip && ovs_vport_cmd_fill_info(vport, skb, + sock_net(skb->sk), NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, - OVS_VPORT_CMD_NEW) < 0) + OVS_VPORT_CMD_GET, + GFP_ATOMIC) < 0) goto out; j++; @@ -2187,50 +2552,84 @@ out: return skb->len; } -static struct genl_ops dp_vport_genl_ops[] = { +static void ovs_dp_masks_rebalance(struct work_struct *work) +{ + struct ovs_net *ovs_net = container_of(work, struct ovs_net, + masks_rebalance.work); + struct datapath *dp; + + ovs_lock(); + + list_for_each_entry(dp, &ovs_net->dps, list_node) + ovs_flow_masks_rebalance(&dp->table); + + ovs_unlock(); + + schedule_delayed_work(&ovs_net->masks_rebalance, + msecs_to_jiffies(DP_MASKS_REBALANCE_INTERVAL)); +} + +static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = { + [OVS_VPORT_ATTR_NAME] = { .type = NLA_NUL_STRING, .len = IFNAMSIZ - 1 }, + [OVS_VPORT_ATTR_STATS] = { .len = sizeof(struct ovs_vport_stats) }, + [OVS_VPORT_ATTR_PORT_NO] = { .type = NLA_U32 }, + [OVS_VPORT_ATTR_TYPE] = { .type = NLA_U32 }, + [OVS_VPORT_ATTR_UPCALL_PID] = { .type = NLA_UNSPEC }, + [OVS_VPORT_ATTR_OPTIONS] = { .type = NLA_NESTED }, + [OVS_VPORT_ATTR_IFINDEX] = NLA_POLICY_MIN(NLA_S32, 0), + [OVS_VPORT_ATTR_NETNSID] = { .type = NLA_S32 }, + [OVS_VPORT_ATTR_UPCALL_STATS] = { .type = NLA_NESTED }, +}; + +static const struct genl_small_ops dp_vport_genl_ops[] = { { .cmd = OVS_VPORT_CMD_NEW, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ - .policy = vport_policy, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .doit = ovs_vport_cmd_new }, { .cmd = OVS_VPORT_CMD_DEL, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ - .policy = vport_policy, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .doit = ovs_vport_cmd_del }, { .cmd = OVS_VPORT_CMD_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .flags = 0, /* OK for unprivileged users. */ - .policy = vport_policy, .doit = ovs_vport_cmd_get, .dumpit = ovs_vport_cmd_dump }, { .cmd = OVS_VPORT_CMD_SET, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ - .policy = vport_policy, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */ .doit = ovs_vport_cmd_set, }, }; -struct genl_family_and_ops { - struct genl_family *family; - struct genl_ops *ops; - int n_ops; - struct genl_multicast_group *group; +struct genl_family dp_vport_genl_family __ro_after_init = { + .hdrsize = sizeof(struct ovs_header), + .name = OVS_VPORT_FAMILY, + .version = OVS_VPORT_VERSION, + .maxattr = OVS_VPORT_ATTR_MAX, + .policy = vport_policy, + .netnsok = true, + .parallel_ops = true, + .small_ops = dp_vport_genl_ops, + .n_small_ops = ARRAY_SIZE(dp_vport_genl_ops), + .resv_start_op = OVS_VPORT_CMD_SET + 1, + .mcgrps = &ovs_dp_vport_multicast_group, + .n_mcgrps = 1, + .module = THIS_MODULE, }; -static const struct genl_family_and_ops dp_genl_families[] = { - { &dp_datapath_genl_family, - dp_datapath_genl_ops, ARRAY_SIZE(dp_datapath_genl_ops), - &ovs_dp_datapath_multicast_group }, - { &dp_vport_genl_family, - dp_vport_genl_ops, ARRAY_SIZE(dp_vport_genl_ops), - &ovs_dp_vport_multicast_group }, - { &dp_flow_genl_family, - dp_flow_genl_ops, ARRAY_SIZE(dp_flow_genl_ops), - &ovs_dp_flow_multicast_group }, - { &dp_packet_genl_family, - dp_packet_genl_ops, ARRAY_SIZE(dp_packet_genl_ops), - NULL }, +static struct genl_family * const dp_genl_families[] = { + &dp_datapath_genl_family, + &dp_vport_genl_family, + &dp_flow_genl_family, + &dp_packet_genl_family, + &dp_meter_genl_family, +#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) + &dp_ct_limit_genl_family, +#endif }; static void dp_unregister_genl(int n_families) @@ -2238,84 +2637,98 @@ static void dp_unregister_genl(int n_families) int i; for (i = 0; i < n_families; i++) - genl_unregister_family(dp_genl_families[i].family); + genl_unregister_family(dp_genl_families[i]); } -static int dp_register_genl(void) +static int __init dp_register_genl(void) { - int n_registered; int err; int i; - n_registered = 0; for (i = 0; i < ARRAY_SIZE(dp_genl_families); i++) { - const struct genl_family_and_ops *f = &dp_genl_families[i]; - err = genl_register_family_with_ops(f->family, f->ops, - f->n_ops); + err = genl_register_family(dp_genl_families[i]); if (err) goto error; - n_registered++; - - if (f->group) { - err = genl_register_mc_group(f->family, f->group); - if (err) - goto error; - } } return 0; error: - dp_unregister_genl(n_registered); + dp_unregister_genl(i); return err; } -static void rehash_flow_table(struct work_struct *work) -{ - struct datapath *dp; - struct net *net; - - ovs_lock(); - rtnl_lock(); - for_each_net(net) { - struct ovs_net *ovs_net = net_generic(net, ovs_net_id); - - list_for_each_entry(dp, &ovs_net->dps, list_node) { - struct flow_table *old_table = ovsl_dereference(dp->table); - struct flow_table *new_table; - - new_table = ovs_flow_tbl_rehash(old_table); - if (!IS_ERR(new_table)) { - rcu_assign_pointer(dp->table, new_table); - ovs_flow_tbl_deferred_destroy(old_table); - } - } - } - rtnl_unlock(); - ovs_unlock(); - schedule_delayed_work(&rehash_flow_wq, REHASH_FLOW_INTERVAL); -} - static int __net_init ovs_init_net(struct net *net) { struct ovs_net *ovs_net = net_generic(net, ovs_net_id); + int err; INIT_LIST_HEAD(&ovs_net->dps); INIT_WORK(&ovs_net->dp_notify_work, ovs_dp_notify_wq); + INIT_DELAYED_WORK(&ovs_net->masks_rebalance, ovs_dp_masks_rebalance); + + err = ovs_ct_init(net); + if (err) + return err; + + schedule_delayed_work(&ovs_net->masks_rebalance, + msecs_to_jiffies(DP_MASKS_REBALANCE_INTERVAL)); return 0; } -static void __net_exit ovs_exit_net(struct net *net) +static void __net_exit list_vports_from_net(struct net *net, struct net *dnet, + struct list_head *head) { - struct datapath *dp, *dp_next; struct ovs_net *ovs_net = net_generic(net, ovs_net_id); + struct datapath *dp; + + list_for_each_entry(dp, &ovs_net->dps, list_node) { + int i; + + for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) { + struct vport *vport; + + hlist_for_each_entry(vport, &dp->ports[i], dp_hash_node) { + if (vport->ops->type != OVS_VPORT_TYPE_INTERNAL) + continue; + + if (dev_net(vport->dev) == dnet) + list_add(&vport->detach_list, head); + } + } + } +} + +static void __net_exit ovs_exit_net(struct net *dnet) +{ + struct datapath *dp, *dp_next; + struct ovs_net *ovs_net = net_generic(dnet, ovs_net_id); + struct vport *vport, *vport_next; + struct net *net; + LIST_HEAD(head); ovs_lock(); + + ovs_ct_exit(dnet); + list_for_each_entry_safe(dp, dp_next, &ovs_net->dps, list_node) __dp_destroy(dp); + + down_read(&net_rwsem); + for_each_net(net) + list_vports_from_net(net, dnet, &head); + up_read(&net_rwsem); + + /* Detach all vports from given namespace. */ + list_for_each_entry_safe(vport, vport_next, &head, detach_list) { + list_del(&vport->detach_list); + ovs_dp_detach_port(vport); + } + ovs_unlock(); + cancel_delayed_work_sync(&ovs_net->masks_rebalance); cancel_work_sync(&ovs_net->dp_notify_work); } @@ -2326,18 +2739,60 @@ static struct pernet_operations ovs_net_ops = { .size = sizeof(struct ovs_net), }; +static const char * const ovs_drop_reasons[] = { +#define S(x) [(x) & ~SKB_DROP_REASON_SUBSYS_MASK] = (#x), + OVS_DROP_REASONS(S) +#undef S +}; + +static struct drop_reason_list drop_reason_list_ovs = { + .reasons = ovs_drop_reasons, + .n_reasons = ARRAY_SIZE(ovs_drop_reasons), +}; + +static int __init ovs_alloc_percpu_storage(void) +{ + unsigned int cpu; + + ovs_pcpu_storage = alloc_percpu(*ovs_pcpu_storage); + if (!ovs_pcpu_storage) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + struct ovs_pcpu_storage *ovs_pcpu; + + ovs_pcpu = per_cpu_ptr(ovs_pcpu_storage, cpu); + local_lock_init(&ovs_pcpu->bh_lock); + } + return 0; +} + +static void ovs_free_percpu_storage(void) +{ + free_percpu(ovs_pcpu_storage); +} + static int __init dp_init(void) { int err; - BUILD_BUG_ON(sizeof(struct ovs_skb_cb) > FIELD_SIZEOF(struct sk_buff, cb)); + BUILD_BUG_ON(sizeof(struct ovs_skb_cb) > + sizeof_field(struct sk_buff, cb)); pr_info("Open vSwitch switching datapath\n"); - err = ovs_flow_init(); + err = ovs_alloc_percpu_storage(); + if (err) + goto error; + + err = ovs_internal_dev_rtnl_link_register(); if (err) goto error; + err = ovs_flow_init(); + if (err) + goto error_unreg_rtnl_link; + err = ovs_vport_init(); if (err) goto error_flow_exit; @@ -2350,14 +2805,21 @@ static int __init dp_init(void) if (err) goto error_netns_exit; + err = ovs_netdev_init(); + if (err) + goto error_unreg_notifier; + err = dp_register_genl(); if (err < 0) - goto error_unreg_notifier; + goto error_unreg_netdev; - schedule_delayed_work(&rehash_flow_wq, REHASH_FLOW_INTERVAL); + drop_reasons_register_subsys(SKB_DROP_REASON_SUBSYS_OPENVSWITCH, + &drop_reason_list_ovs); return 0; +error_unreg_netdev: + ovs_netdev_exit(); error_unreg_notifier: unregister_netdevice_notifier(&ovs_dp_device_notifier); error_netns_exit: @@ -2366,19 +2828,25 @@ error_vport_exit: ovs_vport_exit(); error_flow_exit: ovs_flow_exit(); +error_unreg_rtnl_link: + ovs_internal_dev_rtnl_link_unregister(); error: + ovs_free_percpu_storage(); return err; } static void dp_cleanup(void) { - cancel_delayed_work_sync(&rehash_flow_wq); dp_unregister_genl(ARRAY_SIZE(dp_genl_families)); + ovs_netdev_exit(); unregister_netdevice_notifier(&ovs_dp_device_notifier); unregister_pernet_device(&ovs_net_ops); + drop_reasons_unregister_subsys(SKB_DROP_REASON_SUBSYS_OPENVSWITCH); rcu_barrier(); ovs_vport_exit(); ovs_flow_exit(); + ovs_internal_dev_rtnl_link_unregister(); + ovs_free_percpu_storage(); } module_init(dp_init); @@ -2386,3 +2854,9 @@ module_exit(dp_cleanup); MODULE_DESCRIPTION("Open vSwitch switching datapath"); MODULE_LICENSE("GPL"); +MODULE_ALIAS_GENL_FAMILY(OVS_DATAPATH_FAMILY); +MODULE_ALIAS_GENL_FAMILY(OVS_VPORT_FAMILY); +MODULE_ALIAS_GENL_FAMILY(OVS_FLOW_FAMILY); +MODULE_ALIAS_GENL_FAMILY(OVS_PACKET_FAMILY); +MODULE_ALIAS_GENL_FAMILY(OVS_METER_FAMILY); +MODULE_ALIAS_GENL_FAMILY(OVS_CT_LIMIT_FAMILY); diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index a91486484916..db0c3e69d66c 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -1,19 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* - * Copyright (c) 2007-2012 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA + * Copyright (c) 2007-2014 Nicira, Inc. */ #ifndef DATAPATH_H @@ -25,43 +12,75 @@ #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/u64_stats_sync.h> +#include <net/ip_tunnels.h> +#include <net/mpls.h> +#include "conntrack.h" #include "flow.h" -#include "vport.h" - -#define DP_MAX_PORTS USHRT_MAX -#define DP_VPORT_HASH_BUCKETS 1024 +#include "flow_table.h" +#include "meter.h" +#include "vport-internal_dev.h" -#define SAMPLE_ACTION_DEPTH 3 +#define DP_MAX_PORTS USHRT_MAX +#define DP_VPORT_HASH_BUCKETS 1024 +#define DP_MASKS_REBALANCE_INTERVAL 4000 /** * struct dp_stats_percpu - per-cpu packet processing statistics for a given * datapath. * @n_hit: Number of received packets for which a matching flow was found in * the flow table. - * @n_miss: Number of received packets that had no matching flow in the flow - * table. The sum of @n_hit and @n_miss is the number of packets that have + * @n_missed: Number of received packets that had no matching flow in the flow + * table. The sum of @n_hit and @n_missed is the number of packets that have * been received by the datapath. * @n_lost: Number of received packets that had no matching flow in the flow * table that could not be sent to userspace (normally due to an overflow in * one of the datapath's queues). + * @n_mask_hit: Number of masks looked up for flow match. + * @n_mask_hit / (@n_hit + @n_missed) will be the average masks looked + * up per packet. + * @n_cache_hit: The number of received packets that had their mask found using + * the mask cache. + * @syncp: Synchronization point for 64bit counters. */ struct dp_stats_percpu { u64 n_hit; u64 n_missed; u64 n_lost; - struct u64_stats_sync sync; + u64 n_mask_hit; + u64 n_cache_hit; + struct u64_stats_sync syncp; +}; + +/** + * struct dp_nlsk_pids - array of netlink portids of for a datapath. + * This is used when OVS_DP_F_DISPATCH_UPCALL_PER_CPU + * is enabled and must be protected by rcu. + * @rcu: RCU callback head for deferred destruction. + * @n_pids: Size of @pids array. + * @pids: Array storing the Netlink socket PIDs indexed by CPU ID for packets + * that miss the flow table. + */ +struct dp_nlsk_pids { + struct rcu_head rcu; + u32 n_pids; + u32 pids[]; }; /** * struct datapath - datapath for flow-based packet switching * @rcu: RCU callback head for deferred destruction. * @list_node: Element in global 'dps' list. - * @table: Current flow table. Protected by ovs_mutex and RCU. + * @table: flow table. * @ports: Hash table for ports. %OVSP_LOCAL port always exists. Protected by * ovs_mutex and RCU. * @stats_percpu: Per-CPU datapath statistics. * @net: Reference to net namespace. + * @user_features: Bitmap of enabled %OVS_DP_F_* features. + * @max_headroom: The maximum headroom of all vports in this datapath; it will + * be used by all the internal vports in this dp. + * @meter_tbl: Meter table. + * @upcall_portids: RCU protected 'struct dp_nlsk_pids'. * * Context: See the comment on locking at the top of datapath.c for additional * locking information. @@ -71,7 +90,7 @@ struct datapath { struct list_head list_node; /* Flow table. */ - struct flow_table __rcu *table; + struct flow_table table; /* Switch ports. */ struct hlist_head *ports; @@ -79,53 +98,147 @@ struct datapath { /* Stats. */ struct dp_stats_percpu __percpu *stats_percpu; -#ifdef CONFIG_NET_NS /* Network namespace ref. */ - struct net *net; -#endif + possible_net_t net; + + u32 user_features; + + u32 max_headroom; + + /* Switch meters. */ + struct dp_meter_table meter_tbl; + + struct dp_nlsk_pids __rcu *upcall_portids; }; /** * struct ovs_skb_cb - OVS data in skb CB - * @flow: The flow associated with this packet. May be %NULL if no flow. - * @tun_key: Key for the tunnel that encapsulated this packet. NULL if the - * packet is not being tunneled. + * @input_vport: The original vport packet came in on. This value is cached + * when a packet is received by OVS. + * @mru: The maximum received fragement size; 0 if the packet is not + * fragmented. + * @acts_origlen: The netlink size of the flow actions applied to this skb. + * @cutlen: The number of bytes from the packet end to be removed. + * @probability: The sampling probability that was applied to this skb; 0 means + * no sampling has occurred; U32_MAX means 100% probability. + * @upcall_pid: Netlink socket PID to use for sending this packet to userspace; + * 0 means "not set" and default per-CPU or per-vport dispatch should be used. */ struct ovs_skb_cb { - struct sw_flow *flow; - struct ovs_key_ipv4_tunnel *tun_key; + struct vport *input_vport; + u16 mru; + u16 acts_origlen; + u32 cutlen; + u32 probability; + u32 upcall_pid; }; #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb) /** - * struct dp_upcall - metadata to include with a packet to send to userspace + * struct dp_upcall_info - metadata to include with a packet sent to userspace * @cmd: One of %OVS_PACKET_CMD_*. - * @key: Becomes %OVS_PACKET_ATTR_KEY. Must be nonnull. * @userdata: If nonnull, its variable-length value is passed to userspace as * %OVS_PACKET_ATTR_USERDATA. - * @pid: Netlink PID to which packet should be sent. If @pid is 0 then no - * packet is sent and the packet is accounted in the datapath's @n_lost + * @actions: If nonnull, its variable-length value is passed to userspace as + * %OVS_PACKET_ATTR_ACTIONS. + * @actions_len: The length of the @actions. + * @portid: Netlink portid to which packet should be sent. If @portid is 0 + * then no packet is sent and the packet is accounted in the datapath's @n_lost * counter. + * @egress_tun_info: If nonnull, becomes %OVS_PACKET_ATTR_EGRESS_TUN_KEY. + * @mru: If not zero, Maximum received IP fragment size. */ struct dp_upcall_info { - u8 cmd; - const struct sw_flow_key *key; + struct ip_tunnel_info *egress_tun_info; const struct nlattr *userdata; + const struct nlattr *actions; + int actions_len; u32 portid; + u8 cmd; + u16 mru; }; /** * struct ovs_net - Per net-namespace data for ovs. * @dps: List of datapaths to enable dumping them all out. * Protected by genl_mutex. + * @dp_notify_work: A work notifier to handle port unregistering. + * @masks_rebalance: A work to periodically optimize flow table caches. + * @ct_limit_info: A hash table of conntrack zone connection limits. + * @xt_label: Whether connlables are configured for the network or not. */ struct ovs_net { struct list_head dps; struct work_struct dp_notify_work; - struct vport_net vport_net; + struct delayed_work masks_rebalance; +#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT) + struct ovs_ct_limit_info *ct_limit_info; +#endif + bool xt_label; +}; + +#define MAX_L2_LEN (VLAN_ETH_HLEN + 3 * MPLS_HLEN) +struct ovs_frag_data { + unsigned long dst; + struct vport *vport; + struct ovs_skb_cb cb; + __be16 inner_protocol; + u16 network_offset; /* valid only for MPLS */ + u16 vlan_tci; + __be16 vlan_proto; + unsigned int l2_len; + u8 mac_proto; + u8 l2_data[MAX_L2_LEN]; +}; + +struct deferred_action { + struct sk_buff *skb; + const struct nlattr *actions; + int actions_len; + + /* Store pkt_key clone when creating deferred action. */ + struct sw_flow_key pkt_key; +}; + +#define DEFERRED_ACTION_FIFO_SIZE 10 +#define OVS_RECURSION_LIMIT 5 +#define OVS_DEFERRED_ACTION_THRESHOLD (OVS_RECURSION_LIMIT - 2) + +struct action_fifo { + int head; + int tail; + /* Deferred action fifo queue storage. */ + struct deferred_action fifo[DEFERRED_ACTION_FIFO_SIZE]; }; -extern int ovs_net_id; +struct action_flow_keys { + struct sw_flow_key key[OVS_DEFERRED_ACTION_THRESHOLD]; +}; + +struct ovs_pcpu_storage { + struct action_fifo action_fifos; + struct action_flow_keys flow_keys; + struct ovs_frag_data frag_data; + int exec_level; + struct task_struct *owner; + local_lock_t bh_lock; +}; + +extern struct ovs_pcpu_storage __percpu *ovs_pcpu_storage; + +/** + * enum ovs_pkt_hash_types - hash info to include with a packet + * to send to userspace. + * @OVS_PACKET_HASH_SW_BIT: indicates hash was computed in software stack. + * @OVS_PACKET_HASH_L4_BIT: indicates hash is a canonical 4-tuple hash + * over transport ports. + */ +enum ovs_pkt_hash_types { + OVS_PACKET_HASH_SW_BIT = (1ULL << 32), + OVS_PACKET_HASH_L4_BIT = (1ULL << 33), +}; + +extern unsigned int ovs_net_id; void ovs_lock(void); void ovs_unlock(void); @@ -135,11 +248,13 @@ int lockdep_ovsl_is_held(void); #define lockdep_ovsl_is_held() 1 #endif -#define ASSERT_OVSL() WARN_ON(unlikely(!lockdep_ovsl_is_held())) +#define ASSERT_OVSL() WARN_ON(!lockdep_ovsl_is_held()) #define ovsl_dereference(p) \ rcu_dereference_protected(p, lockdep_ovsl_is_held()) +#define rcu_dereference_ovsl(p) \ + rcu_dereference_check(p, lockdep_ovsl_is_held()) -static inline struct net *ovs_dp_get_net(struct datapath *dp) +static inline struct net *ovs_dp_get_net(const struct datapath *dp) { return read_pnet(&dp->net); } @@ -169,18 +284,63 @@ static inline struct vport *ovs_vport_ovsl(const struct datapath *dp, int port_n return ovs_lookup_vport(dp, port_no); } +/* Must be called with rcu_read_lock. */ +static inline struct datapath *get_dp_rcu(struct net *net, int dp_ifindex) +{ + struct net_device *dev = dev_get_by_index_rcu(net, dp_ifindex); + + if (dev) { + struct vport *vport = ovs_internal_dev_get_vport(dev); + + if (vport) + return vport->dp; + } + + return NULL; +} + +/* The caller must hold either ovs_mutex or rcu_read_lock to keep the + * returned dp pointer valid. + */ +static inline struct datapath *get_dp(struct net *net, int dp_ifindex) +{ + struct datapath *dp; + + WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_ovsl_is_held()); + rcu_read_lock(); + dp = get_dp_rcu(net, dp_ifindex); + rcu_read_unlock(); + + return dp; +} + extern struct notifier_block ovs_dp_device_notifier; -extern struct genl_multicast_group ovs_dp_vport_multicast_group; +extern struct genl_family dp_vport_genl_family; -void ovs_dp_process_received_packet(struct vport *, struct sk_buff *); +void ovs_dp_process_packet(struct sk_buff *skb, struct sw_flow_key *key); void ovs_dp_detach_port(struct vport *); int ovs_dp_upcall(struct datapath *, struct sk_buff *, - const struct dp_upcall_info *); + const struct sw_flow_key *, const struct dp_upcall_info *, + uint32_t cutlen); + +u32 ovs_dp_get_upcall_portid(const struct datapath *dp, uint32_t cpu_id); const char *ovs_dp_name(const struct datapath *dp); -struct sk_buff *ovs_vport_cmd_build_info(struct vport *, u32 pid, u32 seq, - u8 cmd); +struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, struct net *net, + u32 portid, u32 seq, u8 cmd); + +int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb, + const struct sw_flow_actions *, struct sw_flow_key *); -int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb); void ovs_dp_notify_wq(struct work_struct *work); + +/* 'KEY' must not have any bits set outside of the 'MASK' */ +#define OVS_MASKED(OLD, KEY, MASK) ((KEY) | ((OLD) & ~(MASK))) +#define OVS_SET_MASKED(OLD, KEY, MASK) ((OLD) = OVS_MASKED(OLD, KEY, MASK)) + +#define OVS_NLERR(logging_allowed, fmt, ...) \ +do { \ + if (logging_allowed && net_ratelimit()) \ + pr_info("netlink: " fmt "\n", ##__VA_ARGS__); \ +} while (0) #endif /* datapath.h */ diff --git a/net/openvswitch/dp_notify.c b/net/openvswitch/dp_notify.c index c3235675f359..a2af90ee99af 100644 --- a/net/openvswitch/dp_notify.c +++ b/net/openvswitch/dp_notify.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2007-2012 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA */ #include <linux/netdevice.h> @@ -30,19 +17,18 @@ static void dp_detach_port_notify(struct vport *vport) struct datapath *dp; dp = vport->dp; - notify = ovs_vport_cmd_build_info(vport, 0, 0, - OVS_VPORT_CMD_DEL); + notify = ovs_vport_cmd_build_info(vport, ovs_dp_get_net(dp), + 0, 0, OVS_VPORT_CMD_DEL); ovs_dp_detach_port(vport); if (IS_ERR(notify)) { - netlink_set_err(ovs_dp_get_net(dp)->genl_sock, 0, - ovs_dp_vport_multicast_group.id, - PTR_ERR(notify)); + genl_set_err(&dp_vport_genl_family, ovs_dp_get_net(dp), 0, + 0, PTR_ERR(notify)); return; } - genlmsg_multicast_netns(ovs_dp_get_net(dp), notify, 0, - ovs_dp_vport_multicast_group.id, - GFP_KERNEL); + genlmsg_multicast_netns(&dp_vport_genl_family, + ovs_dp_get_net(dp), notify, 0, + 0, GFP_KERNEL); } void ovs_dp_notify_wq(struct work_struct *work) @@ -59,14 +45,10 @@ void ovs_dp_notify_wq(struct work_struct *work) struct hlist_node *n; hlist_for_each_entry_safe(vport, n, &dp->ports[i], dp_hash_node) { - struct netdev_vport *netdev_vport; - - if (vport->ops->type != OVS_VPORT_TYPE_NETDEV) + if (vport->ops->type == OVS_VPORT_TYPE_INTERNAL) continue; - netdev_vport = netdev_vport_priv(vport); - if (netdev_vport->dev->reg_state == NETREG_UNREGISTERED || - netdev_vport->dev->reg_state == NETREG_UNREGISTERING) + if (!(netif_is_ovs_port(vport->dev))) dp_detach_port_notify(vport); } } @@ -88,8 +70,12 @@ static int dp_device_event(struct notifier_block *unused, unsigned long event, return NOTIFY_DONE; if (event == NETDEV_UNREGISTER) { + /* upper_dev_unlink and decrement promisc immediately */ + ovs_netdev_detach_dev(vport); + + /* schedule vport destroy, dev_put and genl notification */ ovs_net = net_generic(dev_net(dev), ovs_net_id); - queue_work(system_wq, &ovs_net->dp_notify_work); + queue_work(system_percpu_wq, &ovs_net->dp_notify_work); } return NOTIFY_DONE; diff --git a/net/openvswitch/drop.h b/net/openvswitch/drop.h new file mode 100644 index 000000000000..cedf9b7b5796 --- /dev/null +++ b/net/openvswitch/drop.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * OpenvSwitch drop reason list. + */ + +#ifndef OPENVSWITCH_DROP_H +#define OPENVSWITCH_DROP_H +#include <linux/skbuff.h> +#include <net/dropreason.h> + +#define OVS_DROP_REASONS(R) \ + R(OVS_DROP_LAST_ACTION) \ + R(OVS_DROP_ACTION_ERROR) \ + R(OVS_DROP_EXPLICIT) \ + R(OVS_DROP_EXPLICIT_WITH_ERROR) \ + R(OVS_DROP_METER) \ + R(OVS_DROP_RECURSION_LIMIT) \ + R(OVS_DROP_DEFERRED_LIMIT) \ + R(OVS_DROP_FRAG_L2_TOO_LONG) \ + R(OVS_DROP_FRAG_INVALID_PROTO) \ + R(OVS_DROP_CONNTRACK) \ + R(OVS_DROP_IP_TTL) \ + /* deliberate comment for trailing \ */ + +enum ovs_drop_reason { + __OVS_DROP_REASON = SKB_DROP_REASON_SUBSYS_OPENVSWITCH << + SKB_DROP_REASON_SUBSYS_SHIFT, +#define ENUM(x) x, + OVS_DROP_REASONS(ENUM) +#undef ENUM + + OVS_DROP_MAX, +}; + +static inline void +ovs_kfree_skb_reason(struct sk_buff *skb, enum ovs_drop_reason reason) +{ + kfree_skb_reason(skb, (u32)reason); +} + +#endif /* OPENVSWITCH_DROP_H */ diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 5c519b121e1b..66366982f604 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -1,23 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (c) 2007-2011 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA + * Copyright (c) 2007-2014 Nicira, Inc. */ -#include "flow.h" -#include "datapath.h" #include <linux/uaccess.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> @@ -31,9 +16,13 @@ #include <linux/module.h> #include <linux/in.h> #include <linux/rcupdate.h> +#include <linux/cpumask.h> #include <linux/if_arp.h> #include <linux/ip.h> #include <linux/ipv6.h> +#include <linux/mpls.h> +#include <linux/sctp.h> +#include <linux/smp.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/icmp.h> @@ -42,9 +31,146 @@ #include <net/ip.h> #include <net/ip_tunnels.h> #include <net/ipv6.h> +#include <net/mpls.h> #include <net/ndisc.h> +#include <net/nsh.h> +#include <net/pkt_cls.h> +#include <net/netfilter/nf_conntrack_zones.h> + +#include "conntrack.h" +#include "datapath.h" +#include "flow.h" +#include "flow_netlink.h" +#include "vport.h" + +u64 ovs_flow_used_time(unsigned long flow_jiffies) +{ + struct timespec64 cur_ts; + u64 cur_ms, idle_ms; + + ktime_get_ts64(&cur_ts); + idle_ms = jiffies_to_msecs(jiffies - flow_jiffies); + cur_ms = (u64)(u32)cur_ts.tv_sec * MSEC_PER_SEC + + cur_ts.tv_nsec / NSEC_PER_MSEC; -static struct kmem_cache *flow_cache; + return cur_ms - idle_ms; +} + +#define TCP_FLAGS_BE16(tp) (*(__be16 *)&tcp_flag_word(tp) & htons(0x0FFF)) + +void ovs_flow_stats_update(struct sw_flow *flow, __be16 tcp_flags, + const struct sk_buff *skb) +{ + struct sw_flow_stats *stats; + unsigned int cpu = smp_processor_id(); + int len = skb->len + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0); + + stats = rcu_dereference(flow->stats[cpu]); + + /* Check if already have CPU-specific stats. */ + if (likely(stats)) { + spin_lock(&stats->lock); + /* Mark if we write on the pre-allocated stats. */ + if (cpu == 0 && unlikely(flow->stats_last_writer != cpu)) + flow->stats_last_writer = cpu; + } else { + stats = rcu_dereference(flow->stats[0]); /* Pre-allocated. */ + spin_lock(&stats->lock); + + /* If the current CPU is the only writer on the + * pre-allocated stats keep using them. + */ + if (unlikely(flow->stats_last_writer != cpu)) { + /* A previous locker may have already allocated the + * stats, so we need to check again. If CPU-specific + * stats were already allocated, we update the pre- + * allocated stats as we have already locked them. + */ + if (likely(flow->stats_last_writer != -1) && + likely(!rcu_access_pointer(flow->stats[cpu]))) { + /* Try to allocate CPU-specific stats. */ + struct sw_flow_stats *new_stats; + + new_stats = + kmem_cache_alloc_node(flow_stats_cache, + GFP_NOWAIT | + __GFP_THISNODE | + __GFP_NOWARN | + __GFP_NOMEMALLOC, + numa_node_id()); + if (likely(new_stats)) { + new_stats->used = jiffies; + new_stats->packet_count = 1; + new_stats->byte_count = len; + new_stats->tcp_flags = tcp_flags; + spin_lock_init(&new_stats->lock); + + rcu_assign_pointer(flow->stats[cpu], + new_stats); + cpumask_set_cpu(cpu, + flow->cpu_used_mask); + goto unlock; + } + } + flow->stats_last_writer = cpu; + } + } + + stats->used = jiffies; + stats->packet_count++; + stats->byte_count += len; + stats->tcp_flags |= tcp_flags; +unlock: + spin_unlock(&stats->lock); +} + +/* Must be called with rcu_read_lock or ovs_mutex. */ +void ovs_flow_stats_get(const struct sw_flow *flow, + struct ovs_flow_stats *ovs_stats, + unsigned long *used, __be16 *tcp_flags) +{ + unsigned int cpu; + + *used = 0; + *tcp_flags = 0; + memset(ovs_stats, 0, sizeof(*ovs_stats)); + + for_each_cpu(cpu, flow->cpu_used_mask) { + struct sw_flow_stats *stats = rcu_dereference_ovsl(flow->stats[cpu]); + + if (stats) { + /* Local CPU may write on non-local stats, so we must + * block bottom-halves here. + */ + spin_lock_bh(&stats->lock); + if (!*used || time_after(stats->used, *used)) + *used = stats->used; + *tcp_flags |= stats->tcp_flags; + ovs_stats->n_packets += stats->packet_count; + ovs_stats->n_bytes += stats->byte_count; + spin_unlock_bh(&stats->lock); + } + } +} + +/* Called with ovs_mutex. */ +void ovs_flow_stats_clear(struct sw_flow *flow) +{ + unsigned int cpu; + + for_each_cpu(cpu, flow->cpu_used_mask) { + struct sw_flow_stats *stats = ovsl_dereference(flow->stats[cpu]); + + if (stats) { + spin_lock_bh(&stats->lock); + stats->used = 0; + stats->packet_count = 0; + stats->byte_count = 0; + stats->tcp_flags = 0; + spin_unlock_bh(&stats->lock); + } + } +} static int check_header(struct sk_buff *skb, int len) { @@ -102,49 +228,172 @@ static bool udphdr_ok(struct sk_buff *skb) sizeof(struct udphdr)); } +static bool sctphdr_ok(struct sk_buff *skb) +{ + return pskb_may_pull(skb, skb_transport_offset(skb) + + sizeof(struct sctphdr)); +} + static bool icmphdr_ok(struct sk_buff *skb) { return pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct icmphdr)); } -u64 ovs_flow_used_time(unsigned long flow_jiffies) +/** + * get_ipv6_ext_hdrs() - Parses packet and sets IPv6 extension header flags. + * + * @skb: buffer where extension header data starts in packet + * @nh: ipv6 header + * @ext_hdrs: flags are stored here + * + * OFPIEH12_UNREP is set if more than one of a given IPv6 extension header + * is unexpectedly encountered. (Two destination options headers may be + * expected and would not cause this bit to be set.) + * + * OFPIEH12_UNSEQ is set if IPv6 extension headers were not in the order + * preferred (but not required) by RFC 2460: + * + * When more than one extension header is used in the same packet, it is + * recommended that those headers appear in the following order: + * IPv6 header + * Hop-by-Hop Options header + * Destination Options header + * Routing header + * Fragment header + * Authentication header + * Encapsulating Security Payload header + * Destination Options header + * upper-layer header + */ +static void get_ipv6_ext_hdrs(struct sk_buff *skb, struct ipv6hdr *nh, + u16 *ext_hdrs) { - struct timespec cur_ts; - u64 cur_ms, idle_ms; + u8 next_type = nh->nexthdr; + unsigned int start = skb_network_offset(skb) + sizeof(struct ipv6hdr); + int dest_options_header_count = 0; + + *ext_hdrs = 0; + + while (ipv6_ext_hdr(next_type)) { + struct ipv6_opt_hdr _hdr, *hp; + + switch (next_type) { + case IPPROTO_NONE: + *ext_hdrs |= OFPIEH12_NONEXT; + /* stop parsing */ + return; + + case IPPROTO_ESP: + if (*ext_hdrs & OFPIEH12_ESP) + *ext_hdrs |= OFPIEH12_UNREP; + if ((*ext_hdrs & ~(OFPIEH12_HOP | OFPIEH12_DEST | + OFPIEH12_ROUTER | IPPROTO_FRAGMENT | + OFPIEH12_AUTH | OFPIEH12_UNREP)) || + dest_options_header_count >= 2) { + *ext_hdrs |= OFPIEH12_UNSEQ; + } + *ext_hdrs |= OFPIEH12_ESP; + break; - ktime_get_ts(&cur_ts); - idle_ms = jiffies_to_msecs(jiffies - flow_jiffies); - cur_ms = (u64)cur_ts.tv_sec * MSEC_PER_SEC + - cur_ts.tv_nsec / NSEC_PER_MSEC; + case IPPROTO_AH: + if (*ext_hdrs & OFPIEH12_AUTH) + *ext_hdrs |= OFPIEH12_UNREP; + if ((*ext_hdrs & + ~(OFPIEH12_HOP | OFPIEH12_DEST | OFPIEH12_ROUTER | + IPPROTO_FRAGMENT | OFPIEH12_UNREP)) || + dest_options_header_count >= 2) { + *ext_hdrs |= OFPIEH12_UNSEQ; + } + *ext_hdrs |= OFPIEH12_AUTH; + break; - return cur_ms - idle_ms; -} + case IPPROTO_DSTOPTS: + if (dest_options_header_count == 0) { + if (*ext_hdrs & + ~(OFPIEH12_HOP | OFPIEH12_UNREP)) + *ext_hdrs |= OFPIEH12_UNSEQ; + *ext_hdrs |= OFPIEH12_DEST; + } else if (dest_options_header_count == 1) { + if (*ext_hdrs & + ~(OFPIEH12_HOP | OFPIEH12_DEST | + OFPIEH12_ROUTER | OFPIEH12_FRAG | + OFPIEH12_AUTH | OFPIEH12_ESP | + OFPIEH12_UNREP)) { + *ext_hdrs |= OFPIEH12_UNSEQ; + } + } else { + *ext_hdrs |= OFPIEH12_UNREP; + } + dest_options_header_count++; + break; + + case IPPROTO_FRAGMENT: + if (*ext_hdrs & OFPIEH12_FRAG) + *ext_hdrs |= OFPIEH12_UNREP; + if ((*ext_hdrs & ~(OFPIEH12_HOP | + OFPIEH12_DEST | + OFPIEH12_ROUTER | + OFPIEH12_UNREP)) || + dest_options_header_count >= 2) { + *ext_hdrs |= OFPIEH12_UNSEQ; + } + *ext_hdrs |= OFPIEH12_FRAG; + break; + + case IPPROTO_ROUTING: + if (*ext_hdrs & OFPIEH12_ROUTER) + *ext_hdrs |= OFPIEH12_UNREP; + if ((*ext_hdrs & ~(OFPIEH12_HOP | + OFPIEH12_DEST | + OFPIEH12_UNREP)) || + dest_options_header_count >= 2) { + *ext_hdrs |= OFPIEH12_UNSEQ; + } + *ext_hdrs |= OFPIEH12_ROUTER; + break; + + case IPPROTO_HOPOPTS: + if (*ext_hdrs & OFPIEH12_HOP) + *ext_hdrs |= OFPIEH12_UNREP; + /* OFPIEH12_HOP is set to 1 if a hop-by-hop IPv6 + * extension header is present as the first + * extension header in the packet. + */ + if (*ext_hdrs == 0) + *ext_hdrs |= OFPIEH12_HOP; + else + *ext_hdrs |= OFPIEH12_UNSEQ; + break; + + default: + return; + } -#define SW_FLOW_KEY_OFFSET(field) \ - (offsetof(struct sw_flow_key, field) + \ - FIELD_SIZEOF(struct sw_flow_key, field)) + hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr); + if (!hp) + break; + next_type = hp->nexthdr; + start += ipv6_optlen(hp); + } +} -static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key, - int *key_lenp) +static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key) { + unsigned short frag_off; + unsigned int payload_ofs = 0; unsigned int nh_ofs = skb_network_offset(skb); unsigned int nh_len; - int payload_ofs; struct ipv6hdr *nh; - uint8_t nexthdr; - __be16 frag_off; - int err; - - *key_lenp = SW_FLOW_KEY_OFFSET(ipv6.label); + int err, nexthdr, flags = 0; err = check_header(skb, nh_ofs + sizeof(*nh)); if (unlikely(err)) return err; nh = ipv6_hdr(skb); - nexthdr = nh->nexthdr; - payload_ofs = (u8 *)(nh + 1) - skb->data; + + get_ipv6_ext_hdrs(skb, nh, &key->ipv6.exthdrs); key->ip.proto = NEXTHDR_NONE; key->ip.tos = ipv6_get_dsfield(nh); @@ -153,17 +402,25 @@ static int parse_ipv6hdr(struct sk_buff *skb, struct sw_flow_key *key, key->ipv6.addr.src = nh->saddr; key->ipv6.addr.dst = nh->daddr; - payload_ofs = ipv6_skip_exthdr(skb, payload_ofs, &nexthdr, &frag_off); - if (unlikely(payload_ofs < 0)) - return -EINVAL; - - if (frag_off) { - if (frag_off & htons(~0x7)) + nexthdr = ipv6_find_hdr(skb, &payload_ofs, -1, &frag_off, &flags); + if (flags & IP6_FH_F_FRAG) { + if (frag_off) { key->ip.frag = OVS_FRAG_TYPE_LATER; - else - key->ip.frag = OVS_FRAG_TYPE_FIRST; + key->ip.proto = NEXTHDR_FRAGMENT; + return 0; + } + key->ip.frag = OVS_FRAG_TYPE_FIRST; + } else { + key->ip.frag = OVS_FRAG_TYPE_NONE; } + /* Delayed handling of error in ipv6_find_hdr() as it + * always sets flags and frag_off to a valid value which may be + * used to set key->ip.frag above. + */ + if (unlikely(nexthdr < 0)) + return -EPROTO; + nh_len = payload_ofs - nh_ofs; skb_set_transport_header(skb, nh_ofs + nh_len); key->ip.proto = nexthdr; @@ -176,284 +433,78 @@ static bool icmp6hdr_ok(struct sk_buff *skb) sizeof(struct icmp6hdr)); } -#define TCP_FLAGS_OFFSET 13 -#define TCP_FLAG_MASK 0x3f - -void ovs_flow_used(struct sw_flow *flow, struct sk_buff *skb) -{ - u8 tcp_flags = 0; - - if ((flow->key.eth.type == htons(ETH_P_IP) || - flow->key.eth.type == htons(ETH_P_IPV6)) && - flow->key.ip.proto == IPPROTO_TCP && - likely(skb->len >= skb_transport_offset(skb) + sizeof(struct tcphdr))) { - u8 *tcp = (u8 *)tcp_hdr(skb); - tcp_flags = *(tcp + TCP_FLAGS_OFFSET) & TCP_FLAG_MASK; - } - - spin_lock(&flow->lock); - flow->used = jiffies; - flow->packet_count++; - flow->byte_count += skb->len; - flow->tcp_flags |= tcp_flags; - spin_unlock(&flow->lock); -} - -struct sw_flow_actions *ovs_flow_actions_alloc(int size) -{ - struct sw_flow_actions *sfa; - - if (size > MAX_ACTIONS_BUFSIZE) - return ERR_PTR(-EINVAL); - - sfa = kmalloc(sizeof(*sfa) + size, GFP_KERNEL); - if (!sfa) - return ERR_PTR(-ENOMEM); - - sfa->actions_len = 0; - return sfa; -} - -struct sw_flow *ovs_flow_alloc(void) -{ - struct sw_flow *flow; - - flow = kmem_cache_alloc(flow_cache, GFP_KERNEL); - if (!flow) - return ERR_PTR(-ENOMEM); - - spin_lock_init(&flow->lock); - flow->sf_acts = NULL; - - return flow; -} - -static struct hlist_head *find_bucket(struct flow_table *table, u32 hash) -{ - hash = jhash_1word(hash, table->hash_seed); - return flex_array_get(table->buckets, - (hash & (table->n_buckets - 1))); -} - -static struct flex_array *alloc_buckets(unsigned int n_buckets) -{ - struct flex_array *buckets; - int i, err; - - buckets = flex_array_alloc(sizeof(struct hlist_head *), - n_buckets, GFP_KERNEL); - if (!buckets) - return NULL; - - err = flex_array_prealloc(buckets, 0, n_buckets, GFP_KERNEL); - if (err) { - flex_array_free(buckets); - return NULL; - } - - for (i = 0; i < n_buckets; i++) - INIT_HLIST_HEAD((struct hlist_head *) - flex_array_get(buckets, i)); - - return buckets; -} - -static void free_buckets(struct flex_array *buckets) -{ - flex_array_free(buckets); -} - -struct flow_table *ovs_flow_tbl_alloc(int new_size) -{ - struct flow_table *table = kmalloc(sizeof(*table), GFP_KERNEL); - - if (!table) - return NULL; - - table->buckets = alloc_buckets(new_size); - - if (!table->buckets) { - kfree(table); - return NULL; - } - table->n_buckets = new_size; - table->count = 0; - table->node_ver = 0; - table->keep_flows = false; - get_random_bytes(&table->hash_seed, sizeof(u32)); - - return table; -} - -void ovs_flow_tbl_destroy(struct flow_table *table) -{ - int i; - - if (!table) - return; - - if (table->keep_flows) - goto skip_flows; - - for (i = 0; i < table->n_buckets; i++) { - struct sw_flow *flow; - struct hlist_head *head = flex_array_get(table->buckets, i); - struct hlist_node *n; - int ver = table->node_ver; - - hlist_for_each_entry_safe(flow, n, head, hash_node[ver]) { - hlist_del_rcu(&flow->hash_node[ver]); - ovs_flow_free(flow); - } - } - -skip_flows: - free_buckets(table->buckets); - kfree(table); -} - -static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu) -{ - struct flow_table *table = container_of(rcu, struct flow_table, rcu); - - ovs_flow_tbl_destroy(table); -} - -void ovs_flow_tbl_deferred_destroy(struct flow_table *table) -{ - if (!table) - return; - - call_rcu(&table->rcu, flow_tbl_destroy_rcu_cb); -} - -struct sw_flow *ovs_flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *last) +/** + * parse_vlan_tag - Parse vlan tag from vlan header. + * @skb: skb containing frame to parse + * @key_vh: pointer to parsed vlan tag + * @untag_vlan: should the vlan header be removed from the frame + * + * Return: ERROR on memory error. + * %0 if it encounters a non-vlan or incomplete packet. + * %1 after successfully parsing vlan tag. + */ +static int parse_vlan_tag(struct sk_buff *skb, struct vlan_head *key_vh, + bool untag_vlan) { - struct sw_flow *flow; - struct hlist_head *head; - int ver; - int i; - - ver = table->node_ver; - while (*bucket < table->n_buckets) { - i = 0; - head = flex_array_get(table->buckets, *bucket); - hlist_for_each_entry_rcu(flow, head, hash_node[ver]) { - if (i < *last) { - i++; - continue; - } - *last = i + 1; - return flow; - } - (*bucket)++; - *last = 0; - } + struct vlan_head *vh = (struct vlan_head *)skb->data; - return NULL; -} - -static void __flow_tbl_insert(struct flow_table *table, struct sw_flow *flow) -{ - struct hlist_head *head; - head = find_bucket(table, flow->hash); - hlist_add_head_rcu(&flow->hash_node[table->node_ver], head); - table->count++; -} + if (likely(!eth_type_vlan(vh->tpid))) + return 0; -static void flow_table_copy_flows(struct flow_table *old, struct flow_table *new) -{ - int old_ver; - int i; + if (unlikely(skb->len < sizeof(struct vlan_head) + sizeof(__be16))) + return 0; - old_ver = old->node_ver; - new->node_ver = !old_ver; + if (unlikely(!pskb_may_pull(skb, sizeof(struct vlan_head) + + sizeof(__be16)))) + return -ENOMEM; - /* Insert in new table. */ - for (i = 0; i < old->n_buckets; i++) { - struct sw_flow *flow; - struct hlist_head *head; + vh = (struct vlan_head *)skb->data; + key_vh->tci = vh->tci | htons(VLAN_CFI_MASK); + key_vh->tpid = vh->tpid; - head = flex_array_get(old->buckets, i); + if (unlikely(untag_vlan)) { + int offset = skb->data - skb_mac_header(skb); + u16 tci; + int err; - hlist_for_each_entry(flow, head, hash_node[old_ver]) - __flow_tbl_insert(new, flow); + __skb_push(skb, offset); + err = __skb_vlan_pop(skb, &tci); + __skb_pull(skb, offset); + if (err) + return err; + __vlan_hwaccel_put_tag(skb, key_vh->tpid, tci); + } else { + __skb_pull(skb, sizeof(struct vlan_head)); } - old->keep_flows = true; -} - -static struct flow_table *__flow_tbl_rehash(struct flow_table *table, int n_buckets) -{ - struct flow_table *new_table; - - new_table = ovs_flow_tbl_alloc(n_buckets); - if (!new_table) - return ERR_PTR(-ENOMEM); - - flow_table_copy_flows(table, new_table); - - return new_table; -} - -struct flow_table *ovs_flow_tbl_rehash(struct flow_table *table) -{ - return __flow_tbl_rehash(table, table->n_buckets); -} - -struct flow_table *ovs_flow_tbl_expand(struct flow_table *table) -{ - return __flow_tbl_rehash(table, table->n_buckets * 2); -} - -void ovs_flow_free(struct sw_flow *flow) -{ - if (unlikely(!flow)) - return; - - kfree((struct sf_flow_acts __force *)flow->sf_acts); - kmem_cache_free(flow_cache, flow); + return 1; } -/* RCU callback used by ovs_flow_deferred_free. */ -static void rcu_free_flow_callback(struct rcu_head *rcu) +static void clear_vlan(struct sw_flow_key *key) { - struct sw_flow *flow = container_of(rcu, struct sw_flow, rcu); - - ovs_flow_free(flow); -} - -/* Schedules 'flow' to be freed after the next RCU grace period. - * The caller must hold rcu_read_lock for this to be sensible. */ -void ovs_flow_deferred_free(struct sw_flow *flow) -{ - call_rcu(&flow->rcu, rcu_free_flow_callback); -} - -/* Schedules 'sf_acts' to be freed after the next RCU grace period. - * The caller must hold rcu_read_lock for this to be sensible. */ -void ovs_flow_deferred_free_acts(struct sw_flow_actions *sf_acts) -{ - kfree_rcu(sf_acts, rcu); + key->eth.vlan.tci = 0; + key->eth.vlan.tpid = 0; + key->eth.cvlan.tci = 0; + key->eth.cvlan.tpid = 0; } static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key) { - struct qtag_prefix { - __be16 eth_type; /* ETH_P_8021Q */ - __be16 tci; - }; - struct qtag_prefix *qp; - - if (unlikely(skb->len < sizeof(struct qtag_prefix) + sizeof(__be16))) - return 0; + int res; - if (unlikely(!pskb_may_pull(skb, sizeof(struct qtag_prefix) + - sizeof(__be16)))) - return -ENOMEM; + if (skb_vlan_tag_present(skb)) { + key->eth.vlan.tci = htons(skb->vlan_tci) | htons(VLAN_CFI_MASK); + key->eth.vlan.tpid = skb->vlan_proto; + } else { + /* Parse outer vlan tag in the non-accelerated case. */ + res = parse_vlan_tag(skb, &key->eth.vlan, true); + if (res <= 0) + return res; + } - qp = (struct qtag_prefix *) skb->data; - key->eth.tci = qp->tci | htons(VLAN_TAG_PRESENT); - __skb_pull(skb, sizeof(struct qtag_prefix)); + /* Parse inner vlan tag. */ + res = parse_vlan_tag(skb, &key->eth.cvlan, false); + if (res <= 0) + return res; return 0; } @@ -473,7 +524,7 @@ static __be16 parse_ethertype(struct sk_buff *skb) proto = *(__be16 *) skb->data; __skb_pull(skb, sizeof(__be16)); - if (ntohs(proto) >= ETH_P_802_3_MIN) + if (eth_proto_is_802_3(proto)) return proto; if (skb->len < sizeof(struct llc_snap_hdr)) @@ -490,25 +541,22 @@ static __be16 parse_ethertype(struct sk_buff *skb) __skb_pull(skb, sizeof(struct llc_snap_hdr)); - if (ntohs(llc->ethertype) >= ETH_P_802_3_MIN) + if (eth_proto_is_802_3(llc->ethertype)) return llc->ethertype; return htons(ETH_P_802_2); } static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key, - int *key_lenp, int nh_len) + int nh_len) { struct icmp6hdr *icmp = icmp6_hdr(skb); - int error = 0; - int key_len; /* The ICMPv6 type and code fields use the 16-bit transport port * fields, so we need to store them in 16-bit network byte order. */ - key->ipv6.tp.src = htons(icmp->icmp6_type); - key->ipv6.tp.dst = htons(icmp->icmp6_code); - key_len = SW_FLOW_KEY_OFFSET(ipv6.tp); + key->tp.src = htons(icmp->icmp6_type); + key->tp.dst = htons(icmp->icmp6_code); if (icmp->icmp6_code == 0 && (icmp->icmp6_type == NDISC_NEIGHBOUR_SOLICITATION || @@ -517,21 +565,19 @@ static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key, struct nd_msg *nd; int offset; - key_len = SW_FLOW_KEY_OFFSET(ipv6.nd); + memset(&key->ipv6.nd, 0, sizeof(key->ipv6.nd)); /* In order to process neighbor discovery options, we need the * entire packet. */ if (unlikely(icmp_len < sizeof(*nd))) - goto out; - if (unlikely(skb_linearize(skb))) { - error = -ENOMEM; - goto out; - } + return 0; + + if (unlikely(skb_linearize(skb))) + return -ENOMEM; nd = (struct nd_msg *)skb_transport_header(skb); key->ipv6.nd.target = nd->target; - key_len = SW_FLOW_KEY_OFFSET(ipv6.nd); icmp_len -= sizeof(*nd); offset = 0; @@ -541,7 +587,7 @@ static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key, int opt_len = nd_opt->nd_opt_len * 8; if (unlikely(!opt_len || opt_len > icmp_len)) - goto invalid; + return 0; /* Store the link layer address if the appropriate * option is provided. It is considered an error if @@ -551,14 +597,14 @@ static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key, && opt_len == 8) { if (unlikely(!is_zero_ether_addr(key->ipv6.nd.sll))) goto invalid; - memcpy(key->ipv6.nd.sll, - &nd->opt[offset+sizeof(*nd_opt)], ETH_ALEN); + ether_addr_copy(key->ipv6.nd.sll, + &nd->opt[offset+sizeof(*nd_opt)]); } else if (nd_opt->nd_opt_type == ND_OPT_TARGET_LL_ADDR && opt_len == 8) { if (unlikely(!is_zero_ether_addr(key->ipv6.nd.tll))) goto invalid; - memcpy(key->ipv6.nd.tll, - &nd->opt[offset+sizeof(*nd_opt)], ETH_ALEN); + ether_addr_copy(key->ipv6.nd.tll, + &nd->opt[offset+sizeof(*nd_opt)]); } icmp_len -= opt_len; @@ -566,98 +612,88 @@ static int parse_icmpv6(struct sk_buff *skb, struct sw_flow_key *key, } } - goto out; + return 0; invalid: memset(&key->ipv6.nd.target, 0, sizeof(key->ipv6.nd.target)); memset(key->ipv6.nd.sll, 0, sizeof(key->ipv6.nd.sll)); memset(key->ipv6.nd.tll, 0, sizeof(key->ipv6.nd.tll)); -out: - *key_lenp = key_len; - return error; + return 0; } -/** - * ovs_flow_extract - extracts a flow key from an Ethernet frame. - * @skb: sk_buff that contains the frame, with skb->data pointing to the - * Ethernet header - * @in_port: port number on which @skb was received. - * @key: output flow key - * @key_lenp: length of output flow key - * - * The caller must ensure that skb->len >= ETH_HLEN. - * - * Returns 0 if successful, otherwise a negative errno value. - * - * Initializes @skb header pointers as follows: - * - * - skb->mac_header: the Ethernet header. - * - * - skb->network_header: just past the Ethernet header, or just past the - * VLAN header, to the first byte of the Ethernet payload. - * - * - skb->transport_header: If key->eth.type is ETH_P_IP or ETH_P_IPV6 - * on output, then just past the IP header, if one is present and - * of a correct length, otherwise the same as skb->network_header. - * For other key->eth.type values it is left untouched. - */ -int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key, - int *key_lenp) +static int parse_nsh(struct sk_buff *skb, struct sw_flow_key *key) { - int error = 0; - int key_len = SW_FLOW_KEY_OFFSET(eth); - struct ethhdr *eth; - - memset(key, 0, sizeof(*key)); + struct nshhdr *nh; + unsigned int nh_ofs = skb_network_offset(skb); + u8 version, length; + int err; - key->phy.priority = skb->priority; - if (OVS_CB(skb)->tun_key) - memcpy(&key->tun_key, OVS_CB(skb)->tun_key, sizeof(key->tun_key)); - key->phy.in_port = in_port; - key->phy.skb_mark = skb->mark; + err = check_header(skb, nh_ofs + NSH_BASE_HDR_LEN); + if (unlikely(err)) + return err; - skb_reset_mac_header(skb); + nh = nsh_hdr(skb); + version = nsh_get_ver(nh); + length = nsh_hdr_len(nh); - /* Link layer. We are guaranteed to have at least the 14 byte Ethernet - * header in the linear data area. - */ - eth = eth_hdr(skb); - memcpy(key->eth.src, eth->h_source, ETH_ALEN); - memcpy(key->eth.dst, eth->h_dest, ETH_ALEN); + if (version != 0) + return -EINVAL; - __skb_pull(skb, 2 * ETH_ALEN); - /* We are going to push all headers that we pull, so no need to - * update skb->csum here. - */ + err = check_header(skb, nh_ofs + length); + if (unlikely(err)) + return err; - if (vlan_tx_tag_present(skb)) - key->eth.tci = htons(skb->vlan_tci); - else if (eth->h_proto == htons(ETH_P_8021Q)) - if (unlikely(parse_vlan(skb, key))) - return -ENOMEM; + nh = nsh_hdr(skb); + key->nsh.base.flags = nsh_get_flags(nh); + key->nsh.base.ttl = nsh_get_ttl(nh); + key->nsh.base.mdtype = nh->mdtype; + key->nsh.base.np = nh->np; + key->nsh.base.path_hdr = nh->path_hdr; + switch (key->nsh.base.mdtype) { + case NSH_M_TYPE1: + if (length != NSH_M_TYPE1_LEN) + return -EINVAL; + memcpy(key->nsh.context, nh->md1.context, + sizeof(nh->md1)); + break; + case NSH_M_TYPE2: + memset(key->nsh.context, 0, + sizeof(nh->md1)); + break; + default: + return -EINVAL; + } - key->eth.type = parse_ethertype(skb); - if (unlikely(key->eth.type == htons(0))) - return -ENOMEM; + return 0; +} - skb_reset_network_header(skb); - __skb_push(skb, skb->data - skb_mac_header(skb)); +/** + * key_extract_l3l4 - extracts L3/L4 header information. + * @skb: sk_buff that contains the frame, with skb->data pointing to the + * L3 header + * @key: output flow key + * + * Return: %0 if successful, otherwise a negative errno value. + */ +static int key_extract_l3l4(struct sk_buff *skb, struct sw_flow_key *key) +{ + int error; /* Network layer. */ if (key->eth.type == htons(ETH_P_IP)) { struct iphdr *nh; __be16 offset; - key_len = SW_FLOW_KEY_OFFSET(ipv4.addr); - error = check_iphdr(skb); if (unlikely(error)) { + memset(&key->ip, 0, sizeof(key->ip)); + memset(&key->ipv4, 0, sizeof(key->ipv4)); if (error == -EINVAL) { skb->transport_header = skb->network_header; error = 0; } - goto out; + return error; } nh = ip_hdr(skb); @@ -671,836 +707,410 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key, offset = nh->frag_off & htons(IP_OFFSET); if (offset) { key->ip.frag = OVS_FRAG_TYPE_LATER; - goto out; + memset(&key->tp, 0, sizeof(key->tp)); + return 0; } if (nh->frag_off & htons(IP_MF) || - skb_shinfo(skb)->gso_type & SKB_GSO_UDP) + skb_shinfo(skb)->gso_type & SKB_GSO_UDP) key->ip.frag = OVS_FRAG_TYPE_FIRST; + else + key->ip.frag = OVS_FRAG_TYPE_NONE; /* Transport layer. */ if (key->ip.proto == IPPROTO_TCP) { - key_len = SW_FLOW_KEY_OFFSET(ipv4.tp); if (tcphdr_ok(skb)) { struct tcphdr *tcp = tcp_hdr(skb); - key->ipv4.tp.src = tcp->source; - key->ipv4.tp.dst = tcp->dest; + key->tp.src = tcp->source; + key->tp.dst = tcp->dest; + key->tp.flags = TCP_FLAGS_BE16(tcp); + } else { + memset(&key->tp, 0, sizeof(key->tp)); } + } else if (key->ip.proto == IPPROTO_UDP) { - key_len = SW_FLOW_KEY_OFFSET(ipv4.tp); if (udphdr_ok(skb)) { struct udphdr *udp = udp_hdr(skb); - key->ipv4.tp.src = udp->source; - key->ipv4.tp.dst = udp->dest; + key->tp.src = udp->source; + key->tp.dst = udp->dest; + } else { + memset(&key->tp, 0, sizeof(key->tp)); + } + } else if (key->ip.proto == IPPROTO_SCTP) { + if (sctphdr_ok(skb)) { + struct sctphdr *sctp = sctp_hdr(skb); + key->tp.src = sctp->source; + key->tp.dst = sctp->dest; + } else { + memset(&key->tp, 0, sizeof(key->tp)); } } else if (key->ip.proto == IPPROTO_ICMP) { - key_len = SW_FLOW_KEY_OFFSET(ipv4.tp); if (icmphdr_ok(skb)) { struct icmphdr *icmp = icmp_hdr(skb); /* The ICMP type and code fields use the 16-bit * transport port fields, so we need to store * them in 16-bit network byte order. */ - key->ipv4.tp.src = htons(icmp->type); - key->ipv4.tp.dst = htons(icmp->code); + key->tp.src = htons(icmp->type); + key->tp.dst = htons(icmp->code); + } else { + memset(&key->tp, 0, sizeof(key->tp)); } } - } else if ((key->eth.type == htons(ETH_P_ARP) || - key->eth.type == htons(ETH_P_RARP)) && arphdr_ok(skb)) { + } else if (key->eth.type == htons(ETH_P_ARP) || + key->eth.type == htons(ETH_P_RARP)) { struct arp_eth_header *arp; + bool arp_available = arphdr_ok(skb); arp = (struct arp_eth_header *)skb_network_header(skb); - if (arp->ar_hrd == htons(ARPHRD_ETHER) - && arp->ar_pro == htons(ETH_P_IP) - && arp->ar_hln == ETH_ALEN - && arp->ar_pln == 4) { + if (arp_available && + arp->ar_hrd == htons(ARPHRD_ETHER) && + arp->ar_pro == htons(ETH_P_IP) && + arp->ar_hln == ETH_ALEN && + arp->ar_pln == 4) { /* We only match on the lower 8 bits of the opcode. */ if (ntohs(arp->ar_op) <= 0xff) key->ip.proto = ntohs(arp->ar_op); + else + key->ip.proto = 0; + memcpy(&key->ipv4.addr.src, arp->ar_sip, sizeof(key->ipv4.addr.src)); memcpy(&key->ipv4.addr.dst, arp->ar_tip, sizeof(key->ipv4.addr.dst)); - memcpy(key->ipv4.arp.sha, arp->ar_sha, ETH_ALEN); - memcpy(key->ipv4.arp.tha, arp->ar_tha, ETH_ALEN); - key_len = SW_FLOW_KEY_OFFSET(ipv4.arp); + ether_addr_copy(key->ipv4.arp.sha, arp->ar_sha); + ether_addr_copy(key->ipv4.arp.tha, arp->ar_tha); + } else { + memset(&key->ip, 0, sizeof(key->ip)); + memset(&key->ipv4, 0, sizeof(key->ipv4)); } + } else if (eth_p_mpls(key->eth.type)) { + size_t label_count = 1; + + memset(&key->mpls, 0, sizeof(key->mpls)); + skb_set_inner_network_header(skb, skb->mac_len); + while (1) { + __be32 lse; + + error = check_header(skb, skb->mac_len + + label_count * MPLS_HLEN); + if (unlikely(error)) + return 0; + + memcpy(&lse, skb_inner_network_header(skb), MPLS_HLEN); + + if (label_count <= MPLS_LABEL_DEPTH) + memcpy(&key->mpls.lse[label_count - 1], &lse, + MPLS_HLEN); + + skb_set_inner_network_header(skb, skb->mac_len + + label_count * MPLS_HLEN); + if (lse & htonl(MPLS_LS_S_MASK)) + break; + + label_count++; + } + if (label_count > MPLS_LABEL_DEPTH) + label_count = MPLS_LABEL_DEPTH; + + key->mpls.num_labels_mask = GENMASK(label_count - 1, 0); } else if (key->eth.type == htons(ETH_P_IPV6)) { int nh_len; /* IPv6 Header + Extensions */ - nh_len = parse_ipv6hdr(skb, key, &key_len); + nh_len = parse_ipv6hdr(skb, key); if (unlikely(nh_len < 0)) { - if (nh_len == -EINVAL) + switch (nh_len) { + case -EINVAL: + memset(&key->ip, 0, sizeof(key->ip)); + memset(&key->ipv6.addr, 0, sizeof(key->ipv6.addr)); + fallthrough; + case -EPROTO: skb->transport_header = skb->network_header; - else + error = 0; + break; + default: error = nh_len; - goto out; + } + return error; } - if (key->ip.frag == OVS_FRAG_TYPE_LATER) - goto out; + if (key->ip.frag == OVS_FRAG_TYPE_LATER) { + memset(&key->tp, 0, sizeof(key->tp)); + return 0; + } if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP) key->ip.frag = OVS_FRAG_TYPE_FIRST; /* Transport layer. */ if (key->ip.proto == NEXTHDR_TCP) { - key_len = SW_FLOW_KEY_OFFSET(ipv6.tp); if (tcphdr_ok(skb)) { struct tcphdr *tcp = tcp_hdr(skb); - key->ipv6.tp.src = tcp->source; - key->ipv6.tp.dst = tcp->dest; + key->tp.src = tcp->source; + key->tp.dst = tcp->dest; + key->tp.flags = TCP_FLAGS_BE16(tcp); + } else { + memset(&key->tp, 0, sizeof(key->tp)); } } else if (key->ip.proto == NEXTHDR_UDP) { - key_len = SW_FLOW_KEY_OFFSET(ipv6.tp); if (udphdr_ok(skb)) { struct udphdr *udp = udp_hdr(skb); - key->ipv6.tp.src = udp->source; - key->ipv6.tp.dst = udp->dest; + key->tp.src = udp->source; + key->tp.dst = udp->dest; + } else { + memset(&key->tp, 0, sizeof(key->tp)); + } + } else if (key->ip.proto == NEXTHDR_SCTP) { + if (sctphdr_ok(skb)) { + struct sctphdr *sctp = sctp_hdr(skb); + key->tp.src = sctp->source; + key->tp.dst = sctp->dest; + } else { + memset(&key->tp, 0, sizeof(key->tp)); } } else if (key->ip.proto == NEXTHDR_ICMP) { - key_len = SW_FLOW_KEY_OFFSET(ipv6.tp); if (icmp6hdr_ok(skb)) { - error = parse_icmpv6(skb, key, &key_len, nh_len); - if (error < 0) - goto out; + error = parse_icmpv6(skb, key, nh_len); + if (error) + return error; + } else { + memset(&key->tp, 0, sizeof(key->tp)); } } + } else if (key->eth.type == htons(ETH_P_NSH)) { + error = parse_nsh(skb, key); + if (error) + return error; } - -out: - *key_lenp = key_len; - return error; -} - -static u32 ovs_flow_hash(const struct sw_flow_key *key, int key_start, int key_len) -{ - return jhash2((u32 *)((u8 *)key + key_start), - DIV_ROUND_UP(key_len - key_start, sizeof(u32)), 0); -} - -static int flow_key_start(struct sw_flow_key *key) -{ - if (key->tun_key.ipv4_dst) - return 0; - else - return offsetof(struct sw_flow_key, phy); -} - -struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *table, - struct sw_flow_key *key, int key_len) -{ - struct sw_flow *flow; - struct hlist_head *head; - u8 *_key; - int key_start; - u32 hash; - - key_start = flow_key_start(key); - hash = ovs_flow_hash(key, key_start, key_len); - - _key = (u8 *) key + key_start; - head = find_bucket(table, hash); - hlist_for_each_entry_rcu(flow, head, hash_node[table->node_ver]) { - - if (flow->hash == hash && - !memcmp((u8 *)&flow->key + key_start, _key, key_len - key_start)) { - return flow; - } - } - return NULL; -} - -void ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow, - struct sw_flow_key *key, int key_len) -{ - flow->hash = ovs_flow_hash(key, flow_key_start(key), key_len); - memcpy(&flow->key, key, sizeof(flow->key)); - __flow_tbl_insert(table, flow); -} - -void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow) -{ - BUG_ON(table->count == 0); - hlist_del_rcu(&flow->hash_node[table->node_ver]); - table->count--; + return 0; } -/* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute. */ -const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { - [OVS_KEY_ATTR_ENCAP] = -1, - [OVS_KEY_ATTR_PRIORITY] = sizeof(u32), - [OVS_KEY_ATTR_IN_PORT] = sizeof(u32), - [OVS_KEY_ATTR_SKB_MARK] = sizeof(u32), - [OVS_KEY_ATTR_ETHERNET] = sizeof(struct ovs_key_ethernet), - [OVS_KEY_ATTR_VLAN] = sizeof(__be16), - [OVS_KEY_ATTR_ETHERTYPE] = sizeof(__be16), - [OVS_KEY_ATTR_IPV4] = sizeof(struct ovs_key_ipv4), - [OVS_KEY_ATTR_IPV6] = sizeof(struct ovs_key_ipv6), - [OVS_KEY_ATTR_TCP] = sizeof(struct ovs_key_tcp), - [OVS_KEY_ATTR_UDP] = sizeof(struct ovs_key_udp), - [OVS_KEY_ATTR_ICMP] = sizeof(struct ovs_key_icmp), - [OVS_KEY_ATTR_ICMPV6] = sizeof(struct ovs_key_icmpv6), - [OVS_KEY_ATTR_ARP] = sizeof(struct ovs_key_arp), - [OVS_KEY_ATTR_ND] = sizeof(struct ovs_key_nd), - [OVS_KEY_ATTR_TUNNEL] = -1, -}; - -static int ipv4_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_len, - const struct nlattr *a[], u32 *attrs) +/** + * key_extract - extracts a flow key from an Ethernet frame. + * @skb: sk_buff that contains the frame, with skb->data pointing to the + * Ethernet header + * @key: output flow key + * + * The caller must ensure that skb->len >= ETH_HLEN. + * + * Initializes @skb header fields as follows: + * + * - skb->mac_header: the L2 header. + * + * - skb->network_header: just past the L2 header, or just past the + * VLAN header, to the first byte of the L2 payload. + * + * - skb->transport_header: If key->eth.type is ETH_P_IP or ETH_P_IPV6 + * on output, then just past the IP header, if one is present and + * of a correct length, otherwise the same as skb->network_header. + * For other key->eth.type values it is left untouched. + * + * - skb->protocol: the type of the data starting at skb->network_header. + * Equals to key->eth.type. + * + * Return: %0 if successful, otherwise a negative errno value. + */ +static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) { - const struct ovs_key_icmp *icmp_key; - const struct ovs_key_tcp *tcp_key; - const struct ovs_key_udp *udp_key; - - switch (swkey->ip.proto) { - case IPPROTO_TCP: - if (!(*attrs & (1 << OVS_KEY_ATTR_TCP))) - return -EINVAL; - *attrs &= ~(1 << OVS_KEY_ATTR_TCP); - - *key_len = SW_FLOW_KEY_OFFSET(ipv4.tp); - tcp_key = nla_data(a[OVS_KEY_ATTR_TCP]); - swkey->ipv4.tp.src = tcp_key->tcp_src; - swkey->ipv4.tp.dst = tcp_key->tcp_dst; - break; + struct ethhdr *eth; - case IPPROTO_UDP: - if (!(*attrs & (1 << OVS_KEY_ATTR_UDP))) - return -EINVAL; - *attrs &= ~(1 << OVS_KEY_ATTR_UDP); + /* Flags are always used as part of stats */ + key->tp.flags = 0; - *key_len = SW_FLOW_KEY_OFFSET(ipv4.tp); - udp_key = nla_data(a[OVS_KEY_ATTR_UDP]); - swkey->ipv4.tp.src = udp_key->udp_src; - swkey->ipv4.tp.dst = udp_key->udp_dst; - break; + skb_reset_mac_header(skb); - case IPPROTO_ICMP: - if (!(*attrs & (1 << OVS_KEY_ATTR_ICMP))) + /* Link layer. */ + clear_vlan(key); + if (ovs_key_mac_proto(key) == MAC_PROTO_NONE) { + if (unlikely(eth_type_vlan(skb->protocol))) return -EINVAL; - *attrs &= ~(1 << OVS_KEY_ATTR_ICMP); - *key_len = SW_FLOW_KEY_OFFSET(ipv4.tp); - icmp_key = nla_data(a[OVS_KEY_ATTR_ICMP]); - swkey->ipv4.tp.src = htons(icmp_key->icmp_type); - swkey->ipv4.tp.dst = htons(icmp_key->icmp_code); - break; - } - - return 0; -} - -static int ipv6_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_len, - const struct nlattr *a[], u32 *attrs) -{ - const struct ovs_key_icmpv6 *icmpv6_key; - const struct ovs_key_tcp *tcp_key; - const struct ovs_key_udp *udp_key; + skb_reset_network_header(skb); + key->eth.type = skb->protocol; + } else { + eth = eth_hdr(skb); + ether_addr_copy(key->eth.src, eth->h_source); + ether_addr_copy(key->eth.dst, eth->h_dest); - switch (swkey->ip.proto) { - case IPPROTO_TCP: - if (!(*attrs & (1 << OVS_KEY_ATTR_TCP))) - return -EINVAL; - *attrs &= ~(1 << OVS_KEY_ATTR_TCP); + __skb_pull(skb, 2 * ETH_ALEN); + /* We are going to push all headers that we pull, so no need to + * update skb->csum here. + */ - *key_len = SW_FLOW_KEY_OFFSET(ipv6.tp); - tcp_key = nla_data(a[OVS_KEY_ATTR_TCP]); - swkey->ipv6.tp.src = tcp_key->tcp_src; - swkey->ipv6.tp.dst = tcp_key->tcp_dst; - break; + if (unlikely(parse_vlan(skb, key))) + return -ENOMEM; - case IPPROTO_UDP: - if (!(*attrs & (1 << OVS_KEY_ATTR_UDP))) - return -EINVAL; - *attrs &= ~(1 << OVS_KEY_ATTR_UDP); + key->eth.type = parse_ethertype(skb); + if (unlikely(key->eth.type == htons(0))) + return -ENOMEM; - *key_len = SW_FLOW_KEY_OFFSET(ipv6.tp); - udp_key = nla_data(a[OVS_KEY_ATTR_UDP]); - swkey->ipv6.tp.src = udp_key->udp_src; - swkey->ipv6.tp.dst = udp_key->udp_dst; - break; + /* Multiple tagged packets need to retain TPID to satisfy + * skb_vlan_pop(), which will later shift the ethertype into + * skb->protocol. + */ + if (key->eth.cvlan.tci & htons(VLAN_CFI_MASK)) + skb->protocol = key->eth.cvlan.tpid; + else + skb->protocol = key->eth.type; - case IPPROTO_ICMPV6: - if (!(*attrs & (1 << OVS_KEY_ATTR_ICMPV6))) - return -EINVAL; - *attrs &= ~(1 << OVS_KEY_ATTR_ICMPV6); - - *key_len = SW_FLOW_KEY_OFFSET(ipv6.tp); - icmpv6_key = nla_data(a[OVS_KEY_ATTR_ICMPV6]); - swkey->ipv6.tp.src = htons(icmpv6_key->icmpv6_type); - swkey->ipv6.tp.dst = htons(icmpv6_key->icmpv6_code); - - if (swkey->ipv6.tp.src == htons(NDISC_NEIGHBOUR_SOLICITATION) || - swkey->ipv6.tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) { - const struct ovs_key_nd *nd_key; - - if (!(*attrs & (1 << OVS_KEY_ATTR_ND))) - return -EINVAL; - *attrs &= ~(1 << OVS_KEY_ATTR_ND); - - *key_len = SW_FLOW_KEY_OFFSET(ipv6.nd); - nd_key = nla_data(a[OVS_KEY_ATTR_ND]); - memcpy(&swkey->ipv6.nd.target, nd_key->nd_target, - sizeof(swkey->ipv6.nd.target)); - memcpy(swkey->ipv6.nd.sll, nd_key->nd_sll, ETH_ALEN); - memcpy(swkey->ipv6.nd.tll, nd_key->nd_tll, ETH_ALEN); - } - break; + skb_reset_network_header(skb); + __skb_push(skb, skb->data - skb_mac_header(skb)); } - return 0; + skb_reset_mac_len(skb); + + /* Fill out L3/L4 key info, if any */ + return key_extract_l3l4(skb, key); } -static int parse_flow_nlattrs(const struct nlattr *attr, - const struct nlattr *a[], u32 *attrsp) +/* In the case of conntrack fragment handling it expects L3 headers, + * add a helper. + */ +int ovs_flow_key_update_l3l4(struct sk_buff *skb, struct sw_flow_key *key) { - const struct nlattr *nla; - u32 attrs; - int rem; - - attrs = 0; - nla_for_each_nested(nla, attr, rem) { - u16 type = nla_type(nla); - int expected_len; - - if (type > OVS_KEY_ATTR_MAX || attrs & (1 << type)) - return -EINVAL; - - expected_len = ovs_key_lens[type]; - if (nla_len(nla) != expected_len && expected_len != -1) - return -EINVAL; - - attrs |= 1 << type; - a[type] = nla; - } - if (rem) - return -EINVAL; - - *attrsp = attrs; - return 0; + return key_extract_l3l4(skb, key); } -int ovs_ipv4_tun_from_nlattr(const struct nlattr *attr, - struct ovs_key_ipv4_tunnel *tun_key) +int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key) { - struct nlattr *a; - int rem; - bool ttl = false; - - memset(tun_key, 0, sizeof(*tun_key)); - - nla_for_each_nested(a, attr, rem) { - int type = nla_type(a); - static const u32 ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] = { - [OVS_TUNNEL_KEY_ATTR_ID] = sizeof(u64), - [OVS_TUNNEL_KEY_ATTR_IPV4_SRC] = sizeof(u32), - [OVS_TUNNEL_KEY_ATTR_IPV4_DST] = sizeof(u32), - [OVS_TUNNEL_KEY_ATTR_TOS] = 1, - [OVS_TUNNEL_KEY_ATTR_TTL] = 1, - [OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT] = 0, - [OVS_TUNNEL_KEY_ATTR_CSUM] = 0, - }; - - if (type > OVS_TUNNEL_KEY_ATTR_MAX || - ovs_tunnel_key_lens[type] != nla_len(a)) - return -EINVAL; - - switch (type) { - case OVS_TUNNEL_KEY_ATTR_ID: - tun_key->tun_id = nla_get_be64(a); - tun_key->tun_flags |= TUNNEL_KEY; - break; - case OVS_TUNNEL_KEY_ATTR_IPV4_SRC: - tun_key->ipv4_src = nla_get_be32(a); - break; - case OVS_TUNNEL_KEY_ATTR_IPV4_DST: - tun_key->ipv4_dst = nla_get_be32(a); - break; - case OVS_TUNNEL_KEY_ATTR_TOS: - tun_key->ipv4_tos = nla_get_u8(a); - break; - case OVS_TUNNEL_KEY_ATTR_TTL: - tun_key->ipv4_ttl = nla_get_u8(a); - ttl = true; - break; - case OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT: - tun_key->tun_flags |= TUNNEL_DONT_FRAGMENT; - break; - case OVS_TUNNEL_KEY_ATTR_CSUM: - tun_key->tun_flags |= TUNNEL_CSUM; - break; - default: - return -EINVAL; + int res; - } - } - if (rem > 0) - return -EINVAL; - - if (!tun_key->ipv4_dst) - return -EINVAL; + res = key_extract(skb, key); + if (!res) + key->mac_proto &= ~SW_FLOW_KEY_INVALID; - if (!ttl) - return -EINVAL; - - return 0; + return res; } -int ovs_ipv4_tun_to_nlattr(struct sk_buff *skb, - const struct ovs_key_ipv4_tunnel *tun_key) +static int key_extract_mac_proto(struct sk_buff *skb) { - struct nlattr *nla; - - nla = nla_nest_start(skb, OVS_KEY_ATTR_TUNNEL); - if (!nla) - return -EMSGSIZE; - - if (tun_key->tun_flags & TUNNEL_KEY && - nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, tun_key->tun_id)) - return -EMSGSIZE; - if (tun_key->ipv4_src && - nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, tun_key->ipv4_src)) - return -EMSGSIZE; - if (nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, tun_key->ipv4_dst)) - return -EMSGSIZE; - if (tun_key->ipv4_tos && - nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, tun_key->ipv4_tos)) - return -EMSGSIZE; - if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, tun_key->ipv4_ttl)) - return -EMSGSIZE; - if ((tun_key->tun_flags & TUNNEL_DONT_FRAGMENT) && - nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT)) - return -EMSGSIZE; - if ((tun_key->tun_flags & TUNNEL_CSUM) && - nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_CSUM)) - return -EMSGSIZE; - - nla_nest_end(skb, nla); - return 0; + switch (skb->dev->type) { + case ARPHRD_ETHER: + return MAC_PROTO_ETHERNET; + case ARPHRD_NONE: + if (skb->protocol == htons(ETH_P_TEB)) + return MAC_PROTO_ETHERNET; + return MAC_PROTO_NONE; + } + WARN_ON_ONCE(1); + return -EINVAL; } -/** - * ovs_flow_from_nlattrs - parses Netlink attributes into a flow key. - * @swkey: receives the extracted flow key. - * @key_lenp: number of bytes used in @swkey. - * @attr: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute - * sequence. - */ -int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp, - const struct nlattr *attr) +int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, + struct sk_buff *skb, struct sw_flow_key *key) { - const struct nlattr *a[OVS_KEY_ATTR_MAX + 1]; - const struct ovs_key_ethernet *eth_key; - int key_len; - u32 attrs; - int err; - - memset(swkey, 0, sizeof(struct sw_flow_key)); - key_len = SW_FLOW_KEY_OFFSET(eth); - - err = parse_flow_nlattrs(attr, a, &attrs); - if (err) - return err; - - /* Metadata attributes. */ - if (attrs & (1 << OVS_KEY_ATTR_PRIORITY)) { - swkey->phy.priority = nla_get_u32(a[OVS_KEY_ATTR_PRIORITY]); - attrs &= ~(1 << OVS_KEY_ATTR_PRIORITY); - } - if (attrs & (1 << OVS_KEY_ATTR_IN_PORT)) { - u32 in_port = nla_get_u32(a[OVS_KEY_ATTR_IN_PORT]); - if (in_port >= DP_MAX_PORTS) - return -EINVAL; - swkey->phy.in_port = in_port; - attrs &= ~(1 << OVS_KEY_ATTR_IN_PORT); - } else { - swkey->phy.in_port = DP_MAX_PORTS; - } - if (attrs & (1 << OVS_KEY_ATTR_SKB_MARK)) { - swkey->phy.skb_mark = nla_get_u32(a[OVS_KEY_ATTR_SKB_MARK]); - attrs &= ~(1 << OVS_KEY_ATTR_SKB_MARK); - } - - if (attrs & (1 << OVS_KEY_ATTR_TUNNEL)) { - err = ovs_ipv4_tun_from_nlattr(a[OVS_KEY_ATTR_TUNNEL], &swkey->tun_key); - if (err) - return err; - - attrs &= ~(1 << OVS_KEY_ATTR_TUNNEL); - } - - /* Data attributes. */ - if (!(attrs & (1 << OVS_KEY_ATTR_ETHERNET))) - return -EINVAL; - attrs &= ~(1 << OVS_KEY_ATTR_ETHERNET); - - eth_key = nla_data(a[OVS_KEY_ATTR_ETHERNET]); - memcpy(swkey->eth.src, eth_key->eth_src, ETH_ALEN); - memcpy(swkey->eth.dst, eth_key->eth_dst, ETH_ALEN); - - if (attrs & (1u << OVS_KEY_ATTR_ETHERTYPE) && - nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]) == htons(ETH_P_8021Q)) { - const struct nlattr *encap; - __be16 tci; - - if (attrs != ((1 << OVS_KEY_ATTR_VLAN) | - (1 << OVS_KEY_ATTR_ETHERTYPE) | - (1 << OVS_KEY_ATTR_ENCAP))) - return -EINVAL; - - encap = a[OVS_KEY_ATTR_ENCAP]; - tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); - if (tci & htons(VLAN_TAG_PRESENT)) { - swkey->eth.tci = tci; - - err = parse_flow_nlattrs(encap, a, &attrs); - if (err) - return err; - } else if (!tci) { - /* Corner case for truncated 802.1Q header. */ - if (nla_len(encap)) - return -EINVAL; - - swkey->eth.type = htons(ETH_P_8021Q); - *key_lenp = key_len; - return 0; +#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) + struct tc_skb_ext *tc_ext; +#endif + bool post_ct = false, post_ct_snat = false, post_ct_dnat = false; + int res, err; + u16 zone = 0; + + /* Extract metadata from packet. */ + if (tun_info) { + key->tun_proto = ip_tunnel_info_af(tun_info); + memcpy(&key->tun_key, &tun_info->key, sizeof(key->tun_key)); + + if (tun_info->options_len) { + BUILD_BUG_ON((1 << (sizeof(tun_info->options_len) * + 8)) - 1 + > sizeof(key->tun_opts)); + + ip_tunnel_info_opts_get(TUN_METADATA_OPTS(key, tun_info->options_len), + tun_info); + key->tun_opts_len = tun_info->options_len; } else { - return -EINVAL; + key->tun_opts_len = 0; } + } else { + key->tun_proto = 0; + key->tun_opts_len = 0; + memset(&key->tun_key, 0, sizeof(key->tun_key)); } - if (attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) { - swkey->eth.type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]); - if (ntohs(swkey->eth.type) < ETH_P_802_3_MIN) - return -EINVAL; - attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); + key->phy.priority = skb->priority; + key->phy.in_port = OVS_CB(skb)->input_vport->port_no; + key->phy.skb_mark = skb->mark; + key->ovs_flow_hash = 0; + res = key_extract_mac_proto(skb); + if (res < 0) + return res; + key->mac_proto = res; + +#if IS_ENABLED(CONFIG_NET_TC_SKB_EXT) + if (tc_skb_ext_tc_enabled()) { + tc_ext = skb_ext_find(skb, TC_SKB_EXT); + key->recirc_id = tc_ext && !tc_ext->act_miss ? + tc_ext->chain : 0; + OVS_CB(skb)->mru = tc_ext ? tc_ext->mru : 0; + post_ct = tc_ext ? tc_ext->post_ct : false; + post_ct_snat = post_ct ? tc_ext->post_ct_snat : false; + post_ct_dnat = post_ct ? tc_ext->post_ct_dnat : false; + zone = post_ct ? tc_ext->zone : 0; } else { - swkey->eth.type = htons(ETH_P_802_2); - } - - if (swkey->eth.type == htons(ETH_P_IP)) { - const struct ovs_key_ipv4 *ipv4_key; - - if (!(attrs & (1 << OVS_KEY_ATTR_IPV4))) - return -EINVAL; - attrs &= ~(1 << OVS_KEY_ATTR_IPV4); - - key_len = SW_FLOW_KEY_OFFSET(ipv4.addr); - ipv4_key = nla_data(a[OVS_KEY_ATTR_IPV4]); - if (ipv4_key->ipv4_frag > OVS_FRAG_TYPE_MAX) - return -EINVAL; - swkey->ip.proto = ipv4_key->ipv4_proto; - swkey->ip.tos = ipv4_key->ipv4_tos; - swkey->ip.ttl = ipv4_key->ipv4_ttl; - swkey->ip.frag = ipv4_key->ipv4_frag; - swkey->ipv4.addr.src = ipv4_key->ipv4_src; - swkey->ipv4.addr.dst = ipv4_key->ipv4_dst; - - if (swkey->ip.frag != OVS_FRAG_TYPE_LATER) { - err = ipv4_flow_from_nlattrs(swkey, &key_len, a, &attrs); - if (err) - return err; - } - } else if (swkey->eth.type == htons(ETH_P_IPV6)) { - const struct ovs_key_ipv6 *ipv6_key; - - if (!(attrs & (1 << OVS_KEY_ATTR_IPV6))) - return -EINVAL; - attrs &= ~(1 << OVS_KEY_ATTR_IPV6); - - key_len = SW_FLOW_KEY_OFFSET(ipv6.label); - ipv6_key = nla_data(a[OVS_KEY_ATTR_IPV6]); - if (ipv6_key->ipv6_frag > OVS_FRAG_TYPE_MAX) - return -EINVAL; - swkey->ipv6.label = ipv6_key->ipv6_label; - swkey->ip.proto = ipv6_key->ipv6_proto; - swkey->ip.tos = ipv6_key->ipv6_tclass; - swkey->ip.ttl = ipv6_key->ipv6_hlimit; - swkey->ip.frag = ipv6_key->ipv6_frag; - memcpy(&swkey->ipv6.addr.src, ipv6_key->ipv6_src, - sizeof(swkey->ipv6.addr.src)); - memcpy(&swkey->ipv6.addr.dst, ipv6_key->ipv6_dst, - sizeof(swkey->ipv6.addr.dst)); - - if (swkey->ip.frag != OVS_FRAG_TYPE_LATER) { - err = ipv6_flow_from_nlattrs(swkey, &key_len, a, &attrs); - if (err) - return err; - } - } else if (swkey->eth.type == htons(ETH_P_ARP) || - swkey->eth.type == htons(ETH_P_RARP)) { - const struct ovs_key_arp *arp_key; - - if (!(attrs & (1 << OVS_KEY_ATTR_ARP))) - return -EINVAL; - attrs &= ~(1 << OVS_KEY_ATTR_ARP); - - key_len = SW_FLOW_KEY_OFFSET(ipv4.arp); - arp_key = nla_data(a[OVS_KEY_ATTR_ARP]); - swkey->ipv4.addr.src = arp_key->arp_sip; - swkey->ipv4.addr.dst = arp_key->arp_tip; - if (arp_key->arp_op & htons(0xff00)) - return -EINVAL; - swkey->ip.proto = ntohs(arp_key->arp_op); - memcpy(swkey->ipv4.arp.sha, arp_key->arp_sha, ETH_ALEN); - memcpy(swkey->ipv4.arp.tha, arp_key->arp_tha, ETH_ALEN); + key->recirc_id = 0; } - - if (attrs) - return -EINVAL; - *key_lenp = key_len; - - return 0; -} - -/** - * ovs_flow_metadata_from_nlattrs - parses Netlink attributes into a flow key. - * @flow: Receives extracted in_port, priority, tun_key and skb_mark. - * @key_len: Length of key in @flow. Used for calculating flow hash. - * @attr: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute - * sequence. - * - * This parses a series of Netlink attributes that form a flow key, which must - * take the same form accepted by flow_from_nlattrs(), but only enough of it to - * get the metadata, that is, the parts of the flow key that cannot be - * extracted from the packet itself. - */ -int ovs_flow_metadata_from_nlattrs(struct sw_flow *flow, int key_len, - const struct nlattr *attr) -{ - struct ovs_key_ipv4_tunnel *tun_key = &flow->key.tun_key; - const struct nlattr *nla; - int rem; - - flow->key.phy.in_port = DP_MAX_PORTS; - flow->key.phy.priority = 0; - flow->key.phy.skb_mark = 0; - memset(tun_key, 0, sizeof(flow->key.tun_key)); - - nla_for_each_nested(nla, attr, rem) { - int type = nla_type(nla); - - if (type <= OVS_KEY_ATTR_MAX && ovs_key_lens[type] > 0) { - int err; - - if (nla_len(nla) != ovs_key_lens[type]) - return -EINVAL; - - switch (type) { - case OVS_KEY_ATTR_PRIORITY: - flow->key.phy.priority = nla_get_u32(nla); - break; - - case OVS_KEY_ATTR_TUNNEL: - err = ovs_ipv4_tun_from_nlattr(nla, tun_key); - if (err) - return err; - break; - - case OVS_KEY_ATTR_IN_PORT: - if (nla_get_u32(nla) >= DP_MAX_PORTS) - return -EINVAL; - flow->key.phy.in_port = nla_get_u32(nla); - break; - - case OVS_KEY_ATTR_SKB_MARK: - flow->key.phy.skb_mark = nla_get_u32(nla); - break; +#else + key->recirc_id = 0; +#endif + + err = key_extract(skb, key); + if (!err) { + ovs_ct_fill_key(skb, key, post_ct); /* Must be after key_extract(). */ + if (post_ct) { + if (!skb_get_nfct(skb)) { + key->ct_zone = zone; + } else { + if (!post_ct_dnat) + key->ct_state &= ~OVS_CS_F_DST_NAT; + if (!post_ct_snat) + key->ct_state &= ~OVS_CS_F_SRC_NAT; } } } - if (rem) - return -EINVAL; - - flow->hash = ovs_flow_hash(&flow->key, - flow_key_start(&flow->key), key_len); - - return 0; + return err; } -int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb) +int ovs_flow_key_extract_userspace(struct net *net, const struct nlattr *attr, + struct sk_buff *skb, + struct sw_flow_key *key, bool log) { - struct ovs_key_ethernet *eth_key; - struct nlattr *nla, *encap; - - if (swkey->phy.priority && - nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, swkey->phy.priority)) - goto nla_put_failure; - - if (swkey->tun_key.ipv4_dst && - ovs_ipv4_tun_to_nlattr(skb, &swkey->tun_key)) - goto nla_put_failure; - - if (swkey->phy.in_port != DP_MAX_PORTS && - nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, swkey->phy.in_port)) - goto nla_put_failure; - - if (swkey->phy.skb_mark && - nla_put_u32(skb, OVS_KEY_ATTR_SKB_MARK, swkey->phy.skb_mark)) - goto nla_put_failure; - - nla = nla_reserve(skb, OVS_KEY_ATTR_ETHERNET, sizeof(*eth_key)); - if (!nla) - goto nla_put_failure; - eth_key = nla_data(nla); - memcpy(eth_key->eth_src, swkey->eth.src, ETH_ALEN); - memcpy(eth_key->eth_dst, swkey->eth.dst, ETH_ALEN); - - if (swkey->eth.tci || swkey->eth.type == htons(ETH_P_8021Q)) { - if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, htons(ETH_P_8021Q)) || - nla_put_be16(skb, OVS_KEY_ATTR_VLAN, swkey->eth.tci)) - goto nla_put_failure; - encap = nla_nest_start(skb, OVS_KEY_ATTR_ENCAP); - if (!swkey->eth.tci) - goto unencap; - } else { - encap = NULL; - } - - if (swkey->eth.type == htons(ETH_P_802_2)) - goto unencap; - - if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, swkey->eth.type)) - goto nla_put_failure; - - if (swkey->eth.type == htons(ETH_P_IP)) { - struct ovs_key_ipv4 *ipv4_key; - - nla = nla_reserve(skb, OVS_KEY_ATTR_IPV4, sizeof(*ipv4_key)); - if (!nla) - goto nla_put_failure; - ipv4_key = nla_data(nla); - ipv4_key->ipv4_src = swkey->ipv4.addr.src; - ipv4_key->ipv4_dst = swkey->ipv4.addr.dst; - ipv4_key->ipv4_proto = swkey->ip.proto; - ipv4_key->ipv4_tos = swkey->ip.tos; - ipv4_key->ipv4_ttl = swkey->ip.ttl; - ipv4_key->ipv4_frag = swkey->ip.frag; - } else if (swkey->eth.type == htons(ETH_P_IPV6)) { - struct ovs_key_ipv6 *ipv6_key; - - nla = nla_reserve(skb, OVS_KEY_ATTR_IPV6, sizeof(*ipv6_key)); - if (!nla) - goto nla_put_failure; - ipv6_key = nla_data(nla); - memcpy(ipv6_key->ipv6_src, &swkey->ipv6.addr.src, - sizeof(ipv6_key->ipv6_src)); - memcpy(ipv6_key->ipv6_dst, &swkey->ipv6.addr.dst, - sizeof(ipv6_key->ipv6_dst)); - ipv6_key->ipv6_label = swkey->ipv6.label; - ipv6_key->ipv6_proto = swkey->ip.proto; - ipv6_key->ipv6_tclass = swkey->ip.tos; - ipv6_key->ipv6_hlimit = swkey->ip.ttl; - ipv6_key->ipv6_frag = swkey->ip.frag; - } else if (swkey->eth.type == htons(ETH_P_ARP) || - swkey->eth.type == htons(ETH_P_RARP)) { - struct ovs_key_arp *arp_key; - - nla = nla_reserve(skb, OVS_KEY_ATTR_ARP, sizeof(*arp_key)); - if (!nla) - goto nla_put_failure; - arp_key = nla_data(nla); - memset(arp_key, 0, sizeof(struct ovs_key_arp)); - arp_key->arp_sip = swkey->ipv4.addr.src; - arp_key->arp_tip = swkey->ipv4.addr.dst; - arp_key->arp_op = htons(swkey->ip.proto); - memcpy(arp_key->arp_sha, swkey->ipv4.arp.sha, ETH_ALEN); - memcpy(arp_key->arp_tha, swkey->ipv4.arp.tha, ETH_ALEN); - } + const struct nlattr *a[OVS_KEY_ATTR_MAX + 1]; + u64 attrs = 0; + int err; - if ((swkey->eth.type == htons(ETH_P_IP) || - swkey->eth.type == htons(ETH_P_IPV6)) && - swkey->ip.frag != OVS_FRAG_TYPE_LATER) { - - if (swkey->ip.proto == IPPROTO_TCP) { - struct ovs_key_tcp *tcp_key; - - nla = nla_reserve(skb, OVS_KEY_ATTR_TCP, sizeof(*tcp_key)); - if (!nla) - goto nla_put_failure; - tcp_key = nla_data(nla); - if (swkey->eth.type == htons(ETH_P_IP)) { - tcp_key->tcp_src = swkey->ipv4.tp.src; - tcp_key->tcp_dst = swkey->ipv4.tp.dst; - } else if (swkey->eth.type == htons(ETH_P_IPV6)) { - tcp_key->tcp_src = swkey->ipv6.tp.src; - tcp_key->tcp_dst = swkey->ipv6.tp.dst; - } - } else if (swkey->ip.proto == IPPROTO_UDP) { - struct ovs_key_udp *udp_key; - - nla = nla_reserve(skb, OVS_KEY_ATTR_UDP, sizeof(*udp_key)); - if (!nla) - goto nla_put_failure; - udp_key = nla_data(nla); - if (swkey->eth.type == htons(ETH_P_IP)) { - udp_key->udp_src = swkey->ipv4.tp.src; - udp_key->udp_dst = swkey->ipv4.tp.dst; - } else if (swkey->eth.type == htons(ETH_P_IPV6)) { - udp_key->udp_src = swkey->ipv6.tp.src; - udp_key->udp_dst = swkey->ipv6.tp.dst; - } - } else if (swkey->eth.type == htons(ETH_P_IP) && - swkey->ip.proto == IPPROTO_ICMP) { - struct ovs_key_icmp *icmp_key; - - nla = nla_reserve(skb, OVS_KEY_ATTR_ICMP, sizeof(*icmp_key)); - if (!nla) - goto nla_put_failure; - icmp_key = nla_data(nla); - icmp_key->icmp_type = ntohs(swkey->ipv4.tp.src); - icmp_key->icmp_code = ntohs(swkey->ipv4.tp.dst); - } else if (swkey->eth.type == htons(ETH_P_IPV6) && - swkey->ip.proto == IPPROTO_ICMPV6) { - struct ovs_key_icmpv6 *icmpv6_key; - - nla = nla_reserve(skb, OVS_KEY_ATTR_ICMPV6, - sizeof(*icmpv6_key)); - if (!nla) - goto nla_put_failure; - icmpv6_key = nla_data(nla); - icmpv6_key->icmpv6_type = ntohs(swkey->ipv6.tp.src); - icmpv6_key->icmpv6_code = ntohs(swkey->ipv6.tp.dst); - - if (icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_SOLICITATION || - icmpv6_key->icmpv6_type == NDISC_NEIGHBOUR_ADVERTISEMENT) { - struct ovs_key_nd *nd_key; - - nla = nla_reserve(skb, OVS_KEY_ATTR_ND, sizeof(*nd_key)); - if (!nla) - goto nla_put_failure; - nd_key = nla_data(nla); - memcpy(nd_key->nd_target, &swkey->ipv6.nd.target, - sizeof(nd_key->nd_target)); - memcpy(nd_key->nd_sll, swkey->ipv6.nd.sll, ETH_ALEN); - memcpy(nd_key->nd_tll, swkey->ipv6.nd.tll, ETH_ALEN); - } - } - } + err = parse_flow_nlattrs(attr, a, &attrs, log); + if (err) + return -EINVAL; -unencap: - if (encap) - nla_nest_end(skb, encap); + /* Extract metadata from netlink attributes. */ + err = ovs_nla_get_flow_metadata(net, a, attrs, key, log); + if (err) + return err; - return 0; + /* key_extract assumes that skb->protocol is set-up for + * layer 3 packets which is the case for other callers, + * in particular packets received from the network stack. + * Here the correct value can be set from the metadata + * extracted above. + * For L2 packet key eth type would be zero. skb protocol + * would be set to correct value later during key-extact. + */ -nla_put_failure: - return -EMSGSIZE; -} + skb->protocol = key->eth.type; + err = key_extract(skb, key); + if (err) + return err; -/* Initializes the flow module. - * Returns zero if successful or a negative error code. */ -int ovs_flow_init(void) -{ - flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow), 0, - 0, NULL); - if (flow_cache == NULL) - return -ENOMEM; + /* Check that we have conntrack original direction tuple metadata only + * for packets for which it makes sense. Otherwise the key may be + * corrupted due to overlapping key fields. + */ + if (attrs & (1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4) && + key->eth.type != htons(ETH_P_IP)) + return -EINVAL; + if (attrs & (1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6) && + (key->eth.type != htons(ETH_P_IPV6) || + sw_flow_key_is_nd(key))) + return -EINVAL; return 0; } - -/* Uninitializes the flow module. */ -void ovs_flow_exit(void) -{ - kmem_cache_destroy(flow_cache); -} diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h index 66ef7220293e..b5711aff6e76 100644 --- a/net/openvswitch/flow.h +++ b/net/openvswitch/flow.h @@ -1,24 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* - * Copyright (c) 2007-2011 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA + * Copyright (c) 2007-2017 Nicira, Inc. */ #ifndef FLOW_H #define FLOW_H 1 +#include <linux/cache.h> #include <linux/kernel.h> #include <linux/netlink.h> #include <linux/openvswitch.h> @@ -29,66 +17,100 @@ #include <linux/in6.h> #include <linux/jiffies.h> #include <linux/time.h> -#include <linux/flex_array.h> +#include <linux/cpumask.h> #include <net/inet_ecn.h> +#include <net/ip_tunnels.h> +#include <net/dst_metadata.h> +#include <net/nsh.h> struct sk_buff; -struct sw_flow_actions { - struct rcu_head rcu; - u32 actions_len; - struct nlattr actions[]; +enum sw_flow_mac_proto { + MAC_PROTO_NONE = 0, + MAC_PROTO_ETHERNET, }; +#define SW_FLOW_KEY_INVALID 0x80 +#define MPLS_LABEL_DEPTH 3 -/* Used to memset ovs_key_ipv4_tunnel padding. */ -#define OVS_TUNNEL_KEY_SIZE \ - (offsetof(struct ovs_key_ipv4_tunnel, ipv4_ttl) + \ - FIELD_SIZEOF(struct ovs_key_ipv4_tunnel, ipv4_ttl)) - -struct ovs_key_ipv4_tunnel { - __be64 tun_id; - __be32 ipv4_src; - __be32 ipv4_dst; - __be16 tun_flags; - u8 ipv4_tos; - u8 ipv4_ttl; +/* Bit definitions for IPv6 Extension Header pseudo-field. */ +enum ofp12_ipv6exthdr_flags { + OFPIEH12_NONEXT = 1 << 0, /* "No next header" encountered. */ + OFPIEH12_ESP = 1 << 1, /* Encrypted Sec Payload header present. */ + OFPIEH12_AUTH = 1 << 2, /* Authentication header present. */ + OFPIEH12_DEST = 1 << 3, /* 1 or 2 dest headers present. */ + OFPIEH12_FRAG = 1 << 4, /* Fragment header present. */ + OFPIEH12_ROUTER = 1 << 5, /* Router header present. */ + OFPIEH12_HOP = 1 << 6, /* Hop-by-hop header present. */ + OFPIEH12_UNREP = 1 << 7, /* Unexpected repeats encountered. */ + OFPIEH12_UNSEQ = 1 << 8 /* Unexpected sequencing encountered. */ }; -static inline void ovs_flow_tun_key_init(struct ovs_key_ipv4_tunnel *tun_key, - const struct iphdr *iph, __be64 tun_id, - __be16 tun_flags) -{ - tun_key->tun_id = tun_id; - tun_key->ipv4_src = iph->saddr; - tun_key->ipv4_dst = iph->daddr; - tun_key->ipv4_tos = iph->tos; - tun_key->ipv4_ttl = iph->ttl; - tun_key->tun_flags = tun_flags; - - /* clear struct padding. */ - memset((unsigned char *) tun_key + OVS_TUNNEL_KEY_SIZE, 0, - sizeof(*tun_key) - OVS_TUNNEL_KEY_SIZE); -} +/* Store options at the end of the array if they are less than the + * maximum size. This allows us to get the benefits of variable length + * matching for small options. + */ +#define TUN_METADATA_OFFSET(opt_len) \ + (sizeof_field(struct sw_flow_key, tun_opts) - opt_len) +#define TUN_METADATA_OPTS(flow_key, opt_len) \ + ((void *)((flow_key)->tun_opts + TUN_METADATA_OFFSET(opt_len))) + +struct ovs_tunnel_info { + struct metadata_dst *tun_dst; +}; + +struct vlan_head { + __be16 tpid; /* Vlan type. Generally 802.1q or 802.1ad.*/ + __be16 tci; /* 0 if no VLAN, VLAN_CFI_MASK set otherwise. */ +}; + +#define OVS_SW_FLOW_KEY_METADATA_SIZE \ + (offsetof(struct sw_flow_key, recirc_id) + \ + sizeof_field(struct sw_flow_key, recirc_id)) + +struct ovs_key_nsh { + struct ovs_nsh_key_base base; + __be32 context[NSH_MD1_CONTEXT_SIZE]; +}; struct sw_flow_key { - struct ovs_key_ipv4_tunnel tun_key; /* Encapsulating tunnel key. */ + u8 tun_opts[IP_TUNNEL_OPTS_MAX]; + u8 tun_opts_len; + struct ip_tunnel_key tun_key; /* Encapsulating tunnel key. */ struct { u32 priority; /* Packet QoS priority. */ u32 skb_mark; /* SKB mark. */ u16 in_port; /* Input switch port (or DP_MAX_PORTS). */ - } phy; + } __packed phy; /* Safe when right after 'tun_key'. */ + u8 mac_proto; /* MAC layer protocol (e.g. Ethernet). */ + u8 tun_proto; /* Protocol of encapsulating tunnel. */ + u32 ovs_flow_hash; /* Datapath computed hash value. */ + u32 recirc_id; /* Recirculation ID. */ struct { u8 src[ETH_ALEN]; /* Ethernet source address. */ u8 dst[ETH_ALEN]; /* Ethernet destination address. */ - __be16 tci; /* 0 if no VLAN, VLAN_TAG_PRESENT set otherwise. */ + struct vlan_head vlan; + struct vlan_head cvlan; __be16 type; /* Ethernet frame type. */ } eth; + /* Filling a hole of two bytes. */ + u8 ct_state; + u8 ct_orig_proto; /* CT original direction tuple IP + * protocol. + */ + union { + struct { + u8 proto; /* IP protocol or lower 8 bits of ARP opcode. */ + u8 tos; /* IP ToS. */ + u8 ttl; /* IP TTL/hop limit. */ + u8 frag; /* One of OVS_FRAG_TYPE_*. */ + } ip; + }; + u16 ct_zone; /* Conntrack zone. */ struct { - u8 proto; /* IP protocol or lower 8 bits of ARP opcode. */ - u8 tos; /* IP ToS. */ - u8 ttl; /* IP TTL/hop limit. */ - u8 frag; /* One of OVS_FRAG_TYPE_*. */ - } ip; + __be16 src; /* TCP/UDP/SCTP source port. */ + __be16 dst; /* TCP/UDP/SCTP destination port. */ + __be16 flags; /* TCP flags. */ + } tp; union { struct { struct { @@ -97,9 +119,9 @@ struct sw_flow_key { } addr; union { struct { - __be16 src; /* TCP/UDP source port. */ - __be16 dst; /* TCP/UDP destination port. */ - } tp; + __be32 src; + __be32 dst; + } ct_orig; /* Conntrack original direction fields. */ struct { u8 sha[ETH_ALEN]; /* ARP source hardware address. */ u8 tha[ETH_ALEN]; /* ARP target hardware address. */ @@ -112,32 +134,109 @@ struct sw_flow_key { struct in6_addr dst; /* IPv6 destination address. */ } addr; __be32 label; /* IPv6 flow label. */ - struct { - __be16 src; /* TCP/UDP source port. */ - __be16 dst; /* TCP/UDP destination port. */ - } tp; - struct { - struct in6_addr target; /* ND target address. */ - u8 sll[ETH_ALEN]; /* ND source link layer address. */ - u8 tll[ETH_ALEN]; /* ND target link layer address. */ - } nd; + u16 exthdrs; /* IPv6 extension header flags */ + union { + struct { + struct in6_addr src; + struct in6_addr dst; + } ct_orig; /* Conntrack original direction fields. */ + struct { + struct in6_addr target; /* ND target address. */ + u8 sll[ETH_ALEN]; /* ND source link layer address. */ + u8 tll[ETH_ALEN]; /* ND target link layer address. */ + } nd; + }; } ipv6; + struct { + u32 num_labels_mask; /* labels present bitmap of effective length MPLS_LABEL_DEPTH */ + __be32 lse[MPLS_LABEL_DEPTH]; /* label stack entry */ + } mpls; + + struct ovs_key_nsh nsh; /* network service header */ }; + struct { + /* Connection tracking fields not packed above. */ + struct { + __be16 src; /* CT orig tuple tp src port. */ + __be16 dst; /* CT orig tuple tp dst port. */ + } orig_tp; + u32 mark; + struct ovs_key_ct_labels labels; + } ct; + +} __aligned(BITS_PER_LONG/8); /* Ensure that we can do comparisons as longs. */ + +static inline bool sw_flow_key_is_nd(const struct sw_flow_key *key) +{ + return key->eth.type == htons(ETH_P_IPV6) && + key->ip.proto == NEXTHDR_ICMP && + key->tp.dst == 0 && + (key->tp.src == htons(NDISC_NEIGHBOUR_SOLICITATION) || + key->tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)); +} + +struct sw_flow_key_range { + unsigned short int start; + unsigned short int end; }; -struct sw_flow { +struct sw_flow_mask { + int ref_count; struct rcu_head rcu; - struct hlist_node hash_node[2]; - u32 hash; + struct sw_flow_key_range range; + struct sw_flow_key key; +}; +struct sw_flow_match { + struct sw_flow_key *key; + struct sw_flow_key_range range; + struct sw_flow_mask *mask; +}; + +#define MAX_UFID_LENGTH 16 /* 128 bits */ + +struct sw_flow_id { + u32 ufid_len; + union { + u32 ufid[MAX_UFID_LENGTH / 4]; + struct sw_flow_key *unmasked_key; + }; +}; + +struct sw_flow_actions { + struct rcu_head rcu; + size_t orig_len; /* From flow_cmd_new netlink actions size */ + u32 actions_len; + struct nlattr actions[]; +}; + +struct sw_flow_stats { + u64 packet_count; /* Number of packets matched. */ + u64 byte_count; /* Number of bytes matched. */ + unsigned long used; /* Last used time (in jiffies). */ + spinlock_t lock; /* Lock for atomic stats update. */ + __be16 tcp_flags; /* Union of seen TCP flags. */ +}; + +struct sw_flow { + struct rcu_head rcu; + struct { + struct hlist_node node[2]; + u32 hash; + } flow_table, ufid_table; + int stats_last_writer; /* CPU id of the last writer on + * 'stats[0]'. + */ struct sw_flow_key key; + struct sw_flow_id id; + struct cpumask *cpu_used_mask; + struct sw_flow_mask *mask; struct sw_flow_actions __rcu *sf_acts; - - spinlock_t lock; /* Lock for values below. */ - unsigned long used; /* Last used time (in jiffies). */ - u64 packet_count; /* Number of packets matched. */ - u64 byte_count; /* Number of bytes matched. */ - u8 tcp_flags; /* Union of seen TCP flags. */ + struct sw_flow_stats __rcu *stats[]; /* One for each CPU. First one + * is allocated at flow creation time, + * the rest are allocated on demand + * while holding the 'stats[0].lock'. + */ }; struct arp_eth_header { @@ -154,65 +253,46 @@ struct arp_eth_header { unsigned char ar_tip[4]; /* target IP address */ } __packed; -int ovs_flow_init(void); -void ovs_flow_exit(void); - -struct sw_flow *ovs_flow_alloc(void); -void ovs_flow_deferred_free(struct sw_flow *); -void ovs_flow_free(struct sw_flow *flow); - -struct sw_flow_actions *ovs_flow_actions_alloc(int actions_len); -void ovs_flow_deferred_free_acts(struct sw_flow_actions *); - -int ovs_flow_extract(struct sk_buff *, u16 in_port, struct sw_flow_key *, - int *key_lenp); -void ovs_flow_used(struct sw_flow *, struct sk_buff *); -u64 ovs_flow_used_time(unsigned long flow_jiffies); - -int ovs_flow_to_nlattrs(const struct sw_flow_key *, struct sk_buff *); -int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp, - const struct nlattr *); -int ovs_flow_metadata_from_nlattrs(struct sw_flow *flow, int key_len, - const struct nlattr *attr); +static inline u8 ovs_key_mac_proto(const struct sw_flow_key *key) +{ + return key->mac_proto & ~SW_FLOW_KEY_INVALID; +} -#define MAX_ACTIONS_BUFSIZE (32 * 1024) -#define TBL_MIN_BUCKETS 1024 +static inline u16 __ovs_mac_header_len(u8 mac_proto) +{ + return mac_proto == MAC_PROTO_ETHERNET ? ETH_HLEN : 0; +} -struct flow_table { - struct flex_array *buckets; - unsigned int count, n_buckets; - struct rcu_head rcu; - int node_ver; - u32 hash_seed; - bool keep_flows; -}; +static inline u16 ovs_mac_header_len(const struct sw_flow_key *key) +{ + return __ovs_mac_header_len(ovs_key_mac_proto(key)); +} -static inline int ovs_flow_tbl_count(struct flow_table *table) +static inline bool ovs_identifier_is_ufid(const struct sw_flow_id *sfid) { - return table->count; + return sfid->ufid_len; } -static inline int ovs_flow_tbl_need_to_expand(struct flow_table *table) +static inline bool ovs_identifier_is_key(const struct sw_flow_id *sfid) { - return (table->count > table->n_buckets); + return !ovs_identifier_is_ufid(sfid); } -struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *table, - struct sw_flow_key *key, int len); -void ovs_flow_tbl_destroy(struct flow_table *table); -void ovs_flow_tbl_deferred_destroy(struct flow_table *table); -struct flow_table *ovs_flow_tbl_alloc(int new_size); -struct flow_table *ovs_flow_tbl_expand(struct flow_table *table); -struct flow_table *ovs_flow_tbl_rehash(struct flow_table *table); -void ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow, - struct sw_flow_key *key, int key_len); -void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow); - -struct sw_flow *ovs_flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *idx); -extern const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1]; -int ovs_ipv4_tun_from_nlattr(const struct nlattr *attr, - struct ovs_key_ipv4_tunnel *tun_key); -int ovs_ipv4_tun_to_nlattr(struct sk_buff *skb, - const struct ovs_key_ipv4_tunnel *tun_key); +void ovs_flow_stats_update(struct sw_flow *, __be16 tcp_flags, + const struct sk_buff *); +void ovs_flow_stats_get(const struct sw_flow *, struct ovs_flow_stats *, + unsigned long *used, __be16 *tcp_flags); +void ovs_flow_stats_clear(struct sw_flow *); +u64 ovs_flow_used_time(unsigned long flow_jiffies); + +int ovs_flow_key_update(struct sk_buff *skb, struct sw_flow_key *key); +int ovs_flow_key_update_l3l4(struct sk_buff *skb, struct sw_flow_key *key); +int ovs_flow_key_extract(const struct ip_tunnel_info *tun_info, + struct sk_buff *skb, + struct sw_flow_key *key); +/* Extract key from packet coming from userspace. */ +int ovs_flow_key_extract_userspace(struct net *net, const struct nlattr *attr, + struct sk_buff *skb, + struct sw_flow_key *key, bool log); #endif /* flow.h */ diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c new file mode 100644 index 000000000000..1cb4f97335d8 --- /dev/null +++ b/net/openvswitch/flow_netlink.c @@ -0,0 +1,3794 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2007-2017 Nicira, Inc. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include "flow.h" +#include "datapath.h" +#include <linux/uaccess.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/if_ether.h> +#include <linux/if_vlan.h> +#include <net/llc_pdu.h> +#include <linux/kernel.h> +#include <linux/jhash.h> +#include <linux/jiffies.h> +#include <linux/llc.h> +#include <linux/module.h> +#include <linux/in.h> +#include <linux/rcupdate.h> +#include <linux/if_arp.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/sctp.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/icmp.h> +#include <linux/icmpv6.h> +#include <linux/rculist.h> +#include <net/geneve.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/ndisc.h> +#include <net/mpls.h> +#include <net/vxlan.h> +#include <net/tun_proto.h> +#include <net/erspan.h> + +#include "drop.h" +#include "flow_netlink.h" + +struct ovs_len_tbl { + int len; + const struct ovs_len_tbl *next; +}; + +#define OVS_ATTR_NESTED -1 +#define OVS_ATTR_VARIABLE -2 +#define OVS_COPY_ACTIONS_MAX_DEPTH 16 + +static bool actions_may_change_flow(const struct nlattr *actions) +{ + struct nlattr *nla; + int rem; + + nla_for_each_nested(nla, actions, rem) { + u16 action = nla_type(nla); + + switch (action) { + case OVS_ACTION_ATTR_OUTPUT: + case OVS_ACTION_ATTR_RECIRC: + case OVS_ACTION_ATTR_TRUNC: + case OVS_ACTION_ATTR_USERSPACE: + case OVS_ACTION_ATTR_DROP: + case OVS_ACTION_ATTR_PSAMPLE: + break; + + case OVS_ACTION_ATTR_CT: + case OVS_ACTION_ATTR_CT_CLEAR: + case OVS_ACTION_ATTR_HASH: + case OVS_ACTION_ATTR_POP_ETH: + case OVS_ACTION_ATTR_POP_MPLS: + case OVS_ACTION_ATTR_POP_NSH: + case OVS_ACTION_ATTR_POP_VLAN: + case OVS_ACTION_ATTR_PUSH_ETH: + case OVS_ACTION_ATTR_PUSH_MPLS: + case OVS_ACTION_ATTR_PUSH_NSH: + case OVS_ACTION_ATTR_PUSH_VLAN: + case OVS_ACTION_ATTR_SAMPLE: + case OVS_ACTION_ATTR_SET: + case OVS_ACTION_ATTR_SET_MASKED: + case OVS_ACTION_ATTR_METER: + case OVS_ACTION_ATTR_CHECK_PKT_LEN: + case OVS_ACTION_ATTR_ADD_MPLS: + case OVS_ACTION_ATTR_DEC_TTL: + default: + return true; + } + } + return false; +} + +static void update_range(struct sw_flow_match *match, + size_t offset, size_t size, bool is_mask) +{ + struct sw_flow_key_range *range; + size_t start = rounddown(offset, sizeof(long)); + size_t end = roundup(offset + size, sizeof(long)); + + if (!is_mask) + range = &match->range; + else + range = &match->mask->range; + + if (range->start == range->end) { + range->start = start; + range->end = end; + return; + } + + if (range->start > start) + range->start = start; + + if (range->end < end) + range->end = end; +} + +#define SW_FLOW_KEY_PUT(match, field, value, is_mask) \ + do { \ + update_range(match, offsetof(struct sw_flow_key, field), \ + sizeof((match)->key->field), is_mask); \ + if (is_mask) \ + (match)->mask->key.field = value; \ + else \ + (match)->key->field = value; \ + } while (0) + +#define SW_FLOW_KEY_MEMCPY_OFFSET(match, offset, value_p, len, is_mask) \ + do { \ + update_range(match, offset, len, is_mask); \ + if (is_mask) \ + memcpy((u8 *)&(match)->mask->key + offset, value_p, \ + len); \ + else \ + memcpy((u8 *)(match)->key + offset, value_p, len); \ + } while (0) + +#define SW_FLOW_KEY_MEMCPY(match, field, value_p, len, is_mask) \ + SW_FLOW_KEY_MEMCPY_OFFSET(match, offsetof(struct sw_flow_key, field), \ + value_p, len, is_mask) + +#define SW_FLOW_KEY_MEMSET_FIELD(match, field, value, is_mask) \ + do { \ + update_range(match, offsetof(struct sw_flow_key, field), \ + sizeof((match)->key->field), is_mask); \ + if (is_mask) \ + memset((u8 *)&(match)->mask->key.field, value, \ + sizeof((match)->mask->key.field)); \ + else \ + memset((u8 *)&(match)->key->field, value, \ + sizeof((match)->key->field)); \ + } while (0) + +#define SW_FLOW_KEY_BITMAP_COPY(match, field, value_p, nbits, is_mask) ({ \ + update_range(match, offsetof(struct sw_flow_key, field), \ + bitmap_size(nbits), is_mask); \ + bitmap_copy(is_mask ? (match)->mask->key.field : (match)->key->field, \ + value_p, nbits); \ +}) + +static bool match_validate(const struct sw_flow_match *match, + u64 key_attrs, u64 mask_attrs, bool log) +{ + u64 key_expected = 0; + u64 mask_allowed = key_attrs; /* At most allow all key attributes */ + + /* The following mask attributes allowed only if they + * pass the validation tests. */ + mask_allowed &= ~((1 << OVS_KEY_ATTR_IPV4) + | (1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4) + | (1 << OVS_KEY_ATTR_IPV6) + | (1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6) + | (1 << OVS_KEY_ATTR_TCP) + | (1 << OVS_KEY_ATTR_TCP_FLAGS) + | (1 << OVS_KEY_ATTR_UDP) + | (1 << OVS_KEY_ATTR_SCTP) + | (1 << OVS_KEY_ATTR_ICMP) + | (1 << OVS_KEY_ATTR_ICMPV6) + | (1 << OVS_KEY_ATTR_ARP) + | (1 << OVS_KEY_ATTR_ND) + | (1 << OVS_KEY_ATTR_MPLS) + | (1 << OVS_KEY_ATTR_NSH)); + + /* Always allowed mask fields. */ + mask_allowed |= ((1 << OVS_KEY_ATTR_TUNNEL) + | (1 << OVS_KEY_ATTR_IN_PORT) + | (1 << OVS_KEY_ATTR_ETHERTYPE)); + + /* Check key attributes. */ + if (match->key->eth.type == htons(ETH_P_ARP) + || match->key->eth.type == htons(ETH_P_RARP)) { + key_expected |= 1 << OVS_KEY_ATTR_ARP; + if (match->mask && (match->mask->key.eth.type == htons(0xffff))) + mask_allowed |= 1 << OVS_KEY_ATTR_ARP; + } + + if (eth_p_mpls(match->key->eth.type)) { + key_expected |= 1 << OVS_KEY_ATTR_MPLS; + if (match->mask && (match->mask->key.eth.type == htons(0xffff))) + mask_allowed |= 1 << OVS_KEY_ATTR_MPLS; + } + + if (match->key->eth.type == htons(ETH_P_IP)) { + key_expected |= 1 << OVS_KEY_ATTR_IPV4; + if (match->mask && match->mask->key.eth.type == htons(0xffff)) { + mask_allowed |= 1 << OVS_KEY_ATTR_IPV4; + mask_allowed |= 1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4; + } + + if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) { + if (match->key->ip.proto == IPPROTO_UDP) { + key_expected |= 1 << OVS_KEY_ATTR_UDP; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_UDP; + } + + if (match->key->ip.proto == IPPROTO_SCTP) { + key_expected |= 1 << OVS_KEY_ATTR_SCTP; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_SCTP; + } + + if (match->key->ip.proto == IPPROTO_TCP) { + key_expected |= 1 << OVS_KEY_ATTR_TCP; + key_expected |= 1 << OVS_KEY_ATTR_TCP_FLAGS; + if (match->mask && (match->mask->key.ip.proto == 0xff)) { + mask_allowed |= 1 << OVS_KEY_ATTR_TCP; + mask_allowed |= 1 << OVS_KEY_ATTR_TCP_FLAGS; + } + } + + if (match->key->ip.proto == IPPROTO_ICMP) { + key_expected |= 1 << OVS_KEY_ATTR_ICMP; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_ICMP; + } + } + } + + if (match->key->eth.type == htons(ETH_P_IPV6)) { + key_expected |= 1 << OVS_KEY_ATTR_IPV6; + if (match->mask && match->mask->key.eth.type == htons(0xffff)) { + mask_allowed |= 1 << OVS_KEY_ATTR_IPV6; + mask_allowed |= 1 << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6; + } + + if (match->key->ip.frag != OVS_FRAG_TYPE_LATER) { + if (match->key->ip.proto == IPPROTO_UDP) { + key_expected |= 1 << OVS_KEY_ATTR_UDP; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_UDP; + } + + if (match->key->ip.proto == IPPROTO_SCTP) { + key_expected |= 1 << OVS_KEY_ATTR_SCTP; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_SCTP; + } + + if (match->key->ip.proto == IPPROTO_TCP) { + key_expected |= 1 << OVS_KEY_ATTR_TCP; + key_expected |= 1 << OVS_KEY_ATTR_TCP_FLAGS; + if (match->mask && (match->mask->key.ip.proto == 0xff)) { + mask_allowed |= 1 << OVS_KEY_ATTR_TCP; + mask_allowed |= 1 << OVS_KEY_ATTR_TCP_FLAGS; + } + } + + if (match->key->ip.proto == IPPROTO_ICMPV6) { + key_expected |= 1 << OVS_KEY_ATTR_ICMPV6; + if (match->mask && (match->mask->key.ip.proto == 0xff)) + mask_allowed |= 1 << OVS_KEY_ATTR_ICMPV6; + + if (match->key->tp.src == + htons(NDISC_NEIGHBOUR_SOLICITATION) || + match->key->tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) { + key_expected |= 1 << OVS_KEY_ATTR_ND; + /* Original direction conntrack tuple + * uses the same space as the ND fields + * in the key, so both are not allowed + * at the same time. + */ + mask_allowed &= ~(1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6); + if (match->mask && (match->mask->key.tp.src == htons(0xff))) + mask_allowed |= 1 << OVS_KEY_ATTR_ND; + } + } + } + } + + if (match->key->eth.type == htons(ETH_P_NSH)) { + key_expected |= 1 << OVS_KEY_ATTR_NSH; + if (match->mask && + match->mask->key.eth.type == htons(0xffff)) { + mask_allowed |= 1 << OVS_KEY_ATTR_NSH; + } + } + + if ((key_attrs & key_expected) != key_expected) { + /* Key attributes check failed. */ + OVS_NLERR(log, "Missing key (keys=%llx, expected=%llx)", + (unsigned long long)key_attrs, + (unsigned long long)key_expected); + return false; + } + + if ((mask_attrs & mask_allowed) != mask_attrs) { + /* Mask attributes check failed. */ + OVS_NLERR(log, "Unexpected mask (mask=%llx, allowed=%llx)", + (unsigned long long)mask_attrs, + (unsigned long long)mask_allowed); + return false; + } + + return true; +} + +size_t ovs_tun_key_attr_size(void) +{ + /* Whenever adding new OVS_TUNNEL_KEY_ FIELDS, we should consider + * updating this function. + */ + return nla_total_size_64bit(8) /* OVS_TUNNEL_KEY_ATTR_ID */ + + nla_total_size(16) /* OVS_TUNNEL_KEY_ATTR_IPV[46]_SRC */ + + nla_total_size(16) /* OVS_TUNNEL_KEY_ATTR_IPV[46]_DST */ + + nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TOS */ + + nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TTL */ + + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT */ + + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_CSUM */ + + nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_OAM */ + + nla_total_size(256) /* OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS */ + /* OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS and + * OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS is mutually exclusive with + * OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS and covered by it. + */ + + nla_total_size(2) /* OVS_TUNNEL_KEY_ATTR_TP_SRC */ + + nla_total_size(2); /* OVS_TUNNEL_KEY_ATTR_TP_DST */ +} + +static size_t ovs_nsh_key_attr_size(void) +{ + /* Whenever adding new OVS_NSH_KEY_ FIELDS, we should consider + * updating this function. + */ + return nla_total_size(NSH_BASE_HDR_LEN) /* OVS_NSH_KEY_ATTR_BASE */ + /* OVS_NSH_KEY_ATTR_MD1 and OVS_NSH_KEY_ATTR_MD2 are + * mutually exclusive, so the bigger one can cover + * the small one. + */ + + nla_total_size(NSH_CTX_HDRS_MAX_LEN); +} + +size_t ovs_key_attr_size(void) +{ + /* Whenever adding new OVS_KEY_ FIELDS, we should consider + * updating this function. + */ + BUILD_BUG_ON(OVS_KEY_ATTR_MAX != 32); + + return nla_total_size(4) /* OVS_KEY_ATTR_PRIORITY */ + + nla_total_size(0) /* OVS_KEY_ATTR_TUNNEL */ + + ovs_tun_key_attr_size() + + nla_total_size(4) /* OVS_KEY_ATTR_IN_PORT */ + + nla_total_size(4) /* OVS_KEY_ATTR_SKB_MARK */ + + nla_total_size(4) /* OVS_KEY_ATTR_DP_HASH */ + + nla_total_size(4) /* OVS_KEY_ATTR_RECIRC_ID */ + + nla_total_size(4) /* OVS_KEY_ATTR_CT_STATE */ + + nla_total_size(2) /* OVS_KEY_ATTR_CT_ZONE */ + + nla_total_size(4) /* OVS_KEY_ATTR_CT_MARK */ + + nla_total_size(16) /* OVS_KEY_ATTR_CT_LABELS */ + + nla_total_size(40) /* OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6 */ + + nla_total_size(0) /* OVS_KEY_ATTR_NSH */ + + ovs_nsh_key_attr_size() + + nla_total_size(12) /* OVS_KEY_ATTR_ETHERNET */ + + nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */ + + nla_total_size(4) /* OVS_KEY_ATTR_VLAN */ + + nla_total_size(0) /* OVS_KEY_ATTR_ENCAP */ + + nla_total_size(2) /* OVS_KEY_ATTR_ETHERTYPE */ + + nla_total_size(40) /* OVS_KEY_ATTR_IPV6 */ + + nla_total_size(2) /* OVS_KEY_ATTR_ICMPV6 */ + + nla_total_size(28) /* OVS_KEY_ATTR_ND */ + + nla_total_size(2); /* OVS_KEY_ATTR_IPV6_EXTHDRS */ +} + +static const struct ovs_len_tbl ovs_vxlan_ext_key_lens[OVS_VXLAN_EXT_MAX + 1] = { + [OVS_VXLAN_EXT_GBP] = { .len = sizeof(u32) }, +}; + +static const struct ovs_len_tbl ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] = { + [OVS_TUNNEL_KEY_ATTR_ID] = { .len = sizeof(u64) }, + [OVS_TUNNEL_KEY_ATTR_IPV4_SRC] = { .len = sizeof(u32) }, + [OVS_TUNNEL_KEY_ATTR_IPV4_DST] = { .len = sizeof(u32) }, + [OVS_TUNNEL_KEY_ATTR_TOS] = { .len = 1 }, + [OVS_TUNNEL_KEY_ATTR_TTL] = { .len = 1 }, + [OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT] = { .len = 0 }, + [OVS_TUNNEL_KEY_ATTR_CSUM] = { .len = 0 }, + [OVS_TUNNEL_KEY_ATTR_TP_SRC] = { .len = sizeof(u16) }, + [OVS_TUNNEL_KEY_ATTR_TP_DST] = { .len = sizeof(u16) }, + [OVS_TUNNEL_KEY_ATTR_OAM] = { .len = 0 }, + [OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS] = { .len = OVS_ATTR_VARIABLE }, + [OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS] = { .len = OVS_ATTR_NESTED, + .next = ovs_vxlan_ext_key_lens }, + [OVS_TUNNEL_KEY_ATTR_IPV6_SRC] = { .len = sizeof(struct in6_addr) }, + [OVS_TUNNEL_KEY_ATTR_IPV6_DST] = { .len = sizeof(struct in6_addr) }, + [OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS] = { .len = OVS_ATTR_VARIABLE }, + [OVS_TUNNEL_KEY_ATTR_IPV4_INFO_BRIDGE] = { .len = 0 }, +}; + +static const struct ovs_len_tbl +ovs_nsh_key_attr_lens[OVS_NSH_KEY_ATTR_MAX + 1] = { + [OVS_NSH_KEY_ATTR_BASE] = { .len = sizeof(struct ovs_nsh_key_base) }, + [OVS_NSH_KEY_ATTR_MD1] = { .len = sizeof(struct ovs_nsh_key_md1) }, + [OVS_NSH_KEY_ATTR_MD2] = { .len = OVS_ATTR_VARIABLE }, +}; + +/* The size of the argument for each %OVS_KEY_ATTR_* Netlink attribute. */ +static const struct ovs_len_tbl ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = { + [OVS_KEY_ATTR_ENCAP] = { .len = OVS_ATTR_NESTED }, + [OVS_KEY_ATTR_PRIORITY] = { .len = sizeof(u32) }, + [OVS_KEY_ATTR_IN_PORT] = { .len = sizeof(u32) }, + [OVS_KEY_ATTR_SKB_MARK] = { .len = sizeof(u32) }, + [OVS_KEY_ATTR_ETHERNET] = { .len = sizeof(struct ovs_key_ethernet) }, + [OVS_KEY_ATTR_VLAN] = { .len = sizeof(__be16) }, + [OVS_KEY_ATTR_ETHERTYPE] = { .len = sizeof(__be16) }, + [OVS_KEY_ATTR_IPV4] = { .len = sizeof(struct ovs_key_ipv4) }, + [OVS_KEY_ATTR_IPV6] = { .len = sizeof(struct ovs_key_ipv6) }, + [OVS_KEY_ATTR_TCP] = { .len = sizeof(struct ovs_key_tcp) }, + [OVS_KEY_ATTR_TCP_FLAGS] = { .len = sizeof(__be16) }, + [OVS_KEY_ATTR_UDP] = { .len = sizeof(struct ovs_key_udp) }, + [OVS_KEY_ATTR_SCTP] = { .len = sizeof(struct ovs_key_sctp) }, + [OVS_KEY_ATTR_ICMP] = { .len = sizeof(struct ovs_key_icmp) }, + [OVS_KEY_ATTR_ICMPV6] = { .len = sizeof(struct ovs_key_icmpv6) }, + [OVS_KEY_ATTR_ARP] = { .len = sizeof(struct ovs_key_arp) }, + [OVS_KEY_ATTR_ND] = { .len = sizeof(struct ovs_key_nd) }, + [OVS_KEY_ATTR_RECIRC_ID] = { .len = sizeof(u32) }, + [OVS_KEY_ATTR_DP_HASH] = { .len = sizeof(u32) }, + [OVS_KEY_ATTR_TUNNEL] = { .len = OVS_ATTR_NESTED, + .next = ovs_tunnel_key_lens, }, + [OVS_KEY_ATTR_MPLS] = { .len = OVS_ATTR_VARIABLE }, + [OVS_KEY_ATTR_CT_STATE] = { .len = sizeof(u32) }, + [OVS_KEY_ATTR_CT_ZONE] = { .len = sizeof(u16) }, + [OVS_KEY_ATTR_CT_MARK] = { .len = sizeof(u32) }, + [OVS_KEY_ATTR_CT_LABELS] = { .len = sizeof(struct ovs_key_ct_labels) }, + [OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4] = { + .len = sizeof(struct ovs_key_ct_tuple_ipv4) }, + [OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6] = { + .len = sizeof(struct ovs_key_ct_tuple_ipv6) }, + [OVS_KEY_ATTR_NSH] = { .len = OVS_ATTR_NESTED, + .next = ovs_nsh_key_attr_lens, }, + [OVS_KEY_ATTR_IPV6_EXTHDRS] = { + .len = sizeof(struct ovs_key_ipv6_exthdrs) }, +}; + +static bool check_attr_len(unsigned int attr_len, unsigned int expected_len) +{ + return expected_len == attr_len || + expected_len == OVS_ATTR_NESTED || + expected_len == OVS_ATTR_VARIABLE; +} + +static bool is_all_zero(const u8 *fp, size_t size) +{ + int i; + + if (!fp) + return false; + + for (i = 0; i < size; i++) + if (fp[i]) + return false; + + return true; +} + +static int __parse_flow_nlattrs(const struct nlattr *attr, + const struct nlattr *a[], + u64 *attrsp, bool log, bool nz) +{ + const struct nlattr *nla; + u64 attrs; + int rem; + + attrs = *attrsp; + nla_for_each_nested(nla, attr, rem) { + u16 type = nla_type(nla); + int expected_len; + + if (type > OVS_KEY_ATTR_MAX) { + OVS_NLERR(log, "Key type %d is out of range max %d", + type, OVS_KEY_ATTR_MAX); + return -EINVAL; + } + + if (type == OVS_KEY_ATTR_PACKET_TYPE || + type == OVS_KEY_ATTR_ND_EXTENSIONS || + type == OVS_KEY_ATTR_TUNNEL_INFO) { + OVS_NLERR(log, "Key type %d is not supported", type); + return -EINVAL; + } + + if (attrs & (1ULL << type)) { + OVS_NLERR(log, "Duplicate key (type %d).", type); + return -EINVAL; + } + + expected_len = ovs_key_lens[type].len; + if (!check_attr_len(nla_len(nla), expected_len)) { + OVS_NLERR(log, "Key %d has unexpected len %d expected %d", + type, nla_len(nla), expected_len); + return -EINVAL; + } + + if (!nz || !is_all_zero(nla_data(nla), nla_len(nla))) { + attrs |= 1ULL << type; + a[type] = nla; + } + } + if (rem) { + OVS_NLERR(log, "Message has %d unknown bytes.", rem); + return -EINVAL; + } + + *attrsp = attrs; + return 0; +} + +static int parse_flow_mask_nlattrs(const struct nlattr *attr, + const struct nlattr *a[], u64 *attrsp, + bool log) +{ + return __parse_flow_nlattrs(attr, a, attrsp, log, true); +} + +int parse_flow_nlattrs(const struct nlattr *attr, const struct nlattr *a[], + u64 *attrsp, bool log) +{ + return __parse_flow_nlattrs(attr, a, attrsp, log, false); +} + +static int genev_tun_opt_from_nlattr(const struct nlattr *a, + struct sw_flow_match *match, bool is_mask, + bool log) +{ + unsigned long opt_key_offset; + + if (nla_len(a) > sizeof(match->key->tun_opts)) { + OVS_NLERR(log, "Geneve option length err (len %d, max %zu).", + nla_len(a), sizeof(match->key->tun_opts)); + return -EINVAL; + } + + if (nla_len(a) % 4 != 0) { + OVS_NLERR(log, "Geneve opt len %d is not a multiple of 4.", + nla_len(a)); + return -EINVAL; + } + + /* We need to record the length of the options passed + * down, otherwise packets with the same format but + * additional options will be silently matched. + */ + if (!is_mask) { + SW_FLOW_KEY_PUT(match, tun_opts_len, nla_len(a), + false); + } else { + /* This is somewhat unusual because it looks at + * both the key and mask while parsing the + * attributes (and by extension assumes the key + * is parsed first). Normally, we would verify + * that each is the correct length and that the + * attributes line up in the validate function. + * However, that is difficult because this is + * variable length and we won't have the + * information later. + */ + if (match->key->tun_opts_len != nla_len(a)) { + OVS_NLERR(log, "Geneve option len %d != mask len %d", + match->key->tun_opts_len, nla_len(a)); + return -EINVAL; + } + + SW_FLOW_KEY_PUT(match, tun_opts_len, 0xff, true); + } + + opt_key_offset = TUN_METADATA_OFFSET(nla_len(a)); + SW_FLOW_KEY_MEMCPY_OFFSET(match, opt_key_offset, nla_data(a), + nla_len(a), is_mask); + return 0; +} + +static int vxlan_tun_opt_from_nlattr(const struct nlattr *attr, + struct sw_flow_match *match, bool is_mask, + bool log) +{ + struct nlattr *a; + int rem; + unsigned long opt_key_offset; + struct vxlan_metadata opts; + + BUILD_BUG_ON(sizeof(opts) > sizeof(match->key->tun_opts)); + + memset(&opts, 0, sizeof(opts)); + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); + + if (type > OVS_VXLAN_EXT_MAX) { + OVS_NLERR(log, "VXLAN extension %d out of range max %d", + type, OVS_VXLAN_EXT_MAX); + return -EINVAL; + } + + if (!check_attr_len(nla_len(a), + ovs_vxlan_ext_key_lens[type].len)) { + OVS_NLERR(log, "VXLAN extension %d has unexpected len %d expected %d", + type, nla_len(a), + ovs_vxlan_ext_key_lens[type].len); + return -EINVAL; + } + + switch (type) { + case OVS_VXLAN_EXT_GBP: + opts.gbp = nla_get_u32(a); + break; + default: + OVS_NLERR(log, "Unknown VXLAN extension attribute %d", + type); + return -EINVAL; + } + } + if (rem) { + OVS_NLERR(log, "VXLAN extension message has %d unknown bytes.", + rem); + return -EINVAL; + } + + if (!is_mask) + SW_FLOW_KEY_PUT(match, tun_opts_len, sizeof(opts), false); + else + SW_FLOW_KEY_PUT(match, tun_opts_len, 0xff, true); + + opt_key_offset = TUN_METADATA_OFFSET(sizeof(opts)); + SW_FLOW_KEY_MEMCPY_OFFSET(match, opt_key_offset, &opts, sizeof(opts), + is_mask); + return 0; +} + +static int erspan_tun_opt_from_nlattr(const struct nlattr *a, + struct sw_flow_match *match, bool is_mask, + bool log) +{ + unsigned long opt_key_offset; + + BUILD_BUG_ON(sizeof(struct erspan_metadata) > + sizeof(match->key->tun_opts)); + + if (nla_len(a) > sizeof(match->key->tun_opts)) { + OVS_NLERR(log, "ERSPAN option length err (len %d, max %zu).", + nla_len(a), sizeof(match->key->tun_opts)); + return -EINVAL; + } + + if (!is_mask) + SW_FLOW_KEY_PUT(match, tun_opts_len, + sizeof(struct erspan_metadata), false); + else + SW_FLOW_KEY_PUT(match, tun_opts_len, 0xff, true); + + opt_key_offset = TUN_METADATA_OFFSET(nla_len(a)); + SW_FLOW_KEY_MEMCPY_OFFSET(match, opt_key_offset, nla_data(a), + nla_len(a), is_mask); + return 0; +} + +static int ip_tun_from_nlattr(const struct nlattr *attr, + struct sw_flow_match *match, bool is_mask, + bool log) +{ + bool ttl = false, ipv4 = false, ipv6 = false; + IP_TUNNEL_DECLARE_FLAGS(tun_flags) = { }; + bool info_bridge_mode = false; + int opts_type = 0; + struct nlattr *a; + int rem; + + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); + int err; + + if (type > OVS_TUNNEL_KEY_ATTR_MAX) { + OVS_NLERR(log, "Tunnel attr %d out of range max %d", + type, OVS_TUNNEL_KEY_ATTR_MAX); + return -EINVAL; + } + + if (!check_attr_len(nla_len(a), + ovs_tunnel_key_lens[type].len)) { + OVS_NLERR(log, "Tunnel attr %d has unexpected len %d expected %d", + type, nla_len(a), ovs_tunnel_key_lens[type].len); + return -EINVAL; + } + + switch (type) { + case OVS_TUNNEL_KEY_ATTR_ID: + SW_FLOW_KEY_PUT(match, tun_key.tun_id, + nla_get_be64(a), is_mask); + __set_bit(IP_TUNNEL_KEY_BIT, tun_flags); + break; + case OVS_TUNNEL_KEY_ATTR_IPV4_SRC: + SW_FLOW_KEY_PUT(match, tun_key.u.ipv4.src, + nla_get_in_addr(a), is_mask); + ipv4 = true; + break; + case OVS_TUNNEL_KEY_ATTR_IPV4_DST: + SW_FLOW_KEY_PUT(match, tun_key.u.ipv4.dst, + nla_get_in_addr(a), is_mask); + ipv4 = true; + break; + case OVS_TUNNEL_KEY_ATTR_IPV6_SRC: + SW_FLOW_KEY_PUT(match, tun_key.u.ipv6.src, + nla_get_in6_addr(a), is_mask); + ipv6 = true; + break; + case OVS_TUNNEL_KEY_ATTR_IPV6_DST: + SW_FLOW_KEY_PUT(match, tun_key.u.ipv6.dst, + nla_get_in6_addr(a), is_mask); + ipv6 = true; + break; + case OVS_TUNNEL_KEY_ATTR_TOS: + SW_FLOW_KEY_PUT(match, tun_key.tos, + nla_get_u8(a), is_mask); + break; + case OVS_TUNNEL_KEY_ATTR_TTL: + SW_FLOW_KEY_PUT(match, tun_key.ttl, + nla_get_u8(a), is_mask); + ttl = true; + break; + case OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT: + __set_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, tun_flags); + break; + case OVS_TUNNEL_KEY_ATTR_CSUM: + __set_bit(IP_TUNNEL_CSUM_BIT, tun_flags); + break; + case OVS_TUNNEL_KEY_ATTR_TP_SRC: + SW_FLOW_KEY_PUT(match, tun_key.tp_src, + nla_get_be16(a), is_mask); + break; + case OVS_TUNNEL_KEY_ATTR_TP_DST: + SW_FLOW_KEY_PUT(match, tun_key.tp_dst, + nla_get_be16(a), is_mask); + break; + case OVS_TUNNEL_KEY_ATTR_OAM: + __set_bit(IP_TUNNEL_OAM_BIT, tun_flags); + break; + case OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS: + if (opts_type) { + OVS_NLERR(log, "Multiple metadata blocks provided"); + return -EINVAL; + } + + err = genev_tun_opt_from_nlattr(a, match, is_mask, log); + if (err) + return err; + + __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, tun_flags); + opts_type = type; + break; + case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS: + if (opts_type) { + OVS_NLERR(log, "Multiple metadata blocks provided"); + return -EINVAL; + } + + err = vxlan_tun_opt_from_nlattr(a, match, is_mask, log); + if (err) + return err; + + __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, tun_flags); + opts_type = type; + break; + case OVS_TUNNEL_KEY_ATTR_PAD: + break; + case OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS: + if (opts_type) { + OVS_NLERR(log, "Multiple metadata blocks provided"); + return -EINVAL; + } + + err = erspan_tun_opt_from_nlattr(a, match, is_mask, + log); + if (err) + return err; + + __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, tun_flags); + opts_type = type; + break; + case OVS_TUNNEL_KEY_ATTR_IPV4_INFO_BRIDGE: + info_bridge_mode = true; + ipv4 = true; + break; + default: + OVS_NLERR(log, "Unknown IP tunnel attribute %d", + type); + return -EINVAL; + } + } + + SW_FLOW_KEY_BITMAP_COPY(match, tun_key.tun_flags, tun_flags, + __IP_TUNNEL_FLAG_NUM, is_mask); + if (is_mask) + SW_FLOW_KEY_MEMSET_FIELD(match, tun_proto, 0xff, true); + else + SW_FLOW_KEY_PUT(match, tun_proto, ipv6 ? AF_INET6 : AF_INET, + false); + + if (rem > 0) { + OVS_NLERR(log, "IP tunnel attribute has %d unknown bytes.", + rem); + return -EINVAL; + } + + if (ipv4 && ipv6) { + OVS_NLERR(log, "Mixed IPv4 and IPv6 tunnel attributes"); + return -EINVAL; + } + + if (!is_mask) { + if (!ipv4 && !ipv6) { + OVS_NLERR(log, "IP tunnel dst address not specified"); + return -EINVAL; + } + if (ipv4) { + if (info_bridge_mode) { + __clear_bit(IP_TUNNEL_KEY_BIT, tun_flags); + + if (match->key->tun_key.u.ipv4.src || + match->key->tun_key.u.ipv4.dst || + match->key->tun_key.tp_src || + match->key->tun_key.tp_dst || + match->key->tun_key.ttl || + match->key->tun_key.tos || + !ip_tunnel_flags_empty(tun_flags)) { + OVS_NLERR(log, "IPv4 tun info is not correct"); + return -EINVAL; + } + } else if (!match->key->tun_key.u.ipv4.dst) { + OVS_NLERR(log, "IPv4 tunnel dst address is zero"); + return -EINVAL; + } + } + if (ipv6 && ipv6_addr_any(&match->key->tun_key.u.ipv6.dst)) { + OVS_NLERR(log, "IPv6 tunnel dst address is zero"); + return -EINVAL; + } + + if (!ttl && !info_bridge_mode) { + OVS_NLERR(log, "IP tunnel TTL not specified."); + return -EINVAL; + } + } + + return opts_type; +} + +static int vxlan_opt_to_nlattr(struct sk_buff *skb, + const void *tun_opts, int swkey_tun_opts_len) +{ + const struct vxlan_metadata *opts = tun_opts; + struct nlattr *nla; + + nla = nla_nest_start_noflag(skb, OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS); + if (!nla) + return -EMSGSIZE; + + if (nla_put_u32(skb, OVS_VXLAN_EXT_GBP, opts->gbp) < 0) + return -EMSGSIZE; + + nla_nest_end(skb, nla); + return 0; +} + +static int __ip_tun_to_nlattr(struct sk_buff *skb, + const struct ip_tunnel_key *output, + const void *tun_opts, int swkey_tun_opts_len, + unsigned short tun_proto, u8 mode) +{ + if (test_bit(IP_TUNNEL_KEY_BIT, output->tun_flags) && + nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, output->tun_id, + OVS_TUNNEL_KEY_ATTR_PAD)) + return -EMSGSIZE; + + if (mode & IP_TUNNEL_INFO_BRIDGE) + return nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_IPV4_INFO_BRIDGE) + ? -EMSGSIZE : 0; + + switch (tun_proto) { + case AF_INET: + if (output->u.ipv4.src && + nla_put_in_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, + output->u.ipv4.src)) + return -EMSGSIZE; + if (output->u.ipv4.dst && + nla_put_in_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, + output->u.ipv4.dst)) + return -EMSGSIZE; + break; + case AF_INET6: + if (!ipv6_addr_any(&output->u.ipv6.src) && + nla_put_in6_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV6_SRC, + &output->u.ipv6.src)) + return -EMSGSIZE; + if (!ipv6_addr_any(&output->u.ipv6.dst) && + nla_put_in6_addr(skb, OVS_TUNNEL_KEY_ATTR_IPV6_DST, + &output->u.ipv6.dst)) + return -EMSGSIZE; + break; + } + if (output->tos && + nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, output->tos)) + return -EMSGSIZE; + if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, output->ttl)) + return -EMSGSIZE; + if (test_bit(IP_TUNNEL_DONT_FRAGMENT_BIT, output->tun_flags) && + nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT)) + return -EMSGSIZE; + if (test_bit(IP_TUNNEL_CSUM_BIT, output->tun_flags) && + nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_CSUM)) + return -EMSGSIZE; + if (output->tp_src && + nla_put_be16(skb, OVS_TUNNEL_KEY_ATTR_TP_SRC, output->tp_src)) + return -EMSGSIZE; + if (output->tp_dst && + nla_put_be16(skb, OVS_TUNNEL_KEY_ATTR_TP_DST, output->tp_dst)) + return -EMSGSIZE; + if (test_bit(IP_TUNNEL_OAM_BIT, output->tun_flags) && + nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_OAM)) + return -EMSGSIZE; + if (swkey_tun_opts_len) { + if (test_bit(IP_TUNNEL_GENEVE_OPT_BIT, output->tun_flags) && + nla_put(skb, OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS, + swkey_tun_opts_len, tun_opts)) + return -EMSGSIZE; + else if (test_bit(IP_TUNNEL_VXLAN_OPT_BIT, + output->tun_flags) && + vxlan_opt_to_nlattr(skb, tun_opts, swkey_tun_opts_len)) + return -EMSGSIZE; + else if (test_bit(IP_TUNNEL_ERSPAN_OPT_BIT, + output->tun_flags) && + nla_put(skb, OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS, + swkey_tun_opts_len, tun_opts)) + return -EMSGSIZE; + } + + return 0; +} + +static int ip_tun_to_nlattr(struct sk_buff *skb, + const struct ip_tunnel_key *output, + const void *tun_opts, int swkey_tun_opts_len, + unsigned short tun_proto, u8 mode) +{ + struct nlattr *nla; + int err; + + nla = nla_nest_start_noflag(skb, OVS_KEY_ATTR_TUNNEL); + if (!nla) + return -EMSGSIZE; + + err = __ip_tun_to_nlattr(skb, output, tun_opts, swkey_tun_opts_len, + tun_proto, mode); + if (err) + return err; + + nla_nest_end(skb, nla); + return 0; +} + +int ovs_nla_put_tunnel_info(struct sk_buff *skb, + struct ip_tunnel_info *tun_info) +{ + return __ip_tun_to_nlattr(skb, &tun_info->key, + ip_tunnel_info_opts(tun_info), + tun_info->options_len, + ip_tunnel_info_af(tun_info), tun_info->mode); +} + +static int encode_vlan_from_nlattrs(struct sw_flow_match *match, + const struct nlattr *a[], + bool is_mask, bool inner) +{ + __be16 tci = 0; + __be16 tpid = 0; + + if (a[OVS_KEY_ATTR_VLAN]) + tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); + + if (a[OVS_KEY_ATTR_ETHERTYPE]) + tpid = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]); + + if (likely(!inner)) { + SW_FLOW_KEY_PUT(match, eth.vlan.tpid, tpid, is_mask); + SW_FLOW_KEY_PUT(match, eth.vlan.tci, tci, is_mask); + } else { + SW_FLOW_KEY_PUT(match, eth.cvlan.tpid, tpid, is_mask); + SW_FLOW_KEY_PUT(match, eth.cvlan.tci, tci, is_mask); + } + return 0; +} + +static int validate_vlan_from_nlattrs(const struct sw_flow_match *match, + u64 key_attrs, bool inner, + const struct nlattr **a, bool log) +{ + __be16 tci = 0; + + if (!((key_attrs & (1 << OVS_KEY_ATTR_ETHERNET)) && + (key_attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) && + eth_type_vlan(nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE])))) { + /* Not a VLAN. */ + return 0; + } + + if (!((key_attrs & (1 << OVS_KEY_ATTR_VLAN)) && + (key_attrs & (1 << OVS_KEY_ATTR_ENCAP)))) { + OVS_NLERR(log, "Invalid %s frame", (inner) ? "C-VLAN" : "VLAN"); + return -EINVAL; + } + + if (a[OVS_KEY_ATTR_VLAN]) + tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); + + if (!(tci & htons(VLAN_CFI_MASK))) { + if (tci) { + OVS_NLERR(log, "%s TCI does not have VLAN_CFI_MASK bit set.", + (inner) ? "C-VLAN" : "VLAN"); + return -EINVAL; + } else if (nla_len(a[OVS_KEY_ATTR_ENCAP])) { + /* Corner case for truncated VLAN header. */ + OVS_NLERR(log, "Truncated %s header has non-zero encap attribute.", + (inner) ? "C-VLAN" : "VLAN"); + return -EINVAL; + } + } + + return 1; +} + +static int validate_vlan_mask_from_nlattrs(const struct sw_flow_match *match, + u64 key_attrs, bool inner, + const struct nlattr **a, bool log) +{ + __be16 tci = 0; + __be16 tpid = 0; + bool encap_valid = !!(match->key->eth.vlan.tci & + htons(VLAN_CFI_MASK)); + bool i_encap_valid = !!(match->key->eth.cvlan.tci & + htons(VLAN_CFI_MASK)); + + if (!(key_attrs & (1 << OVS_KEY_ATTR_ENCAP))) { + /* Not a VLAN. */ + return 0; + } + + if ((!inner && !encap_valid) || (inner && !i_encap_valid)) { + OVS_NLERR(log, "Encap mask attribute is set for non-%s frame.", + (inner) ? "C-VLAN" : "VLAN"); + return -EINVAL; + } + + if (a[OVS_KEY_ATTR_VLAN]) + tci = nla_get_be16(a[OVS_KEY_ATTR_VLAN]); + + if (a[OVS_KEY_ATTR_ETHERTYPE]) + tpid = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]); + + if (tpid != htons(0xffff)) { + OVS_NLERR(log, "Must have an exact match on %s TPID (mask=%x).", + (inner) ? "C-VLAN" : "VLAN", ntohs(tpid)); + return -EINVAL; + } + if (!(tci & htons(VLAN_CFI_MASK))) { + OVS_NLERR(log, "%s TCI mask does not have exact match for VLAN_CFI_MASK bit.", + (inner) ? "C-VLAN" : "VLAN"); + return -EINVAL; + } + + return 1; +} + +static int __parse_vlan_from_nlattrs(struct sw_flow_match *match, + u64 *key_attrs, bool inner, + const struct nlattr **a, bool is_mask, + bool log) +{ + int err; + const struct nlattr *encap; + + if (!is_mask) + err = validate_vlan_from_nlattrs(match, *key_attrs, inner, + a, log); + else + err = validate_vlan_mask_from_nlattrs(match, *key_attrs, inner, + a, log); + if (err <= 0) + return err; + + err = encode_vlan_from_nlattrs(match, a, is_mask, inner); + if (err) + return err; + + *key_attrs &= ~(1 << OVS_KEY_ATTR_ENCAP); + *key_attrs &= ~(1 << OVS_KEY_ATTR_VLAN); + *key_attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); + + encap = a[OVS_KEY_ATTR_ENCAP]; + + if (!is_mask) + err = parse_flow_nlattrs(encap, a, key_attrs, log); + else + err = parse_flow_mask_nlattrs(encap, a, key_attrs, log); + + return err; +} + +static int parse_vlan_from_nlattrs(struct sw_flow_match *match, + u64 *key_attrs, const struct nlattr **a, + bool is_mask, bool log) +{ + int err; + bool encap_valid = false; + + err = __parse_vlan_from_nlattrs(match, key_attrs, false, a, + is_mask, log); + if (err) + return err; + + encap_valid = !!(match->key->eth.vlan.tci & htons(VLAN_CFI_MASK)); + if (encap_valid) { + err = __parse_vlan_from_nlattrs(match, key_attrs, true, a, + is_mask, log); + if (err) + return err; + } + + return 0; +} + +static int parse_eth_type_from_nlattrs(struct sw_flow_match *match, + u64 *attrs, const struct nlattr **a, + bool is_mask, bool log) +{ + __be16 eth_type; + + eth_type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]); + if (is_mask) { + /* Always exact match EtherType. */ + eth_type = htons(0xffff); + } else if (!eth_proto_is_802_3(eth_type)) { + OVS_NLERR(log, "EtherType %x is less than min %x", + ntohs(eth_type), ETH_P_802_3_MIN); + return -EINVAL; + } + + SW_FLOW_KEY_PUT(match, eth.type, eth_type, is_mask); + *attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE); + return 0; +} + +static int metadata_from_nlattrs(struct net *net, struct sw_flow_match *match, + u64 *attrs, const struct nlattr **a, + bool is_mask, bool log) +{ + u8 mac_proto = MAC_PROTO_ETHERNET; + + if (*attrs & (1 << OVS_KEY_ATTR_DP_HASH)) { + u32 hash_val = nla_get_u32(a[OVS_KEY_ATTR_DP_HASH]); + + SW_FLOW_KEY_PUT(match, ovs_flow_hash, hash_val, is_mask); + *attrs &= ~(1 << OVS_KEY_ATTR_DP_HASH); + } + + if (*attrs & (1 << OVS_KEY_ATTR_RECIRC_ID)) { + u32 recirc_id = nla_get_u32(a[OVS_KEY_ATTR_RECIRC_ID]); + + SW_FLOW_KEY_PUT(match, recirc_id, recirc_id, is_mask); + *attrs &= ~(1 << OVS_KEY_ATTR_RECIRC_ID); + } + + if (*attrs & (1 << OVS_KEY_ATTR_PRIORITY)) { + SW_FLOW_KEY_PUT(match, phy.priority, + nla_get_u32(a[OVS_KEY_ATTR_PRIORITY]), is_mask); + *attrs &= ~(1 << OVS_KEY_ATTR_PRIORITY); + } + + if (*attrs & (1 << OVS_KEY_ATTR_IN_PORT)) { + u32 in_port = nla_get_u32(a[OVS_KEY_ATTR_IN_PORT]); + + if (is_mask) { + in_port = 0xffffffff; /* Always exact match in_port. */ + } else if (in_port >= DP_MAX_PORTS) { + OVS_NLERR(log, "Port %d exceeds max allowable %d", + in_port, DP_MAX_PORTS); + return -EINVAL; + } + + SW_FLOW_KEY_PUT(match, phy.in_port, in_port, is_mask); + *attrs &= ~(1 << OVS_KEY_ATTR_IN_PORT); + } else if (!is_mask) { + SW_FLOW_KEY_PUT(match, phy.in_port, DP_MAX_PORTS, is_mask); + } + + if (*attrs & (1 << OVS_KEY_ATTR_SKB_MARK)) { + uint32_t mark = nla_get_u32(a[OVS_KEY_ATTR_SKB_MARK]); + + SW_FLOW_KEY_PUT(match, phy.skb_mark, mark, is_mask); + *attrs &= ~(1 << OVS_KEY_ATTR_SKB_MARK); + } + if (*attrs & (1 << OVS_KEY_ATTR_TUNNEL)) { + if (ip_tun_from_nlattr(a[OVS_KEY_ATTR_TUNNEL], match, + is_mask, log) < 0) + return -EINVAL; + *attrs &= ~(1 << OVS_KEY_ATTR_TUNNEL); + } + + if (*attrs & (1 << OVS_KEY_ATTR_CT_STATE) && + ovs_ct_verify(net, OVS_KEY_ATTR_CT_STATE)) { + u32 ct_state = nla_get_u32(a[OVS_KEY_ATTR_CT_STATE]); + + if (ct_state & ~CT_SUPPORTED_MASK) { + OVS_NLERR(log, "ct_state flags %08x unsupported", + ct_state); + return -EINVAL; + } + + SW_FLOW_KEY_PUT(match, ct_state, ct_state, is_mask); + *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_STATE); + } + if (*attrs & (1 << OVS_KEY_ATTR_CT_ZONE) && + ovs_ct_verify(net, OVS_KEY_ATTR_CT_ZONE)) { + u16 ct_zone = nla_get_u16(a[OVS_KEY_ATTR_CT_ZONE]); + + SW_FLOW_KEY_PUT(match, ct_zone, ct_zone, is_mask); + *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ZONE); + } + if (*attrs & (1 << OVS_KEY_ATTR_CT_MARK) && + ovs_ct_verify(net, OVS_KEY_ATTR_CT_MARK)) { + u32 mark = nla_get_u32(a[OVS_KEY_ATTR_CT_MARK]); + + SW_FLOW_KEY_PUT(match, ct.mark, mark, is_mask); + *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_MARK); + } + if (*attrs & (1 << OVS_KEY_ATTR_CT_LABELS) && + ovs_ct_verify(net, OVS_KEY_ATTR_CT_LABELS)) { + const struct ovs_key_ct_labels *cl; + + cl = nla_data(a[OVS_KEY_ATTR_CT_LABELS]); + SW_FLOW_KEY_MEMCPY(match, ct.labels, cl->ct_labels, + sizeof(*cl), is_mask); + *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_LABELS); + } + if (*attrs & (1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4)) { + const struct ovs_key_ct_tuple_ipv4 *ct; + + ct = nla_data(a[OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4]); + + SW_FLOW_KEY_PUT(match, ipv4.ct_orig.src, ct->ipv4_src, is_mask); + SW_FLOW_KEY_PUT(match, ipv4.ct_orig.dst, ct->ipv4_dst, is_mask); + SW_FLOW_KEY_PUT(match, ct.orig_tp.src, ct->src_port, is_mask); + SW_FLOW_KEY_PUT(match, ct.orig_tp.dst, ct->dst_port, is_mask); + SW_FLOW_KEY_PUT(match, ct_orig_proto, ct->ipv4_proto, is_mask); + *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4); + } + if (*attrs & (1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6)) { + const struct ovs_key_ct_tuple_ipv6 *ct; + + ct = nla_data(a[OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6]); + + SW_FLOW_KEY_MEMCPY(match, ipv6.ct_orig.src, &ct->ipv6_src, + sizeof(match->key->ipv6.ct_orig.src), + is_mask); + SW_FLOW_KEY_MEMCPY(match, ipv6.ct_orig.dst, &ct->ipv6_dst, + sizeof(match->key->ipv6.ct_orig.dst), + is_mask); + SW_FLOW_KEY_PUT(match, ct.orig_tp.src, ct->src_port, is_mask); + SW_FLOW_KEY_PUT(match, ct.orig_tp.dst, ct->dst_port, is_mask); + SW_FLOW_KEY_PUT(match, ct_orig_proto, ct->ipv6_proto, is_mask); + *attrs &= ~(1ULL << OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6); + } + + /* For layer 3 packets the Ethernet type is provided + * and treated as metadata but no MAC addresses are provided. + */ + if (!(*attrs & (1ULL << OVS_KEY_ATTR_ETHERNET)) && + (*attrs & (1ULL << OVS_KEY_ATTR_ETHERTYPE))) + mac_proto = MAC_PROTO_NONE; + + /* Always exact match mac_proto */ + SW_FLOW_KEY_PUT(match, mac_proto, is_mask ? 0xff : mac_proto, is_mask); + + if (mac_proto == MAC_PROTO_NONE) + return parse_eth_type_from_nlattrs(match, attrs, a, is_mask, + log); + + return 0; +} + +/* + * Constructs NSH header 'nh' from attributes of OVS_ACTION_ATTR_PUSH_NSH, + * where 'nh' points to a memory block of 'size' bytes. It's assumed that + * attributes were previously validated with validate_push_nsh(). + */ +int nsh_hdr_from_nlattr(const struct nlattr *attr, + struct nshhdr *nh, size_t size) +{ + struct nlattr *a; + int rem; + u8 flags = 0; + u8 ttl = 0; + int mdlen = 0; + + if (size < NSH_BASE_HDR_LEN) + return -ENOBUFS; + + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); + + switch (type) { + case OVS_NSH_KEY_ATTR_BASE: { + const struct ovs_nsh_key_base *base = nla_data(a); + + flags = base->flags; + ttl = base->ttl; + nh->np = base->np; + nh->mdtype = base->mdtype; + nh->path_hdr = base->path_hdr; + break; + } + case OVS_NSH_KEY_ATTR_MD1: + mdlen = nla_len(a); + if (mdlen > size - NSH_BASE_HDR_LEN) + return -ENOBUFS; + memcpy(&nh->md1, nla_data(a), mdlen); + break; + + case OVS_NSH_KEY_ATTR_MD2: + mdlen = nla_len(a); + if (mdlen > size - NSH_BASE_HDR_LEN) + return -ENOBUFS; + memcpy(&nh->md2, nla_data(a), mdlen); + break; + + default: + return -EINVAL; + } + } + + /* nsh header length = NSH_BASE_HDR_LEN + mdlen */ + nh->ver_flags_ttl_len = 0; + nsh_set_flags_ttl_len(nh, flags, ttl, NSH_BASE_HDR_LEN + mdlen); + + return 0; +} + +static int nsh_key_put_from_nlattr(const struct nlattr *attr, + struct sw_flow_match *match, bool is_mask, + bool is_push_nsh, bool log) +{ + struct nlattr *a; + int rem; + bool has_base = false; + bool has_md1 = false; + bool has_md2 = false; + u8 mdtype = 0; + int mdlen = 0; + + if (WARN_ON(is_push_nsh && is_mask)) + return -EINVAL; + + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); + int i; + + if (type > OVS_NSH_KEY_ATTR_MAX) { + OVS_NLERR(log, "nsh attr %d is out of range max %d", + type, OVS_NSH_KEY_ATTR_MAX); + return -EINVAL; + } + + if (!check_attr_len(nla_len(a), + ovs_nsh_key_attr_lens[type].len)) { + OVS_NLERR( + log, + "nsh attr %d has unexpected len %d expected %d", + type, + nla_len(a), + ovs_nsh_key_attr_lens[type].len + ); + return -EINVAL; + } + + switch (type) { + case OVS_NSH_KEY_ATTR_BASE: { + const struct ovs_nsh_key_base *base = nla_data(a); + + has_base = true; + mdtype = base->mdtype; + SW_FLOW_KEY_PUT(match, nsh.base.flags, + base->flags, is_mask); + SW_FLOW_KEY_PUT(match, nsh.base.ttl, + base->ttl, is_mask); + SW_FLOW_KEY_PUT(match, nsh.base.mdtype, + base->mdtype, is_mask); + SW_FLOW_KEY_PUT(match, nsh.base.np, + base->np, is_mask); + SW_FLOW_KEY_PUT(match, nsh.base.path_hdr, + base->path_hdr, is_mask); + break; + } + case OVS_NSH_KEY_ATTR_MD1: { + const struct ovs_nsh_key_md1 *md1 = nla_data(a); + + has_md1 = true; + for (i = 0; i < NSH_MD1_CONTEXT_SIZE; i++) + SW_FLOW_KEY_PUT(match, nsh.context[i], + md1->context[i], is_mask); + break; + } + case OVS_NSH_KEY_ATTR_MD2: + if (!is_push_nsh) /* Not supported MD type 2 yet */ + return -ENOTSUPP; + + has_md2 = true; + mdlen = nla_len(a); + if (mdlen > NSH_CTX_HDRS_MAX_LEN || mdlen <= 0) { + OVS_NLERR( + log, + "Invalid MD length %d for MD type %d", + mdlen, + mdtype + ); + return -EINVAL; + } + break; + default: + OVS_NLERR(log, "Unknown nsh attribute %d", + type); + return -EINVAL; + } + } + + if (rem > 0) { + OVS_NLERR(log, "nsh attribute has %d unknown bytes.", rem); + return -EINVAL; + } + + if (has_md1 && has_md2) { + OVS_NLERR( + 1, + "invalid nsh attribute: md1 and md2 are exclusive." + ); + return -EINVAL; + } + + if (!is_mask) { + if ((has_md1 && mdtype != NSH_M_TYPE1) || + (has_md2 && mdtype != NSH_M_TYPE2)) { + OVS_NLERR(1, "nsh attribute has unmatched MD type %d.", + mdtype); + return -EINVAL; + } + + if (is_push_nsh && + (!has_base || (!has_md1 && !has_md2))) { + OVS_NLERR( + 1, + "push_nsh: missing base or metadata attributes" + ); + return -EINVAL; + } + } + + return 0; +} + +static int ovs_key_from_nlattrs(struct net *net, struct sw_flow_match *match, + u64 attrs, const struct nlattr **a, + bool is_mask, bool log) +{ + int err; + + err = metadata_from_nlattrs(net, match, &attrs, a, is_mask, log); + if (err) + return err; + + if (attrs & (1 << OVS_KEY_ATTR_ETHERNET)) { + const struct ovs_key_ethernet *eth_key; + + eth_key = nla_data(a[OVS_KEY_ATTR_ETHERNET]); + SW_FLOW_KEY_MEMCPY(match, eth.src, + eth_key->eth_src, ETH_ALEN, is_mask); + SW_FLOW_KEY_MEMCPY(match, eth.dst, + eth_key->eth_dst, ETH_ALEN, is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_ETHERNET); + + if (attrs & (1 << OVS_KEY_ATTR_VLAN)) { + /* VLAN attribute is always parsed before getting here since it + * may occur multiple times. + */ + OVS_NLERR(log, "VLAN attribute unexpected."); + return -EINVAL; + } + + if (attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) { + err = parse_eth_type_from_nlattrs(match, &attrs, a, is_mask, + log); + if (err) + return err; + } else if (!is_mask) { + SW_FLOW_KEY_PUT(match, eth.type, htons(ETH_P_802_2), is_mask); + } + } else if (!match->key->eth.type) { + OVS_NLERR(log, "Either Ethernet header or EtherType is required."); + return -EINVAL; + } + + if (attrs & (1 << OVS_KEY_ATTR_IPV4)) { + const struct ovs_key_ipv4 *ipv4_key; + + ipv4_key = nla_data(a[OVS_KEY_ATTR_IPV4]); + if (!is_mask && ipv4_key->ipv4_frag > OVS_FRAG_TYPE_MAX) { + OVS_NLERR(log, "IPv4 frag type %d is out of range max %d", + ipv4_key->ipv4_frag, OVS_FRAG_TYPE_MAX); + return -EINVAL; + } + SW_FLOW_KEY_PUT(match, ip.proto, + ipv4_key->ipv4_proto, is_mask); + SW_FLOW_KEY_PUT(match, ip.tos, + ipv4_key->ipv4_tos, is_mask); + SW_FLOW_KEY_PUT(match, ip.ttl, + ipv4_key->ipv4_ttl, is_mask); + SW_FLOW_KEY_PUT(match, ip.frag, + ipv4_key->ipv4_frag, is_mask); + SW_FLOW_KEY_PUT(match, ipv4.addr.src, + ipv4_key->ipv4_src, is_mask); + SW_FLOW_KEY_PUT(match, ipv4.addr.dst, + ipv4_key->ipv4_dst, is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_IPV4); + } + + if (attrs & (1 << OVS_KEY_ATTR_IPV6)) { + const struct ovs_key_ipv6 *ipv6_key; + + ipv6_key = nla_data(a[OVS_KEY_ATTR_IPV6]); + if (!is_mask && ipv6_key->ipv6_frag > OVS_FRAG_TYPE_MAX) { + OVS_NLERR(log, "IPv6 frag type %d is out of range max %d", + ipv6_key->ipv6_frag, OVS_FRAG_TYPE_MAX); + return -EINVAL; + } + + if (!is_mask && ipv6_key->ipv6_label & htonl(0xFFF00000)) { + OVS_NLERR(log, "IPv6 flow label %x is out of range (max=%x)", + ntohl(ipv6_key->ipv6_label), (1 << 20) - 1); + return -EINVAL; + } + + SW_FLOW_KEY_PUT(match, ipv6.label, + ipv6_key->ipv6_label, is_mask); + SW_FLOW_KEY_PUT(match, ip.proto, + ipv6_key->ipv6_proto, is_mask); + SW_FLOW_KEY_PUT(match, ip.tos, + ipv6_key->ipv6_tclass, is_mask); + SW_FLOW_KEY_PUT(match, ip.ttl, + ipv6_key->ipv6_hlimit, is_mask); + SW_FLOW_KEY_PUT(match, ip.frag, + ipv6_key->ipv6_frag, is_mask); + SW_FLOW_KEY_MEMCPY(match, ipv6.addr.src, + ipv6_key->ipv6_src, + sizeof(match->key->ipv6.addr.src), + is_mask); + SW_FLOW_KEY_MEMCPY(match, ipv6.addr.dst, + ipv6_key->ipv6_dst, + sizeof(match->key->ipv6.addr.dst), + is_mask); + + attrs &= ~(1 << OVS_KEY_ATTR_IPV6); + } + + if (attrs & (1ULL << OVS_KEY_ATTR_IPV6_EXTHDRS)) { + const struct ovs_key_ipv6_exthdrs *ipv6_exthdrs_key; + + ipv6_exthdrs_key = nla_data(a[OVS_KEY_ATTR_IPV6_EXTHDRS]); + + SW_FLOW_KEY_PUT(match, ipv6.exthdrs, + ipv6_exthdrs_key->hdrs, is_mask); + + attrs &= ~(1ULL << OVS_KEY_ATTR_IPV6_EXTHDRS); + } + + if (attrs & (1 << OVS_KEY_ATTR_ARP)) { + const struct ovs_key_arp *arp_key; + + arp_key = nla_data(a[OVS_KEY_ATTR_ARP]); + if (!is_mask && (arp_key->arp_op & htons(0xff00))) { + OVS_NLERR(log, "Unknown ARP opcode (opcode=%d).", + arp_key->arp_op); + return -EINVAL; + } + + SW_FLOW_KEY_PUT(match, ipv4.addr.src, + arp_key->arp_sip, is_mask); + SW_FLOW_KEY_PUT(match, ipv4.addr.dst, + arp_key->arp_tip, is_mask); + SW_FLOW_KEY_PUT(match, ip.proto, + ntohs(arp_key->arp_op), is_mask); + SW_FLOW_KEY_MEMCPY(match, ipv4.arp.sha, + arp_key->arp_sha, ETH_ALEN, is_mask); + SW_FLOW_KEY_MEMCPY(match, ipv4.arp.tha, + arp_key->arp_tha, ETH_ALEN, is_mask); + + attrs &= ~(1 << OVS_KEY_ATTR_ARP); + } + + if (attrs & (1 << OVS_KEY_ATTR_NSH)) { + if (nsh_key_put_from_nlattr(a[OVS_KEY_ATTR_NSH], match, + is_mask, false, log) < 0) + return -EINVAL; + attrs &= ~(1 << OVS_KEY_ATTR_NSH); + } + + if (attrs & (1 << OVS_KEY_ATTR_MPLS)) { + const struct ovs_key_mpls *mpls_key; + u32 hdr_len; + u32 label_count, label_count_mask, i; + + mpls_key = nla_data(a[OVS_KEY_ATTR_MPLS]); + hdr_len = nla_len(a[OVS_KEY_ATTR_MPLS]); + label_count = hdr_len / sizeof(struct ovs_key_mpls); + + if (label_count == 0 || label_count > MPLS_LABEL_DEPTH || + hdr_len % sizeof(struct ovs_key_mpls)) + return -EINVAL; + + label_count_mask = GENMASK(label_count - 1, 0); + + for (i = 0 ; i < label_count; i++) + SW_FLOW_KEY_PUT(match, mpls.lse[i], + mpls_key[i].mpls_lse, is_mask); + + SW_FLOW_KEY_PUT(match, mpls.num_labels_mask, + label_count_mask, is_mask); + + attrs &= ~(1 << OVS_KEY_ATTR_MPLS); + } + + if (attrs & (1 << OVS_KEY_ATTR_TCP)) { + const struct ovs_key_tcp *tcp_key; + + tcp_key = nla_data(a[OVS_KEY_ATTR_TCP]); + SW_FLOW_KEY_PUT(match, tp.src, tcp_key->tcp_src, is_mask); + SW_FLOW_KEY_PUT(match, tp.dst, tcp_key->tcp_dst, is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_TCP); + } + + if (attrs & (1 << OVS_KEY_ATTR_TCP_FLAGS)) { + SW_FLOW_KEY_PUT(match, tp.flags, + nla_get_be16(a[OVS_KEY_ATTR_TCP_FLAGS]), + is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_TCP_FLAGS); + } + + if (attrs & (1 << OVS_KEY_ATTR_UDP)) { + const struct ovs_key_udp *udp_key; + + udp_key = nla_data(a[OVS_KEY_ATTR_UDP]); + SW_FLOW_KEY_PUT(match, tp.src, udp_key->udp_src, is_mask); + SW_FLOW_KEY_PUT(match, tp.dst, udp_key->udp_dst, is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_UDP); + } + + if (attrs & (1 << OVS_KEY_ATTR_SCTP)) { + const struct ovs_key_sctp *sctp_key; + + sctp_key = nla_data(a[OVS_KEY_ATTR_SCTP]); + SW_FLOW_KEY_PUT(match, tp.src, sctp_key->sctp_src, is_mask); + SW_FLOW_KEY_PUT(match, tp.dst, sctp_key->sctp_dst, is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_SCTP); + } + + if (attrs & (1 << OVS_KEY_ATTR_ICMP)) { + const struct ovs_key_icmp *icmp_key; + + icmp_key = nla_data(a[OVS_KEY_ATTR_ICMP]); + SW_FLOW_KEY_PUT(match, tp.src, + htons(icmp_key->icmp_type), is_mask); + SW_FLOW_KEY_PUT(match, tp.dst, + htons(icmp_key->icmp_code), is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_ICMP); + } + + if (attrs & (1 << OVS_KEY_ATTR_ICMPV6)) { + const struct ovs_key_icmpv6 *icmpv6_key; + + icmpv6_key = nla_data(a[OVS_KEY_ATTR_ICMPV6]); + SW_FLOW_KEY_PUT(match, tp.src, + htons(icmpv6_key->icmpv6_type), is_mask); + SW_FLOW_KEY_PUT(match, tp.dst, + htons(icmpv6_key->icmpv6_code), is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_ICMPV6); + } + + if (attrs & (1 << OVS_KEY_ATTR_ND)) { + const struct ovs_key_nd *nd_key; + + nd_key = nla_data(a[OVS_KEY_ATTR_ND]); + SW_FLOW_KEY_MEMCPY(match, ipv6.nd.target, + nd_key->nd_target, + sizeof(match->key->ipv6.nd.target), + is_mask); + SW_FLOW_KEY_MEMCPY(match, ipv6.nd.sll, + nd_key->nd_sll, ETH_ALEN, is_mask); + SW_FLOW_KEY_MEMCPY(match, ipv6.nd.tll, + nd_key->nd_tll, ETH_ALEN, is_mask); + attrs &= ~(1 << OVS_KEY_ATTR_ND); + } + + if (attrs != 0) { + OVS_NLERR(log, "Unknown key attributes %llx", + (unsigned long long)attrs); + return -EINVAL; + } + + return 0; +} + +static void nlattr_set(struct nlattr *attr, u8 val, + const struct ovs_len_tbl *tbl) +{ + struct nlattr *nla; + int rem; + + /* The nlattr stream should already have been validated */ + nla_for_each_nested(nla, attr, rem) { + if (tbl[nla_type(nla)].len == OVS_ATTR_NESTED) + nlattr_set(nla, val, tbl[nla_type(nla)].next ? : tbl); + else + memset(nla_data(nla), val, nla_len(nla)); + + if (nla_type(nla) == OVS_KEY_ATTR_CT_STATE) + *(u32 *)nla_data(nla) &= CT_SUPPORTED_MASK; + } +} + +static void mask_set_nlattr(struct nlattr *attr, u8 val) +{ + nlattr_set(attr, val, ovs_key_lens); +} + +/** + * ovs_nla_get_match - parses Netlink attributes into a flow key and + * mask. In case the 'mask' is NULL, the flow is treated as exact match + * flow. Otherwise, it is treated as a wildcarded flow, except the mask + * does not include any don't care bit. + * @net: Used to determine per-namespace field support. + * @match: receives the extracted flow match information. + * @nla_key: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute + * sequence. The fields should of the packet that triggered the creation + * of this flow. + * @nla_mask: Optional. Netlink attribute holding nested %OVS_KEY_ATTR_* + * Netlink attribute specifies the mask field of the wildcarded flow. + * @log: Boolean to allow kernel error logging. Normally true, but when + * probing for feature compatibility this should be passed in as false to + * suppress unnecessary error logging. + */ +int ovs_nla_get_match(struct net *net, struct sw_flow_match *match, + const struct nlattr *nla_key, + const struct nlattr *nla_mask, + bool log) +{ + const struct nlattr *a[OVS_KEY_ATTR_MAX + 1]; + struct nlattr *newmask = NULL; + u64 key_attrs = 0; + u64 mask_attrs = 0; + int err; + + err = parse_flow_nlattrs(nla_key, a, &key_attrs, log); + if (err) + return err; + + err = parse_vlan_from_nlattrs(match, &key_attrs, a, false, log); + if (err) + return err; + + err = ovs_key_from_nlattrs(net, match, key_attrs, a, false, log); + if (err) + return err; + + if (match->mask) { + if (!nla_mask) { + /* Create an exact match mask. We need to set to 0xff + * all the 'match->mask' fields that have been touched + * in 'match->key'. We cannot simply memset + * 'match->mask', because padding bytes and fields not + * specified in 'match->key' should be left to 0. + * Instead, we use a stream of netlink attributes, + * copied from 'key' and set to 0xff. + * ovs_key_from_nlattrs() will take care of filling + * 'match->mask' appropriately. + */ + newmask = kmemdup(nla_key, + nla_total_size(nla_len(nla_key)), + GFP_KERNEL); + if (!newmask) + return -ENOMEM; + + mask_set_nlattr(newmask, 0xff); + + /* The userspace does not send tunnel attributes that + * are 0, but we should not wildcard them nonetheless. + */ + if (match->key->tun_proto) + SW_FLOW_KEY_MEMSET_FIELD(match, tun_key, + 0xff, true); + + nla_mask = newmask; + } + + err = parse_flow_mask_nlattrs(nla_mask, a, &mask_attrs, log); + if (err) + goto free_newmask; + + /* Always match on tci. */ + SW_FLOW_KEY_PUT(match, eth.vlan.tci, htons(0xffff), true); + SW_FLOW_KEY_PUT(match, eth.cvlan.tci, htons(0xffff), true); + + err = parse_vlan_from_nlattrs(match, &mask_attrs, a, true, log); + if (err) + goto free_newmask; + + err = ovs_key_from_nlattrs(net, match, mask_attrs, a, true, + log); + if (err) + goto free_newmask; + } + + if (!match_validate(match, key_attrs, mask_attrs, log)) + err = -EINVAL; + +free_newmask: + kfree(newmask); + return err; +} + +static size_t get_ufid_len(const struct nlattr *attr, bool log) +{ + size_t len; + + if (!attr) + return 0; + + len = nla_len(attr); + if (len < 1 || len > MAX_UFID_LENGTH) { + OVS_NLERR(log, "ufid size %u bytes exceeds the range (1, %d)", + nla_len(attr), MAX_UFID_LENGTH); + return 0; + } + + return len; +} + +/* Initializes 'flow->ufid', returning true if 'attr' contains a valid UFID, + * or false otherwise. + */ +bool ovs_nla_get_ufid(struct sw_flow_id *sfid, const struct nlattr *attr, + bool log) +{ + sfid->ufid_len = get_ufid_len(attr, log); + if (sfid->ufid_len) + memcpy(sfid->ufid, nla_data(attr), sfid->ufid_len); + + return sfid->ufid_len; +} + +int ovs_nla_get_identifier(struct sw_flow_id *sfid, const struct nlattr *ufid, + const struct sw_flow_key *key, bool log) +{ + struct sw_flow_key *new_key; + + if (ovs_nla_get_ufid(sfid, ufid, log)) + return 0; + + /* If UFID was not provided, use unmasked key. */ + new_key = kmalloc(sizeof(*new_key), GFP_KERNEL); + if (!new_key) + return -ENOMEM; + memcpy(new_key, key, sizeof(*key)); + sfid->unmasked_key = new_key; + + return 0; +} + +u32 ovs_nla_get_ufid_flags(const struct nlattr *attr) +{ + return nla_get_u32_default(attr, 0); +} + +/** + * ovs_nla_get_flow_metadata - parses Netlink attributes into a flow key. + * @net: Network namespace. + * @key: Receives extracted in_port, priority, tun_key, skb_mark and conntrack + * metadata. + * @a: Array of netlink attributes holding parsed %OVS_KEY_ATTR_* Netlink + * attributes. + * @attrs: Bit mask for the netlink attributes included in @a. + * @log: Boolean to allow kernel error logging. Normally true, but when + * probing for feature compatibility this should be passed in as false to + * suppress unnecessary error logging. + * + * This parses a series of Netlink attributes that form a flow key, which must + * take the same form accepted by flow_from_nlattrs(), but only enough of it to + * get the metadata, that is, the parts of the flow key that cannot be + * extracted from the packet itself. + * + * This must be called before the packet key fields are filled in 'key'. + */ + +int ovs_nla_get_flow_metadata(struct net *net, + const struct nlattr *a[OVS_KEY_ATTR_MAX + 1], + u64 attrs, struct sw_flow_key *key, bool log) +{ + struct sw_flow_match match; + + memset(&match, 0, sizeof(match)); + match.key = key; + + key->ct_state = 0; + key->ct_zone = 0; + key->ct_orig_proto = 0; + memset(&key->ct, 0, sizeof(key->ct)); + memset(&key->ipv4.ct_orig, 0, sizeof(key->ipv4.ct_orig)); + memset(&key->ipv6.ct_orig, 0, sizeof(key->ipv6.ct_orig)); + + key->phy.in_port = DP_MAX_PORTS; + + return metadata_from_nlattrs(net, &match, &attrs, a, false, log); +} + +static int ovs_nla_put_vlan(struct sk_buff *skb, const struct vlan_head *vh, + bool is_mask) +{ + __be16 eth_type = !is_mask ? vh->tpid : htons(0xffff); + + if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, eth_type) || + nla_put_be16(skb, OVS_KEY_ATTR_VLAN, vh->tci)) + return -EMSGSIZE; + return 0; +} + +static int nsh_key_to_nlattr(const struct ovs_key_nsh *nsh, bool is_mask, + struct sk_buff *skb) +{ + struct nlattr *start; + + start = nla_nest_start_noflag(skb, OVS_KEY_ATTR_NSH); + if (!start) + return -EMSGSIZE; + + if (nla_put(skb, OVS_NSH_KEY_ATTR_BASE, sizeof(nsh->base), &nsh->base)) + goto nla_put_failure; + + if (is_mask || nsh->base.mdtype == NSH_M_TYPE1) { + if (nla_put(skb, OVS_NSH_KEY_ATTR_MD1, + sizeof(nsh->context), nsh->context)) + goto nla_put_failure; + } + + /* Don't support MD type 2 yet */ + + nla_nest_end(skb, start); + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +static int __ovs_nla_put_key(const struct sw_flow_key *swkey, + const struct sw_flow_key *output, bool is_mask, + struct sk_buff *skb) +{ + struct ovs_key_ethernet *eth_key; + struct nlattr *nla; + struct nlattr *encap = NULL; + struct nlattr *in_encap = NULL; + + if (nla_put_u32(skb, OVS_KEY_ATTR_RECIRC_ID, output->recirc_id)) + goto nla_put_failure; + + if (nla_put_u32(skb, OVS_KEY_ATTR_DP_HASH, output->ovs_flow_hash)) + goto nla_put_failure; + + if (nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, output->phy.priority)) + goto nla_put_failure; + + if ((swkey->tun_proto || is_mask)) { + const void *opts = NULL; + + if (ip_tunnel_is_options_present(output->tun_key.tun_flags)) + opts = TUN_METADATA_OPTS(output, swkey->tun_opts_len); + + if (ip_tun_to_nlattr(skb, &output->tun_key, opts, + swkey->tun_opts_len, swkey->tun_proto, 0)) + goto nla_put_failure; + } + + if (swkey->phy.in_port == DP_MAX_PORTS) { + if (is_mask && (output->phy.in_port == 0xffff)) + if (nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, 0xffffffff)) + goto nla_put_failure; + } else { + u16 upper_u16; + upper_u16 = !is_mask ? 0 : 0xffff; + + if (nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, + (upper_u16 << 16) | output->phy.in_port)) + goto nla_put_failure; + } + + if (nla_put_u32(skb, OVS_KEY_ATTR_SKB_MARK, output->phy.skb_mark)) + goto nla_put_failure; + + if (ovs_ct_put_key(swkey, output, skb)) + goto nla_put_failure; + + if (ovs_key_mac_proto(swkey) == MAC_PROTO_ETHERNET) { + nla = nla_reserve(skb, OVS_KEY_ATTR_ETHERNET, sizeof(*eth_key)); + if (!nla) + goto nla_put_failure; + + eth_key = nla_data(nla); + ether_addr_copy(eth_key->eth_src, output->eth.src); + ether_addr_copy(eth_key->eth_dst, output->eth.dst); + + if (swkey->eth.vlan.tci || eth_type_vlan(swkey->eth.type)) { + if (ovs_nla_put_vlan(skb, &output->eth.vlan, is_mask)) + goto nla_put_failure; + encap = nla_nest_start_noflag(skb, OVS_KEY_ATTR_ENCAP); + if (!swkey->eth.vlan.tci) + goto unencap; + + if (swkey->eth.cvlan.tci || eth_type_vlan(swkey->eth.type)) { + if (ovs_nla_put_vlan(skb, &output->eth.cvlan, is_mask)) + goto nla_put_failure; + in_encap = nla_nest_start_noflag(skb, + OVS_KEY_ATTR_ENCAP); + if (!swkey->eth.cvlan.tci) + goto unencap; + } + } + + if (swkey->eth.type == htons(ETH_P_802_2)) { + /* + * Ethertype 802.2 is represented in the netlink with omitted + * OVS_KEY_ATTR_ETHERTYPE in the flow key attribute, and + * 0xffff in the mask attribute. Ethertype can also + * be wildcarded. + */ + if (is_mask && output->eth.type) + if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, + output->eth.type)) + goto nla_put_failure; + goto unencap; + } + } + + if (nla_put_be16(skb, OVS_KEY_ATTR_ETHERTYPE, output->eth.type)) + goto nla_put_failure; + + if (eth_type_vlan(swkey->eth.type)) { + /* There are 3 VLAN tags, we don't know anything about the rest + * of the packet, so truncate here. + */ + WARN_ON_ONCE(!(encap && in_encap)); + goto unencap; + } + + if (swkey->eth.type == htons(ETH_P_IP)) { + struct ovs_key_ipv4 *ipv4_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_IPV4, sizeof(*ipv4_key)); + if (!nla) + goto nla_put_failure; + ipv4_key = nla_data(nla); + ipv4_key->ipv4_src = output->ipv4.addr.src; + ipv4_key->ipv4_dst = output->ipv4.addr.dst; + ipv4_key->ipv4_proto = output->ip.proto; + ipv4_key->ipv4_tos = output->ip.tos; + ipv4_key->ipv4_ttl = output->ip.ttl; + ipv4_key->ipv4_frag = output->ip.frag; + } else if (swkey->eth.type == htons(ETH_P_IPV6)) { + struct ovs_key_ipv6 *ipv6_key; + struct ovs_key_ipv6_exthdrs *ipv6_exthdrs_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_IPV6, sizeof(*ipv6_key)); + if (!nla) + goto nla_put_failure; + ipv6_key = nla_data(nla); + memcpy(ipv6_key->ipv6_src, &output->ipv6.addr.src, + sizeof(ipv6_key->ipv6_src)); + memcpy(ipv6_key->ipv6_dst, &output->ipv6.addr.dst, + sizeof(ipv6_key->ipv6_dst)); + ipv6_key->ipv6_label = output->ipv6.label; + ipv6_key->ipv6_proto = output->ip.proto; + ipv6_key->ipv6_tclass = output->ip.tos; + ipv6_key->ipv6_hlimit = output->ip.ttl; + ipv6_key->ipv6_frag = output->ip.frag; + + nla = nla_reserve(skb, OVS_KEY_ATTR_IPV6_EXTHDRS, + sizeof(*ipv6_exthdrs_key)); + if (!nla) + goto nla_put_failure; + ipv6_exthdrs_key = nla_data(nla); + ipv6_exthdrs_key->hdrs = output->ipv6.exthdrs; + } else if (swkey->eth.type == htons(ETH_P_NSH)) { + if (nsh_key_to_nlattr(&output->nsh, is_mask, skb)) + goto nla_put_failure; + } else if (swkey->eth.type == htons(ETH_P_ARP) || + swkey->eth.type == htons(ETH_P_RARP)) { + struct ovs_key_arp *arp_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_ARP, sizeof(*arp_key)); + if (!nla) + goto nla_put_failure; + arp_key = nla_data(nla); + memset(arp_key, 0, sizeof(struct ovs_key_arp)); + arp_key->arp_sip = output->ipv4.addr.src; + arp_key->arp_tip = output->ipv4.addr.dst; + arp_key->arp_op = htons(output->ip.proto); + ether_addr_copy(arp_key->arp_sha, output->ipv4.arp.sha); + ether_addr_copy(arp_key->arp_tha, output->ipv4.arp.tha); + } else if (eth_p_mpls(swkey->eth.type)) { + u8 i, num_labels; + struct ovs_key_mpls *mpls_key; + + num_labels = hweight_long(output->mpls.num_labels_mask); + nla = nla_reserve(skb, OVS_KEY_ATTR_MPLS, + num_labels * sizeof(*mpls_key)); + if (!nla) + goto nla_put_failure; + + mpls_key = nla_data(nla); + for (i = 0; i < num_labels; i++) + mpls_key[i].mpls_lse = output->mpls.lse[i]; + } + + if ((swkey->eth.type == htons(ETH_P_IP) || + swkey->eth.type == htons(ETH_P_IPV6)) && + swkey->ip.frag != OVS_FRAG_TYPE_LATER) { + + if (swkey->ip.proto == IPPROTO_TCP) { + struct ovs_key_tcp *tcp_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_TCP, sizeof(*tcp_key)); + if (!nla) + goto nla_put_failure; + tcp_key = nla_data(nla); + tcp_key->tcp_src = output->tp.src; + tcp_key->tcp_dst = output->tp.dst; + if (nla_put_be16(skb, OVS_KEY_ATTR_TCP_FLAGS, + output->tp.flags)) + goto nla_put_failure; + } else if (swkey->ip.proto == IPPROTO_UDP) { + struct ovs_key_udp *udp_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_UDP, sizeof(*udp_key)); + if (!nla) + goto nla_put_failure; + udp_key = nla_data(nla); + udp_key->udp_src = output->tp.src; + udp_key->udp_dst = output->tp.dst; + } else if (swkey->ip.proto == IPPROTO_SCTP) { + struct ovs_key_sctp *sctp_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_SCTP, sizeof(*sctp_key)); + if (!nla) + goto nla_put_failure; + sctp_key = nla_data(nla); + sctp_key->sctp_src = output->tp.src; + sctp_key->sctp_dst = output->tp.dst; + } else if (swkey->eth.type == htons(ETH_P_IP) && + swkey->ip.proto == IPPROTO_ICMP) { + struct ovs_key_icmp *icmp_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_ICMP, sizeof(*icmp_key)); + if (!nla) + goto nla_put_failure; + icmp_key = nla_data(nla); + icmp_key->icmp_type = ntohs(output->tp.src); + icmp_key->icmp_code = ntohs(output->tp.dst); + } else if (swkey->eth.type == htons(ETH_P_IPV6) && + swkey->ip.proto == IPPROTO_ICMPV6) { + struct ovs_key_icmpv6 *icmpv6_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_ICMPV6, + sizeof(*icmpv6_key)); + if (!nla) + goto nla_put_failure; + icmpv6_key = nla_data(nla); + icmpv6_key->icmpv6_type = ntohs(output->tp.src); + icmpv6_key->icmpv6_code = ntohs(output->tp.dst); + + if (swkey->tp.src == htons(NDISC_NEIGHBOUR_SOLICITATION) || + swkey->tp.src == htons(NDISC_NEIGHBOUR_ADVERTISEMENT)) { + struct ovs_key_nd *nd_key; + + nla = nla_reserve(skb, OVS_KEY_ATTR_ND, sizeof(*nd_key)); + if (!nla) + goto nla_put_failure; + nd_key = nla_data(nla); + memcpy(nd_key->nd_target, &output->ipv6.nd.target, + sizeof(nd_key->nd_target)); + ether_addr_copy(nd_key->nd_sll, output->ipv6.nd.sll); + ether_addr_copy(nd_key->nd_tll, output->ipv6.nd.tll); + } + } + } + +unencap: + if (in_encap) + nla_nest_end(skb, in_encap); + if (encap) + nla_nest_end(skb, encap); + + return 0; + +nla_put_failure: + return -EMSGSIZE; +} + +int ovs_nla_put_key(const struct sw_flow_key *swkey, + const struct sw_flow_key *output, int attr, bool is_mask, + struct sk_buff *skb) +{ + int err; + struct nlattr *nla; + + nla = nla_nest_start_noflag(skb, attr); + if (!nla) + return -EMSGSIZE; + err = __ovs_nla_put_key(swkey, output, is_mask, skb); + if (err) + return err; + nla_nest_end(skb, nla); + + return 0; +} + +/* Called with ovs_mutex or RCU read lock. */ +int ovs_nla_put_identifier(const struct sw_flow *flow, struct sk_buff *skb) +{ + if (ovs_identifier_is_ufid(&flow->id)) + return nla_put(skb, OVS_FLOW_ATTR_UFID, flow->id.ufid_len, + flow->id.ufid); + + return ovs_nla_put_key(flow->id.unmasked_key, flow->id.unmasked_key, + OVS_FLOW_ATTR_KEY, false, skb); +} + +/* Called with ovs_mutex or RCU read lock. */ +int ovs_nla_put_masked_key(const struct sw_flow *flow, struct sk_buff *skb) +{ + return ovs_nla_put_key(&flow->key, &flow->key, + OVS_FLOW_ATTR_KEY, false, skb); +} + +/* Called with ovs_mutex or RCU read lock. */ +int ovs_nla_put_mask(const struct sw_flow *flow, struct sk_buff *skb) +{ + return ovs_nla_put_key(&flow->key, &flow->mask->key, + OVS_FLOW_ATTR_MASK, true, skb); +} + +static struct sw_flow_actions *nla_alloc_flow_actions(int size) +{ + struct sw_flow_actions *sfa; + + sfa = kmalloc(kmalloc_size_roundup(sizeof(*sfa) + size), GFP_KERNEL); + if (!sfa) + return ERR_PTR(-ENOMEM); + + sfa->actions_len = 0; + return sfa; +} + +static void ovs_nla_free_nested_actions(const struct nlattr *actions, int len); + +static void ovs_nla_free_check_pkt_len_action(const struct nlattr *action) +{ + const struct nlattr *a; + int rem; + + nla_for_each_nested(a, action, rem) { + switch (nla_type(a)) { + case OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL: + case OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER: + ovs_nla_free_nested_actions(nla_data(a), nla_len(a)); + break; + } + } +} + +static void ovs_nla_free_clone_action(const struct nlattr *action) +{ + const struct nlattr *a = nla_data(action); + int rem = nla_len(action); + + switch (nla_type(a)) { + case OVS_CLONE_ATTR_EXEC: + /* The real list of actions follows this attribute. */ + a = nla_next(a, &rem); + ovs_nla_free_nested_actions(a, rem); + break; + } +} + +static void ovs_nla_free_dec_ttl_action(const struct nlattr *action) +{ + const struct nlattr *a = nla_data(action); + + switch (nla_type(a)) { + case OVS_DEC_TTL_ATTR_ACTION: + ovs_nla_free_nested_actions(nla_data(a), nla_len(a)); + break; + } +} + +static void ovs_nla_free_sample_action(const struct nlattr *action) +{ + const struct nlattr *a = nla_data(action); + int rem = nla_len(action); + + switch (nla_type(a)) { + case OVS_SAMPLE_ATTR_ARG: + /* The real list of actions follows this attribute. */ + a = nla_next(a, &rem); + ovs_nla_free_nested_actions(a, rem); + break; + } +} + +static void ovs_nla_free_set_action(const struct nlattr *a) +{ + const struct nlattr *ovs_key = nla_data(a); + struct ovs_tunnel_info *ovs_tun; + + switch (nla_type(ovs_key)) { + case OVS_KEY_ATTR_TUNNEL_INFO: + ovs_tun = nla_data(ovs_key); + dst_release((struct dst_entry *)ovs_tun->tun_dst); + break; + } +} + +static void ovs_nla_free_nested_actions(const struct nlattr *actions, int len) +{ + const struct nlattr *a; + int rem; + + /* Whenever new actions are added, the need to update this + * function should be considered. + */ + BUILD_BUG_ON(OVS_ACTION_ATTR_MAX != 25); + + if (!actions) + return; + + nla_for_each_attr(a, actions, len, rem) { + switch (nla_type(a)) { + case OVS_ACTION_ATTR_CHECK_PKT_LEN: + ovs_nla_free_check_pkt_len_action(a); + break; + + case OVS_ACTION_ATTR_CLONE: + ovs_nla_free_clone_action(a); + break; + + case OVS_ACTION_ATTR_CT: + ovs_ct_free_action(a); + break; + + case OVS_ACTION_ATTR_DEC_TTL: + ovs_nla_free_dec_ttl_action(a); + break; + + case OVS_ACTION_ATTR_SAMPLE: + ovs_nla_free_sample_action(a); + break; + + case OVS_ACTION_ATTR_SET: + ovs_nla_free_set_action(a); + break; + } + } +} + +void ovs_nla_free_flow_actions(struct sw_flow_actions *sf_acts) +{ + if (!sf_acts) + return; + + ovs_nla_free_nested_actions(sf_acts->actions, sf_acts->actions_len); + kfree(sf_acts); +} + +static void __ovs_nla_free_flow_actions(struct rcu_head *head) +{ + ovs_nla_free_flow_actions(container_of(head, struct sw_flow_actions, rcu)); +} + +/* Schedules 'sf_acts' to be freed after the next RCU grace period. + * The caller must hold rcu_read_lock for this to be sensible. */ +void ovs_nla_free_flow_actions_rcu(struct sw_flow_actions *sf_acts) +{ + call_rcu(&sf_acts->rcu, __ovs_nla_free_flow_actions); +} + +static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa, + int attr_len, bool log) +{ + + struct sw_flow_actions *acts; + int new_acts_size; + size_t req_size = NLA_ALIGN(attr_len); + int next_offset = offsetof(struct sw_flow_actions, actions) + + (*sfa)->actions_len; + + if (req_size <= (ksize(*sfa) - next_offset)) + goto out; + + new_acts_size = max(next_offset + req_size, ksize(*sfa) * 2); + + acts = nla_alloc_flow_actions(new_acts_size); + if (IS_ERR(acts)) + return ERR_CAST(acts); + + memcpy(acts->actions, (*sfa)->actions, (*sfa)->actions_len); + acts->actions_len = (*sfa)->actions_len; + acts->orig_len = (*sfa)->orig_len; + kfree(*sfa); + *sfa = acts; + +out: + (*sfa)->actions_len += req_size; + return (struct nlattr *) ((unsigned char *)(*sfa) + next_offset); +} + +static struct nlattr *__add_action(struct sw_flow_actions **sfa, + int attrtype, void *data, int len, bool log) +{ + struct nlattr *a; + + a = reserve_sfa_size(sfa, nla_attr_size(len), log); + if (IS_ERR(a)) + return a; + + a->nla_type = attrtype; + a->nla_len = nla_attr_size(len); + + if (data) + memcpy(nla_data(a), data, len); + memset((unsigned char *) a + a->nla_len, 0, nla_padlen(len)); + + return a; +} + +int ovs_nla_add_action(struct sw_flow_actions **sfa, int attrtype, void *data, + int len, bool log) +{ + struct nlattr *a; + + a = __add_action(sfa, attrtype, data, len, log); + + return PTR_ERR_OR_ZERO(a); +} + +static inline int add_nested_action_start(struct sw_flow_actions **sfa, + int attrtype, bool log) +{ + int used = (*sfa)->actions_len; + int err; + + err = ovs_nla_add_action(sfa, attrtype, NULL, 0, log); + if (err) + return err; + + return used; +} + +static inline void add_nested_action_end(struct sw_flow_actions *sfa, + int st_offset) +{ + struct nlattr *a = (struct nlattr *) ((unsigned char *)sfa->actions + + st_offset); + + a->nla_len = sfa->actions_len - st_offset; +} + +static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, + const struct sw_flow_key *key, + struct sw_flow_actions **sfa, + __be16 eth_type, __be16 vlan_tci, + u32 mpls_label_count, bool log, + u32 depth); + +static int validate_and_copy_sample(struct net *net, const struct nlattr *attr, + const struct sw_flow_key *key, + struct sw_flow_actions **sfa, + __be16 eth_type, __be16 vlan_tci, + u32 mpls_label_count, bool log, bool last, + u32 depth) +{ + const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1]; + const struct nlattr *probability, *actions; + const struct nlattr *a; + int rem, start, err; + struct sample_arg arg; + + memset(attrs, 0, sizeof(attrs)); + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); + if (!type || type > OVS_SAMPLE_ATTR_MAX || attrs[type]) + return -EINVAL; + attrs[type] = a; + } + if (rem) + return -EINVAL; + + probability = attrs[OVS_SAMPLE_ATTR_PROBABILITY]; + if (!probability || nla_len(probability) != sizeof(u32)) + return -EINVAL; + + actions = attrs[OVS_SAMPLE_ATTR_ACTIONS]; + if (!actions || (nla_len(actions) && nla_len(actions) < NLA_HDRLEN)) + return -EINVAL; + + /* validation done, copy sample action. */ + start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SAMPLE, log); + if (start < 0) + return start; + + /* When both skb and flow may be changed, put the sample + * into a deferred fifo. On the other hand, if only skb + * may be modified, the actions can be executed in place. + * + * Do this analysis at the flow installation time. + * Set 'clone_action->exec' to true if the actions can be + * executed without being deferred. + * + * If the sample is the last action, it can always be excuted + * rather than deferred. + */ + arg.exec = last || !actions_may_change_flow(actions); + arg.probability = nla_get_u32(probability); + + err = ovs_nla_add_action(sfa, OVS_SAMPLE_ATTR_ARG, &arg, sizeof(arg), + log); + if (err) + return err; + + err = __ovs_nla_copy_actions(net, actions, key, sfa, + eth_type, vlan_tci, mpls_label_count, log, + depth + 1); + + if (err) + return err; + + add_nested_action_end(*sfa, start); + + return 0; +} + +static int validate_and_copy_dec_ttl(struct net *net, + const struct nlattr *attr, + const struct sw_flow_key *key, + struct sw_flow_actions **sfa, + __be16 eth_type, __be16 vlan_tci, + u32 mpls_label_count, bool log, + u32 depth) +{ + const struct nlattr *attrs[OVS_DEC_TTL_ATTR_MAX + 1]; + int start, action_start, err, rem; + const struct nlattr *a, *actions; + + memset(attrs, 0, sizeof(attrs)); + nla_for_each_nested(a, attr, rem) { + int type = nla_type(a); + + /* Ignore unknown attributes to be future proof. */ + if (type > OVS_DEC_TTL_ATTR_MAX) + continue; + + if (!type || attrs[type]) { + OVS_NLERR(log, "Duplicate or invalid key (type %d).", + type); + return -EINVAL; + } + + attrs[type] = a; + } + + if (rem) { + OVS_NLERR(log, "Message has %d unknown bytes.", rem); + return -EINVAL; + } + + actions = attrs[OVS_DEC_TTL_ATTR_ACTION]; + if (!actions || (nla_len(actions) && nla_len(actions) < NLA_HDRLEN)) { + OVS_NLERR(log, "Missing valid actions attribute."); + return -EINVAL; + } + + start = add_nested_action_start(sfa, OVS_ACTION_ATTR_DEC_TTL, log); + if (start < 0) + return start; + + action_start = add_nested_action_start(sfa, OVS_DEC_TTL_ATTR_ACTION, log); + if (action_start < 0) + return action_start; + + err = __ovs_nla_copy_actions(net, actions, key, sfa, eth_type, + vlan_tci, mpls_label_count, log, + depth + 1); + if (err) + return err; + + add_nested_action_end(*sfa, action_start); + add_nested_action_end(*sfa, start); + return 0; +} + +static int validate_and_copy_clone(struct net *net, + const struct nlattr *attr, + const struct sw_flow_key *key, + struct sw_flow_actions **sfa, + __be16 eth_type, __be16 vlan_tci, + u32 mpls_label_count, bool log, bool last, + u32 depth) +{ + int start, err; + u32 exec; + + if (nla_len(attr) && nla_len(attr) < NLA_HDRLEN) + return -EINVAL; + + start = add_nested_action_start(sfa, OVS_ACTION_ATTR_CLONE, log); + if (start < 0) + return start; + + exec = last || !actions_may_change_flow(attr); + + err = ovs_nla_add_action(sfa, OVS_CLONE_ATTR_EXEC, &exec, + sizeof(exec), log); + if (err) + return err; + + err = __ovs_nla_copy_actions(net, attr, key, sfa, + eth_type, vlan_tci, mpls_label_count, log, + depth + 1); + if (err) + return err; + + add_nested_action_end(*sfa, start); + + return 0; +} + +void ovs_match_init(struct sw_flow_match *match, + struct sw_flow_key *key, + bool reset_key, + struct sw_flow_mask *mask) +{ + memset(match, 0, sizeof(*match)); + match->key = key; + match->mask = mask; + + if (reset_key) + memset(key, 0, sizeof(*key)); + + if (mask) { + memset(&mask->key, 0, sizeof(mask->key)); + mask->range.start = mask->range.end = 0; + } +} + +static int validate_geneve_opts(struct sw_flow_key *key) +{ + struct geneve_opt *option; + int opts_len = key->tun_opts_len; + bool crit_opt = false; + + option = (struct geneve_opt *)TUN_METADATA_OPTS(key, key->tun_opts_len); + while (opts_len > 0) { + int len; + + if (opts_len < sizeof(*option)) + return -EINVAL; + + len = sizeof(*option) + option->length * 4; + if (len > opts_len) + return -EINVAL; + + crit_opt |= !!(option->type & GENEVE_CRIT_OPT_TYPE); + + option = (struct geneve_opt *)((u8 *)option + len); + opts_len -= len; + } + + if (crit_opt) + __set_bit(IP_TUNNEL_CRIT_OPT_BIT, key->tun_key.tun_flags); + + return 0; +} + +static int validate_and_copy_set_tun(const struct nlattr *attr, + struct sw_flow_actions **sfa, bool log) +{ + IP_TUNNEL_DECLARE_FLAGS(dst_opt_type) = { }; + struct sw_flow_match match; + struct sw_flow_key key; + struct metadata_dst *tun_dst; + struct ip_tunnel_info *tun_info; + struct ovs_tunnel_info *ovs_tun; + struct nlattr *a; + int err = 0, start, opts_type; + + ovs_match_init(&match, &key, true, NULL); + opts_type = ip_tun_from_nlattr(nla_data(attr), &match, false, log); + if (opts_type < 0) + return opts_type; + + if (key.tun_opts_len) { + switch (opts_type) { + case OVS_TUNNEL_KEY_ATTR_GENEVE_OPTS: + err = validate_geneve_opts(&key); + if (err < 0) + return err; + + __set_bit(IP_TUNNEL_GENEVE_OPT_BIT, dst_opt_type); + break; + case OVS_TUNNEL_KEY_ATTR_VXLAN_OPTS: + __set_bit(IP_TUNNEL_VXLAN_OPT_BIT, dst_opt_type); + break; + case OVS_TUNNEL_KEY_ATTR_ERSPAN_OPTS: + __set_bit(IP_TUNNEL_ERSPAN_OPT_BIT, dst_opt_type); + break; + } + } + + start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SET, log); + if (start < 0) + return start; + + tun_dst = metadata_dst_alloc(key.tun_opts_len, METADATA_IP_TUNNEL, + GFP_KERNEL); + + if (!tun_dst) + return -ENOMEM; + + err = dst_cache_init(&tun_dst->u.tun_info.dst_cache, GFP_KERNEL); + if (err) { + dst_release((struct dst_entry *)tun_dst); + return err; + } + + a = __add_action(sfa, OVS_KEY_ATTR_TUNNEL_INFO, NULL, + sizeof(*ovs_tun), log); + if (IS_ERR(a)) { + dst_release((struct dst_entry *)tun_dst); + return PTR_ERR(a); + } + + ovs_tun = nla_data(a); + ovs_tun->tun_dst = tun_dst; + + tun_info = &tun_dst->u.tun_info; + tun_info->mode = IP_TUNNEL_INFO_TX; + if (key.tun_proto == AF_INET6) + tun_info->mode |= IP_TUNNEL_INFO_IPV6; + else if (key.tun_proto == AF_INET && key.tun_key.u.ipv4.dst == 0) + tun_info->mode |= IP_TUNNEL_INFO_BRIDGE; + tun_info->key = key.tun_key; + + /* We need to store the options in the action itself since + * everything else will go away after flow setup. We can append + * it to tun_info and then point there. + */ + ip_tunnel_info_opts_set(tun_info, + TUN_METADATA_OPTS(&key, key.tun_opts_len), + key.tun_opts_len, dst_opt_type); + add_nested_action_end(*sfa, start); + + return err; +} + +static bool validate_push_nsh(const struct nlattr *attr, bool log) +{ + struct sw_flow_match match; + struct sw_flow_key key; + + ovs_match_init(&match, &key, true, NULL); + return !nsh_key_put_from_nlattr(attr, &match, false, true, log); +} + +/* Return false if there are any non-masked bits set. + * Mask follows data immediately, before any netlink padding. + */ +static bool validate_masked(u8 *data, int len) +{ + u8 *mask = data + len; + + while (len--) + if (*data++ & ~*mask++) + return false; + + return true; +} + +static int validate_set(const struct nlattr *a, + const struct sw_flow_key *flow_key, + struct sw_flow_actions **sfa, bool *skip_copy, + u8 mac_proto, __be16 eth_type, bool masked, bool log) +{ + const struct nlattr *ovs_key = nla_data(a); + int key_type = nla_type(ovs_key); + size_t key_len; + + /* There can be only one key in a action */ + if (!nla_ok(ovs_key, nla_len(a)) || + nla_total_size(nla_len(ovs_key)) != nla_len(a)) + return -EINVAL; + + key_len = nla_len(ovs_key); + if (masked) + key_len /= 2; + + if (key_type > OVS_KEY_ATTR_MAX || + !check_attr_len(key_len, ovs_key_lens[key_type].len)) + return -EINVAL; + + if (masked && !validate_masked(nla_data(ovs_key), key_len)) + return -EINVAL; + + switch (key_type) { + case OVS_KEY_ATTR_PRIORITY: + case OVS_KEY_ATTR_SKB_MARK: + case OVS_KEY_ATTR_CT_MARK: + case OVS_KEY_ATTR_CT_LABELS: + break; + + case OVS_KEY_ATTR_ETHERNET: + if (mac_proto != MAC_PROTO_ETHERNET) + return -EINVAL; + break; + + case OVS_KEY_ATTR_TUNNEL: { + int err; + + if (masked) + return -EINVAL; /* Masked tunnel set not supported. */ + + *skip_copy = true; + err = validate_and_copy_set_tun(a, sfa, log); + if (err) + return err; + break; + } + case OVS_KEY_ATTR_IPV4: { + const struct ovs_key_ipv4 *ipv4_key; + + if (eth_type != htons(ETH_P_IP)) + return -EINVAL; + + ipv4_key = nla_data(ovs_key); + + if (masked) { + const struct ovs_key_ipv4 *mask = ipv4_key + 1; + + /* Non-writeable fields. */ + if (mask->ipv4_proto || mask->ipv4_frag) + return -EINVAL; + } else { + if (ipv4_key->ipv4_proto != flow_key->ip.proto) + return -EINVAL; + + if (ipv4_key->ipv4_frag != flow_key->ip.frag) + return -EINVAL; + } + break; + } + case OVS_KEY_ATTR_IPV6: { + const struct ovs_key_ipv6 *ipv6_key; + + if (eth_type != htons(ETH_P_IPV6)) + return -EINVAL; + + ipv6_key = nla_data(ovs_key); + + if (masked) { + const struct ovs_key_ipv6 *mask = ipv6_key + 1; + + /* Non-writeable fields. */ + if (mask->ipv6_proto || mask->ipv6_frag) + return -EINVAL; + + /* Invalid bits in the flow label mask? */ + if (ntohl(mask->ipv6_label) & 0xFFF00000) + return -EINVAL; + } else { + if (ipv6_key->ipv6_proto != flow_key->ip.proto) + return -EINVAL; + + if (ipv6_key->ipv6_frag != flow_key->ip.frag) + return -EINVAL; + } + if (ntohl(ipv6_key->ipv6_label) & 0xFFF00000) + return -EINVAL; + + break; + } + case OVS_KEY_ATTR_TCP: + if ((eth_type != htons(ETH_P_IP) && + eth_type != htons(ETH_P_IPV6)) || + flow_key->ip.proto != IPPROTO_TCP) + return -EINVAL; + + break; + + case OVS_KEY_ATTR_UDP: + if ((eth_type != htons(ETH_P_IP) && + eth_type != htons(ETH_P_IPV6)) || + flow_key->ip.proto != IPPROTO_UDP) + return -EINVAL; + + break; + + case OVS_KEY_ATTR_MPLS: + if (!eth_p_mpls(eth_type)) + return -EINVAL; + break; + + case OVS_KEY_ATTR_SCTP: + if ((eth_type != htons(ETH_P_IP) && + eth_type != htons(ETH_P_IPV6)) || + flow_key->ip.proto != IPPROTO_SCTP) + return -EINVAL; + + break; + + default: + return -EINVAL; + } + + /* Convert non-masked non-tunnel set actions to masked set actions. */ + if (!masked && key_type != OVS_KEY_ATTR_TUNNEL) { + int start, len = key_len * 2; + struct nlattr *at; + + *skip_copy = true; + + start = add_nested_action_start(sfa, + OVS_ACTION_ATTR_SET_TO_MASKED, + log); + if (start < 0) + return start; + + at = __add_action(sfa, key_type, NULL, len, log); + if (IS_ERR(at)) + return PTR_ERR(at); + + memcpy(nla_data(at), nla_data(ovs_key), key_len); /* Key. */ + memset(nla_data(at) + key_len, 0xff, key_len); /* Mask. */ + /* Clear non-writeable bits from otherwise writeable fields. */ + if (key_type == OVS_KEY_ATTR_IPV6) { + struct ovs_key_ipv6 *mask = nla_data(at) + key_len; + + mask->ipv6_label &= htonl(0x000FFFFF); + } + add_nested_action_end(*sfa, start); + } + + return 0; +} + +static int validate_userspace(const struct nlattr *attr) +{ + static const struct nla_policy userspace_policy[OVS_USERSPACE_ATTR_MAX + 1] = { + [OVS_USERSPACE_ATTR_PID] = {.type = NLA_U32 }, + [OVS_USERSPACE_ATTR_USERDATA] = {.type = NLA_UNSPEC }, + [OVS_USERSPACE_ATTR_EGRESS_TUN_PORT] = {.type = NLA_U32 }, + }; + struct nlattr *a[OVS_USERSPACE_ATTR_MAX + 1]; + int error; + + error = nla_parse_deprecated_strict(a, OVS_USERSPACE_ATTR_MAX, + nla_data(attr), nla_len(attr), + userspace_policy, NULL); + if (error) + return error; + + if (!a[OVS_USERSPACE_ATTR_PID] || + !nla_get_u32(a[OVS_USERSPACE_ATTR_PID])) + return -EINVAL; + + return 0; +} + +static const struct nla_policy cpl_policy[OVS_CHECK_PKT_LEN_ATTR_MAX + 1] = { + [OVS_CHECK_PKT_LEN_ATTR_PKT_LEN] = {.type = NLA_U16 }, + [OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER] = {.type = NLA_NESTED }, + [OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL] = {.type = NLA_NESTED }, +}; + +static int validate_and_copy_check_pkt_len(struct net *net, + const struct nlattr *attr, + const struct sw_flow_key *key, + struct sw_flow_actions **sfa, + __be16 eth_type, __be16 vlan_tci, + u32 mpls_label_count, + bool log, bool last, u32 depth) +{ + const struct nlattr *acts_if_greater, *acts_if_lesser_eq; + struct nlattr *a[OVS_CHECK_PKT_LEN_ATTR_MAX + 1]; + struct check_pkt_len_arg arg; + int nested_acts_start; + int start, err; + + err = nla_parse_deprecated_strict(a, OVS_CHECK_PKT_LEN_ATTR_MAX, + nla_data(attr), nla_len(attr), + cpl_policy, NULL); + if (err) + return err; + + if (!a[OVS_CHECK_PKT_LEN_ATTR_PKT_LEN] || + !nla_get_u16(a[OVS_CHECK_PKT_LEN_ATTR_PKT_LEN])) + return -EINVAL; + + acts_if_lesser_eq = a[OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL]; + acts_if_greater = a[OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER]; + + /* Both the nested action should be present. */ + if (!acts_if_greater || !acts_if_lesser_eq) + return -EINVAL; + + /* validation done, copy the nested actions. */ + start = add_nested_action_start(sfa, OVS_ACTION_ATTR_CHECK_PKT_LEN, + log); + if (start < 0) + return start; + + arg.pkt_len = nla_get_u16(a[OVS_CHECK_PKT_LEN_ATTR_PKT_LEN]); + arg.exec_for_lesser_equal = + last || !actions_may_change_flow(acts_if_lesser_eq); + arg.exec_for_greater = + last || !actions_may_change_flow(acts_if_greater); + + err = ovs_nla_add_action(sfa, OVS_CHECK_PKT_LEN_ATTR_ARG, &arg, + sizeof(arg), log); + if (err) + return err; + + nested_acts_start = add_nested_action_start(sfa, + OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL, log); + if (nested_acts_start < 0) + return nested_acts_start; + + err = __ovs_nla_copy_actions(net, acts_if_lesser_eq, key, sfa, + eth_type, vlan_tci, mpls_label_count, log, + depth + 1); + + if (err) + return err; + + add_nested_action_end(*sfa, nested_acts_start); + + nested_acts_start = add_nested_action_start(sfa, + OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER, log); + if (nested_acts_start < 0) + return nested_acts_start; + + err = __ovs_nla_copy_actions(net, acts_if_greater, key, sfa, + eth_type, vlan_tci, mpls_label_count, log, + depth + 1); + + if (err) + return err; + + add_nested_action_end(*sfa, nested_acts_start); + add_nested_action_end(*sfa, start); + return 0; +} + +static int validate_psample(const struct nlattr *attr) +{ + static const struct nla_policy policy[OVS_PSAMPLE_ATTR_MAX + 1] = { + [OVS_PSAMPLE_ATTR_GROUP] = { .type = NLA_U32 }, + [OVS_PSAMPLE_ATTR_COOKIE] = { + .type = NLA_BINARY, + .len = OVS_PSAMPLE_COOKIE_MAX_SIZE, + }, + }; + struct nlattr *a[OVS_PSAMPLE_ATTR_MAX + 1]; + int err; + + if (!IS_ENABLED(CONFIG_PSAMPLE)) + return -EOPNOTSUPP; + + err = nla_parse_nested(a, OVS_PSAMPLE_ATTR_MAX, attr, policy, NULL); + if (err) + return err; + + return a[OVS_PSAMPLE_ATTR_GROUP] ? 0 : -EINVAL; +} + +static int copy_action(const struct nlattr *from, + struct sw_flow_actions **sfa, bool log) +{ + int totlen = NLA_ALIGN(from->nla_len); + struct nlattr *to; + + to = reserve_sfa_size(sfa, from->nla_len, log); + if (IS_ERR(to)) + return PTR_ERR(to); + + memcpy(to, from, totlen); + return 0; +} + +static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, + const struct sw_flow_key *key, + struct sw_flow_actions **sfa, + __be16 eth_type, __be16 vlan_tci, + u32 mpls_label_count, bool log, + u32 depth) +{ + u8 mac_proto = ovs_key_mac_proto(key); + const struct nlattr *a; + int rem, err; + + if (depth > OVS_COPY_ACTIONS_MAX_DEPTH) + return -EOVERFLOW; + + nla_for_each_nested(a, attr, rem) { + /* Expected argument lengths, (u32)-1 for variable length. */ + static const u32 action_lens[OVS_ACTION_ATTR_MAX + 1] = { + [OVS_ACTION_ATTR_OUTPUT] = sizeof(u32), + [OVS_ACTION_ATTR_RECIRC] = sizeof(u32), + [OVS_ACTION_ATTR_USERSPACE] = (u32)-1, + [OVS_ACTION_ATTR_PUSH_MPLS] = sizeof(struct ovs_action_push_mpls), + [OVS_ACTION_ATTR_POP_MPLS] = sizeof(__be16), + [OVS_ACTION_ATTR_PUSH_VLAN] = sizeof(struct ovs_action_push_vlan), + [OVS_ACTION_ATTR_POP_VLAN] = 0, + [OVS_ACTION_ATTR_SET] = (u32)-1, + [OVS_ACTION_ATTR_SET_MASKED] = (u32)-1, + [OVS_ACTION_ATTR_SAMPLE] = (u32)-1, + [OVS_ACTION_ATTR_HASH] = sizeof(struct ovs_action_hash), + [OVS_ACTION_ATTR_CT] = (u32)-1, + [OVS_ACTION_ATTR_CT_CLEAR] = 0, + [OVS_ACTION_ATTR_TRUNC] = sizeof(struct ovs_action_trunc), + [OVS_ACTION_ATTR_PUSH_ETH] = sizeof(struct ovs_action_push_eth), + [OVS_ACTION_ATTR_POP_ETH] = 0, + [OVS_ACTION_ATTR_PUSH_NSH] = (u32)-1, + [OVS_ACTION_ATTR_POP_NSH] = 0, + [OVS_ACTION_ATTR_METER] = sizeof(u32), + [OVS_ACTION_ATTR_CLONE] = (u32)-1, + [OVS_ACTION_ATTR_CHECK_PKT_LEN] = (u32)-1, + [OVS_ACTION_ATTR_ADD_MPLS] = sizeof(struct ovs_action_add_mpls), + [OVS_ACTION_ATTR_DEC_TTL] = (u32)-1, + [OVS_ACTION_ATTR_DROP] = sizeof(u32), + [OVS_ACTION_ATTR_PSAMPLE] = (u32)-1, + }; + const struct ovs_action_push_vlan *vlan; + int type = nla_type(a); + bool skip_copy; + + if (type > OVS_ACTION_ATTR_MAX || + (action_lens[type] != nla_len(a) && + action_lens[type] != (u32)-1)) + return -EINVAL; + + skip_copy = false; + switch (type) { + case OVS_ACTION_ATTR_UNSPEC: + return -EINVAL; + + case OVS_ACTION_ATTR_USERSPACE: + err = validate_userspace(a); + if (err) + return err; + break; + + case OVS_ACTION_ATTR_OUTPUT: + if (nla_get_u32(a) >= DP_MAX_PORTS) + return -EINVAL; + break; + + case OVS_ACTION_ATTR_TRUNC: { + const struct ovs_action_trunc *trunc = nla_data(a); + + if (trunc->max_len < ETH_HLEN) + return -EINVAL; + break; + } + + case OVS_ACTION_ATTR_HASH: { + const struct ovs_action_hash *act_hash = nla_data(a); + + switch (act_hash->hash_alg) { + case OVS_HASH_ALG_L4: + fallthrough; + case OVS_HASH_ALG_SYM_L4: + break; + default: + return -EINVAL; + } + + break; + } + + case OVS_ACTION_ATTR_POP_VLAN: + if (mac_proto != MAC_PROTO_ETHERNET) + return -EINVAL; + vlan_tci = htons(0); + break; + + case OVS_ACTION_ATTR_PUSH_VLAN: + if (mac_proto != MAC_PROTO_ETHERNET) + return -EINVAL; + vlan = nla_data(a); + if (!eth_type_vlan(vlan->vlan_tpid)) + return -EINVAL; + if (!(vlan->vlan_tci & htons(VLAN_CFI_MASK))) + return -EINVAL; + vlan_tci = vlan->vlan_tci; + break; + + case OVS_ACTION_ATTR_RECIRC: + break; + + case OVS_ACTION_ATTR_ADD_MPLS: { + const struct ovs_action_add_mpls *mpls = nla_data(a); + + if (!eth_p_mpls(mpls->mpls_ethertype)) + return -EINVAL; + + if (mpls->tun_flags & OVS_MPLS_L3_TUNNEL_FLAG_MASK) { + if (vlan_tci & htons(VLAN_CFI_MASK) || + (eth_type != htons(ETH_P_IP) && + eth_type != htons(ETH_P_IPV6) && + eth_type != htons(ETH_P_ARP) && + eth_type != htons(ETH_P_RARP) && + !eth_p_mpls(eth_type))) + return -EINVAL; + mpls_label_count++; + } else { + if (mac_proto == MAC_PROTO_ETHERNET) { + mpls_label_count = 1; + mac_proto = MAC_PROTO_NONE; + } else { + mpls_label_count++; + } + } + eth_type = mpls->mpls_ethertype; + break; + } + + case OVS_ACTION_ATTR_PUSH_MPLS: { + const struct ovs_action_push_mpls *mpls = nla_data(a); + + if (!eth_p_mpls(mpls->mpls_ethertype)) + return -EINVAL; + /* Prohibit push MPLS other than to a white list + * for packets that have a known tag order. + */ + if (vlan_tci & htons(VLAN_CFI_MASK) || + (eth_type != htons(ETH_P_IP) && + eth_type != htons(ETH_P_IPV6) && + eth_type != htons(ETH_P_ARP) && + eth_type != htons(ETH_P_RARP) && + !eth_p_mpls(eth_type))) + return -EINVAL; + eth_type = mpls->mpls_ethertype; + mpls_label_count++; + break; + } + + case OVS_ACTION_ATTR_POP_MPLS: { + __be16 proto; + if (vlan_tci & htons(VLAN_CFI_MASK) || + !eth_p_mpls(eth_type)) + return -EINVAL; + + /* Disallow subsequent L2.5+ set actions and mpls_pop + * actions once the last MPLS label in the packet is + * popped as there is no check here to ensure that + * the new eth type is valid and thus set actions could + * write off the end of the packet or otherwise corrupt + * it. + * + * Support for these actions is planned using packet + * recirculation. + */ + proto = nla_get_be16(a); + + if (proto == htons(ETH_P_TEB) && + mac_proto != MAC_PROTO_NONE) + return -EINVAL; + + mpls_label_count--; + + if (!eth_p_mpls(proto) || !mpls_label_count) + eth_type = htons(0); + else + eth_type = proto; + + break; + } + + case OVS_ACTION_ATTR_SET: + err = validate_set(a, key, sfa, + &skip_copy, mac_proto, eth_type, + false, log); + if (err) + return err; + break; + + case OVS_ACTION_ATTR_SET_MASKED: + err = validate_set(a, key, sfa, + &skip_copy, mac_proto, eth_type, + true, log); + if (err) + return err; + break; + + case OVS_ACTION_ATTR_SAMPLE: { + bool last = nla_is_last(a, rem); + + err = validate_and_copy_sample(net, a, key, sfa, + eth_type, vlan_tci, + mpls_label_count, + log, last, depth); + if (err) + return err; + skip_copy = true; + break; + } + + case OVS_ACTION_ATTR_CT: + err = ovs_ct_copy_action(net, a, key, sfa, log); + if (err) + return err; + skip_copy = true; + break; + + case OVS_ACTION_ATTR_CT_CLEAR: + break; + + case OVS_ACTION_ATTR_PUSH_ETH: + /* Disallow pushing an Ethernet header if one + * is already present */ + if (mac_proto != MAC_PROTO_NONE) + return -EINVAL; + mac_proto = MAC_PROTO_ETHERNET; + break; + + case OVS_ACTION_ATTR_POP_ETH: + if (mac_proto != MAC_PROTO_ETHERNET) + return -EINVAL; + if (vlan_tci & htons(VLAN_CFI_MASK)) + return -EINVAL; + mac_proto = MAC_PROTO_NONE; + break; + + case OVS_ACTION_ATTR_PUSH_NSH: + if (mac_proto != MAC_PROTO_ETHERNET) { + u8 next_proto; + + next_proto = tun_p_from_eth_p(eth_type); + if (!next_proto) + return -EINVAL; + } + mac_proto = MAC_PROTO_NONE; + if (!validate_push_nsh(nla_data(a), log)) + return -EINVAL; + break; + + case OVS_ACTION_ATTR_POP_NSH: { + __be16 inner_proto; + + if (eth_type != htons(ETH_P_NSH)) + return -EINVAL; + inner_proto = tun_p_to_eth_p(key->nsh.base.np); + if (!inner_proto) + return -EINVAL; + if (key->nsh.base.np == TUN_P_ETHERNET) + mac_proto = MAC_PROTO_ETHERNET; + else + mac_proto = MAC_PROTO_NONE; + break; + } + + case OVS_ACTION_ATTR_METER: + /* Non-existent meters are simply ignored. */ + break; + + case OVS_ACTION_ATTR_CLONE: { + bool last = nla_is_last(a, rem); + + err = validate_and_copy_clone(net, a, key, sfa, + eth_type, vlan_tci, + mpls_label_count, + log, last, depth); + if (err) + return err; + skip_copy = true; + break; + } + + case OVS_ACTION_ATTR_CHECK_PKT_LEN: { + bool last = nla_is_last(a, rem); + + err = validate_and_copy_check_pkt_len(net, a, key, sfa, + eth_type, + vlan_tci, + mpls_label_count, + log, last, + depth); + if (err) + return err; + skip_copy = true; + break; + } + + case OVS_ACTION_ATTR_DEC_TTL: + err = validate_and_copy_dec_ttl(net, a, key, sfa, + eth_type, vlan_tci, + mpls_label_count, log, + depth); + if (err) + return err; + skip_copy = true; + break; + + case OVS_ACTION_ATTR_DROP: + if (!nla_is_last(a, rem)) + return -EINVAL; + break; + + case OVS_ACTION_ATTR_PSAMPLE: + err = validate_psample(a); + if (err) + return err; + break; + + default: + OVS_NLERR(log, "Unknown Action type %d", type); + return -EINVAL; + } + if (!skip_copy) { + err = copy_action(a, sfa, log); + if (err) + return err; + } + } + + if (rem > 0) + return -EINVAL; + + return 0; +} + +/* 'key' must be the masked key. */ +int ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, + const struct sw_flow_key *key, + struct sw_flow_actions **sfa, bool log) +{ + int err; + u32 mpls_label_count = 0; + + *sfa = nla_alloc_flow_actions(nla_len(attr)); + if (IS_ERR(*sfa)) + return PTR_ERR(*sfa); + + if (eth_p_mpls(key->eth.type)) + mpls_label_count = hweight_long(key->mpls.num_labels_mask); + + (*sfa)->orig_len = nla_len(attr); + err = __ovs_nla_copy_actions(net, attr, key, sfa, key->eth.type, + key->eth.vlan.tci, mpls_label_count, log, + 0); + if (err) + ovs_nla_free_flow_actions(*sfa); + + return err; +} + +static int sample_action_to_attr(const struct nlattr *attr, + struct sk_buff *skb) +{ + struct nlattr *start, *ac_start = NULL, *sample_arg; + int err = 0, rem = nla_len(attr); + const struct sample_arg *arg; + struct nlattr *actions; + + start = nla_nest_start_noflag(skb, OVS_ACTION_ATTR_SAMPLE); + if (!start) + return -EMSGSIZE; + + sample_arg = nla_data(attr); + arg = nla_data(sample_arg); + actions = nla_next(sample_arg, &rem); + + if (nla_put_u32(skb, OVS_SAMPLE_ATTR_PROBABILITY, arg->probability)) { + err = -EMSGSIZE; + goto out; + } + + ac_start = nla_nest_start_noflag(skb, OVS_SAMPLE_ATTR_ACTIONS); + if (!ac_start) { + err = -EMSGSIZE; + goto out; + } + + err = ovs_nla_put_actions(actions, rem, skb); + +out: + if (err) { + nla_nest_cancel(skb, ac_start); + nla_nest_cancel(skb, start); + } else { + nla_nest_end(skb, ac_start); + nla_nest_end(skb, start); + } + + return err; +} + +static int clone_action_to_attr(const struct nlattr *attr, + struct sk_buff *skb) +{ + struct nlattr *start; + int err = 0, rem = nla_len(attr); + + start = nla_nest_start_noflag(skb, OVS_ACTION_ATTR_CLONE); + if (!start) + return -EMSGSIZE; + + /* Skipping the OVS_CLONE_ATTR_EXEC that is always the first attribute. */ + attr = nla_next(nla_data(attr), &rem); + err = ovs_nla_put_actions(attr, rem, skb); + + if (err) + nla_nest_cancel(skb, start); + else + nla_nest_end(skb, start); + + return err; +} + +static int check_pkt_len_action_to_attr(const struct nlattr *attr, + struct sk_buff *skb) +{ + struct nlattr *start, *ac_start = NULL; + const struct check_pkt_len_arg *arg; + const struct nlattr *a, *cpl_arg; + int err = 0, rem = nla_len(attr); + + start = nla_nest_start_noflag(skb, OVS_ACTION_ATTR_CHECK_PKT_LEN); + if (!start) + return -EMSGSIZE; + + /* The first nested attribute in 'attr' is always + * 'OVS_CHECK_PKT_LEN_ATTR_ARG'. + */ + cpl_arg = nla_data(attr); + arg = nla_data(cpl_arg); + + if (nla_put_u16(skb, OVS_CHECK_PKT_LEN_ATTR_PKT_LEN, arg->pkt_len)) { + err = -EMSGSIZE; + goto out; + } + + /* Second nested attribute in 'attr' is always + * 'OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL'. + */ + a = nla_next(cpl_arg, &rem); + ac_start = nla_nest_start_noflag(skb, + OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_LESS_EQUAL); + if (!ac_start) { + err = -EMSGSIZE; + goto out; + } + + err = ovs_nla_put_actions(nla_data(a), nla_len(a), skb); + if (err) { + nla_nest_cancel(skb, ac_start); + goto out; + } else { + nla_nest_end(skb, ac_start); + } + + /* Third nested attribute in 'attr' is always + * OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER. + */ + a = nla_next(a, &rem); + ac_start = nla_nest_start_noflag(skb, + OVS_CHECK_PKT_LEN_ATTR_ACTIONS_IF_GREATER); + if (!ac_start) { + err = -EMSGSIZE; + goto out; + } + + err = ovs_nla_put_actions(nla_data(a), nla_len(a), skb); + if (err) { + nla_nest_cancel(skb, ac_start); + goto out; + } else { + nla_nest_end(skb, ac_start); + } + + nla_nest_end(skb, start); + return 0; + +out: + nla_nest_cancel(skb, start); + return err; +} + +static int dec_ttl_action_to_attr(const struct nlattr *attr, + struct sk_buff *skb) +{ + struct nlattr *start, *action_start; + const struct nlattr *a; + int err = 0, rem; + + start = nla_nest_start_noflag(skb, OVS_ACTION_ATTR_DEC_TTL); + if (!start) + return -EMSGSIZE; + + nla_for_each_attr(a, nla_data(attr), nla_len(attr), rem) { + switch (nla_type(a)) { + case OVS_DEC_TTL_ATTR_ACTION: + + action_start = nla_nest_start_noflag(skb, OVS_DEC_TTL_ATTR_ACTION); + if (!action_start) { + err = -EMSGSIZE; + goto out; + } + + err = ovs_nla_put_actions(nla_data(a), nla_len(a), skb); + if (err) + goto out; + + nla_nest_end(skb, action_start); + break; + + default: + /* Ignore all other option to be future compatible */ + break; + } + } + + nla_nest_end(skb, start); + return 0; + +out: + nla_nest_cancel(skb, start); + return err; +} + +static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb) +{ + const struct nlattr *ovs_key = nla_data(a); + int key_type = nla_type(ovs_key); + struct nlattr *start; + int err; + + switch (key_type) { + case OVS_KEY_ATTR_TUNNEL_INFO: { + struct ovs_tunnel_info *ovs_tun = nla_data(ovs_key); + struct ip_tunnel_info *tun_info = &ovs_tun->tun_dst->u.tun_info; + + start = nla_nest_start_noflag(skb, OVS_ACTION_ATTR_SET); + if (!start) + return -EMSGSIZE; + + err = ip_tun_to_nlattr(skb, &tun_info->key, + ip_tunnel_info_opts(tun_info), + tun_info->options_len, + ip_tunnel_info_af(tun_info), tun_info->mode); + if (err) + return err; + nla_nest_end(skb, start); + break; + } + default: + if (nla_put(skb, OVS_ACTION_ATTR_SET, nla_len(a), ovs_key)) + return -EMSGSIZE; + break; + } + + return 0; +} + +static int masked_set_action_to_set_action_attr(const struct nlattr *a, + struct sk_buff *skb) +{ + const struct nlattr *ovs_key = nla_data(a); + struct nlattr *nla; + size_t key_len = nla_len(ovs_key) / 2; + + /* Revert the conversion we did from a non-masked set action to + * masked set action. + */ + nla = nla_nest_start_noflag(skb, OVS_ACTION_ATTR_SET); + if (!nla) + return -EMSGSIZE; + + if (nla_put(skb, nla_type(ovs_key), key_len, nla_data(ovs_key))) + return -EMSGSIZE; + + nla_nest_end(skb, nla); + return 0; +} + +int ovs_nla_put_actions(const struct nlattr *attr, int len, struct sk_buff *skb) +{ + const struct nlattr *a; + int rem, err; + + nla_for_each_attr(a, attr, len, rem) { + int type = nla_type(a); + + switch (type) { + case OVS_ACTION_ATTR_SET: + err = set_action_to_attr(a, skb); + if (err) + return err; + break; + + case OVS_ACTION_ATTR_SET_TO_MASKED: + err = masked_set_action_to_set_action_attr(a, skb); + if (err) + return err; + break; + + case OVS_ACTION_ATTR_SAMPLE: + err = sample_action_to_attr(a, skb); + if (err) + return err; + break; + + case OVS_ACTION_ATTR_CT: + err = ovs_ct_action_to_attr(nla_data(a), skb); + if (err) + return err; + break; + + case OVS_ACTION_ATTR_CLONE: + err = clone_action_to_attr(a, skb); + if (err) + return err; + break; + + case OVS_ACTION_ATTR_CHECK_PKT_LEN: + err = check_pkt_len_action_to_attr(a, skb); + if (err) + return err; + break; + + case OVS_ACTION_ATTR_DEC_TTL: + err = dec_ttl_action_to_attr(a, skb); + if (err) + return err; + break; + + default: + if (nla_put(skb, type, nla_len(a), nla_data(a))) + return -EMSGSIZE; + break; + } + } + + return 0; +} diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h new file mode 100644 index 000000000000..ff8cdecbe346 --- /dev/null +++ b/net/openvswitch/flow_netlink.h @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2007-2013 Nicira, Inc. + */ + + +#ifndef FLOW_NETLINK_H +#define FLOW_NETLINK_H 1 + +#include <linux/kernel.h> +#include <linux/netlink.h> +#include <linux/openvswitch.h> +#include <linux/spinlock.h> +#include <linux/types.h> +#include <linux/rcupdate.h> +#include <linux/if_ether.h> +#include <linux/in6.h> +#include <linux/jiffies.h> +#include <linux/time.h> + +#include <net/inet_ecn.h> +#include <net/ip_tunnels.h> + +#include "flow.h" + +size_t ovs_tun_key_attr_size(void); +size_t ovs_key_attr_size(void); + +void ovs_match_init(struct sw_flow_match *match, + struct sw_flow_key *key, bool reset_key, + struct sw_flow_mask *mask); + +int ovs_nla_put_key(const struct sw_flow_key *, const struct sw_flow_key *, + int attr, bool is_mask, struct sk_buff *); +int parse_flow_nlattrs(const struct nlattr *attr, const struct nlattr *a[], + u64 *attrsp, bool log); +int ovs_nla_get_flow_metadata(struct net *net, + const struct nlattr *a[OVS_KEY_ATTR_MAX + 1], + u64 attrs, struct sw_flow_key *key, bool log); + +int ovs_nla_put_identifier(const struct sw_flow *flow, struct sk_buff *skb); +int ovs_nla_put_masked_key(const struct sw_flow *flow, struct sk_buff *skb); +int ovs_nla_put_mask(const struct sw_flow *flow, struct sk_buff *skb); + +int ovs_nla_get_match(struct net *, struct sw_flow_match *, + const struct nlattr *key, const struct nlattr *mask, + bool log); + +int ovs_nla_put_tunnel_info(struct sk_buff *skb, + struct ip_tunnel_info *tun_info); + +bool ovs_nla_get_ufid(struct sw_flow_id *, const struct nlattr *, bool log); +int ovs_nla_get_identifier(struct sw_flow_id *sfid, const struct nlattr *ufid, + const struct sw_flow_key *key, bool log); +u32 ovs_nla_get_ufid_flags(const struct nlattr *attr); + +int ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, + const struct sw_flow_key *key, + struct sw_flow_actions **sfa, bool log); +int ovs_nla_add_action(struct sw_flow_actions **sfa, int attrtype, + void *data, int len, bool log); +int ovs_nla_put_actions(const struct nlattr *attr, + int len, struct sk_buff *skb); + +void ovs_nla_free_flow_actions(struct sw_flow_actions *); +void ovs_nla_free_flow_actions_rcu(struct sw_flow_actions *); + +int nsh_hdr_from_nlattr(const struct nlattr *attr, struct nshhdr *nh, + size_t size); + +#endif /* flow_netlink.h */ diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c new file mode 100644 index 000000000000..ffc72a741a50 --- /dev/null +++ b/net/openvswitch/flow_table.c @@ -0,0 +1,1219 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2007-2014 Nicira, Inc. + */ + +#include "flow.h" +#include "datapath.h" +#include "flow_netlink.h" +#include <linux/uaccess.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/if_ether.h> +#include <linux/if_vlan.h> +#include <net/llc_pdu.h> +#include <linux/kernel.h> +#include <linux/jhash.h> +#include <linux/jiffies.h> +#include <linux/llc.h> +#include <linux/module.h> +#include <linux/in.h> +#include <linux/rcupdate.h> +#include <linux/cpumask.h> +#include <linux/if_arp.h> +#include <linux/ip.h> +#include <linux/ipv6.h> +#include <linux/sctp.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <linux/icmp.h> +#include <linux/icmpv6.h> +#include <linux/rculist.h> +#include <linux/sort.h> +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/ndisc.h> + +#define TBL_MIN_BUCKETS 1024 +#define MASK_ARRAY_SIZE_MIN 16 +#define REHASH_INTERVAL (10 * 60 * HZ) + +#define MC_DEFAULT_HASH_ENTRIES 256 +#define MC_HASH_SHIFT 8 +#define MC_HASH_SEGS ((sizeof(uint32_t) * 8) / MC_HASH_SHIFT) + +static struct kmem_cache *flow_cache; +struct kmem_cache *flow_stats_cache __read_mostly; + +static u16 range_n_bytes(const struct sw_flow_key_range *range) +{ + return range->end - range->start; +} + +void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src, + bool full, const struct sw_flow_mask *mask) +{ + int start = full ? 0 : mask->range.start; + int len = full ? sizeof *dst : range_n_bytes(&mask->range); + const long *m = (const long *)((const u8 *)&mask->key + start); + const long *s = (const long *)((const u8 *)src + start); + long *d = (long *)((u8 *)dst + start); + int i; + + /* If 'full' is true then all of 'dst' is fully initialized. Otherwise, + * if 'full' is false the memory outside of the 'mask->range' is left + * uninitialized. This can be used as an optimization when further + * operations on 'dst' only use contents within 'mask->range'. + */ + for (i = 0; i < len; i += sizeof(long)) + *d++ = *s++ & *m++; +} + +struct sw_flow *ovs_flow_alloc(void) +{ + struct sw_flow *flow; + struct sw_flow_stats *stats; + + flow = kmem_cache_zalloc(flow_cache, GFP_KERNEL); + if (!flow) + return ERR_PTR(-ENOMEM); + + flow->stats_last_writer = -1; + flow->cpu_used_mask = (struct cpumask *)&flow->stats[nr_cpu_ids]; + + /* Initialize the default stat node. */ + stats = kmem_cache_alloc_node(flow_stats_cache, + GFP_KERNEL | __GFP_ZERO, + node_online(0) ? 0 : NUMA_NO_NODE); + if (!stats) + goto err; + + spin_lock_init(&stats->lock); + + RCU_INIT_POINTER(flow->stats[0], stats); + + cpumask_set_cpu(0, flow->cpu_used_mask); + + return flow; +err: + kmem_cache_free(flow_cache, flow); + return ERR_PTR(-ENOMEM); +} + +int ovs_flow_tbl_count(const struct flow_table *table) +{ + return table->count; +} + +static void flow_free(struct sw_flow *flow) +{ + unsigned int cpu; + + if (ovs_identifier_is_key(&flow->id)) + kfree(flow->id.unmasked_key); + if (flow->sf_acts) + ovs_nla_free_flow_actions((struct sw_flow_actions __force *) + flow->sf_acts); + + for_each_cpu(cpu, flow->cpu_used_mask) { + if (flow->stats[cpu]) + kmem_cache_free(flow_stats_cache, + (struct sw_flow_stats __force *)flow->stats[cpu]); + } + + kmem_cache_free(flow_cache, flow); +} + +static void rcu_free_flow_callback(struct rcu_head *rcu) +{ + struct sw_flow *flow = container_of(rcu, struct sw_flow, rcu); + + flow_free(flow); +} + +void ovs_flow_free(struct sw_flow *flow, bool deferred) +{ + if (!flow) + return; + + if (deferred) + call_rcu(&flow->rcu, rcu_free_flow_callback); + else + flow_free(flow); +} + +static void __table_instance_destroy(struct table_instance *ti) +{ + kvfree(ti->buckets); + kfree(ti); +} + +static struct table_instance *table_instance_alloc(int new_size) +{ + struct table_instance *ti = kmalloc(sizeof(*ti), GFP_KERNEL); + int i; + + if (!ti) + return NULL; + + ti->buckets = kvmalloc_array(new_size, sizeof(struct hlist_head), + GFP_KERNEL); + if (!ti->buckets) { + kfree(ti); + return NULL; + } + + for (i = 0; i < new_size; i++) + INIT_HLIST_HEAD(&ti->buckets[i]); + + ti->n_buckets = new_size; + ti->node_ver = 0; + get_random_bytes(&ti->hash_seed, sizeof(u32)); + + return ti; +} + +static void __mask_array_destroy(struct mask_array *ma) +{ + free_percpu(ma->masks_usage_stats); + kfree(ma); +} + +static void mask_array_rcu_cb(struct rcu_head *rcu) +{ + struct mask_array *ma = container_of(rcu, struct mask_array, rcu); + + __mask_array_destroy(ma); +} + +static void tbl_mask_array_reset_counters(struct mask_array *ma) +{ + int i, cpu; + + /* As the per CPU counters are not atomic we can not go ahead and + * reset them from another CPU. To be able to still have an approximate + * zero based counter we store the value at reset, and subtract it + * later when processing. + */ + for (i = 0; i < ma->max; i++) { + ma->masks_usage_zero_cntr[i] = 0; + + for_each_possible_cpu(cpu) { + struct mask_array_stats *stats; + unsigned int start; + u64 counter; + + stats = per_cpu_ptr(ma->masks_usage_stats, cpu); + do { + start = u64_stats_fetch_begin(&stats->syncp); + counter = stats->usage_cntrs[i]; + } while (u64_stats_fetch_retry(&stats->syncp, start)); + + ma->masks_usage_zero_cntr[i] += counter; + } + } +} + +static struct mask_array *tbl_mask_array_alloc(int size) +{ + struct mask_array *new; + + size = max(MASK_ARRAY_SIZE_MIN, size); + new = kzalloc(struct_size(new, masks, size) + + sizeof(u64) * size, GFP_KERNEL); + if (!new) + return NULL; + + new->masks_usage_zero_cntr = (u64 *)((u8 *)new + + struct_size(new, masks, size)); + + new->masks_usage_stats = __alloc_percpu(sizeof(struct mask_array_stats) + + sizeof(u64) * size, + __alignof__(u64)); + if (!new->masks_usage_stats) { + kfree(new); + return NULL; + } + + new->count = 0; + new->max = size; + + return new; +} + +static int tbl_mask_array_realloc(struct flow_table *tbl, int size) +{ + struct mask_array *old; + struct mask_array *new; + + new = tbl_mask_array_alloc(size); + if (!new) + return -ENOMEM; + + old = ovsl_dereference(tbl->mask_array); + if (old) { + int i; + + for (i = 0; i < old->max; i++) { + if (ovsl_dereference(old->masks[i])) + new->masks[new->count++] = old->masks[i]; + } + call_rcu(&old->rcu, mask_array_rcu_cb); + } + + rcu_assign_pointer(tbl->mask_array, new); + + return 0; +} + +static int tbl_mask_array_add_mask(struct flow_table *tbl, + struct sw_flow_mask *new) +{ + struct mask_array *ma = ovsl_dereference(tbl->mask_array); + int err, ma_count = READ_ONCE(ma->count); + + if (ma_count >= ma->max) { + err = tbl_mask_array_realloc(tbl, ma->max + + MASK_ARRAY_SIZE_MIN); + if (err) + return err; + + ma = ovsl_dereference(tbl->mask_array); + } else { + /* On every add or delete we need to reset the counters so + * every new mask gets a fair chance of being prioritized. + */ + tbl_mask_array_reset_counters(ma); + } + + BUG_ON(ovsl_dereference(ma->masks[ma_count])); + + rcu_assign_pointer(ma->masks[ma_count], new); + WRITE_ONCE(ma->count, ma_count + 1); + + return 0; +} + +static void tbl_mask_array_del_mask(struct flow_table *tbl, + struct sw_flow_mask *mask) +{ + struct mask_array *ma = ovsl_dereference(tbl->mask_array); + int i, ma_count = READ_ONCE(ma->count); + + /* Remove the deleted mask pointers from the array */ + for (i = 0; i < ma_count; i++) { + if (mask == ovsl_dereference(ma->masks[i])) + goto found; + } + + BUG(); + return; + +found: + WRITE_ONCE(ma->count, ma_count - 1); + + rcu_assign_pointer(ma->masks[i], ma->masks[ma_count - 1]); + RCU_INIT_POINTER(ma->masks[ma_count - 1], NULL); + + kfree_rcu(mask, rcu); + + /* Shrink the mask array if necessary. */ + if (ma->max >= (MASK_ARRAY_SIZE_MIN * 2) && + ma_count <= (ma->max / 3)) + tbl_mask_array_realloc(tbl, ma->max / 2); + else + tbl_mask_array_reset_counters(ma); + +} + +/* Remove 'mask' from the mask list, if it is not needed any more. */ +static void flow_mask_remove(struct flow_table *tbl, struct sw_flow_mask *mask) +{ + if (mask) { + /* ovs-lock is required to protect mask-refcount and + * mask list. + */ + ASSERT_OVSL(); + BUG_ON(!mask->ref_count); + mask->ref_count--; + + if (!mask->ref_count) + tbl_mask_array_del_mask(tbl, mask); + } +} + +static void __mask_cache_destroy(struct mask_cache *mc) +{ + free_percpu(mc->mask_cache); + kfree(mc); +} + +static void mask_cache_rcu_cb(struct rcu_head *rcu) +{ + struct mask_cache *mc = container_of(rcu, struct mask_cache, rcu); + + __mask_cache_destroy(mc); +} + +static struct mask_cache *tbl_mask_cache_alloc(u32 size) +{ + struct mask_cache_entry __percpu *cache = NULL; + struct mask_cache *new; + + /* Only allow size to be 0, or a power of 2, and does not exceed + * percpu allocation size. + */ + if ((!is_power_of_2(size) && size != 0) || + (size * sizeof(struct mask_cache_entry)) > PCPU_MIN_UNIT_SIZE) + return NULL; + + new = kzalloc(sizeof(*new), GFP_KERNEL); + if (!new) + return NULL; + + new->cache_size = size; + if (new->cache_size > 0) { + cache = __alloc_percpu(array_size(sizeof(struct mask_cache_entry), + new->cache_size), + __alignof__(struct mask_cache_entry)); + if (!cache) { + kfree(new); + return NULL; + } + } + + new->mask_cache = cache; + return new; +} +int ovs_flow_tbl_masks_cache_resize(struct flow_table *table, u32 size) +{ + struct mask_cache *mc = rcu_dereference_ovsl(table->mask_cache); + struct mask_cache *new; + + if (size == mc->cache_size) + return 0; + + if ((!is_power_of_2(size) && size != 0) || + (size * sizeof(struct mask_cache_entry)) > PCPU_MIN_UNIT_SIZE) + return -EINVAL; + + new = tbl_mask_cache_alloc(size); + if (!new) + return -ENOMEM; + + rcu_assign_pointer(table->mask_cache, new); + call_rcu(&mc->rcu, mask_cache_rcu_cb); + + return 0; +} + +int ovs_flow_tbl_init(struct flow_table *table) +{ + struct table_instance *ti, *ufid_ti; + struct mask_cache *mc; + struct mask_array *ma; + + mc = tbl_mask_cache_alloc(MC_DEFAULT_HASH_ENTRIES); + if (!mc) + return -ENOMEM; + + ma = tbl_mask_array_alloc(MASK_ARRAY_SIZE_MIN); + if (!ma) + goto free_mask_cache; + + ti = table_instance_alloc(TBL_MIN_BUCKETS); + if (!ti) + goto free_mask_array; + + ufid_ti = table_instance_alloc(TBL_MIN_BUCKETS); + if (!ufid_ti) + goto free_ti; + + rcu_assign_pointer(table->ti, ti); + rcu_assign_pointer(table->ufid_ti, ufid_ti); + rcu_assign_pointer(table->mask_array, ma); + rcu_assign_pointer(table->mask_cache, mc); + table->last_rehash = jiffies; + table->count = 0; + table->ufid_count = 0; + return 0; + +free_ti: + __table_instance_destroy(ti); +free_mask_array: + __mask_array_destroy(ma); +free_mask_cache: + __mask_cache_destroy(mc); + return -ENOMEM; +} + +static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu) +{ + struct table_instance *ti; + + ti = container_of(rcu, struct table_instance, rcu); + __table_instance_destroy(ti); +} + +static void table_instance_flow_free(struct flow_table *table, + struct table_instance *ti, + struct table_instance *ufid_ti, + struct sw_flow *flow) +{ + hlist_del_rcu(&flow->flow_table.node[ti->node_ver]); + table->count--; + + if (ovs_identifier_is_ufid(&flow->id)) { + hlist_del_rcu(&flow->ufid_table.node[ufid_ti->node_ver]); + table->ufid_count--; + } + + flow_mask_remove(table, flow->mask); +} + +/* Must be called with OVS mutex held. */ +void table_instance_flow_flush(struct flow_table *table, + struct table_instance *ti, + struct table_instance *ufid_ti) +{ + int i; + + for (i = 0; i < ti->n_buckets; i++) { + struct hlist_head *head = &ti->buckets[i]; + struct hlist_node *n; + struct sw_flow *flow; + + hlist_for_each_entry_safe(flow, n, head, + flow_table.node[ti->node_ver]) { + + table_instance_flow_free(table, ti, ufid_ti, + flow); + ovs_flow_free(flow, true); + } + } + + if (WARN_ON(table->count != 0 || + table->ufid_count != 0)) { + table->count = 0; + table->ufid_count = 0; + } +} + +static void table_instance_destroy(struct table_instance *ti, + struct table_instance *ufid_ti) +{ + call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb); + call_rcu(&ufid_ti->rcu, flow_tbl_destroy_rcu_cb); +} + +/* No need for locking this function is called from RCU callback or + * error path. + */ +void ovs_flow_tbl_destroy(struct flow_table *table) +{ + struct table_instance *ti = rcu_dereference_raw(table->ti); + struct table_instance *ufid_ti = rcu_dereference_raw(table->ufid_ti); + struct mask_cache *mc = rcu_dereference_raw(table->mask_cache); + struct mask_array *ma = rcu_dereference_raw(table->mask_array); + + call_rcu(&mc->rcu, mask_cache_rcu_cb); + call_rcu(&ma->rcu, mask_array_rcu_cb); + table_instance_destroy(ti, ufid_ti); +} + +struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *ti, + u32 *bucket, u32 *last) +{ + struct sw_flow *flow; + struct hlist_head *head; + int ver; + int i; + + ver = ti->node_ver; + while (*bucket < ti->n_buckets) { + i = 0; + head = &ti->buckets[*bucket]; + hlist_for_each_entry_rcu(flow, head, flow_table.node[ver]) { + if (i < *last) { + i++; + continue; + } + *last = i + 1; + return flow; + } + (*bucket)++; + *last = 0; + } + + return NULL; +} + +static struct hlist_head *find_bucket(struct table_instance *ti, u32 hash) +{ + hash = jhash_1word(hash, ti->hash_seed); + return &ti->buckets[hash & (ti->n_buckets - 1)]; +} + +static void table_instance_insert(struct table_instance *ti, + struct sw_flow *flow) +{ + struct hlist_head *head; + + head = find_bucket(ti, flow->flow_table.hash); + hlist_add_head_rcu(&flow->flow_table.node[ti->node_ver], head); +} + +static void ufid_table_instance_insert(struct table_instance *ti, + struct sw_flow *flow) +{ + struct hlist_head *head; + + head = find_bucket(ti, flow->ufid_table.hash); + hlist_add_head_rcu(&flow->ufid_table.node[ti->node_ver], head); +} + +static void flow_table_copy_flows(struct table_instance *old, + struct table_instance *new, bool ufid) +{ + int old_ver; + int i; + + old_ver = old->node_ver; + new->node_ver = !old_ver; + + /* Insert in new table. */ + for (i = 0; i < old->n_buckets; i++) { + struct sw_flow *flow; + struct hlist_head *head = &old->buckets[i]; + + if (ufid) + hlist_for_each_entry_rcu(flow, head, + ufid_table.node[old_ver], + lockdep_ovsl_is_held()) + ufid_table_instance_insert(new, flow); + else + hlist_for_each_entry_rcu(flow, head, + flow_table.node[old_ver], + lockdep_ovsl_is_held()) + table_instance_insert(new, flow); + } +} + +static struct table_instance *table_instance_rehash(struct table_instance *ti, + int n_buckets, bool ufid) +{ + struct table_instance *new_ti; + + new_ti = table_instance_alloc(n_buckets); + if (!new_ti) + return NULL; + + flow_table_copy_flows(ti, new_ti, ufid); + + return new_ti; +} + +int ovs_flow_tbl_flush(struct flow_table *flow_table) +{ + struct table_instance *old_ti, *new_ti; + struct table_instance *old_ufid_ti, *new_ufid_ti; + + new_ti = table_instance_alloc(TBL_MIN_BUCKETS); + if (!new_ti) + return -ENOMEM; + new_ufid_ti = table_instance_alloc(TBL_MIN_BUCKETS); + if (!new_ufid_ti) + goto err_free_ti; + + old_ti = ovsl_dereference(flow_table->ti); + old_ufid_ti = ovsl_dereference(flow_table->ufid_ti); + + rcu_assign_pointer(flow_table->ti, new_ti); + rcu_assign_pointer(flow_table->ufid_ti, new_ufid_ti); + flow_table->last_rehash = jiffies; + + table_instance_flow_flush(flow_table, old_ti, old_ufid_ti); + table_instance_destroy(old_ti, old_ufid_ti); + return 0; + +err_free_ti: + __table_instance_destroy(new_ti); + return -ENOMEM; +} + +static u32 flow_hash(const struct sw_flow_key *key, + const struct sw_flow_key_range *range) +{ + const u32 *hash_key = (const u32 *)((const u8 *)key + range->start); + + /* Make sure number of hash bytes are multiple of u32. */ + int hash_u32s = range_n_bytes(range) >> 2; + + return jhash2(hash_key, hash_u32s, 0); +} + +static int flow_key_start(const struct sw_flow_key *key) +{ + if (key->tun_proto) + return 0; + else + return rounddown(offsetof(struct sw_flow_key, phy), + sizeof(long)); +} + +static bool cmp_key(const struct sw_flow_key *key1, + const struct sw_flow_key *key2, + int key_start, int key_end) +{ + const long *cp1 = (const long *)((const u8 *)key1 + key_start); + const long *cp2 = (const long *)((const u8 *)key2 + key_start); + int i; + + for (i = key_start; i < key_end; i += sizeof(long)) + if (*cp1++ ^ *cp2++) + return false; + + return true; +} + +static bool flow_cmp_masked_key(const struct sw_flow *flow, + const struct sw_flow_key *key, + const struct sw_flow_key_range *range) +{ + return cmp_key(&flow->key, key, range->start, range->end); +} + +static bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow, + const struct sw_flow_match *match) +{ + struct sw_flow_key *key = match->key; + int key_start = flow_key_start(key); + int key_end = match->range.end; + + BUG_ON(ovs_identifier_is_ufid(&flow->id)); + return cmp_key(flow->id.unmasked_key, key, key_start, key_end); +} + +static struct sw_flow *masked_flow_lookup(struct table_instance *ti, + const struct sw_flow_key *unmasked, + const struct sw_flow_mask *mask, + u32 *n_mask_hit) +{ + struct sw_flow *flow; + struct hlist_head *head; + u32 hash; + struct sw_flow_key masked_key; + + ovs_flow_mask_key(&masked_key, unmasked, false, mask); + hash = flow_hash(&masked_key, &mask->range); + head = find_bucket(ti, hash); + (*n_mask_hit)++; + + hlist_for_each_entry_rcu(flow, head, flow_table.node[ti->node_ver], + lockdep_ovsl_is_held()) { + if (flow->mask == mask && flow->flow_table.hash == hash && + flow_cmp_masked_key(flow, &masked_key, &mask->range)) + return flow; + } + return NULL; +} + +/* Flow lookup does full lookup on flow table. It starts with + * mask from index passed in *index. + * This function MUST be called with BH disabled due to the use + * of CPU specific variables. + */ +static struct sw_flow *flow_lookup(struct flow_table *tbl, + struct table_instance *ti, + struct mask_array *ma, + const struct sw_flow_key *key, + u32 *n_mask_hit, + u32 *n_cache_hit, + u32 *index) +{ + struct mask_array_stats *stats = this_cpu_ptr(ma->masks_usage_stats); + struct sw_flow *flow; + struct sw_flow_mask *mask; + int i; + + if (likely(*index < ma->max)) { + mask = rcu_dereference_ovsl(ma->masks[*index]); + if (mask) { + flow = masked_flow_lookup(ti, key, mask, n_mask_hit); + if (flow) { + u64_stats_update_begin(&stats->syncp); + stats->usage_cntrs[*index]++; + u64_stats_update_end(&stats->syncp); + (*n_cache_hit)++; + return flow; + } + } + } + + for (i = 0; i < ma->max; i++) { + + if (i == *index) + continue; + + mask = rcu_dereference_ovsl(ma->masks[i]); + if (unlikely(!mask)) + break; + + flow = masked_flow_lookup(ti, key, mask, n_mask_hit); + if (flow) { /* Found */ + *index = i; + u64_stats_update_begin(&stats->syncp); + stats->usage_cntrs[*index]++; + u64_stats_update_end(&stats->syncp); + return flow; + } + } + + return NULL; +} + +/* + * mask_cache maps flow to probable mask. This cache is not tightly + * coupled cache, It means updates to mask list can result in inconsistent + * cache entry in mask cache. + * This is per cpu cache and is divided in MC_HASH_SEGS segments. + * In case of a hash collision the entry is hashed in next segment. + * */ +struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, + const struct sw_flow_key *key, + u32 skb_hash, + u32 *n_mask_hit, + u32 *n_cache_hit) +{ + struct mask_cache *mc = rcu_dereference(tbl->mask_cache); + struct mask_array *ma = rcu_dereference(tbl->mask_array); + struct table_instance *ti = rcu_dereference(tbl->ti); + struct mask_cache_entry *entries, *ce; + struct sw_flow *flow; + u32 hash; + int seg; + + *n_mask_hit = 0; + *n_cache_hit = 0; + if (unlikely(!skb_hash || mc->cache_size == 0)) { + u32 mask_index = 0; + u32 cache = 0; + + return flow_lookup(tbl, ti, ma, key, n_mask_hit, &cache, + &mask_index); + } + + /* Pre and post recirulation flows usually have the same skb_hash + * value. To avoid hash collisions, rehash the 'skb_hash' with + * 'recirc_id'. */ + if (key->recirc_id) + skb_hash = jhash_1word(skb_hash, key->recirc_id); + + ce = NULL; + hash = skb_hash; + entries = this_cpu_ptr(mc->mask_cache); + + /* Find the cache entry 'ce' to operate on. */ + for (seg = 0; seg < MC_HASH_SEGS; seg++) { + int index = hash & (mc->cache_size - 1); + struct mask_cache_entry *e; + + e = &entries[index]; + if (e->skb_hash == skb_hash) { + flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, + n_cache_hit, &e->mask_index); + if (!flow) + e->skb_hash = 0; + return flow; + } + + if (!ce || e->skb_hash < ce->skb_hash) + ce = e; /* A better replacement cache candidate. */ + + hash >>= MC_HASH_SHIFT; + } + + /* Cache miss, do full lookup. */ + flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, n_cache_hit, + &ce->mask_index); + if (flow) + ce->skb_hash = skb_hash; + + *n_cache_hit = 0; + return flow; +} + +struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl, + const struct sw_flow_key *key) +{ + struct table_instance *ti = rcu_dereference_ovsl(tbl->ti); + struct mask_array *ma = rcu_dereference_ovsl(tbl->mask_array); + u32 __always_unused n_mask_hit; + u32 __always_unused n_cache_hit; + struct sw_flow *flow; + u32 index = 0; + + /* This function gets called trough the netlink interface and therefore + * is preemptible. However, flow_lookup() function needs to be called + * with BH disabled due to CPU specific variables. + */ + local_bh_disable(); + flow = flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, &index); + local_bh_enable(); + return flow; +} + +struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl, + const struct sw_flow_match *match) +{ + struct mask_array *ma = ovsl_dereference(tbl->mask_array); + int i; + + /* Always called under ovs-mutex. */ + for (i = 0; i < ma->max; i++) { + struct table_instance *ti = rcu_dereference_ovsl(tbl->ti); + u32 __always_unused n_mask_hit; + struct sw_flow_mask *mask; + struct sw_flow *flow; + + mask = ovsl_dereference(ma->masks[i]); + if (!mask) + continue; + + flow = masked_flow_lookup(ti, match->key, mask, &n_mask_hit); + if (flow && ovs_identifier_is_key(&flow->id) && + ovs_flow_cmp_unmasked_key(flow, match)) { + return flow; + } + } + + return NULL; +} + +static u32 ufid_hash(const struct sw_flow_id *sfid) +{ + return jhash(sfid->ufid, sfid->ufid_len, 0); +} + +static bool ovs_flow_cmp_ufid(const struct sw_flow *flow, + const struct sw_flow_id *sfid) +{ + if (flow->id.ufid_len != sfid->ufid_len) + return false; + + return !memcmp(flow->id.ufid, sfid->ufid, sfid->ufid_len); +} + +bool ovs_flow_cmp(const struct sw_flow *flow, + const struct sw_flow_match *match) +{ + if (ovs_identifier_is_ufid(&flow->id)) + return flow_cmp_masked_key(flow, match->key, &match->range); + + return ovs_flow_cmp_unmasked_key(flow, match); +} + +struct sw_flow *ovs_flow_tbl_lookup_ufid(struct flow_table *tbl, + const struct sw_flow_id *ufid) +{ + struct table_instance *ti = rcu_dereference_ovsl(tbl->ufid_ti); + struct sw_flow *flow; + struct hlist_head *head; + u32 hash; + + hash = ufid_hash(ufid); + head = find_bucket(ti, hash); + hlist_for_each_entry_rcu(flow, head, ufid_table.node[ti->node_ver], + lockdep_ovsl_is_held()) { + if (flow->ufid_table.hash == hash && + ovs_flow_cmp_ufid(flow, ufid)) + return flow; + } + return NULL; +} + +int ovs_flow_tbl_num_masks(const struct flow_table *table) +{ + struct mask_array *ma = rcu_dereference_ovsl(table->mask_array); + return READ_ONCE(ma->count); +} + +u32 ovs_flow_tbl_masks_cache_size(const struct flow_table *table) +{ + struct mask_cache *mc = rcu_dereference_ovsl(table->mask_cache); + + return READ_ONCE(mc->cache_size); +} + +static struct table_instance *table_instance_expand(struct table_instance *ti, + bool ufid) +{ + return table_instance_rehash(ti, ti->n_buckets * 2, ufid); +} + +/* Must be called with OVS mutex held. */ +void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow) +{ + struct table_instance *ti = ovsl_dereference(table->ti); + struct table_instance *ufid_ti = ovsl_dereference(table->ufid_ti); + + BUG_ON(table->count == 0); + table_instance_flow_free(table, ti, ufid_ti, flow); +} + +static struct sw_flow_mask *mask_alloc(void) +{ + struct sw_flow_mask *mask; + + mask = kmalloc(sizeof(*mask), GFP_KERNEL); + if (mask) + mask->ref_count = 1; + + return mask; +} + +static bool mask_equal(const struct sw_flow_mask *a, + const struct sw_flow_mask *b) +{ + const u8 *a_ = (const u8 *)&a->key + a->range.start; + const u8 *b_ = (const u8 *)&b->key + b->range.start; + + return (a->range.end == b->range.end) + && (a->range.start == b->range.start) + && (memcmp(a_, b_, range_n_bytes(&a->range)) == 0); +} + +static struct sw_flow_mask *flow_mask_find(const struct flow_table *tbl, + const struct sw_flow_mask *mask) +{ + struct mask_array *ma; + int i; + + ma = ovsl_dereference(tbl->mask_array); + for (i = 0; i < ma->max; i++) { + struct sw_flow_mask *t; + t = ovsl_dereference(ma->masks[i]); + + if (t && mask_equal(mask, t)) + return t; + } + + return NULL; +} + +/* Add 'mask' into the mask list, if it is not already there. */ +static int flow_mask_insert(struct flow_table *tbl, struct sw_flow *flow, + const struct sw_flow_mask *new) +{ + struct sw_flow_mask *mask; + + mask = flow_mask_find(tbl, new); + if (!mask) { + /* Allocate a new mask if none exists. */ + mask = mask_alloc(); + if (!mask) + return -ENOMEM; + mask->key = new->key; + mask->range = new->range; + + /* Add mask to mask-list. */ + if (tbl_mask_array_add_mask(tbl, mask)) { + kfree(mask); + return -ENOMEM; + } + } else { + BUG_ON(!mask->ref_count); + mask->ref_count++; + } + + flow->mask = mask; + return 0; +} + +/* Must be called with OVS mutex held. */ +static void flow_key_insert(struct flow_table *table, struct sw_flow *flow) +{ + struct table_instance *new_ti = NULL; + struct table_instance *ti; + + flow->flow_table.hash = flow_hash(&flow->key, &flow->mask->range); + ti = ovsl_dereference(table->ti); + table_instance_insert(ti, flow); + table->count++; + + /* Expand table, if necessary, to make room. */ + if (table->count > ti->n_buckets) + new_ti = table_instance_expand(ti, false); + else if (time_after(jiffies, table->last_rehash + REHASH_INTERVAL)) + new_ti = table_instance_rehash(ti, ti->n_buckets, false); + + if (new_ti) { + rcu_assign_pointer(table->ti, new_ti); + call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb); + table->last_rehash = jiffies; + } +} + +/* Must be called with OVS mutex held. */ +static void flow_ufid_insert(struct flow_table *table, struct sw_flow *flow) +{ + struct table_instance *ti; + + flow->ufid_table.hash = ufid_hash(&flow->id); + ti = ovsl_dereference(table->ufid_ti); + ufid_table_instance_insert(ti, flow); + table->ufid_count++; + + /* Expand table, if necessary, to make room. */ + if (table->ufid_count > ti->n_buckets) { + struct table_instance *new_ti; + + new_ti = table_instance_expand(ti, true); + if (new_ti) { + rcu_assign_pointer(table->ufid_ti, new_ti); + call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb); + } + } +} + +/* Must be called with OVS mutex held. */ +int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow, + const struct sw_flow_mask *mask) +{ + int err; + + err = flow_mask_insert(table, flow, mask); + if (err) + return err; + flow_key_insert(table, flow); + if (ovs_identifier_is_ufid(&flow->id)) + flow_ufid_insert(table, flow); + + return 0; +} + +static int compare_mask_and_count(const void *a, const void *b) +{ + const struct mask_count *mc_a = a; + const struct mask_count *mc_b = b; + + return (s64)mc_b->counter - (s64)mc_a->counter; +} + +/* Must be called with OVS mutex held. */ +void ovs_flow_masks_rebalance(struct flow_table *table) +{ + struct mask_array *ma = rcu_dereference_ovsl(table->mask_array); + struct mask_count *masks_and_count; + struct mask_array *new; + int masks_entries = 0; + int i; + + /* Build array of all current entries with use counters. */ + masks_and_count = kmalloc_array(ma->max, sizeof(*masks_and_count), + GFP_KERNEL); + if (!masks_and_count) + return; + + for (i = 0; i < ma->max; i++) { + struct sw_flow_mask *mask; + int cpu; + + mask = rcu_dereference_ovsl(ma->masks[i]); + if (unlikely(!mask)) + break; + + masks_and_count[i].index = i; + masks_and_count[i].counter = 0; + + for_each_possible_cpu(cpu) { + struct mask_array_stats *stats; + unsigned int start; + u64 counter; + + stats = per_cpu_ptr(ma->masks_usage_stats, cpu); + do { + start = u64_stats_fetch_begin(&stats->syncp); + counter = stats->usage_cntrs[i]; + } while (u64_stats_fetch_retry(&stats->syncp, start)); + + masks_and_count[i].counter += counter; + } + + /* Subtract the zero count value. */ + masks_and_count[i].counter -= ma->masks_usage_zero_cntr[i]; + + /* Rather than calling tbl_mask_array_reset_counters() + * below when no change is needed, do it inline here. + */ + ma->masks_usage_zero_cntr[i] += masks_and_count[i].counter; + } + + if (i == 0) + goto free_mask_entries; + + /* Sort the entries */ + masks_entries = i; + sort(masks_and_count, masks_entries, sizeof(*masks_and_count), + compare_mask_and_count, NULL); + + /* If the order is the same, nothing to do... */ + for (i = 0; i < masks_entries; i++) { + if (i != masks_and_count[i].index) + break; + } + if (i == masks_entries) + goto free_mask_entries; + + /* Rebuilt the new list in order of usage. */ + new = tbl_mask_array_alloc(ma->max); + if (!new) + goto free_mask_entries; + + for (i = 0; i < masks_entries; i++) { + int index = masks_and_count[i].index; + + if (ovsl_dereference(ma->masks[index])) + new->masks[new->count++] = ma->masks[index]; + } + + rcu_assign_pointer(table->mask_array, new); + call_rcu(&ma->rcu, mask_array_rcu_cb); + +free_mask_entries: + kfree(masks_and_count); +} + +/* Initializes the flow module. + * Returns zero if successful or a negative error code. */ +int ovs_flow_init(void) +{ + BUILD_BUG_ON(__alignof__(struct sw_flow_key) % __alignof__(long)); + BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long)); + + flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow) + + (nr_cpu_ids + * sizeof(struct sw_flow_stats *)) + + cpumask_size(), + 0, 0, NULL); + if (flow_cache == NULL) + return -ENOMEM; + + flow_stats_cache + = kmem_cache_create("sw_flow_stats", sizeof(struct sw_flow_stats), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (flow_stats_cache == NULL) { + kmem_cache_destroy(flow_cache); + flow_cache = NULL; + return -ENOMEM; + } + + return 0; +} + +/* Uninitializes the flow module. */ +void ovs_flow_exit(void) +{ + kmem_cache_destroy(flow_stats_cache); + kmem_cache_destroy(flow_cache); +} diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h new file mode 100644 index 000000000000..f524dc3e4862 --- /dev/null +++ b/net/openvswitch/flow_table.h @@ -0,0 +1,115 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2007-2013 Nicira, Inc. + */ + +#ifndef FLOW_TABLE_H +#define FLOW_TABLE_H 1 + +#include <linux/kernel.h> +#include <linux/netlink.h> +#include <linux/openvswitch.h> +#include <linux/spinlock.h> +#include <linux/types.h> +#include <linux/rcupdate.h> +#include <linux/if_ether.h> +#include <linux/in6.h> +#include <linux/jiffies.h> +#include <linux/time.h> + +#include <net/inet_ecn.h> +#include <net/ip_tunnels.h> + +#include "flow.h" + +struct mask_cache_entry { + u32 skb_hash; + u32 mask_index; +}; + +struct mask_cache { + struct rcu_head rcu; + u32 cache_size; /* Must be ^2 value. */ + struct mask_cache_entry __percpu *mask_cache; +}; + +struct mask_count { + int index; + u64 counter; +}; + +struct mask_array_stats { + struct u64_stats_sync syncp; + u64 usage_cntrs[]; +}; + +struct mask_array { + struct rcu_head rcu; + int count, max; + struct mask_array_stats __percpu *masks_usage_stats; + u64 *masks_usage_zero_cntr; + struct sw_flow_mask __rcu *masks[] __counted_by(max); +}; + +struct table_instance { + struct hlist_head *buckets; + unsigned int n_buckets; + struct rcu_head rcu; + int node_ver; + u32 hash_seed; +}; + +struct flow_table { + struct table_instance __rcu *ti; + struct table_instance __rcu *ufid_ti; + struct mask_cache __rcu *mask_cache; + struct mask_array __rcu *mask_array; + unsigned long last_rehash; + unsigned int count; + unsigned int ufid_count; +}; + +extern struct kmem_cache *flow_stats_cache; + +int ovs_flow_init(void); +void ovs_flow_exit(void); + +struct sw_flow *ovs_flow_alloc(void); +void ovs_flow_free(struct sw_flow *, bool deferred); + +int ovs_flow_tbl_init(struct flow_table *); +int ovs_flow_tbl_count(const struct flow_table *table); +void ovs_flow_tbl_destroy(struct flow_table *table); +int ovs_flow_tbl_flush(struct flow_table *flow_table); + +int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow, + const struct sw_flow_mask *mask); +void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow); +int ovs_flow_tbl_num_masks(const struct flow_table *table); +u32 ovs_flow_tbl_masks_cache_size(const struct flow_table *table); +int ovs_flow_tbl_masks_cache_resize(struct flow_table *table, u32 size); +struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *table, + u32 *bucket, u32 *idx); +struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *, + const struct sw_flow_key *, + u32 skb_hash, + u32 *n_mask_hit, + u32 *n_cache_hit); +struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *, + const struct sw_flow_key *); +struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl, + const struct sw_flow_match *match); +struct sw_flow *ovs_flow_tbl_lookup_ufid(struct flow_table *, + const struct sw_flow_id *); + +bool ovs_flow_cmp(const struct sw_flow *, const struct sw_flow_match *); + +void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src, + bool full, const struct sw_flow_mask *mask); + +void ovs_flow_masks_rebalance(struct flow_table *table); +void table_instance_flow_flush(struct flow_table *table, + struct table_instance *ti, + struct table_instance *ufid_ti); + +#endif /* flow_table.h */ diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c new file mode 100644 index 000000000000..cc08e0403909 --- /dev/null +++ b/net/openvswitch/meter.c @@ -0,0 +1,766 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2017 Nicira, Inc. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/if.h> +#include <linux/skbuff.h> +#include <linux/ip.h> +#include <linux/kernel.h> +#include <linux/openvswitch.h> +#include <linux/netlink.h> +#include <linux/rculist.h> + +#include <net/netlink.h> +#include <net/genetlink.h> + +#include "datapath.h" +#include "meter.h" + +static const struct nla_policy meter_policy[OVS_METER_ATTR_MAX + 1] = { + [OVS_METER_ATTR_ID] = { .type = NLA_U32, }, + [OVS_METER_ATTR_KBPS] = { .type = NLA_FLAG }, + [OVS_METER_ATTR_STATS] = { .len = sizeof(struct ovs_flow_stats) }, + [OVS_METER_ATTR_BANDS] = { .type = NLA_NESTED }, + [OVS_METER_ATTR_USED] = { .type = NLA_U64 }, + [OVS_METER_ATTR_CLEAR] = { .type = NLA_FLAG }, + [OVS_METER_ATTR_MAX_METERS] = { .type = NLA_U32 }, + [OVS_METER_ATTR_MAX_BANDS] = { .type = NLA_U32 }, +}; + +static const struct nla_policy band_policy[OVS_BAND_ATTR_MAX + 1] = { + [OVS_BAND_ATTR_TYPE] = { .type = NLA_U32, }, + [OVS_BAND_ATTR_RATE] = { .type = NLA_U32, }, + [OVS_BAND_ATTR_BURST] = { .type = NLA_U32, }, + [OVS_BAND_ATTR_STATS] = { .len = sizeof(struct ovs_flow_stats) }, +}; + +static u32 meter_hash(struct dp_meter_instance *ti, u32 id) +{ + return id % ti->n_meters; +} + +static void ovs_meter_free(struct dp_meter *meter) +{ + if (!meter) + return; + + kfree_rcu(meter, rcu); +} + +/* Call with ovs_mutex or RCU read lock. */ +static struct dp_meter *lookup_meter(const struct dp_meter_table *tbl, + u32 meter_id) +{ + struct dp_meter_instance *ti = rcu_dereference_ovsl(tbl->ti); + u32 hash = meter_hash(ti, meter_id); + struct dp_meter *meter; + + meter = rcu_dereference_ovsl(ti->dp_meters[hash]); + if (meter && likely(meter->id == meter_id)) + return meter; + + return NULL; +} + +static struct dp_meter_instance *dp_meter_instance_alloc(const u32 size) +{ + struct dp_meter_instance *ti; + + ti = kvzalloc(struct_size(ti, dp_meters, size), GFP_KERNEL); + if (!ti) + return NULL; + + ti->n_meters = size; + + return ti; +} + +static void dp_meter_instance_free(struct dp_meter_instance *ti) +{ + kvfree(ti); +} + +static void dp_meter_instance_free_rcu(struct rcu_head *rcu) +{ + struct dp_meter_instance *ti; + + ti = container_of(rcu, struct dp_meter_instance, rcu); + kvfree(ti); +} + +static int +dp_meter_instance_realloc(struct dp_meter_table *tbl, u32 size) +{ + struct dp_meter_instance *ti = rcu_dereference_ovsl(tbl->ti); + int n_meters = min(size, ti->n_meters); + struct dp_meter_instance *new_ti; + int i; + + new_ti = dp_meter_instance_alloc(size); + if (!new_ti) + return -ENOMEM; + + for (i = 0; i < n_meters; i++) + if (rcu_dereference_ovsl(ti->dp_meters[i])) + new_ti->dp_meters[i] = ti->dp_meters[i]; + + rcu_assign_pointer(tbl->ti, new_ti); + call_rcu(&ti->rcu, dp_meter_instance_free_rcu); + + return 0; +} + +static void dp_meter_instance_insert(struct dp_meter_instance *ti, + struct dp_meter *meter) +{ + u32 hash; + + hash = meter_hash(ti, meter->id); + rcu_assign_pointer(ti->dp_meters[hash], meter); +} + +static void dp_meter_instance_remove(struct dp_meter_instance *ti, + struct dp_meter *meter) +{ + u32 hash; + + hash = meter_hash(ti, meter->id); + RCU_INIT_POINTER(ti->dp_meters[hash], NULL); +} + +static int attach_meter(struct dp_meter_table *tbl, struct dp_meter *meter) +{ + struct dp_meter_instance *ti = rcu_dereference_ovsl(tbl->ti); + u32 hash = meter_hash(ti, meter->id); + int err; + + /* In generally, slots selected should be empty, because + * OvS uses id-pool to fetch a available id. + */ + if (unlikely(rcu_dereference_ovsl(ti->dp_meters[hash]))) + return -EBUSY; + + dp_meter_instance_insert(ti, meter); + + /* That function is thread-safe. */ + tbl->count++; + if (tbl->count >= tbl->max_meters_allowed) { + err = -EFBIG; + goto attach_err; + } + + if (tbl->count >= ti->n_meters && + dp_meter_instance_realloc(tbl, ti->n_meters * 2)) { + err = -ENOMEM; + goto attach_err; + } + + return 0; + +attach_err: + dp_meter_instance_remove(ti, meter); + tbl->count--; + return err; +} + +static int detach_meter(struct dp_meter_table *tbl, struct dp_meter *meter) +{ + struct dp_meter_instance *ti; + + ASSERT_OVSL(); + if (!meter) + return 0; + + ti = rcu_dereference_ovsl(tbl->ti); + dp_meter_instance_remove(ti, meter); + + tbl->count--; + + /* Shrink the meter array if necessary. */ + if (ti->n_meters > DP_METER_ARRAY_SIZE_MIN && + tbl->count <= (ti->n_meters / 4)) { + int half_size = ti->n_meters / 2; + int i; + + /* Avoid hash collision, don't move slots to other place. + * Make sure there are no references of meters in array + * which will be released. + */ + for (i = half_size; i < ti->n_meters; i++) + if (rcu_dereference_ovsl(ti->dp_meters[i])) + goto out; + + if (dp_meter_instance_realloc(tbl, half_size)) + goto shrink_err; + } + +out: + return 0; + +shrink_err: + dp_meter_instance_insert(ti, meter); + tbl->count++; + return -ENOMEM; +} + +static struct sk_buff * +ovs_meter_cmd_reply_start(struct genl_info *info, u8 cmd, + struct ovs_header **ovs_reply_header) +{ + struct sk_buff *skb; + struct ovs_header *ovs_header = genl_info_userhdr(info); + + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); + if (!skb) + return ERR_PTR(-ENOMEM); + + *ovs_reply_header = genlmsg_put(skb, info->snd_portid, + info->snd_seq, + &dp_meter_genl_family, 0, cmd); + if (!*ovs_reply_header) { + nlmsg_free(skb); + return ERR_PTR(-EMSGSIZE); + } + (*ovs_reply_header)->dp_ifindex = ovs_header->dp_ifindex; + + return skb; +} + +static int ovs_meter_cmd_reply_stats(struct sk_buff *reply, u32 meter_id, + struct dp_meter *meter) +{ + struct nlattr *nla; + struct dp_meter_band *band; + u16 i; + + if (nla_put_u32(reply, OVS_METER_ATTR_ID, meter_id)) + goto error; + + if (nla_put(reply, OVS_METER_ATTR_STATS, + sizeof(struct ovs_flow_stats), &meter->stats)) + goto error; + + if (nla_put_u64_64bit(reply, OVS_METER_ATTR_USED, meter->used, + OVS_METER_ATTR_PAD)) + goto error; + + nla = nla_nest_start_noflag(reply, OVS_METER_ATTR_BANDS); + if (!nla) + goto error; + + band = meter->bands; + + for (i = 0; i < meter->n_bands; ++i, ++band) { + struct nlattr *band_nla; + + band_nla = nla_nest_start_noflag(reply, OVS_BAND_ATTR_UNSPEC); + if (!band_nla || nla_put(reply, OVS_BAND_ATTR_STATS, + sizeof(struct ovs_flow_stats), + &band->stats)) + goto error; + nla_nest_end(reply, band_nla); + } + nla_nest_end(reply, nla); + + return 0; +error: + return -EMSGSIZE; +} + +static int ovs_meter_cmd_features(struct sk_buff *skb, struct genl_info *info) +{ + struct ovs_header *ovs_header = genl_info_userhdr(info); + struct ovs_header *ovs_reply_header; + struct nlattr *nla, *band_nla; + struct sk_buff *reply; + struct datapath *dp; + int err = -EMSGSIZE; + + reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_FEATURES, + &ovs_reply_header); + if (IS_ERR(reply)) + return PTR_ERR(reply); + + ovs_lock(); + dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + if (!dp) { + err = -ENODEV; + goto exit_unlock; + } + + if (nla_put_u32(reply, OVS_METER_ATTR_MAX_METERS, + dp->meter_tbl.max_meters_allowed)) + goto exit_unlock; + + ovs_unlock(); + + if (nla_put_u32(reply, OVS_METER_ATTR_MAX_BANDS, DP_MAX_BANDS)) + goto nla_put_failure; + + nla = nla_nest_start_noflag(reply, OVS_METER_ATTR_BANDS); + if (!nla) + goto nla_put_failure; + + band_nla = nla_nest_start_noflag(reply, OVS_BAND_ATTR_UNSPEC); + if (!band_nla) + goto nla_put_failure; + /* Currently only DROP band type is supported. */ + if (nla_put_u32(reply, OVS_BAND_ATTR_TYPE, OVS_METER_BAND_TYPE_DROP)) + goto nla_put_failure; + nla_nest_end(reply, band_nla); + nla_nest_end(reply, nla); + + genlmsg_end(reply, ovs_reply_header); + return genlmsg_reply(reply, info); + +exit_unlock: + ovs_unlock(); +nla_put_failure: + nlmsg_free(reply); + return err; +} + +static struct dp_meter *dp_meter_create(struct nlattr **a) +{ + struct nlattr *nla; + int rem; + u16 n_bands = 0; + struct dp_meter *meter; + struct dp_meter_band *band; + int err; + + /* Validate attributes, count the bands. */ + if (!a[OVS_METER_ATTR_BANDS]) + return ERR_PTR(-EINVAL); + + nla_for_each_nested(nla, a[OVS_METER_ATTR_BANDS], rem) + if (++n_bands > DP_MAX_BANDS) + return ERR_PTR(-EINVAL); + + /* Allocate and set up the meter before locking anything. */ + meter = kzalloc(struct_size(meter, bands, n_bands), GFP_KERNEL_ACCOUNT); + if (!meter) + return ERR_PTR(-ENOMEM); + + meter->id = nla_get_u32(a[OVS_METER_ATTR_ID]); + meter->used = div_u64(ktime_get_ns(), 1000 * 1000); + meter->kbps = a[OVS_METER_ATTR_KBPS] ? 1 : 0; + meter->keep_stats = !a[OVS_METER_ATTR_CLEAR]; + spin_lock_init(&meter->lock); + if (meter->keep_stats && a[OVS_METER_ATTR_STATS]) { + meter->stats = *(struct ovs_flow_stats *) + nla_data(a[OVS_METER_ATTR_STATS]); + } + meter->n_bands = n_bands; + + /* Set up meter bands. */ + band = meter->bands; + nla_for_each_nested(nla, a[OVS_METER_ATTR_BANDS], rem) { + struct nlattr *attr[OVS_BAND_ATTR_MAX + 1]; + u32 band_max_delta_t; + + err = nla_parse_deprecated((struct nlattr **)&attr, + OVS_BAND_ATTR_MAX, nla_data(nla), + nla_len(nla), band_policy, NULL); + if (err) + goto exit_free_meter; + + if (!attr[OVS_BAND_ATTR_TYPE] || + !attr[OVS_BAND_ATTR_RATE] || + !attr[OVS_BAND_ATTR_BURST]) { + err = -EINVAL; + goto exit_free_meter; + } + + band->type = nla_get_u32(attr[OVS_BAND_ATTR_TYPE]); + band->rate = nla_get_u32(attr[OVS_BAND_ATTR_RATE]); + if (band->rate == 0) { + err = -EINVAL; + goto exit_free_meter; + } + + band->burst_size = nla_get_u32(attr[OVS_BAND_ATTR_BURST]); + /* Figure out max delta_t that is enough to fill any bucket. + * Keep max_delta_t size to the bucket units: + * pkts => 1/1000 packets, kilobits => bits. + * + * Start with a full bucket. + */ + band->bucket = band->burst_size * 1000ULL; + band_max_delta_t = div_u64(band->bucket, band->rate); + if (band_max_delta_t > meter->max_delta_t) + meter->max_delta_t = band_max_delta_t; + band++; + } + + return meter; + +exit_free_meter: + kfree(meter); + return ERR_PTR(err); +} + +static int ovs_meter_cmd_set(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr **a = info->attrs; + struct dp_meter *meter, *old_meter; + struct sk_buff *reply; + struct ovs_header *ovs_reply_header; + struct ovs_header *ovs_header = genl_info_userhdr(info); + struct dp_meter_table *meter_tbl; + struct datapath *dp; + int err; + u32 meter_id; + bool failed; + + if (!a[OVS_METER_ATTR_ID]) + return -EINVAL; + + meter = dp_meter_create(a); + if (IS_ERR(meter)) + return PTR_ERR(meter); + + reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_SET, + &ovs_reply_header); + if (IS_ERR(reply)) { + err = PTR_ERR(reply); + goto exit_free_meter; + } + + ovs_lock(); + dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + if (!dp) { + err = -ENODEV; + goto exit_unlock; + } + + meter_tbl = &dp->meter_tbl; + meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]); + + old_meter = lookup_meter(meter_tbl, meter_id); + err = detach_meter(meter_tbl, old_meter); + if (err) + goto exit_unlock; + + err = attach_meter(meter_tbl, meter); + if (err) + goto exit_free_old_meter; + + ovs_unlock(); + + /* Build response with the meter_id and stats from + * the old meter, if any. + */ + failed = nla_put_u32(reply, OVS_METER_ATTR_ID, meter_id); + WARN_ON(failed); + if (old_meter) { + spin_lock_bh(&old_meter->lock); + if (old_meter->keep_stats) { + err = ovs_meter_cmd_reply_stats(reply, meter_id, + old_meter); + WARN_ON(err); + } + spin_unlock_bh(&old_meter->lock); + ovs_meter_free(old_meter); + } + + genlmsg_end(reply, ovs_reply_header); + return genlmsg_reply(reply, info); + +exit_free_old_meter: + ovs_meter_free(old_meter); +exit_unlock: + ovs_unlock(); + nlmsg_free(reply); +exit_free_meter: + kfree(meter); + return err; +} + +static int ovs_meter_cmd_get(struct sk_buff *skb, struct genl_info *info) +{ + struct ovs_header *ovs_header = genl_info_userhdr(info); + struct ovs_header *ovs_reply_header; + struct nlattr **a = info->attrs; + struct dp_meter *meter; + struct sk_buff *reply; + struct datapath *dp; + u32 meter_id; + int err; + + if (!a[OVS_METER_ATTR_ID]) + return -EINVAL; + + meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]); + + reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_GET, + &ovs_reply_header); + if (IS_ERR(reply)) + return PTR_ERR(reply); + + ovs_lock(); + + dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + if (!dp) { + err = -ENODEV; + goto exit_unlock; + } + + /* Locate meter, copy stats. */ + meter = lookup_meter(&dp->meter_tbl, meter_id); + if (!meter) { + err = -ENOENT; + goto exit_unlock; + } + + spin_lock_bh(&meter->lock); + err = ovs_meter_cmd_reply_stats(reply, meter_id, meter); + spin_unlock_bh(&meter->lock); + if (err) + goto exit_unlock; + + ovs_unlock(); + + genlmsg_end(reply, ovs_reply_header); + return genlmsg_reply(reply, info); + +exit_unlock: + ovs_unlock(); + nlmsg_free(reply); + return err; +} + +static int ovs_meter_cmd_del(struct sk_buff *skb, struct genl_info *info) +{ + struct ovs_header *ovs_header = genl_info_userhdr(info); + struct ovs_header *ovs_reply_header; + struct nlattr **a = info->attrs; + struct dp_meter *old_meter; + struct sk_buff *reply; + struct datapath *dp; + u32 meter_id; + int err; + + if (!a[OVS_METER_ATTR_ID]) + return -EINVAL; + + reply = ovs_meter_cmd_reply_start(info, OVS_METER_CMD_DEL, + &ovs_reply_header); + if (IS_ERR(reply)) + return PTR_ERR(reply); + + ovs_lock(); + + dp = get_dp(sock_net(skb->sk), ovs_header->dp_ifindex); + if (!dp) { + err = -ENODEV; + goto exit_unlock; + } + + meter_id = nla_get_u32(a[OVS_METER_ATTR_ID]); + old_meter = lookup_meter(&dp->meter_tbl, meter_id); + if (old_meter) { + spin_lock_bh(&old_meter->lock); + err = ovs_meter_cmd_reply_stats(reply, meter_id, old_meter); + WARN_ON(err); + spin_unlock_bh(&old_meter->lock); + + err = detach_meter(&dp->meter_tbl, old_meter); + if (err) + goto exit_unlock; + } + + ovs_unlock(); + ovs_meter_free(old_meter); + genlmsg_end(reply, ovs_reply_header); + return genlmsg_reply(reply, info); + +exit_unlock: + ovs_unlock(); + nlmsg_free(reply); + return err; +} + +/* Meter action execution. + * + * Return true 'meter_id' drop band is triggered. The 'skb' should be + * dropped by the caller'. + */ +bool ovs_meter_execute(struct datapath *dp, struct sk_buff *skb, + struct sw_flow_key *key, u32 meter_id) +{ + long long int now_ms = div_u64(ktime_get_ns(), 1000 * 1000); + long long int long_delta_ms; + struct dp_meter_band *band; + struct dp_meter *meter; + int i, band_exceeded_max = -1; + u32 band_exceeded_rate = 0; + u32 delta_ms; + u32 cost; + + meter = lookup_meter(&dp->meter_tbl, meter_id); + /* Do not drop the packet when there is no meter. */ + if (!meter) + return false; + + /* Lock the meter while using it. */ + spin_lock(&meter->lock); + + long_delta_ms = (now_ms - meter->used); /* ms */ + if (long_delta_ms < 0) { + /* This condition means that we have several threads fighting + * for a meter lock, and the one who received the packets a + * bit later wins. Assuming that all racing threads received + * packets at the same time to avoid overflow. + */ + long_delta_ms = 0; + } + + /* Make sure delta_ms will not be too large, so that bucket will not + * wrap around below. + */ + delta_ms = (long_delta_ms > (long long int)meter->max_delta_t) + ? meter->max_delta_t : (u32)long_delta_ms; + + /* Update meter statistics. + */ + meter->used = now_ms; + meter->stats.n_packets += 1; + meter->stats.n_bytes += skb->len; + + /* Bucket rate is either in kilobits per second, or in packets per + * second. We maintain the bucket in the units of either bits or + * 1/1000th of a packet, correspondingly. + * Then, when rate is multiplied with milliseconds, we get the + * bucket units: + * msec * kbps = bits, and + * msec * packets/sec = 1/1000 packets. + * + * 'cost' is the number of bucket units in this packet. + */ + cost = (meter->kbps) ? skb->len * 8 : 1000; + + /* Update all bands and find the one hit with the highest rate. */ + for (i = 0; i < meter->n_bands; ++i) { + long long int max_bucket_size; + + band = &meter->bands[i]; + max_bucket_size = band->burst_size * 1000LL; + + band->bucket += delta_ms * band->rate; + if (band->bucket > max_bucket_size) + band->bucket = max_bucket_size; + + if (band->bucket >= cost) { + band->bucket -= cost; + } else if (band->rate > band_exceeded_rate) { + band_exceeded_rate = band->rate; + band_exceeded_max = i; + } + } + + if (band_exceeded_max >= 0) { + /* Update band statistics. */ + band = &meter->bands[band_exceeded_max]; + band->stats.n_packets += 1; + band->stats.n_bytes += skb->len; + + /* Drop band triggered, let the caller drop the 'skb'. */ + if (band->type == OVS_METER_BAND_TYPE_DROP) { + spin_unlock(&meter->lock); + return true; + } + } + + spin_unlock(&meter->lock); + return false; +} + +static const struct genl_small_ops dp_meter_genl_ops[] = { + { .cmd = OVS_METER_CMD_FEATURES, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .flags = 0, /* OK for unprivileged users. */ + .doit = ovs_meter_cmd_features + }, + { .cmd = OVS_METER_CMD_SET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN + * privilege. + */ + .doit = ovs_meter_cmd_set, + }, + { .cmd = OVS_METER_CMD_GET, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .flags = 0, /* OK for unprivileged users. */ + .doit = ovs_meter_cmd_get, + }, + { .cmd = OVS_METER_CMD_DEL, + .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN + * privilege. + */ + .doit = ovs_meter_cmd_del + }, +}; + +static const struct genl_multicast_group ovs_meter_multicast_group = { + .name = OVS_METER_MCGROUP, +}; + +struct genl_family dp_meter_genl_family __ro_after_init = { + .hdrsize = sizeof(struct ovs_header), + .name = OVS_METER_FAMILY, + .version = OVS_METER_VERSION, + .maxattr = OVS_METER_ATTR_MAX, + .policy = meter_policy, + .netnsok = true, + .parallel_ops = true, + .small_ops = dp_meter_genl_ops, + .n_small_ops = ARRAY_SIZE(dp_meter_genl_ops), + .resv_start_op = OVS_METER_CMD_GET + 1, + .mcgrps = &ovs_meter_multicast_group, + .n_mcgrps = 1, + .module = THIS_MODULE, +}; + +int ovs_meters_init(struct datapath *dp) +{ + struct dp_meter_table *tbl = &dp->meter_tbl; + struct dp_meter_instance *ti; + unsigned long free_mem_bytes; + + ti = dp_meter_instance_alloc(DP_METER_ARRAY_SIZE_MIN); + if (!ti) + return -ENOMEM; + + /* Allow meters in a datapath to use ~3.12% of physical memory. */ + free_mem_bytes = nr_free_buffer_pages() * (PAGE_SIZE >> 5); + tbl->max_meters_allowed = min(free_mem_bytes / sizeof(struct dp_meter), + DP_METER_NUM_MAX); + if (!tbl->max_meters_allowed) + goto out_err; + + rcu_assign_pointer(tbl->ti, ti); + tbl->count = 0; + + return 0; + +out_err: + dp_meter_instance_free(ti); + return -ENOMEM; +} + +void ovs_meters_exit(struct datapath *dp) +{ + struct dp_meter_table *tbl = &dp->meter_tbl; + struct dp_meter_instance *ti = rcu_dereference_raw(tbl->ti); + int i; + + for (i = 0; i < ti->n_meters; i++) + ovs_meter_free(rcu_dereference_raw(ti->dp_meters[i])); + + dp_meter_instance_free(ti); +} diff --git a/net/openvswitch/meter.h b/net/openvswitch/meter.h new file mode 100644 index 000000000000..8bbf983cd244 --- /dev/null +++ b/net/openvswitch/meter.h @@ -0,0 +1,62 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2017 Nicira, Inc. + */ + +#ifndef METER_H +#define METER_H 1 + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/netlink.h> +#include <linux/openvswitch.h> +#include <linux/skbuff.h> +#include <linux/bits.h> + +#include "flow.h" +struct datapath; + +#define DP_MAX_BANDS 1 +#define DP_METER_ARRAY_SIZE_MIN BIT_ULL(10) +#define DP_METER_NUM_MAX (200000UL) + +struct dp_meter_band { + u32 type; + u32 rate; + u32 burst_size; + u64 bucket; /* 1/1000 packets, or in bits */ + struct ovs_flow_stats stats; +}; + +struct dp_meter { + spinlock_t lock; /* Per meter lock */ + struct rcu_head rcu; + u32 id; + u16 kbps:1, keep_stats:1; + u16 n_bands; + u32 max_delta_t; + u64 used; + struct ovs_flow_stats stats; + struct dp_meter_band bands[] __counted_by(n_bands); +}; + +struct dp_meter_instance { + struct rcu_head rcu; + u32 n_meters; + struct dp_meter __rcu *dp_meters[] __counted_by(n_meters); +}; + +struct dp_meter_table { + struct dp_meter_instance __rcu *ti; + u32 count; + u32 max_meters_allowed; +}; + +extern struct genl_family dp_meter_genl_family; +int ovs_meters_init(struct datapath *dp); +void ovs_meters_exit(struct datapath *dp); +bool ovs_meter_execute(struct datapath *dp, struct sk_buff *skb, + struct sw_flow_key *key, u32 meter_id); + +#endif /* meter.h */ diff --git a/net/openvswitch/openvswitch_trace.c b/net/openvswitch/openvswitch_trace.c new file mode 100644 index 000000000000..62c5f7d6f023 --- /dev/null +++ b/net/openvswitch/openvswitch_trace.c @@ -0,0 +1,10 @@ +// SPDX-License-Identifier: GPL-2.0 +/* bug in tracepoint.h, it should include this */ +#include <linux/module.h> + +/* sparse isn't too happy with all macros... */ +#ifndef __CHECKER__ +#define CREATE_TRACE_POINTS +#include "openvswitch_trace.h" + +#endif diff --git a/net/openvswitch/openvswitch_trace.h b/net/openvswitch/openvswitch_trace.h new file mode 100644 index 000000000000..74d75aaebef4 --- /dev/null +++ b/net/openvswitch/openvswitch_trace.h @@ -0,0 +1,158 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM openvswitch + +#if !defined(_TRACE_OPENVSWITCH_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_OPENVSWITCH_H + +#include <linux/tracepoint.h> + +#include "datapath.h" + +TRACE_EVENT(ovs_do_execute_action, + + TP_PROTO(struct datapath *dp, struct sk_buff *skb, + struct sw_flow_key *key, const struct nlattr *a, int rem), + + TP_ARGS(dp, skb, key, a, rem), + + TP_STRUCT__entry( + __field( void *, dpaddr ) + __string( dp_name, ovs_dp_name(dp) ) + __string( dev_name, skb->dev->name ) + __field( void *, skbaddr ) + __field( unsigned int, len ) + __field( unsigned int, data_len ) + __field( unsigned int, truesize ) + __field( u8, nr_frags ) + __field( u16, gso_size ) + __field( u16, gso_type ) + __field( u32, ovs_flow_hash ) + __field( u32, recirc_id ) + __field( void *, keyaddr ) + __field( u16, key_eth_type ) + __field( u8, key_ct_state ) + __field( u8, key_ct_orig_proto ) + __field( u16, key_ct_zone ) + __field( unsigned int, flow_key_valid ) + __field( u8, action_type ) + __field( unsigned int, action_len ) + __field( void *, action_data ) + __field( u8, is_last ) + ), + + TP_fast_assign( + __entry->dpaddr = dp; + __assign_str(dp_name); + __assign_str(dev_name); + __entry->skbaddr = skb; + __entry->len = skb->len; + __entry->data_len = skb->data_len; + __entry->truesize = skb->truesize; + __entry->nr_frags = skb_shinfo(skb)->nr_frags; + __entry->gso_size = skb_shinfo(skb)->gso_size; + __entry->gso_type = skb_shinfo(skb)->gso_type; + __entry->ovs_flow_hash = key->ovs_flow_hash; + __entry->recirc_id = key->recirc_id; + __entry->keyaddr = key; + __entry->key_eth_type = key->eth.type; + __entry->key_ct_state = key->ct_state; + __entry->key_ct_orig_proto = key->ct_orig_proto; + __entry->key_ct_zone = key->ct_zone; + __entry->flow_key_valid = !(key->mac_proto & SW_FLOW_KEY_INVALID); + __entry->action_type = nla_type(a); + __entry->action_len = nla_len(a); + __entry->action_data = nla_data(a); + __entry->is_last = nla_is_last(a, rem); + ), + + TP_printk("dpaddr=%p dp_name=%s dev=%s skbaddr=%p len=%u data_len=%u truesize=%u nr_frags=%d gso_size=%d gso_type=%#x ovs_flow_hash=0x%08x recirc_id=0x%08x keyaddr=%p eth_type=0x%04x ct_state=%02x ct_orig_proto=%02x ct_Zone=%04x flow_key_valid=%d action_type=%u action_len=%u action_data=%p is_last=%d", + __entry->dpaddr, __get_str(dp_name), __get_str(dev_name), + __entry->skbaddr, __entry->len, __entry->data_len, + __entry->truesize, __entry->nr_frags, __entry->gso_size, + __entry->gso_type, __entry->ovs_flow_hash, + __entry->recirc_id, __entry->keyaddr, __entry->key_eth_type, + __entry->key_ct_state, __entry->key_ct_orig_proto, + __entry->key_ct_zone, + __entry->flow_key_valid, + __entry->action_type, __entry->action_len, + __entry->action_data, __entry->is_last) +); + +TRACE_EVENT(ovs_dp_upcall, + + TP_PROTO(struct datapath *dp, struct sk_buff *skb, + const struct sw_flow_key *key, + const struct dp_upcall_info *upcall_info), + + TP_ARGS(dp, skb, key, upcall_info), + + TP_STRUCT__entry( + __field( void *, dpaddr ) + __string( dp_name, ovs_dp_name(dp) ) + __string( dev_name, skb->dev->name ) + __field( void *, skbaddr ) + __field( unsigned int, len ) + __field( unsigned int, data_len ) + __field( unsigned int, truesize ) + __field( u8, nr_frags ) + __field( u16, gso_size ) + __field( u16, gso_type ) + __field( u32, ovs_flow_hash ) + __field( u32, recirc_id ) + __field( const void *, keyaddr ) + __field( u16, key_eth_type ) + __field( u8, key_ct_state ) + __field( u8, key_ct_orig_proto ) + __field( u16, key_ct_zone ) + __field( unsigned int, flow_key_valid ) + __field( u8, upcall_cmd ) + __field( u32, upcall_port ) + __field( u16, upcall_mru ) + ), + + TP_fast_assign( + __entry->dpaddr = dp; + __assign_str(dp_name); + __assign_str(dev_name); + __entry->skbaddr = skb; + __entry->len = skb->len; + __entry->data_len = skb->data_len; + __entry->truesize = skb->truesize; + __entry->nr_frags = skb_shinfo(skb)->nr_frags; + __entry->gso_size = skb_shinfo(skb)->gso_size; + __entry->gso_type = skb_shinfo(skb)->gso_type; + __entry->ovs_flow_hash = key->ovs_flow_hash; + __entry->recirc_id = key->recirc_id; + __entry->keyaddr = key; + __entry->key_eth_type = key->eth.type; + __entry->key_ct_state = key->ct_state; + __entry->key_ct_orig_proto = key->ct_orig_proto; + __entry->key_ct_zone = key->ct_zone; + __entry->flow_key_valid = !(key->mac_proto & SW_FLOW_KEY_INVALID); + __entry->upcall_cmd = upcall_info->cmd; + __entry->upcall_port = upcall_info->portid; + __entry->upcall_mru = upcall_info->mru; + ), + + TP_printk("dpaddr=%p dp_name=%s dev=%s skbaddr=%p len=%u data_len=%u truesize=%u nr_frags=%d gso_size=%d gso_type=%#x ovs_flow_hash=0x%08x recirc_id=0x%08x keyaddr=%p eth_type=0x%04x ct_state=%02x ct_orig_proto=%02x ct_zone=%04x flow_key_valid=%d upcall_cmd=%u upcall_port=%u upcall_mru=%u", + __entry->dpaddr, __get_str(dp_name), __get_str(dev_name), + __entry->skbaddr, __entry->len, __entry->data_len, + __entry->truesize, __entry->nr_frags, __entry->gso_size, + __entry->gso_type, __entry->ovs_flow_hash, + __entry->recirc_id, __entry->keyaddr, __entry->key_eth_type, + __entry->key_ct_state, __entry->key_ct_orig_proto, + __entry->key_ct_zone, + __entry->flow_key_valid, + __entry->upcall_cmd, __entry->upcall_port, + __entry->upcall_mru) +); + +#endif /* _TRACE_OPENVSWITCH_H */ + +/* This part must be outside protection */ +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE openvswitch_trace +#include <trace/define_trace.h> diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c new file mode 100644 index 000000000000..b10e1602c6b1 --- /dev/null +++ b/net/openvswitch/vport-geneve.c @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Copyright (c) 2014 Nicira, Inc. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/in.h> +#include <linux/ip.h> +#include <linux/net.h> +#include <linux/rculist.h> +#include <linux/udp.h> +#include <linux/if_vlan.h> +#include <linux/module.h> + +#include <net/geneve.h> +#include <net/icmp.h> +#include <net/ip.h> +#include <net/route.h> +#include <net/udp.h> +#include <net/xfrm.h> + +#include "datapath.h" +#include "vport.h" +#include "vport-netdev.h" + +static struct vport_ops ovs_geneve_vport_ops; +/** + * struct geneve_port - Keeps track of open UDP ports + * @dst_port: destination port. + */ +struct geneve_port { + u16 dst_port; +}; + +static inline struct geneve_port *geneve_vport(const struct vport *vport) +{ + return vport_priv(vport); +} + +static int geneve_get_options(const struct vport *vport, + struct sk_buff *skb) +{ + struct geneve_port *geneve_port = geneve_vport(vport); + + if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, geneve_port->dst_port)) + return -EMSGSIZE; + return 0; +} + +static struct vport *geneve_tnl_create(const struct vport_parms *parms) +{ + struct net *net = ovs_dp_get_net(parms->dp); + struct nlattr *options = parms->options; + struct geneve_port *geneve_port; + struct net_device *dev; + struct vport *vport; + struct nlattr *a; + u16 dst_port; + int err; + + if (!options) { + err = -EINVAL; + goto error; + } + + a = nla_find_nested(options, OVS_TUNNEL_ATTR_DST_PORT); + if (a && nla_len(a) == sizeof(u16)) { + dst_port = nla_get_u16(a); + } else { + /* Require destination port from userspace. */ + err = -EINVAL; + goto error; + } + + vport = ovs_vport_alloc(sizeof(struct geneve_port), + &ovs_geneve_vport_ops, parms); + if (IS_ERR(vport)) + return vport; + + geneve_port = geneve_vport(vport); + geneve_port->dst_port = dst_port; + + rtnl_lock(); + dev = geneve_dev_create_fb(net, parms->name, NET_NAME_USER, dst_port); + if (IS_ERR(dev)) { + rtnl_unlock(); + ovs_vport_free(vport); + return ERR_CAST(dev); + } + + err = dev_change_flags(dev, dev->flags | IFF_UP, NULL); + if (err < 0) { + rtnl_delete_link(dev, 0, NULL); + rtnl_unlock(); + ovs_vport_free(vport); + goto error; + } + + rtnl_unlock(); + return vport; +error: + return ERR_PTR(err); +} + +static struct vport *geneve_create(const struct vport_parms *parms) +{ + struct vport *vport; + + vport = geneve_tnl_create(parms); + if (IS_ERR(vport)) + return vport; + + return ovs_netdev_link(vport, parms->name); +} + +static struct vport_ops ovs_geneve_vport_ops = { + .type = OVS_VPORT_TYPE_GENEVE, + .create = geneve_create, + .destroy = ovs_netdev_tunnel_destroy, + .get_options = geneve_get_options, + .send = dev_queue_xmit, +}; + +static int __init ovs_geneve_tnl_init(void) +{ + return ovs_vport_ops_register(&ovs_geneve_vport_ops); +} + +static void __exit ovs_geneve_tnl_exit(void) +{ + ovs_vport_ops_unregister(&ovs_geneve_vport_ops); +} + +module_init(ovs_geneve_tnl_init); +module_exit(ovs_geneve_tnl_exit); + +MODULE_DESCRIPTION("OVS: Geneve switching port"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("vport-type-5"); diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c index 493e9775dcda..4014c9b5eb79 100644 --- a/net/openvswitch/vport-gre.c +++ b/net/openvswitch/vport-gre.c @@ -1,22 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (c) 2007-2013 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA + * Copyright (c) 2007-2014 Nicira, Inc. */ -#ifdef CONFIG_OPENVSWITCH_GRE #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/if.h> @@ -25,13 +11,12 @@ #include <linux/if_tunnel.h> #include <linux/if_vlan.h> #include <linux/in.h> -#include <linux/if_vlan.h> -#include <linux/in.h> #include <linux/in_route.h> #include <linux/inetdevice.h> #include <linux/jhash.h> #include <linux/list.h> #include <linux/kernel.h> +#include <linux/module.h> #include <linux/workqueue.h> #include <linux/rculist.h> #include <net/route.h> @@ -47,229 +32,72 @@ #include "datapath.h" #include "vport.h" +#include "vport-netdev.h" -/* Returns the least-significant 32 bits of a __be64. */ -static __be32 be64_get_low32(__be64 x) -{ -#ifdef __BIG_ENDIAN - return (__force __be32)x; -#else - return (__force __be32)((__force u64)x >> 32); -#endif -} - -static __be16 filter_tnl_flags(__be16 flags) -{ - return flags & (TUNNEL_CSUM | TUNNEL_KEY); -} - -static struct sk_buff *__build_header(struct sk_buff *skb, - int tunnel_hlen) -{ - const struct ovs_key_ipv4_tunnel *tun_key = OVS_CB(skb)->tun_key; - struct tnl_ptk_info tpi; - - skb = gre_handle_offloads(skb, !!(tun_key->tun_flags & TUNNEL_CSUM)); - if (IS_ERR(skb)) - return NULL; - - tpi.flags = filter_tnl_flags(tun_key->tun_flags); - tpi.proto = htons(ETH_P_TEB); - tpi.key = be64_get_low32(tun_key->tun_id); - tpi.seq = 0; - gre_build_header(skb, &tpi, tunnel_hlen); +static struct vport_ops ovs_gre_vport_ops; - return skb; -} - -static __be64 key_to_tunnel_id(__be32 key, __be32 seq) +static struct vport *gre_tnl_create(const struct vport_parms *parms) { -#ifdef __BIG_ENDIAN - return (__force __be64)((__force u64)seq << 32 | (__force u32)key); -#else - return (__force __be64)((__force u64)key << 32 | (__force u32)seq); -#endif -} - -/* Called with rcu_read_lock and BH disabled. */ -static int gre_rcv(struct sk_buff *skb, - const struct tnl_ptk_info *tpi) -{ - struct ovs_key_ipv4_tunnel tun_key; - struct ovs_net *ovs_net; + struct net *net = ovs_dp_get_net(parms->dp); + struct net_device *dev; struct vport *vport; - __be64 key; - - ovs_net = net_generic(dev_net(skb->dev), ovs_net_id); - vport = rcu_dereference(ovs_net->vport_net.gre_vport); - if (unlikely(!vport)) - return PACKET_REJECT; - - key = key_to_tunnel_id(tpi->key, tpi->seq); - ovs_flow_tun_key_init(&tun_key, ip_hdr(skb), key, - filter_tnl_flags(tpi->flags)); - - ovs_vport_receive(vport, skb, &tun_key); - return PACKET_RCVD; -} - -static int gre_tnl_send(struct vport *vport, struct sk_buff *skb) -{ - struct net *net = ovs_dp_get_net(vport->dp); - struct flowi4 fl; - struct rtable *rt; - int min_headroom; - int tunnel_hlen; - __be16 df; int err; - if (unlikely(!OVS_CB(skb)->tun_key)) { - err = -EINVAL; - goto error; - } - - /* Route lookup */ - memset(&fl, 0, sizeof(fl)); - fl.daddr = OVS_CB(skb)->tun_key->ipv4_dst; - fl.saddr = OVS_CB(skb)->tun_key->ipv4_src; - fl.flowi4_tos = RT_TOS(OVS_CB(skb)->tun_key->ipv4_tos); - fl.flowi4_mark = skb->mark; - fl.flowi4_proto = IPPROTO_GRE; - - rt = ip_route_output_key(net, &fl); - if (IS_ERR(rt)) - return PTR_ERR(rt); - - tunnel_hlen = ip_gre_calc_hlen(OVS_CB(skb)->tun_key->tun_flags); - - min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len - + tunnel_hlen + sizeof(struct iphdr) - + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0); - if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) { - int head_delta = SKB_DATA_ALIGN(min_headroom - - skb_headroom(skb) + - 16); - err = pskb_expand_head(skb, max_t(int, head_delta, 0), - 0, GFP_ATOMIC); - if (unlikely(err)) - goto err_free_rt; - } - - if (vlan_tx_tag_present(skb)) { - if (unlikely(!__vlan_put_tag(skb, - skb->vlan_proto, - vlan_tx_tag_get(skb)))) { - err = -ENOMEM; - goto err_free_rt; - } - skb->vlan_tci = 0; + vport = ovs_vport_alloc(0, &ovs_gre_vport_ops, parms); + if (IS_ERR(vport)) + return vport; + + rtnl_lock(); + dev = gretap_fb_dev_create(net, parms->name, NET_NAME_USER); + if (IS_ERR(dev)) { + rtnl_unlock(); + ovs_vport_free(vport); + return ERR_CAST(dev); } - /* Push Tunnel header. */ - skb = __build_header(skb, tunnel_hlen); - if (unlikely(!skb)) { - err = 0; - goto err_free_rt; + err = dev_change_flags(dev, dev->flags | IFF_UP, NULL); + if (err < 0) { + rtnl_delete_link(dev, 0, NULL); + rtnl_unlock(); + ovs_vport_free(vport); + return ERR_PTR(err); } - df = OVS_CB(skb)->tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ? - htons(IP_DF) : 0; - - skb->local_df = 1; - - return iptunnel_xmit(net, rt, skb, fl.saddr, - OVS_CB(skb)->tun_key->ipv4_dst, IPPROTO_GRE, - OVS_CB(skb)->tun_key->ipv4_tos, - OVS_CB(skb)->tun_key->ipv4_ttl, df); -err_free_rt: - ip_rt_put(rt); -error: - return err; -} - -static struct gre_cisco_protocol gre_protocol = { - .handler = gre_rcv, - .priority = 1, -}; - -static int gre_ports; -static int gre_init(void) -{ - int err; - - gre_ports++; - if (gre_ports > 1) - return 0; - - err = gre_cisco_register(&gre_protocol); - if (err) - pr_warn("cannot register gre protocol handler\n"); - - return err; -} - -static void gre_exit(void) -{ - gre_ports--; - if (gre_ports > 0) - return; - - gre_cisco_unregister(&gre_protocol); -} - -static const char *gre_get_name(const struct vport *vport) -{ - return vport_priv(vport); + rtnl_unlock(); + return vport; } static struct vport *gre_create(const struct vport_parms *parms) { - struct net *net = ovs_dp_get_net(parms->dp); - struct ovs_net *ovs_net; struct vport *vport; - int err; - err = gre_init(); - if (err) - return ERR_PTR(err); - - ovs_net = net_generic(net, ovs_net_id); - if (ovsl_dereference(ovs_net->vport_net.gre_vport)) { - vport = ERR_PTR(-EEXIST); - goto error; - } - - vport = ovs_vport_alloc(IFNAMSIZ, &ovs_gre_vport_ops, parms); + vport = gre_tnl_create(parms); if (IS_ERR(vport)) - goto error; + return vport; - strncpy(vport_priv(vport), parms->name, IFNAMSIZ); - rcu_assign_pointer(ovs_net->vport_net.gre_vport, vport); - return vport; - -error: - gre_exit(); - return vport; + return ovs_netdev_link(vport, parms->name); } -static void gre_tnl_destroy(struct vport *vport) -{ - struct net *net = ovs_dp_get_net(vport->dp); - struct ovs_net *ovs_net; +static struct vport_ops ovs_gre_vport_ops = { + .type = OVS_VPORT_TYPE_GRE, + .create = gre_create, + .send = dev_queue_xmit, + .destroy = ovs_netdev_tunnel_destroy, +}; - ovs_net = net_generic(net, ovs_net_id); +static int __init ovs_gre_tnl_init(void) +{ + return ovs_vport_ops_register(&ovs_gre_vport_ops); +} - rcu_assign_pointer(ovs_net->vport_net.gre_vport, NULL); - ovs_vport_deferred_free(vport); - gre_exit(); +static void __exit ovs_gre_tnl_exit(void) +{ + ovs_vport_ops_unregister(&ovs_gre_vport_ops); } -const struct vport_ops ovs_gre_vport_ops = { - .type = OVS_VPORT_TYPE_GRE, - .create = gre_create, - .destroy = gre_tnl_destroy, - .get_name = gre_get_name, - .send = gre_tnl_send, -}; +module_init(ovs_gre_tnl_init); +module_exit(ovs_gre_tnl_exit); -#endif /* OPENVSWITCH_GRE */ +MODULE_DESCRIPTION("OVS: GRE switching port"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("vport-type-3"); diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index 98d3edbbc235..125d310871e9 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -1,22 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2007-2012 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA */ -#include <linux/hardirq.h> #include <linux/if_vlan.h> #include <linux/kernel.h> #include <linux/netdevice.h> @@ -26,6 +12,7 @@ #include <net/dst.h> #include <net/xfrm.h> +#include <net/rtnetlink.h> #include "datapath.h" #include "vport-internal_dev.h" @@ -35,41 +22,32 @@ struct internal_dev { struct vport *vport; }; +static struct vport_ops ovs_internal_vport_ops; + static struct internal_dev *internal_dev_priv(struct net_device *netdev) { return netdev_priv(netdev); } -/* This function is only called by the kernel network layer.*/ -static struct rtnl_link_stats64 *internal_dev_get_stats(struct net_device *netdev, - struct rtnl_link_stats64 *stats) -{ - struct vport *vport = ovs_internal_dev_get_vport(netdev); - struct ovs_vport_stats vport_stats; - - ovs_vport_get_stats(vport, &vport_stats); - - /* The tx and rx stats need to be swapped because the - * switch and host OS have opposite perspectives. */ - stats->rx_packets = vport_stats.tx_packets; - stats->tx_packets = vport_stats.rx_packets; - stats->rx_bytes = vport_stats.tx_bytes; - stats->tx_bytes = vport_stats.rx_bytes; - stats->rx_errors = vport_stats.tx_errors; - stats->tx_errors = vport_stats.rx_errors; - stats->rx_dropped = vport_stats.tx_dropped; - stats->tx_dropped = vport_stats.rx_dropped; - - return stats; -} - /* Called with rcu_read_lock_bh. */ -static int internal_dev_xmit(struct sk_buff *skb, struct net_device *netdev) +static netdev_tx_t +internal_dev_xmit(struct sk_buff *skb, struct net_device *netdev) { + int len, err; + + /* store len value because skb can be freed inside ovs_vport_receive() */ + len = skb->len; + rcu_read_lock(); - ovs_vport_receive(internal_dev_priv(netdev)->vport, skb, NULL); + err = ovs_vport_receive(internal_dev_priv(netdev)->vport, skb, NULL); rcu_read_unlock(); - return 0; + + if (likely(!err)) + dev_sw_netstats_tx_add(netdev, 1, len); + else + netdev->stats.tx_errors++; + + return NETDEV_TX_OK; } static int internal_dev_open(struct net_device *netdev) @@ -87,7 +65,7 @@ static int internal_dev_stop(struct net_device *netdev) static void internal_dev_getinfo(struct net_device *netdev, struct ethtool_drvinfo *info) { - strlcpy(info->driver, "openvswitch", sizeof(info->driver)); + strscpy(info->driver, "openvswitch", sizeof(info->driver)); } static const struct ethtool_ops internal_dev_ethtool_ops = { @@ -95,21 +73,11 @@ static const struct ethtool_ops internal_dev_ethtool_ops = { .get_link = ethtool_op_get_link, }; -static int internal_dev_change_mtu(struct net_device *netdev, int new_mtu) -{ - if (new_mtu < 68) - return -EINVAL; - - netdev->mtu = new_mtu; - return 0; -} - static void internal_dev_destructor(struct net_device *dev) { struct vport *vport = ovs_internal_dev_get_vport(dev); ovs_vport_free(vport); - free_netdev(dev); } static const struct net_device_ops internal_dev_netdev_ops = { @@ -117,76 +85,87 @@ static const struct net_device_ops internal_dev_netdev_ops = { .ndo_stop = internal_dev_stop, .ndo_start_xmit = internal_dev_xmit, .ndo_set_mac_address = eth_mac_addr, - .ndo_change_mtu = internal_dev_change_mtu, - .ndo_get_stats64 = internal_dev_get_stats, +}; + +static struct rtnl_link_ops internal_dev_link_ops __read_mostly = { + .kind = "openvswitch", }; static void do_setup(struct net_device *netdev) { ether_setup(netdev); + netdev->max_mtu = ETH_MAX_MTU; + netdev->netdev_ops = &internal_dev_netdev_ops; netdev->priv_flags &= ~IFF_TX_SKB_SHARING; - netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE; - netdev->destructor = internal_dev_destructor; - SET_ETHTOOL_OPS(netdev, &internal_dev_ethtool_ops); - netdev->tx_queue_len = 0; - - netdev->features = NETIF_F_LLTX | NETIF_F_SG | NETIF_F_FRAGLIST | - NETIF_F_HIGHDMA | NETIF_F_HW_CSUM | NETIF_F_TSO; + netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH | + IFF_NO_QUEUE; + netdev->lltx = true; + netdev->needs_free_netdev = true; + netdev->priv_destructor = NULL; + netdev->ethtool_ops = &internal_dev_ethtool_ops; + netdev->rtnl_link_ops = &internal_dev_link_ops; + + netdev->features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA | + NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE | + NETIF_F_GSO_ENCAP_ALL; netdev->vlan_features = netdev->features; - netdev->features |= NETIF_F_HW_VLAN_CTAG_TX; - netdev->hw_features = netdev->features & ~NETIF_F_LLTX; + netdev->hw_enc_features = netdev->features; + netdev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; + netdev->hw_features = netdev->features; + eth_hw_addr_random(netdev); } static struct vport *internal_dev_create(const struct vport_parms *parms) { struct vport *vport; - struct netdev_vport *netdev_vport; struct internal_dev *internal_dev; + struct net_device *dev; int err; - vport = ovs_vport_alloc(sizeof(struct netdev_vport), - &ovs_internal_vport_ops, parms); + vport = ovs_vport_alloc(0, &ovs_internal_vport_ops, parms); if (IS_ERR(vport)) { err = PTR_ERR(vport); goto error; } - netdev_vport = netdev_vport_priv(vport); - - netdev_vport->dev = alloc_netdev(sizeof(struct internal_dev), - parms->name, do_setup); - if (!netdev_vport->dev) { + dev = alloc_netdev(sizeof(struct internal_dev), + parms->name, NET_NAME_USER, do_setup); + vport->dev = dev; + if (!vport->dev) { err = -ENOMEM; goto error_free_vport; } + dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; - dev_net_set(netdev_vport->dev, ovs_dp_get_net(vport->dp)); - internal_dev = internal_dev_priv(netdev_vport->dev); + dev_net_set(vport->dev, ovs_dp_get_net(vport->dp)); + dev->ifindex = parms->desired_ifindex; + internal_dev = internal_dev_priv(vport->dev); internal_dev->vport = vport; /* Restrict bridge port to current netns. */ if (vport->port_no == OVSP_LOCAL) - netdev_vport->dev->features |= NETIF_F_NETNS_LOCAL; + vport->dev->netns_immutable = true; rtnl_lock(); - err = register_netdevice(netdev_vport->dev); + err = register_netdevice(vport->dev); if (err) - goto error_free_netdev; + goto error_unlock; + vport->dev->priv_destructor = internal_dev_destructor; - dev_set_promiscuity(netdev_vport->dev, 1); + dev_set_promiscuity(vport->dev, 1); rtnl_unlock(); - netif_start_queue(netdev_vport->dev); + netif_start_queue(vport->dev); return vport; -error_free_netdev: +error_unlock: rtnl_unlock(); - free_netdev(netdev_vport->dev); + free_netdev(dev); error_free_vport: ovs_vport_free(vport); error: @@ -195,44 +174,41 @@ error: static void internal_dev_destroy(struct vport *vport) { - struct netdev_vport *netdev_vport = netdev_vport_priv(vport); - - netif_stop_queue(netdev_vport->dev); + netif_stop_queue(vport->dev); rtnl_lock(); - dev_set_promiscuity(netdev_vport->dev, -1); + dev_set_promiscuity(vport->dev, -1); /* unregister_netdevice() waits for an RCU grace period. */ - unregister_netdevice(netdev_vport->dev); - + unregister_netdevice(vport->dev); rtnl_unlock(); } -static int internal_dev_recv(struct vport *vport, struct sk_buff *skb) +static int internal_dev_recv(struct sk_buff *skb) { - struct net_device *netdev = netdev_vport_priv(vport)->dev; - int len; + struct net_device *netdev = skb->dev; - len = skb->len; + if (unlikely(!(netdev->flags & IFF_UP))) { + kfree_skb(skb); + netdev->stats.rx_dropped++; + return NETDEV_TX_OK; + } skb_dst_drop(skb); - nf_reset(skb); - secpath_reset(skb); + nf_reset_ct(skb); - skb->dev = netdev; skb->pkt_type = PACKET_HOST; skb->protocol = eth_type_trans(skb, netdev); skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); + dev_sw_netstats_rx_add(netdev, skb->len); netif_rx(skb); - - return len; + return NETDEV_TX_OK; } -const struct vport_ops ovs_internal_vport_ops = { +static struct vport_ops ovs_internal_vport_ops = { .type = OVS_VPORT_TYPE_INTERNAL, .create = internal_dev_create, .destroy = internal_dev_destroy, - .get_name = ovs_netdev_get_name, .send = internal_dev_recv, }; @@ -248,3 +224,24 @@ struct vport *ovs_internal_dev_get_vport(struct net_device *netdev) return internal_dev_priv(netdev)->vport; } + +int ovs_internal_dev_rtnl_link_register(void) +{ + int err; + + err = rtnl_link_register(&internal_dev_link_ops); + if (err < 0) + return err; + + err = ovs_vport_ops_register(&ovs_internal_vport_ops); + if (err < 0) + rtnl_link_unregister(&internal_dev_link_ops); + + return err; +} + +void ovs_internal_dev_rtnl_link_unregister(void) +{ + ovs_vport_ops_unregister(&ovs_internal_vport_ops); + rtnl_link_unregister(&internal_dev_link_ops); +} diff --git a/net/openvswitch/vport-internal_dev.h b/net/openvswitch/vport-internal_dev.h index 9a7d30ecc6a2..0112d1b09d4b 100644 --- a/net/openvswitch/vport-internal_dev.h +++ b/net/openvswitch/vport-internal_dev.h @@ -1,19 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (c) 2007-2011 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA */ #ifndef VPORT_INTERNAL_DEV_H @@ -24,5 +11,7 @@ int ovs_is_internal_dev(const struct net_device *); struct vport *ovs_internal_dev_get_vport(struct net_device *); +int ovs_internal_dev_rtnl_link_register(void); +void ovs_internal_dev_rtnl_link_unregister(void); #endif /* vport-internal_dev.h */ diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c index 5982f3f62835..91a11067e458 100644 --- a/net/openvswitch/vport-netdev.c +++ b/net/openvswitch/vport-netdev.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2007-2012 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt @@ -25,16 +12,25 @@ #include <linux/llc.h> #include <linux/rtnetlink.h> #include <linux/skbuff.h> +#include <linux/openvswitch.h> +#include <linux/export.h> -#include <net/llc.h> +#include <net/ip_tunnels.h> +#include <net/rtnetlink.h> #include "datapath.h" +#include "vport.h" #include "vport-internal_dev.h" #include "vport-netdev.h" +static struct vport_ops ovs_netdev_vport_ops; + /* Must be called with rcu_read_lock. */ -static void netdev_port_receive(struct vport *vport, struct sk_buff *skb) +static void netdev_port_receive(struct sk_buff *skb) { + struct vport *vport; + + vport = ovs_netdev_get_vport(skb->dev); if (unlikely(!vport)) goto error; @@ -48,12 +44,11 @@ static void netdev_port_receive(struct vport *vport, struct sk_buff *skb) if (unlikely(!skb)) return; - skb_push(skb, ETH_HLEN); - ovs_skb_postpush_rcsum(skb, skb->data, ETH_HLEN); + if (skb->dev->type == ARPHRD_ETHER) + skb_push_rcsum(skb, ETH_HLEN); - ovs_vport_receive(vport, skb, NULL); + ovs_vport_receive(vport, skb, skb_tunnel_info(skb)); return; - error: kfree_skb(skb); } @@ -62,144 +57,160 @@ error: static rx_handler_result_t netdev_frame_hook(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; - struct vport *vport; if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) return RX_HANDLER_PASS; - vport = ovs_netdev_get_vport(skb->dev); - - netdev_port_receive(vport, skb); - + netdev_port_receive(skb); return RX_HANDLER_CONSUMED; } -static struct vport *netdev_create(const struct vport_parms *parms) +static struct net_device *get_dpdev(const struct datapath *dp) { - struct vport *vport; - struct netdev_vport *netdev_vport; - int err; + struct vport *local; - vport = ovs_vport_alloc(sizeof(struct netdev_vport), - &ovs_netdev_vport_ops, parms); - if (IS_ERR(vport)) { - err = PTR_ERR(vport); - goto error; - } + local = ovs_vport_ovsl(dp, OVSP_LOCAL); + return local->dev; +} - netdev_vport = netdev_vport_priv(vport); +struct vport *ovs_netdev_link(struct vport *vport, const char *name) +{ + int err; - netdev_vport->dev = dev_get_by_name(ovs_dp_get_net(vport->dp), parms->name); - if (!netdev_vport->dev) { + vport->dev = dev_get_by_name(ovs_dp_get_net(vport->dp), name); + if (!vport->dev) { err = -ENODEV; goto error_free_vport; } - - if (netdev_vport->dev->flags & IFF_LOOPBACK || - netdev_vport->dev->type != ARPHRD_ETHER || - ovs_is_internal_dev(netdev_vport->dev)) { + /* Ensure that the device exists and that the provided + * name is not one of its aliases. + */ + if (strcmp(name, ovs_vport_name(vport))) { + err = -ENODEV; + goto error_put; + } + netdev_tracker_alloc(vport->dev, &vport->dev_tracker, GFP_KERNEL); + if (vport->dev->flags & IFF_LOOPBACK || + (vport->dev->type != ARPHRD_ETHER && + vport->dev->type != ARPHRD_NONE) || + ovs_is_internal_dev(vport->dev)) { err = -EINVAL; goto error_put; } rtnl_lock(); - err = netdev_rx_handler_register(netdev_vport->dev, netdev_frame_hook, - vport); + err = netdev_master_upper_dev_link(vport->dev, + get_dpdev(vport->dp), + NULL, NULL, NULL); if (err) goto error_unlock; - dev_set_promiscuity(netdev_vport->dev, 1); - netdev_vport->dev->priv_flags |= IFF_OVS_DATAPATH; + err = netdev_rx_handler_register(vport->dev, netdev_frame_hook, + vport); + if (err) + goto error_master_upper_dev_unlink; + + dev_disable_lro(vport->dev); + dev_set_promiscuity(vport->dev, 1); + vport->dev->priv_flags |= IFF_OVS_DATAPATH; rtnl_unlock(); return vport; +error_master_upper_dev_unlink: + netdev_upper_dev_unlink(vport->dev, get_dpdev(vport->dp)); error_unlock: rtnl_unlock(); error_put: - dev_put(netdev_vport->dev); + netdev_put(vport->dev, &vport->dev_tracker); error_free_vport: ovs_vport_free(vport); -error: return ERR_PTR(err); } +EXPORT_SYMBOL_GPL(ovs_netdev_link); -static void free_port_rcu(struct rcu_head *rcu) +static struct vport *netdev_create(const struct vport_parms *parms) { - struct netdev_vport *netdev_vport = container_of(rcu, - struct netdev_vport, rcu); + struct vport *vport; + + vport = ovs_vport_alloc(0, &ovs_netdev_vport_ops, parms); + if (IS_ERR(vport)) + return vport; - dev_put(netdev_vport->dev); - ovs_vport_free(vport_from_priv(netdev_vport)); + return ovs_netdev_link(vport, parms->name); } -static void netdev_destroy(struct vport *vport) +static void vport_netdev_free(struct rcu_head *rcu) { - struct netdev_vport *netdev_vport = netdev_vport_priv(vport); + struct vport *vport = container_of(rcu, struct vport, rcu); - rtnl_lock(); - netdev_vport->dev->priv_flags &= ~IFF_OVS_DATAPATH; - netdev_rx_handler_unregister(netdev_vport->dev); - dev_set_promiscuity(netdev_vport->dev, -1); - rtnl_unlock(); - - call_rcu(&netdev_vport->rcu, free_port_rcu); + netdev_put(vport->dev, &vport->dev_tracker); + ovs_vport_free(vport); } -const char *ovs_netdev_get_name(const struct vport *vport) +void ovs_netdev_detach_dev(struct vport *vport) { - const struct netdev_vport *netdev_vport = netdev_vport_priv(vport); - return netdev_vport->dev->name; + ASSERT_RTNL(); + vport->dev->priv_flags &= ~IFF_OVS_DATAPATH; + netdev_rx_handler_unregister(vport->dev); + netdev_upper_dev_unlink(vport->dev, + netdev_master_upper_dev_get(vport->dev)); + dev_set_promiscuity(vport->dev, -1); } -static unsigned int packet_length(const struct sk_buff *skb) +static void netdev_destroy(struct vport *vport) { - unsigned int length = skb->len - ETH_HLEN; - - if (skb->protocol == htons(ETH_P_8021Q)) - length -= VLAN_HLEN; + rtnl_lock(); + if (netif_is_ovs_port(vport->dev)) + ovs_netdev_detach_dev(vport); + rtnl_unlock(); - return length; + call_rcu(&vport->rcu, vport_netdev_free); } -static int netdev_send(struct vport *vport, struct sk_buff *skb) +void ovs_netdev_tunnel_destroy(struct vport *vport) { - struct netdev_vport *netdev_vport = netdev_vport_priv(vport); - int mtu = netdev_vport->dev->mtu; - int len; - - if (unlikely(packet_length(skb) > mtu && !skb_is_gso(skb))) { - net_warn_ratelimited("%s: dropped over-mtu packet: %d > %d\n", - netdev_vport->dev->name, - packet_length(skb), mtu); - goto drop; - } - - skb->dev = netdev_vport->dev; - len = skb->len; - dev_queue_xmit(skb); + rtnl_lock(); + if (netif_is_ovs_port(vport->dev)) + ovs_netdev_detach_dev(vport); - return len; + /* We can be invoked by both explicit vport deletion and + * underlying netdev deregistration; delete the link only + * if it's not already shutting down. + */ + if (vport->dev->reg_state == NETREG_REGISTERED) + rtnl_delete_link(vport->dev, 0, NULL); + netdev_put(vport->dev, &vport->dev_tracker); + vport->dev = NULL; + rtnl_unlock(); -drop: - kfree_skb(skb); - return 0; + call_rcu(&vport->rcu, vport_netdev_free); } +EXPORT_SYMBOL_GPL(ovs_netdev_tunnel_destroy); /* Returns null if this device is not attached to a datapath. */ struct vport *ovs_netdev_get_vport(struct net_device *dev) { - if (likely(dev->priv_flags & IFF_OVS_DATAPATH)) + if (likely(netif_is_ovs_port(dev))) return (struct vport *) rcu_dereference_rtnl(dev->rx_handler_data); else return NULL; } -const struct vport_ops ovs_netdev_vport_ops = { +static struct vport_ops ovs_netdev_vport_ops = { .type = OVS_VPORT_TYPE_NETDEV, .create = netdev_create, .destroy = netdev_destroy, - .get_name = ovs_netdev_get_name, - .send = netdev_send, + .send = dev_queue_xmit, }; + +int __init ovs_netdev_init(void) +{ + return ovs_vport_ops_register(&ovs_netdev_vport_ops); +} + +void ovs_netdev_exit(void) +{ + ovs_vport_ops_unregister(&ovs_netdev_vport_ops); +} diff --git a/net/openvswitch/vport-netdev.h b/net/openvswitch/vport-netdev.h index dd298b5c5cdb..c5d83a43bfc4 100644 --- a/net/openvswitch/vport-netdev.h +++ b/net/openvswitch/vport-netdev.h @@ -1,19 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (c) 2007-2011 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA */ #ifndef VPORT_NETDEV_H @@ -26,18 +13,11 @@ struct vport *ovs_netdev_get_vport(struct net_device *dev); -struct netdev_vport { - struct rcu_head rcu; +struct vport *ovs_netdev_link(struct vport *vport, const char *name); +void ovs_netdev_detach_dev(struct vport *); - struct net_device *dev; -}; - -static inline struct netdev_vport * -netdev_vport_priv(const struct vport *vport) -{ - return vport_priv(vport); -} - -const char *ovs_netdev_get_name(const struct vport *); +int __init ovs_netdev_init(void); +void ovs_netdev_exit(void); +void ovs_netdev_tunnel_destroy(struct vport *vport); #endif /* vport_netdev.h */ diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c new file mode 100644 index 000000000000..0b881b043bcf --- /dev/null +++ b/net/openvswitch/vport-vxlan.c @@ -0,0 +1,169 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2014 Nicira, Inc. + * Copyright (c) 2013 Cisco Systems, Inc. + */ + +#include <linux/kernel.h> +#include <linux/skbuff.h> +#include <linux/openvswitch.h> +#include <linux/module.h> +#include <net/udp.h> +#include <net/ip_tunnels.h> +#include <net/rtnetlink.h> +#include <net/vxlan.h> + +#include "datapath.h" +#include "vport.h" +#include "vport-netdev.h" + +static struct vport_ops ovs_vxlan_netdev_vport_ops; + +static int vxlan_get_options(const struct vport *vport, struct sk_buff *skb) +{ + struct vxlan_dev *vxlan = netdev_priv(vport->dev); + __be16 dst_port = vxlan->cfg.dst_port; + + if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, ntohs(dst_port))) + return -EMSGSIZE; + + if (vxlan->cfg.flags & VXLAN_F_GBP) { + struct nlattr *exts; + + exts = nla_nest_start_noflag(skb, OVS_TUNNEL_ATTR_EXTENSION); + if (!exts) + return -EMSGSIZE; + + if (vxlan->cfg.flags & VXLAN_F_GBP && + nla_put_flag(skb, OVS_VXLAN_EXT_GBP)) + return -EMSGSIZE; + + nla_nest_end(skb, exts); + } + + return 0; +} + +static const struct nla_policy exts_policy[OVS_VXLAN_EXT_MAX + 1] = { + [OVS_VXLAN_EXT_GBP] = { .type = NLA_FLAG, }, +}; + +static int vxlan_configure_exts(struct vport *vport, struct nlattr *attr, + struct vxlan_config *conf) +{ + struct nlattr *exts[OVS_VXLAN_EXT_MAX + 1]; + int err; + + if (nla_len(attr) < sizeof(struct nlattr)) + return -EINVAL; + + err = nla_parse_nested_deprecated(exts, OVS_VXLAN_EXT_MAX, attr, + exts_policy, NULL); + if (err < 0) + return err; + + if (exts[OVS_VXLAN_EXT_GBP]) + conf->flags |= VXLAN_F_GBP; + + return 0; +} + +static struct vport *vxlan_tnl_create(const struct vport_parms *parms) +{ + struct net *net = ovs_dp_get_net(parms->dp); + struct nlattr *options = parms->options; + struct net_device *dev; + struct vport *vport; + struct nlattr *a; + int err; + struct vxlan_config conf = { + .no_share = true, + .flags = VXLAN_F_COLLECT_METADATA | VXLAN_F_UDP_ZERO_CSUM6_RX, + /* Don't restrict the packets that can be sent by MTU */ + .mtu = IP_MAX_MTU, + }; + + if (!options) { + err = -EINVAL; + goto error; + } + + a = nla_find_nested(options, OVS_TUNNEL_ATTR_DST_PORT); + if (a && nla_len(a) == sizeof(u16)) { + conf.dst_port = htons(nla_get_u16(a)); + } else { + /* Require destination port from userspace. */ + err = -EINVAL; + goto error; + } + + vport = ovs_vport_alloc(0, &ovs_vxlan_netdev_vport_ops, parms); + if (IS_ERR(vport)) + return vport; + + a = nla_find_nested(options, OVS_TUNNEL_ATTR_EXTENSION); + if (a) { + err = vxlan_configure_exts(vport, a, &conf); + if (err) { + ovs_vport_free(vport); + goto error; + } + } + + rtnl_lock(); + dev = vxlan_dev_create(net, parms->name, NET_NAME_USER, &conf); + if (IS_ERR(dev)) { + rtnl_unlock(); + ovs_vport_free(vport); + return ERR_CAST(dev); + } + + err = dev_change_flags(dev, dev->flags | IFF_UP, NULL); + if (err < 0) { + rtnl_delete_link(dev, 0, NULL); + rtnl_unlock(); + ovs_vport_free(vport); + goto error; + } + + rtnl_unlock(); + return vport; +error: + return ERR_PTR(err); +} + +static struct vport *vxlan_create(const struct vport_parms *parms) +{ + struct vport *vport; + + vport = vxlan_tnl_create(parms); + if (IS_ERR(vport)) + return vport; + + return ovs_netdev_link(vport, parms->name); +} + +static struct vport_ops ovs_vxlan_netdev_vport_ops = { + .type = OVS_VPORT_TYPE_VXLAN, + .create = vxlan_create, + .destroy = ovs_netdev_tunnel_destroy, + .get_options = vxlan_get_options, + .send = dev_queue_xmit, +}; + +static int __init ovs_vxlan_tnl_init(void) +{ + return ovs_vport_ops_register(&ovs_vxlan_netdev_vport_ops); +} + +static void __exit ovs_vxlan_tnl_exit(void) +{ + ovs_vport_ops_unregister(&ovs_vxlan_netdev_vport_ops); +} + +module_init(ovs_vxlan_tnl_init); +module_exit(ovs_vxlan_tnl_exit); + +MODULE_DESCRIPTION("OVS: VXLAN switching port"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("vport-type-4"); diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c index d4c7fa04ce08..6bbbc16ab778 100644 --- a/net/openvswitch/vport.c +++ b/net/openvswitch/vport.c @@ -1,19 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (c) 2007-2012 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA + * Copyright (c) 2007-2014 Nicira, Inc. */ #include <linux/etherdevice.h> @@ -28,21 +15,13 @@ #include <linux/rtnetlink.h> #include <linux/compat.h> #include <net/net_namespace.h> +#include <linux/module.h> #include "datapath.h" #include "vport.h" #include "vport-internal_dev.h" -/* List of statically compiled vport implementations. Don't forget to also - * add yours to the list at the bottom of vport.h. */ -static const struct vport_ops *vport_ops_list[] = { - &ovs_netdev_vport_ops, - &ovs_internal_vport_ops, - -#ifdef CONFIG_OPENVSWITCH_GRE - &ovs_gre_vport_ops, -#endif -}; +static LIST_HEAD(vport_ops_list); /* Protected by RCU read lock for reading, ovs_mutex for writing. */ static struct hlist_head *dev_table; @@ -55,7 +34,7 @@ static struct hlist_head *dev_table; */ int ovs_vport_init(void) { - dev_table = kzalloc(VPORT_HASH_BUCKETS * sizeof(struct hlist_head), + dev_table = kcalloc(VPORT_HASH_BUCKETS, sizeof(struct hlist_head), GFP_KERNEL); if (!dev_table) return -ENOMEM; @@ -73,26 +52,54 @@ void ovs_vport_exit(void) kfree(dev_table); } -static struct hlist_head *hash_bucket(struct net *net, const char *name) +static struct hlist_head *hash_bucket(const struct net *net, const char *name) { unsigned int hash = jhash(name, strlen(name), (unsigned long) net); return &dev_table[hash & (VPORT_HASH_BUCKETS - 1)]; } +int __ovs_vport_ops_register(struct vport_ops *ops) +{ + int err = -EEXIST; + struct vport_ops *o; + + ovs_lock(); + list_for_each_entry(o, &vport_ops_list, list) + if (ops->type == o->type) + goto errout; + + list_add_tail(&ops->list, &vport_ops_list); + err = 0; +errout: + ovs_unlock(); + return err; +} +EXPORT_SYMBOL_GPL(__ovs_vport_ops_register); + +void ovs_vport_ops_unregister(struct vport_ops *ops) +{ + ovs_lock(); + list_del(&ops->list); + ovs_unlock(); +} +EXPORT_SYMBOL_GPL(ovs_vport_ops_unregister); + /** * ovs_vport_locate - find a port that has already been created * + * @net: network namespace * @name: name of port to find * * Must be called with ovs or RCU read lock. */ -struct vport *ovs_vport_locate(struct net *net, const char *name) +struct vport *ovs_vport_locate(const struct net *net, const char *name) { struct hlist_head *bucket = hash_bucket(net, name); struct vport *vport; - hlist_for_each_entry_rcu(vport, bucket, hash_node) - if (!strcmp(name, vport->ops->get_name(vport)) && + hlist_for_each_entry_rcu(vport, bucket, hash_node, + lockdep_ovsl_is_held()) + if (!strcmp(name, ovs_vport_name(vport)) && net_eq(ovs_dp_get_net(vport->dp), net)) return vport; @@ -104,17 +111,20 @@ struct vport *ovs_vport_locate(struct net *net, const char *name) * * @priv_size: Size of private data area to allocate. * @ops: vport device ops + * @parms: information about new vport. * * Allocate and initialize a new vport defined by @ops. The vport will contain * a private data area of size @priv_size that can be accessed using - * vport_priv(). vports that are no longer needed should be released with + * vport_priv(). Some parameters of the vport will be initialized from @parms. + * @vports that are no longer needed should be released with * vport_free(). */ struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops, - const struct vport_parms *parms) + const struct vport_parms *parms) { struct vport *vport; size_t alloc_size; + int err; alloc_size = sizeof(struct vport); if (priv_size) { @@ -126,22 +136,31 @@ struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops, if (!vport) return ERR_PTR(-ENOMEM); + vport->upcall_stats = netdev_alloc_pcpu_stats(struct vport_upcall_stats_percpu); + if (!vport->upcall_stats) { + err = -ENOMEM; + goto err_kfree_vport; + } + vport->dp = parms->dp; vport->port_no = parms->port_no; - vport->upcall_portid = parms->upcall_portid; vport->ops = ops; INIT_HLIST_NODE(&vport->dp_hash_node); - vport->percpu_stats = alloc_percpu(struct pcpu_tstats); - if (!vport->percpu_stats) { - kfree(vport); - return ERR_PTR(-ENOMEM); + if (ovs_vport_set_upcall_portids(vport, parms->upcall_portids)) { + err = -EINVAL; + goto err_free_percpu; } - spin_lock_init(&vport->stats_lock); - return vport; + +err_free_percpu: + free_percpu(vport->upcall_stats); +err_kfree_vport: + kfree(vport); + return ERR_PTR(err); } +EXPORT_SYMBOL_GPL(ovs_vport_alloc); /** * ovs_vport_free - uninitialize and free vport @@ -155,9 +174,25 @@ struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *ops, */ void ovs_vport_free(struct vport *vport) { - free_percpu(vport->percpu_stats); + /* vport is freed from RCU callback or error path, Therefore + * it is safe to use raw dereference. + */ + kfree(rcu_dereference_raw(vport->upcall_portids)); + free_percpu(vport->upcall_stats); kfree(vport); } +EXPORT_SYMBOL_GPL(ovs_vport_free); + +static struct vport_ops *ovs_vport_lookup(const struct vport_parms *parms) +{ + struct vport_ops *ops; + + list_for_each_entry(ops, &vport_ops_list, list) + if (ops->type == parms->type) + return ops; + + return NULL; +} /** * ovs_vport_add - add vport device (for kernel callers) @@ -169,38 +204,47 @@ void ovs_vport_free(struct vport *vport) */ struct vport *ovs_vport_add(const struct vport_parms *parms) { + struct vport_ops *ops; struct vport *vport; - int err = 0; - int i; - for (i = 0; i < ARRAY_SIZE(vport_ops_list); i++) { - if (vport_ops_list[i]->type == parms->type) { - struct hlist_head *bucket; + ops = ovs_vport_lookup(parms); + if (ops) { + struct hlist_head *bucket; - vport = vport_ops_list[i]->create(parms); - if (IS_ERR(vport)) { - err = PTR_ERR(vport); - goto out; - } + if (!try_module_get(ops->owner)) + return ERR_PTR(-EAFNOSUPPORT); - bucket = hash_bucket(ovs_dp_get_net(vport->dp), - vport->ops->get_name(vport)); - hlist_add_head_rcu(&vport->hash_node, bucket); + vport = ops->create(parms); + if (IS_ERR(vport)) { + module_put(ops->owner); return vport; } - } - err = -EAFNOSUPPORT; + bucket = hash_bucket(ovs_dp_get_net(vport->dp), + ovs_vport_name(vport)); + hlist_add_head_rcu(&vport->hash_node, bucket); + return vport; + } -out: - return ERR_PTR(err); + /* Unlock to attempt module load and return -EAGAIN if load + * was successful as we need to restart the port addition + * workflow. + */ + ovs_unlock(); + request_module("vport-type-%d", parms->type); + ovs_lock(); + + if (!ovs_vport_lookup(parms)) + return ERR_PTR(-EAFNOSUPPORT); + else + return ERR_PTR(-EAGAIN); } /** * ovs_vport_set_options - modify existing vport device (for kernel callers) * * @vport: vport to modify. - * @port: New configuration. + * @options: New configuration. * * Modifies an existing device with the specified configuration (which is * dependent on device type). ovs_mutex must be held. @@ -217,15 +261,13 @@ int ovs_vport_set_options(struct vport *vport, struct nlattr *options) * * @vport: vport to delete. * - * Detaches @vport from its datapath and destroys it. It is possible to fail - * for reasons such as lack of memory. ovs_mutex must be held. + * Detaches @vport from its datapath and destroys it. ovs_mutex must + * be held. */ void ovs_vport_del(struct vport *vport) { - ASSERT_OVSL(); - hlist_del_rcu(&vport->hash_node); - + module_put(vport->ops->owner); vport->ops->destroy(vport); } @@ -241,45 +283,69 @@ void ovs_vport_del(struct vport *vport) */ void ovs_vport_get_stats(struct vport *vport, struct ovs_vport_stats *stats) { - int i; - - memset(stats, 0, sizeof(*stats)); - - /* We potentially have 2 sources of stats that need to be combined: - * those we have collected (split into err_stats and percpu_stats) from - * set_stats() and device error stats from netdev->get_stats() (for - * errors that happen downstream and therefore aren't reported through - * our vport_record_error() function). - * Stats from first source are reported by ovs (OVS_VPORT_ATTR_STATS). - * netdev-stats can be directly read over netlink-ioctl. - */ - - spin_lock_bh(&vport->stats_lock); + const struct rtnl_link_stats64 *dev_stats; + struct rtnl_link_stats64 temp; + + dev_stats = dev_get_stats(vport->dev, &temp); + stats->rx_errors = dev_stats->rx_errors; + stats->tx_errors = dev_stats->tx_errors; + stats->tx_dropped = dev_stats->tx_dropped; + stats->rx_dropped = dev_stats->rx_dropped; + + stats->rx_bytes = dev_stats->rx_bytes; + stats->rx_packets = dev_stats->rx_packets; + stats->tx_bytes = dev_stats->tx_bytes; + stats->tx_packets = dev_stats->tx_packets; +} - stats->rx_errors = vport->err_stats.rx_errors; - stats->tx_errors = vport->err_stats.tx_errors; - stats->tx_dropped = vport->err_stats.tx_dropped; - stats->rx_dropped = vport->err_stats.rx_dropped; +/** + * ovs_vport_get_upcall_stats - retrieve upcall stats + * + * @vport: vport from which to retrieve the stats. + * @skb: sk_buff where upcall stats should be appended. + * + * Retrieves upcall stats for the given device. + * + * Must be called with ovs_mutex or rcu_read_lock. + */ +int ovs_vport_get_upcall_stats(struct vport *vport, struct sk_buff *skb) +{ + struct nlattr *nla; + int i; - spin_unlock_bh(&vport->stats_lock); + __u64 tx_success = 0; + __u64 tx_fail = 0; for_each_possible_cpu(i) { - const struct pcpu_tstats *percpu_stats; - struct pcpu_tstats local_stats; + const struct vport_upcall_stats_percpu *stats; unsigned int start; - percpu_stats = per_cpu_ptr(vport->percpu_stats, i); - + stats = per_cpu_ptr(vport->upcall_stats, i); do { - start = u64_stats_fetch_begin_bh(&percpu_stats->syncp); - local_stats = *percpu_stats; - } while (u64_stats_fetch_retry_bh(&percpu_stats->syncp, start)); - - stats->rx_bytes += local_stats.rx_bytes; - stats->rx_packets += local_stats.rx_packets; - stats->tx_bytes += local_stats.tx_bytes; - stats->tx_packets += local_stats.tx_packets; + start = u64_stats_fetch_begin(&stats->syncp); + tx_success += u64_stats_read(&stats->n_success); + tx_fail += u64_stats_read(&stats->n_fail); + } while (u64_stats_fetch_retry(&stats->syncp, start)); + } + + nla = nla_nest_start_noflag(skb, OVS_VPORT_ATTR_UPCALL_STATS); + if (!nla) + return -EMSGSIZE; + + if (nla_put_u64_64bit(skb, OVS_VPORT_UPCALL_ATTR_SUCCESS, tx_success, + OVS_VPORT_ATTR_PAD)) { + nla_nest_cancel(skb, nla); + return -EMSGSIZE; } + + if (nla_put_u64_64bit(skb, OVS_VPORT_UPCALL_ATTR_FAIL, tx_fail, + OVS_VPORT_ATTR_PAD)) { + nla_nest_cancel(skb, nla); + return -EMSGSIZE; + } + nla_nest_end(skb, nla); + + return 0; } /** @@ -306,7 +372,7 @@ int ovs_vport_get_options(const struct vport *vport, struct sk_buff *skb) if (!vport->ops->get_options) return 0; - nla = nla_nest_start(skb, OVS_VPORT_ATTR_OPTIONS); + nla = nla_nest_start_noflag(skb, OVS_VPORT_ATTR_OPTIONS); if (!nla) return -EMSGSIZE; @@ -321,105 +387,196 @@ int ovs_vport_get_options(const struct vport *vport, struct sk_buff *skb) } /** - * ovs_vport_receive - pass up received packet to the datapath for processing + * ovs_vport_set_upcall_portids - set upcall portids of @vport. * - * @vport: vport that received the packet - * @skb: skb that was received + * @vport: vport to modify. + * @ids: new configuration, an array of port ids. * - * Must be called with rcu_read_lock. The packet cannot be shared and - * skb->data should point to the Ethernet header. + * Sets the vport's upcall_portids to @ids. + * + * Returns 0 if successful, -EINVAL if @ids is zero length or cannot be parsed + * as an array of U32. + * + * Must be called with ovs_mutex. */ -void ovs_vport_receive(struct vport *vport, struct sk_buff *skb, - struct ovs_key_ipv4_tunnel *tun_key) +int ovs_vport_set_upcall_portids(struct vport *vport, const struct nlattr *ids) { - struct pcpu_tstats *stats; + struct vport_portids *old, *vport_portids; + + if (!nla_len(ids) || nla_len(ids) % sizeof(u32)) + return -EINVAL; - stats = this_cpu_ptr(vport->percpu_stats); - u64_stats_update_begin(&stats->syncp); - stats->rx_packets++; - stats->rx_bytes += skb->len; - u64_stats_update_end(&stats->syncp); + old = ovsl_dereference(vport->upcall_portids); + + vport_portids = kmalloc(sizeof(*vport_portids) + nla_len(ids), + GFP_KERNEL); + if (!vport_portids) + return -ENOMEM; - OVS_CB(skb)->tun_key = tun_key; - ovs_dp_process_received_packet(vport, skb); + vport_portids->n_ids = nla_len(ids) / sizeof(u32); + vport_portids->rn_ids = reciprocal_value(vport_portids->n_ids); + nla_memcpy(vport_portids->ids, ids, nla_len(ids)); + + rcu_assign_pointer(vport->upcall_portids, vport_portids); + + if (old) + kfree_rcu(old, rcu); + return 0; } /** - * ovs_vport_send - send a packet on a device + * ovs_vport_get_upcall_portids - get the upcall_portids of @vport. * - * @vport: vport on which to send the packet - * @skb: skb to send + * @vport: vport from which to retrieve the portids. + * @skb: sk_buff where portids should be appended. * - * Sends the given packet and returns the length of data sent. Either ovs - * lock or rcu_read_lock must be held. + * Retrieves the configuration of the given vport, appending the + * %OVS_VPORT_ATTR_UPCALL_PID attribute which is the array of upcall + * portids to @skb. + * + * Returns 0 if successful, -EMSGSIZE if @skb has insufficient room. + * If an error occurs, @skb is left unmodified. Must be called with + * ovs_mutex or rcu_read_lock. */ -int ovs_vport_send(struct vport *vport, struct sk_buff *skb) +int ovs_vport_get_upcall_portids(const struct vport *vport, + struct sk_buff *skb) { - int sent = vport->ops->send(vport, skb); - - if (likely(sent > 0)) { - struct pcpu_tstats *stats; + struct vport_portids *ids; - stats = this_cpu_ptr(vport->percpu_stats); - - u64_stats_update_begin(&stats->syncp); - stats->tx_packets++; - stats->tx_bytes += sent; - u64_stats_update_end(&stats->syncp); - } else if (sent < 0) { - ovs_vport_record_error(vport, VPORT_E_TX_ERROR); - kfree_skb(skb); - } else - ovs_vport_record_error(vport, VPORT_E_TX_DROPPED); + ids = rcu_dereference_ovsl(vport->upcall_portids); - return sent; + if (vport->dp->user_features & OVS_DP_F_VPORT_PIDS) + return nla_put(skb, OVS_VPORT_ATTR_UPCALL_PID, + ids->n_ids * sizeof(u32), (void *)ids->ids); + else + return nla_put_u32(skb, OVS_VPORT_ATTR_UPCALL_PID, ids->ids[0]); } /** - * ovs_vport_record_error - indicate device error to generic stats layer + * ovs_vport_find_upcall_portid - find the upcall portid to send upcall. * - * @vport: vport that encountered the error - * @err_type: one of enum vport_err_type types to indicate the error type + * @vport: vport from which the missed packet is received. + * @skb: skb that the missed packet was received. * - * If using the vport generic stats layer indicate that an error of the given - * type has occurred. + * Uses the skb_get_hash() to select the upcall portid to send the + * upcall. + * + * Returns the portid of the target socket. Must be called with rcu_read_lock. */ -void ovs_vport_record_error(struct vport *vport, enum vport_err_type err_type) +u32 ovs_vport_find_upcall_portid(const struct vport *vport, + struct sk_buff *skb) { - spin_lock(&vport->stats_lock); + struct vport_portids *ids; + u32 ids_index; + u32 hash; - switch (err_type) { - case VPORT_E_RX_DROPPED: - vport->err_stats.rx_dropped++; - break; + ids = rcu_dereference(vport->upcall_portids); - case VPORT_E_RX_ERROR: - vport->err_stats.rx_errors++; - break; + /* If there is only one portid, select it in the fast-path. */ + if (ids->n_ids == 1) + return ids->ids[0]; - case VPORT_E_TX_DROPPED: - vport->err_stats.tx_dropped++; - break; + hash = skb_get_hash(skb); + ids_index = hash - ids->n_ids * reciprocal_divide(hash, ids->rn_ids); + return ids->ids[ids_index]; +} - case VPORT_E_TX_ERROR: - vport->err_stats.tx_errors++; - break; +/** + * ovs_vport_receive - pass up received packet to the datapath for processing + * + * @vport: vport that received the packet + * @skb: skb that was received + * @tun_info: tunnel (if any) that carried packet + * + * Must be called with rcu_read_lock. The packet cannot be shared and + * skb->data should point to the Ethernet header. + */ +int ovs_vport_receive(struct vport *vport, struct sk_buff *skb, + const struct ip_tunnel_info *tun_info) +{ + struct sw_flow_key key; + int error; + + OVS_CB(skb)->input_vport = vport; + OVS_CB(skb)->mru = 0; + OVS_CB(skb)->cutlen = 0; + OVS_CB(skb)->probability = 0; + OVS_CB(skb)->upcall_pid = 0; + if (unlikely(dev_net(skb->dev) != ovs_dp_get_net(vport->dp))) { + u32 mark; + + mark = skb->mark; + skb_scrub_packet(skb, true); + skb->mark = mark; + tun_info = NULL; } - spin_unlock(&vport->stats_lock); + /* Extract flow from 'skb' into 'key'. */ + error = ovs_flow_key_extract(tun_info, skb, &key); + if (unlikely(error)) { + kfree_skb(skb); + return error; + } + ovs_dp_process_packet(skb, &key); + return 0; } -static void free_vport_rcu(struct rcu_head *rcu) +static int packet_length(const struct sk_buff *skb, + struct net_device *dev) { - struct vport *vport = container_of(rcu, struct vport, rcu); + int length = skb->len - dev->hard_header_len; - ovs_vport_free(vport); + if (!skb_vlan_tag_present(skb) && + eth_type_vlan(skb->protocol)) + length -= VLAN_HLEN; + + /* Don't subtract for multiple VLAN tags. Most (all?) drivers allow + * (ETH_LEN + VLAN_HLEN) in addition to the mtu value, but almost none + * account for 802.1ad. e.g. is_skb_forwardable(). + */ + + return length > 0 ? length : 0; } -void ovs_vport_deferred_free(struct vport *vport) +void ovs_vport_send(struct vport *vport, struct sk_buff *skb, u8 mac_proto) { - if (!vport) - return; + int mtu = vport->dev->mtu; + + switch (vport->dev->type) { + case ARPHRD_NONE: + if (mac_proto == MAC_PROTO_ETHERNET) { + skb_reset_network_header(skb); + skb_reset_mac_len(skb); + skb->protocol = htons(ETH_P_TEB); + } else if (mac_proto != MAC_PROTO_NONE) { + WARN_ON_ONCE(1); + goto drop; + } + break; + case ARPHRD_ETHER: + if (mac_proto != MAC_PROTO_ETHERNET) + goto drop; + break; + default: + goto drop; + } + + if (unlikely(packet_length(skb, vport->dev) > mtu && + !skb_is_gso(skb))) { + vport->dev->stats.tx_errors++; + if (vport->dev->flags & IFF_UP) + net_warn_ratelimited("%s: dropped over-mtu packet: " + "%d > %d\n", vport->dev->name, + packet_length(skb, vport->dev), + mtu); + goto drop; + } + + skb->dev = vport->dev; + skb_clear_tstamp(skb); + vport->ops->send(skb); + return; - call_rcu(&vport->rcu, free_vport_rcu); +drop: + kfree_skb(skb); } diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index 376045c42f8b..9f67b9dd49f9 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -1,19 +1,6 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (c) 2007-2012 Nicira, Inc. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA - * 02110-1301, USA */ #ifndef VPORT_H @@ -23,6 +10,7 @@ #include <linux/list.h> #include <linux/netlink.h> #include <linux/openvswitch.h> +#include <linux/reciprocal_div.h> #include <linux/skbuff.h> #include <linux/spinlock.h> #include <linux/u64_stats_sync.h> @@ -32,12 +20,7 @@ struct vport; struct vport_parms; -/* The following definitions are for users of the vport subsytem: */ - -/* The following definitions are for users of the vport subsytem: */ -struct vport_net { - struct vport __rcu *gre_vport; -}; +/* The following definitions are for users of the vport subsystem: */ int ovs_vport_init(void); void ovs_vport_exit(void); @@ -45,52 +28,63 @@ void ovs_vport_exit(void); struct vport *ovs_vport_add(const struct vport_parms *); void ovs_vport_del(struct vport *); -struct vport *ovs_vport_locate(struct net *net, const char *name); +struct vport *ovs_vport_locate(const struct net *net, const char *name); void ovs_vport_get_stats(struct vport *, struct ovs_vport_stats *); +int ovs_vport_get_upcall_stats(struct vport *vport, struct sk_buff *skb); + int ovs_vport_set_options(struct vport *, struct nlattr *options); int ovs_vport_get_options(const struct vport *, struct sk_buff *); -int ovs_vport_send(struct vport *, struct sk_buff *); +int ovs_vport_set_upcall_portids(struct vport *, const struct nlattr *pids); +int ovs_vport_get_upcall_portids(const struct vport *, struct sk_buff *); +u32 ovs_vport_find_upcall_portid(const struct vport *, struct sk_buff *); -/* The following definitions are for implementers of vport devices: */ - -struct vport_err_stats { - u64 rx_dropped; - u64 rx_errors; - u64 tx_dropped; - u64 tx_errors; +/** + * struct vport_portids - array of netlink portids of a vport. + * must be protected by rcu. + * @rn_ids: The reciprocal value of @n_ids. + * @rcu: RCU callback head for deferred destruction. + * @n_ids: Size of @ids array. + * @ids: Array storing the Netlink socket pids to be used for packets received + * on this port that miss the flow table. + */ +struct vport_portids { + struct reciprocal_value rn_ids; + struct rcu_head rcu; + u32 n_ids; + u32 ids[]; }; /** * struct vport - one port within a datapath - * @rcu: RCU callback head for deferred destruction. + * @dev: Pointer to net_device. + * @dev_tracker: refcount tracker for @dev reference * @dp: Datapath to which this port belongs. - * @upcall_portid: The Netlink port to use for packets received on this port that - * miss the flow table. + * @upcall_portids: RCU protected 'struct vport_portids'. * @port_no: Index into @dp's @ports array. * @hash_node: Element in @dev_table hash table in vport.c. * @dp_hash_node: Element in @datapath->ports hash table in datapath.c. * @ops: Class structure. - * @percpu_stats: Points to per-CPU statistics used and maintained by vport - * @stats_lock: Protects @err_stats; - * @err_stats: Points to error statistics used and maintained by vport + * @upcall_stats: Upcall stats of every ports. + * @detach_list: list used for detaching vport in net-exit call. + * @rcu: RCU callback head for deferred destruction. */ struct vport { - struct rcu_head rcu; + struct net_device *dev; + netdevice_tracker dev_tracker; struct datapath *dp; - u32 upcall_portid; + struct vport_portids __rcu *upcall_portids; u16 port_no; struct hlist_node hash_node; struct hlist_node dp_hash_node; const struct vport_ops *ops; + struct vport_upcall_stats_percpu __percpu *upcall_stats; - struct pcpu_tstats __percpu *percpu_stats; - - spinlock_t stats_lock; - struct vport_err_stats err_stats; + struct list_head detach_list; + struct rcu_head rcu; }; /** @@ -100,18 +94,22 @@ struct vport { * @type: New vport's type. * @options: %OVS_VPORT_ATTR_OPTIONS attribute from Netlink message, %NULL if * none was supplied. + * @desired_ifindex: New vport's ifindex. * @dp: New vport's datapath. * @port_no: New vport's port number. + * @upcall_portids: %OVS_VPORT_ATTR_UPCALL_PID attribute from Netlink message, + * %NULL if none was supplied. */ struct vport_parms { const char *name; enum ovs_vport_type type; + int desired_ifindex; struct nlattr *options; /* For ovs_vport_alloc(). */ struct datapath *dp; u16 port_no; - u32 upcall_portid; + struct nlattr *upcall_portids; }; /** @@ -127,9 +125,10 @@ struct vport_parms { * @get_options: Appends vport-specific attributes for the configuration of an * existing vport to a &struct sk_buff. May be %NULL for a vport that does not * have any configuration. - * @get_name: Get the device's name. - * @send: Send a packet on the device. Returns the length of the packet sent, + * @send: Send a packet on the device. * zero for dropped packets or negative for error. + * @owner: Module that implements this vport type. + * @list: List entry in the global list of vport types. */ struct vport_ops { enum ovs_vport_type type; @@ -141,23 +140,27 @@ struct vport_ops { int (*set_options)(struct vport *, struct nlattr *); int (*get_options)(const struct vport *, struct sk_buff *); - /* Called with rcu_read_lock or ovs_mutex. */ - const char *(*get_name)(const struct vport *); - - int (*send)(struct vport *, struct sk_buff *); + int (*send)(struct sk_buff *skb); + struct module *owner; + struct list_head list; }; -enum vport_err_type { - VPORT_E_RX_DROPPED, - VPORT_E_RX_ERROR, - VPORT_E_TX_DROPPED, - VPORT_E_TX_ERROR, +/** + * struct vport_upcall_stats_percpu - per-cpu packet upcall statistics for + * a given vport. + * @syncp: Synchronization point for 64bit counters. + * @n_success: Number of packets that upcall to userspace succeed. + * @n_fail: Number of packets that upcall to userspace failed. + */ +struct vport_upcall_stats_percpu { + struct u64_stats_sync syncp; + u64_stats_t n_success; + u64_stats_t n_fail; }; struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *, const struct vport_parms *); void ovs_vport_free(struct vport *); -void ovs_vport_deferred_free(struct vport *vport); #define VPORT_ALIGN 8 @@ -166,13 +169,15 @@ void ovs_vport_deferred_free(struct vport *vport); * * @vport: vport to access * + * Returns: A void pointer to a private data allocated in the @vport. + * * If a nonzero size was passed in priv_size of vport_alloc() a private data * area was allocated on creation. This allows that area to be accessed and * used for any purpose needed by the vport implementer. */ static inline void *vport_priv(const struct vport *vport) { - return (u8 *)vport + ALIGN(sizeof(struct vport), VPORT_ALIGN); + return (u8 *)(uintptr_t)vport + ALIGN(sizeof(struct vport), VPORT_ALIGN); } /** @@ -180,31 +185,34 @@ static inline void *vport_priv(const struct vport *vport) * * @priv: Start of private data area. * + * Returns: A reference to a vport structure that contains @priv. + * * It is sometimes useful to translate from a pointer to the private data * area to the vport, such as in the case where the private data pointer is * the result of a hash table lookup. @priv must point to the start of the * private data area. */ -static inline struct vport *vport_from_priv(const void *priv) +static inline struct vport *vport_from_priv(void *priv) { - return (struct vport *)(priv - ALIGN(sizeof(struct vport), VPORT_ALIGN)); + return (struct vport *)((u8 *)priv - ALIGN(sizeof(struct vport), VPORT_ALIGN)); } -void ovs_vport_receive(struct vport *, struct sk_buff *, - struct ovs_key_ipv4_tunnel *); -void ovs_vport_record_error(struct vport *, enum vport_err_type err_type); +int ovs_vport_receive(struct vport *, struct sk_buff *, + const struct ip_tunnel_info *); -/* List of statically compiled vport implementations. Don't forget to also - * add yours to the list at the top of vport.c. */ -extern const struct vport_ops ovs_netdev_vport_ops; -extern const struct vport_ops ovs_internal_vport_ops; -extern const struct vport_ops ovs_gre_vport_ops; - -static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb, - const void *start, unsigned int len) +static inline const char *ovs_vport_name(struct vport *vport) { - if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->csum = csum_add(skb->csum, csum_partial(start, len, 0)); + return vport->dev->name; } +int __ovs_vport_ops_register(struct vport_ops *ops); +#define ovs_vport_ops_register(ops) \ + ({ \ + (ops)->owner = THIS_MODULE; \ + __ovs_vport_ops_register(ops); \ + }) + +void ovs_vport_ops_unregister(struct vport_ops *ops); +void ovs_vport_send(struct vport *vport, struct sk_buff *skb, u8 mac_proto); + #endif /* vport.h */ |
