summaryrefslogtreecommitdiff
path: root/net/openvswitch/conntrack.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/openvswitch/conntrack.c')
-rw-r--r--net/openvswitch/conntrack.c722
1 files changed, 265 insertions, 457 deletions
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index cd94f925495a..a0811e1fba65 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -1,14 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2015 Nicira, Inc.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
*/
#include <linux/module.h>
@@ -17,6 +9,7 @@
#include <linux/udp.h>
#include <linux/sctp.h>
#include <linux/static_key.h>
+#include <linux/string_helpers.h>
#include <net/ip.h>
#include <net/genetlink.h>
#include <net/netfilter/nf_conntrack_core.h>
@@ -24,17 +17,19 @@
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_labels.h>
#include <net/netfilter/nf_conntrack_seqadj.h>
+#include <net/netfilter/nf_conntrack_timeout.h>
#include <net/netfilter/nf_conntrack_zones.h>
#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
#include <net/ipv6_frag.h>
-#ifdef CONFIG_NF_NAT_NEEDED
-#include <linux/netfilter/nf_nat.h>
-#include <net/netfilter/nf_nat_core.h>
-#include <net/netfilter/nf_nat_l3proto.h>
+#if IS_ENABLED(CONFIG_NF_NAT)
+#include <net/netfilter/nf_nat.h>
#endif
+#include <net/netfilter/nf_conntrack_act_ct.h>
+
#include "datapath.h"
+#include "drop.h"
#include "conntrack.h"
#include "flow.h"
#include "flow_netlink.h"
@@ -75,7 +70,9 @@ struct ovs_conntrack_info {
u32 eventmask; /* Mask of 1 << IPCT_*. */
struct md_mark mark;
struct md_labels labels;
-#ifdef CONFIG_NF_NAT_NEEDED
+ char timeout[CTNL_TIMEOUT_NAME_MAX];
+ struct nf_ct_timeout *nf_ct_timeout;
+#if IS_ENABLED(CONFIG_NF_NAT)
struct nf_nat_range2 range; /* Only present for SRC NAT and DST NAT. */
#endif
};
@@ -157,7 +154,7 @@ static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo)
static u32 ovs_ct_get_mark(const struct nf_conn *ct)
{
#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
- return ct ? ct->mark : 0;
+ return ct ? READ_ONCE(ct->mark) : 0;
#else
return 0;
#endif
@@ -171,8 +168,13 @@ static u32 ovs_ct_get_mark(const struct nf_conn *ct)
static void ovs_ct_get_labels(const struct nf_conn *ct,
struct ovs_key_ct_labels *labels)
{
- struct nf_conn_labels *cl = ct ? nf_ct_labels_find(ct) : NULL;
+ struct nf_conn_labels *cl = NULL;
+ if (ct) {
+ if (ct->master && !nf_ct_is_confirmed(ct))
+ ct = ct->master;
+ cl = nf_ct_labels_find(ct);
+ }
if (cl)
memcpy(labels, cl->bits, OVS_CT_LABELS_LEN);
else
@@ -278,15 +280,13 @@ static void ovs_ct_update_key(const struct sk_buff *skb,
/* This is called to initialize CT key fields possibly coming in from the local
* stack.
*/
-void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key)
+void ovs_ct_fill_key(const struct sk_buff *skb,
+ struct sw_flow_key *key,
+ bool post_ct)
{
- ovs_ct_update_key(skb, NULL, key, false, false);
+ ovs_ct_update_key(skb, NULL, key, post_ct, false);
}
-#define IN6_ADDR_INITIALIZER(ADDR) \
- { (ADDR).s6_addr32[0], (ADDR).s6_addr32[1], \
- (ADDR).s6_addr32[2], (ADDR).s6_addr32[3] }
-
int ovs_ct_put_key(const struct sw_flow_key *swkey,
const struct sw_flow_key *output, struct sk_buff *skb)
{
@@ -308,24 +308,30 @@ int ovs_ct_put_key(const struct sw_flow_key *swkey,
if (swkey->ct_orig_proto) {
if (swkey->eth.type == htons(ETH_P_IP)) {
- struct ovs_key_ct_tuple_ipv4 orig = {
- output->ipv4.ct_orig.src,
- output->ipv4.ct_orig.dst,
- output->ct.orig_tp.src,
- output->ct.orig_tp.dst,
- output->ct_orig_proto,
- };
+ struct ovs_key_ct_tuple_ipv4 orig;
+
+ memset(&orig, 0, sizeof(orig));
+ orig.ipv4_src = output->ipv4.ct_orig.src;
+ orig.ipv4_dst = output->ipv4.ct_orig.dst;
+ orig.src_port = output->ct.orig_tp.src;
+ orig.dst_port = output->ct.orig_tp.dst;
+ orig.ipv4_proto = output->ct_orig_proto;
+
if (nla_put(skb, OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV4,
sizeof(orig), &orig))
return -EMSGSIZE;
} else if (swkey->eth.type == htons(ETH_P_IPV6)) {
- struct ovs_key_ct_tuple_ipv6 orig = {
- IN6_ADDR_INITIALIZER(output->ipv6.ct_orig.src),
- IN6_ADDR_INITIALIZER(output->ipv6.ct_orig.dst),
- output->ct.orig_tp.src,
- output->ct.orig_tp.dst,
- output->ct_orig_proto,
- };
+ struct ovs_key_ct_tuple_ipv6 orig;
+
+ memset(&orig, 0, sizeof(orig));
+ memcpy(orig.ipv6_src, output->ipv6.ct_orig.src.s6_addr32,
+ sizeof(orig.ipv6_src));
+ memcpy(orig.ipv6_dst, output->ipv6.ct_orig.dst.s6_addr32,
+ sizeof(orig.ipv6_dst));
+ orig.src_port = output->ct.orig_tp.src;
+ orig.dst_port = output->ct.orig_tp.dst;
+ orig.ipv6_proto = output->ct_orig_proto;
+
if (nla_put(skb, OVS_KEY_ATTR_CT_ORIG_TUPLE_IPV6,
sizeof(orig), &orig))
return -EMSGSIZE;
@@ -341,9 +347,9 @@ static int ovs_ct_set_mark(struct nf_conn *ct, struct sw_flow_key *key,
#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
u32 new_mark;
- new_mark = ct_mark | (ct->mark & ~(mask));
- if (ct->mark != new_mark) {
- ct->mark = new_mark;
+ new_mark = ct_mark | (READ_ONCE(ct->mark) & ~(mask));
+ if (READ_ONCE(ct->mark) != new_mark) {
+ WRITE_ONCE(ct->mark, new_mark);
if (nf_ct_is_confirmed(ct))
nf_conntrack_event_cache(IPCT_MARK, ct);
key->ct.mark = new_mark;
@@ -435,150 +441,26 @@ static int ovs_ct_set_labels(struct nf_conn *ct, struct sw_flow_key *key,
return 0;
}
-/* 'skb' should already be pulled to nh_ofs. */
-static int ovs_ct_helper(struct sk_buff *skb, u16 proto)
+static int ovs_ct_handle_fragments(struct net *net, struct sw_flow_key *key,
+ u16 zone, int family, struct sk_buff *skb)
{
- const struct nf_conntrack_helper *helper;
- const struct nf_conn_help *help;
- enum ip_conntrack_info ctinfo;
- unsigned int protoff;
- struct nf_conn *ct;
+ struct ovs_skb_cb ovs_cb = *OVS_CB(skb);
int err;
- ct = nf_ct_get(skb, &ctinfo);
- if (!ct || ctinfo == IP_CT_RELATED_REPLY)
- return NF_ACCEPT;
-
- help = nfct_help(ct);
- if (!help)
- return NF_ACCEPT;
-
- helper = rcu_dereference(help->helper);
- if (!helper)
- return NF_ACCEPT;
-
- switch (proto) {
- case NFPROTO_IPV4:
- protoff = ip_hdrlen(skb);
- break;
- case NFPROTO_IPV6: {
- u8 nexthdr = ipv6_hdr(skb)->nexthdr;
- __be16 frag_off;
- int ofs;
-
- ofs = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr,
- &frag_off);
- if (ofs < 0 || (frag_off & htons(~0x7)) != 0) {
- pr_debug("proto header not found\n");
- return NF_ACCEPT;
- }
- protoff = ofs;
- break;
- }
- default:
- WARN_ONCE(1, "helper invoked on non-IP family!");
- return NF_DROP;
- }
-
- err = helper->help(skb, protoff, ct, ctinfo);
- if (err != NF_ACCEPT)
+ err = nf_ct_handle_fragments(net, skb, zone, family, &key->ip.proto, &ovs_cb.mru);
+ if (err)
return err;
- /* Adjust seqs after helper. This is needed due to some helpers (e.g.,
- * FTP with NAT) adusting the TCP payload size when mangling IP
- * addresses and/or port numbers in the text-based control connection.
+ /* The key extracted from the fragment that completed this datagram
+ * likely didn't have an L4 header, so regenerate it.
*/
- if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
- !nf_ct_seq_adjust(skb, ct, ctinfo, protoff))
- return NF_DROP;
- return NF_ACCEPT;
-}
-
-/* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero
- * value if 'skb' is freed.
- */
-static int handle_fragments(struct net *net, struct sw_flow_key *key,
- u16 zone, struct sk_buff *skb)
-{
- struct ovs_skb_cb ovs_cb = *OVS_CB(skb);
- int err;
-
- if (key->eth.type == htons(ETH_P_IP)) {
- enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone;
-
- memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
- err = ip_defrag(net, skb, user);
- if (err)
- return err;
-
- ovs_cb.mru = IPCB(skb)->frag_max_size;
-#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
- } else if (key->eth.type == htons(ETH_P_IPV6)) {
- enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone;
-
- memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
- err = nf_ct_frag6_gather(net, skb, user);
- if (err) {
- if (err != -EINPROGRESS)
- kfree_skb(skb);
- return err;
- }
-
- key->ip.proto = ipv6_hdr(skb)->nexthdr;
- ovs_cb.mru = IP6CB(skb)->frag_max_size;
-#endif
- } else {
- kfree_skb(skb);
- return -EPFNOSUPPORT;
- }
-
+ ovs_flow_key_update_l3l4(skb, key);
key->ip.frag = OVS_FRAG_TYPE_NONE;
- skb_clear_hash(skb);
- skb->ignore_df = 1;
*OVS_CB(skb) = ovs_cb;
return 0;
}
-static struct nf_conntrack_expect *
-ovs_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone,
- u16 proto, const struct sk_buff *skb)
-{
- struct nf_conntrack_tuple tuple;
- struct nf_conntrack_expect *exp;
-
- if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), proto, net, &tuple))
- return NULL;
-
- exp = __nf_ct_expect_find(net, zone, &tuple);
- if (exp) {
- struct nf_conntrack_tuple_hash *h;
-
- /* Delete existing conntrack entry, if it clashes with the
- * expectation. This can happen since conntrack ALGs do not
- * check for clashes between (new) expectations and existing
- * conntrack entries. nf_conntrack_in() will check the
- * expectations only if a conntrack entry can not be found,
- * which can lead to OVS finding the expectation (here) in the
- * init direction, but which will not be removed by the
- * nf_conntrack_in() call, if a matching conntrack entry is
- * found instead. In this case all init direction packets
- * would be reported as new related packets, while reply
- * direction packets would be reported as un-related
- * established packets.
- */
- h = nf_conntrack_find_get(net, zone, &tuple);
- if (h) {
- struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
-
- nf_ct_delete(ct, 0, 0);
- nf_conntrack_put(&ct->ct_general);
- }
- }
-
- return exp;
-}
-
/* This replicates logic from nf_conntrack_core.c that is not exported. */
static enum ip_conntrack_info
ovs_ct_get_info(const struct nf_conntrack_tuple_hash *h)
@@ -622,7 +504,7 @@ ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone,
if (natted) {
struct nf_conntrack_tuple inverse;
- if (!nf_ct_invert_tuplepr(&inverse, &tuple)) {
+ if (!nf_ct_invert_tuple(&inverse, &tuple)) {
pr_debug("ovs_ct_find_existing: Inversion failed!\n");
return NULL;
}
@@ -705,6 +587,14 @@ static bool skb_nfct_cached(struct net *net,
if (help && rcu_access_pointer(help->helper) != info->helper)
return false;
}
+ if (info->nf_ct_timeout) {
+ struct nf_conn_timeout *timeout_ext;
+
+ timeout_ext = nf_ct_timeout_find(ct);
+ if (!timeout_ext || info->nf_ct_timeout !=
+ rcu_dereference(timeout_ext->timeout))
+ return false;
+ }
/* Force conntrack entry direction to the current packet? */
if (info->force && CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) {
/* Delete the conntrack entry if confirmed, else just release
@@ -713,7 +603,7 @@ static bool skb_nfct_cached(struct net *net,
if (nf_ct_is_confirmed(ct))
nf_ct_delete(ct, 0, 0);
- nf_conntrack_put(&ct->ct_general);
+ nf_ct_put(ct);
nf_ct_set(skb, NULL, 0);
return false;
}
@@ -721,90 +611,7 @@ static bool skb_nfct_cached(struct net *net,
return ct_executed;
}
-#ifdef CONFIG_NF_NAT_NEEDED
-/* Modelled after nf_nat_ipv[46]_fn().
- * range is only used for new, uninitialized NAT state.
- * Returns either NF_ACCEPT or NF_DROP.
- */
-static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct,
- enum ip_conntrack_info ctinfo,
- const struct nf_nat_range2 *range,
- enum nf_nat_manip_type maniptype)
-{
- int hooknum, nh_off, err = NF_ACCEPT;
-
- nh_off = skb_network_offset(skb);
- skb_pull_rcsum(skb, nh_off);
-
- /* See HOOK2MANIP(). */
- if (maniptype == NF_NAT_MANIP_SRC)
- hooknum = NF_INET_LOCAL_IN; /* Source NAT */
- else
- hooknum = NF_INET_LOCAL_OUT; /* Destination NAT */
-
- switch (ctinfo) {
- case IP_CT_RELATED:
- case IP_CT_RELATED_REPLY:
- if (IS_ENABLED(CONFIG_NF_NAT_IPV4) &&
- skb->protocol == htons(ETH_P_IP) &&
- ip_hdr(skb)->protocol == IPPROTO_ICMP) {
- if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
- hooknum))
- err = NF_DROP;
- goto push;
- } else if (IS_ENABLED(CONFIG_NF_NAT_IPV6) &&
- skb->protocol == htons(ETH_P_IPV6)) {
- __be16 frag_off;
- u8 nexthdr = ipv6_hdr(skb)->nexthdr;
- int hdrlen = ipv6_skip_exthdr(skb,
- sizeof(struct ipv6hdr),
- &nexthdr, &frag_off);
-
- if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) {
- if (!nf_nat_icmpv6_reply_translation(skb, ct,
- ctinfo,
- hooknum,
- hdrlen))
- err = NF_DROP;
- goto push;
- }
- }
- /* Non-ICMP, fall thru to initialize if needed. */
- /* fall through */
- case IP_CT_NEW:
- /* Seen it before? This can happen for loopback, retrans,
- * or local packets.
- */
- if (!nf_nat_initialized(ct, maniptype)) {
- /* Initialize according to the NAT action. */
- err = (range && range->flags & NF_NAT_RANGE_MAP_IPS)
- /* Action is set up to establish a new
- * mapping.
- */
- ? nf_nat_setup_info(ct, range, maniptype)
- : nf_nat_alloc_null_binding(ct, hooknum);
- if (err != NF_ACCEPT)
- goto push;
- }
- break;
-
- case IP_CT_ESTABLISHED:
- case IP_CT_ESTABLISHED_REPLY:
- break;
-
- default:
- err = NF_DROP;
- goto push;
- }
-
- err = nf_nat_packet(ct, ctinfo, hooknum, skb);
-push:
- skb_push(skb, nh_off);
- skb_postpush_rcsum(skb, skb->data, nh_off);
-
- return err;
-}
-
+#if IS_ENABLED(CONFIG_NF_NAT)
static void ovs_nat_update_key(struct sw_flow_key *key,
const struct sk_buff *skb,
enum nf_nat_manip_type maniptype)
@@ -862,48 +669,27 @@ static int ovs_ct_nat(struct net *net, struct sw_flow_key *key,
struct sk_buff *skb, struct nf_conn *ct,
enum ip_conntrack_info ctinfo)
{
- enum nf_nat_manip_type maniptype;
- int err;
+ int err, action = 0;
- /* Add NAT extension if not confirmed yet. */
- if (!nf_ct_is_confirmed(ct) && !nf_ct_nat_ext_add(ct))
- return NF_ACCEPT; /* Can't NAT. */
+ if (!(info->nat & OVS_CT_NAT))
+ return NF_ACCEPT;
+ if (info->nat & OVS_CT_SRC_NAT)
+ action |= BIT(NF_NAT_MANIP_SRC);
+ if (info->nat & OVS_CT_DST_NAT)
+ action |= BIT(NF_NAT_MANIP_DST);
- /* Determine NAT type.
- * Check if the NAT type can be deduced from the tracked connection.
- * Make sure new expected connections (IP_CT_RELATED) are NATted only
- * when committing.
- */
- if (info->nat & OVS_CT_NAT && ctinfo != IP_CT_NEW &&
- ct->status & IPS_NAT_MASK &&
- (ctinfo != IP_CT_RELATED || info->commit)) {
- /* NAT an established or related connection like before. */
- if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY)
- /* This is the REPLY direction for a connection
- * for which NAT was applied in the forward
- * direction. Do the reverse NAT.
- */
- maniptype = ct->status & IPS_SRC_NAT
- ? NF_NAT_MANIP_DST : NF_NAT_MANIP_SRC;
- else
- maniptype = ct->status & IPS_SRC_NAT
- ? NF_NAT_MANIP_SRC : NF_NAT_MANIP_DST;
- } else if (info->nat & OVS_CT_SRC_NAT) {
- maniptype = NF_NAT_MANIP_SRC;
- } else if (info->nat & OVS_CT_DST_NAT) {
- maniptype = NF_NAT_MANIP_DST;
- } else {
- return NF_ACCEPT; /* Connection is not NATed. */
- }
- err = ovs_ct_nat_execute(skb, ct, ctinfo, &info->range, maniptype);
+ err = nf_ct_nat(skb, ct, ctinfo, &action, &info->range, info->commit);
+ if (err != NF_ACCEPT)
+ return err;
- /* Mark NAT done if successful and update the flow key. */
- if (err == NF_ACCEPT)
- ovs_nat_update_key(key, skb, maniptype);
+ if (action & BIT(NF_NAT_MANIP_SRC))
+ ovs_nat_update_key(key, skb, NF_NAT_MANIP_SRC);
+ if (action & BIT(NF_NAT_MANIP_DST))
+ ovs_nat_update_key(key, skb, NF_NAT_MANIP_DST);
return err;
}
-#else /* !CONFIG_NF_NAT_NEEDED */
+#else /* !CONFIG_NF_NAT */
static int ovs_ct_nat(struct net *net, struct sw_flow_key *key,
const struct ovs_conntrack_info *info,
struct sk_buff *skb, struct nf_conn *ct,
@@ -913,6 +699,22 @@ static int ovs_ct_nat(struct net *net, struct sw_flow_key *key,
}
#endif
+static int verdict_to_errno(unsigned int verdict)
+{
+ switch (verdict & NF_VERDICT_MASK) {
+ case NF_ACCEPT:
+ return 0;
+ case NF_DROP:
+ return -EINVAL;
+ case NF_STOLEN:
+ return -EINPROGRESS;
+ default:
+ break;
+ }
+
+ return -EINVAL;
+}
+
/* Pass 'skb' through conntrack in 'net', using zone configured in 'info', if
* not done already. Update key with new CT state after passing the packet
* through conntrack.
@@ -943,15 +745,15 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
/* Associate skb with specified zone. */
if (tmpl) {
- if (skb_nfct(skb))
- nf_conntrack_put(skb_nfct(skb));
+ ct = nf_ct_get(skb, &ctinfo);
+ nf_ct_put(ct);
nf_conntrack_get(&tmpl->ct_general);
nf_ct_set(skb, tmpl, IP_CT_NEW);
}
err = nf_conntrack_in(skb, &state);
if (err != NF_ACCEPT)
- return -ENOENT;
+ return verdict_to_errno(err);
/* Clear CT state NAT flags to mark that we have not yet done
* NAT after the nf_conntrack_in() call. We can actually clear
@@ -965,6 +767,8 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
ct = nf_ct_get(skb, &ctinfo);
if (ct) {
+ bool add_helper = false;
+
/* Packets starting a new connection must be NATted before the
* helper, so that the helper knows about the NAT. We enforce
* this by delaying both NAT and helper calls for unconfirmed
@@ -976,15 +780,19 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
* the key->ct_state.
*/
if (info->nat && !(key->ct_state & OVS_CS_F_NAT_MASK) &&
- (nf_ct_is_confirmed(ct) || info->commit) &&
- ovs_ct_nat(net, key, info, skb, ct, ctinfo) != NF_ACCEPT) {
- return -EINVAL;
+ (nf_ct_is_confirmed(ct) || info->commit)) {
+ int err = ovs_ct_nat(net, key, info, skb, ct, ctinfo);
+
+ err = verdict_to_errno(err);
+ if (err)
+ return err;
}
/* Userspace may decide to perform a ct lookup without a helper
- * specified followed by a (recirculate and) commit with one.
- * Therefore, for unconfirmed connections which we will commit,
- * we need to attach the helper here.
+ * specified followed by a (recirculate and) commit with one,
+ * or attach a helper in a later commit. Therefore, for
+ * connections which we will commit, we may need to attach
+ * the helper here.
*/
if (!nf_ct_is_confirmed(ct) && info->commit &&
info->helper && !nfct_help(ct)) {
@@ -992,17 +800,39 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
GFP_ATOMIC);
if (err)
return err;
+ add_helper = true;
+
+ /* helper installed, add seqadj if NAT is required */
+ if (info->nat && !nfct_seqadj(ct)) {
+ if (!nfct_seqadj_ext_add(ct))
+ return -EINVAL;
+ }
}
/* Call the helper only if:
- * - nf_conntrack_in() was executed above ("!cached") for a
- * confirmed connection, or
+ * - nf_conntrack_in() was executed above ("!cached") or a
+ * helper was just attached ("add_helper") for a confirmed
+ * connection, or
* - When committing an unconfirmed connection.
*/
- if ((nf_ct_is_confirmed(ct) ? !cached : info->commit) &&
- ovs_ct_helper(skb, info->family) != NF_ACCEPT) {
- return -EINVAL;
+ if ((nf_ct_is_confirmed(ct) ? !cached || add_helper :
+ info->commit)) {
+ int err = nf_ct_helper(skb, ct, ctinfo, info->family);
+
+ err = verdict_to_errno(err);
+ if (err)
+ return err;
}
+
+ if (nf_ct_protonum(ct) == IPPROTO_TCP &&
+ nf_ct_is_confirmed(ct) && nf_conntrack_tcp_established(ct)) {
+ /* Be liberal for tcp packets so that out-of-window
+ * packets are not marked invalid.
+ */
+ nf_ct_set_tcp_be_liberal(ct);
+ }
+
+ nf_conn_act_ct_ext_fill(skb, ct, ctinfo);
}
return 0;
@@ -1013,36 +843,16 @@ static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
const struct ovs_conntrack_info *info,
struct sk_buff *skb)
{
- struct nf_conntrack_expect *exp;
-
- /* If we pass an expected packet through nf_conntrack_in() the
- * expectation is typically removed, but the packet could still be
- * lost in upcall processing. To prevent this from happening we
- * perform an explicit expectation lookup. Expected connections are
- * always new, and will be passed through conntrack only when they are
- * committed, as it is OK to remove the expectation at that time.
- */
- exp = ovs_ct_expect_find(net, &info->zone, info->family, skb);
- if (exp) {
- u8 state;
-
- /* NOTE: New connections are NATted and Helped only when
- * committed, so we are not calling into NAT here.
- */
- state = OVS_CS_F_TRACKED | OVS_CS_F_NEW | OVS_CS_F_RELATED;
- __ovs_ct_update_key(key, state, &info->zone, exp->master);
- } else {
- struct nf_conn *ct;
- int err;
+ struct nf_conn *ct;
+ int err;
- err = __ovs_ct_lookup(net, key, info, skb);
- if (err)
- return err;
+ err = __ovs_ct_lookup(net, key, info, skb);
+ if (err)
+ return err;
- ct = (struct nf_conn *)skb_nfct(skb);
- if (ct)
- nf_ct_deliver_cached_events(ct);
- }
+ ct = (struct nf_conn *)skb_nfct(skb);
+ if (ct)
+ nf_ct_deliver_cached_events(ct);
return 0;
}
@@ -1118,8 +928,8 @@ static u32 ct_limit_get(const struct ovs_ct_limit_info *info, u16 zone)
}
static int ovs_ct_check_limit(struct net *net,
- const struct ovs_conntrack_info *info,
- const struct nf_conntrack_tuple *tuple)
+ const struct sk_buff *skb,
+ const struct ovs_conntrack_info *info)
{
struct ovs_net *ovs_net = net_generic(net, ovs_net_id);
const struct ovs_ct_limit_info *ct_limit_info = ovs_net->ct_limit_info;
@@ -1132,8 +942,9 @@ static int ovs_ct_check_limit(struct net *net,
if (per_zone_limit == OVS_CT_LIMIT_UNLIMITED)
return 0;
- connections = nf_conncount_count(net, ct_limit_info->data,
- &conncount_key, tuple, &info->zone);
+ connections = nf_conncount_count_skb(net, skb, info->family,
+ ct_limit_info->data,
+ &conncount_key);
if (connections > per_zone_limit)
return -ENOMEM;
@@ -1162,8 +973,7 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
if (static_branch_unlikely(&ovs_ct_limit_enabled)) {
if (!nf_ct_is_confirmed(ct)) {
- err = ovs_ct_check_limit(net, info,
- &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+ err = ovs_ct_check_limit(net, skb, info);
if (err) {
net_warn_ratelimited("openvswitch: zone: %u "
"exceeds conntrack limit\n",
@@ -1203,6 +1013,8 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
&info->labels.mask);
if (err)
return err;
+
+ nf_conn_act_ct_ext_add(skb, ct, ctinfo);
} else if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) &&
labels_nonzero(&info->labels.mask)) {
err = ovs_ct_set_labels(ct, key, &info->labels.value,
@@ -1213,40 +1025,9 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
/* This will take care of sending queued events even if the connection
* is already confirmed.
*/
- if (nf_conntrack_confirm(skb) != NF_ACCEPT)
- return -EINVAL;
+ err = nf_conntrack_confirm(skb);
- return 0;
-}
-
-/* Trim the skb to the length specified by the IP/IPv6 header,
- * removing any trailing lower-layer padding. This prepares the skb
- * for higher-layer processing that assumes skb->len excludes padding
- * (such as nf_ip_checksum). The caller needs to pull the skb to the
- * network header, and ensure ip_hdr/ipv6_hdr points to valid data.
- */
-static int ovs_skb_network_trim(struct sk_buff *skb)
-{
- unsigned int len;
- int err;
-
- switch (skb->protocol) {
- case htons(ETH_P_IP):
- len = ntohs(ip_hdr(skb)->tot_len);
- break;
- case htons(ETH_P_IPV6):
- len = sizeof(struct ipv6hdr)
- + ntohs(ipv6_hdr(skb)->payload_len);
- break;
- default:
- len = skb->len;
- }
-
- err = pskb_trim_rcsum(skb, len);
- if (err)
- kfree_skb(skb);
-
- return err;
+ return verdict_to_errno(err);
}
/* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero
@@ -1263,12 +1044,15 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb,
nh_ofs = skb_network_offset(skb);
skb_pull_rcsum(skb, nh_ofs);
- err = ovs_skb_network_trim(skb);
- if (err)
+ err = nf_ct_skb_network_trim(skb, info->family);
+ if (err) {
+ kfree_skb(skb);
return err;
+ }
if (key->ip.frag != OVS_FRAG_TYPE_NONE) {
- err = handle_fragments(net, key, info->zone.id, skb);
+ err = ovs_ct_handle_fragments(net, key, info->zone.id,
+ info->family, skb);
if (err)
return err;
}
@@ -1278,53 +1062,33 @@ int ovs_ct_execute(struct net *net, struct sk_buff *skb,
else
err = ovs_ct_lookup(net, key, info, skb);
- skb_push(skb, nh_ofs);
- skb_postpush_rcsum(skb, skb->data, nh_ofs);
+ /* conntrack core returned NF_STOLEN */
+ if (err == -EINPROGRESS)
+ return err;
+
+ skb_push_rcsum(skb, nh_ofs);
if (err)
- kfree_skb(skb);
+ ovs_kfree_skb_reason(skb, OVS_DROP_CONNTRACK);
return err;
}
int ovs_ct_clear(struct sk_buff *skb, struct sw_flow_key *key)
{
- if (skb_nfct(skb)) {
- nf_conntrack_put(skb_nfct(skb));
- nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
- ovs_ct_fill_key(skb, key);
- }
-
- return 0;
-}
-
-static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name,
- const struct sw_flow_key *key, bool log)
-{
- struct nf_conntrack_helper *helper;
- struct nf_conn_help *help;
-
- helper = nf_conntrack_helper_try_module_get(name, info->family,
- key->ip.proto);
- if (!helper) {
- OVS_NLERR(log, "Unknown helper \"%s\"", name);
- return -EINVAL;
- }
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
- help = nf_ct_helper_ext_add(info->ct, GFP_KERNEL);
- if (!help) {
- nf_conntrack_helper_put(helper);
- return -ENOMEM;
- }
+ ct = nf_ct_get(skb, &ctinfo);
- rcu_assign_pointer(help->helper, helper);
- info->helper = helper;
+ nf_ct_put(ct);
+ nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
- if (info->nat)
- request_module("ip_nat_%s", name);
+ if (key)
+ ovs_ct_fill_key(skb, key, false);
return 0;
}
-#ifdef CONFIG_NF_NAT_NEEDED
+#if IS_ENABLED(CONFIG_NF_NAT)
static int parse_nat(const struct nlattr *attr,
struct ovs_conntrack_info *info, bool log)
{
@@ -1461,12 +1225,14 @@ static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = {
.maxlen = sizeof(struct md_labels) },
[OVS_CT_ATTR_HELPER] = { .minlen = 1,
.maxlen = NF_CT_HELPER_NAME_LEN },
-#ifdef CONFIG_NF_NAT_NEEDED
+#if IS_ENABLED(CONFIG_NF_NAT)
/* NAT length is checked when parsing the nested attributes. */
[OVS_CT_ATTR_NAT] = { .minlen = 0, .maxlen = INT_MAX },
#endif
[OVS_CT_ATTR_EVENTMASK] = { .minlen = sizeof(u32),
.maxlen = sizeof(u32) },
+ [OVS_CT_ATTR_TIMEOUT] = { .minlen = 1,
+ .maxlen = CTNL_TIMEOUT_NAME_MAX },
};
static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
@@ -1499,7 +1265,7 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
switch (type) {
case OVS_CT_ATTR_FORCE_COMMIT:
info->force = true;
- /* fall through. */
+ fallthrough;
case OVS_CT_ATTR_COMMIT:
info->commit = true;
break;
@@ -1534,12 +1300,12 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
#endif
case OVS_CT_ATTR_HELPER:
*helper = nla_data(a);
- if (!memchr(*helper, '\0', nla_len(a))) {
+ if (!string_is_terminated(*helper, nla_len(a))) {
OVS_NLERR(log, "Invalid conntrack helper");
return -EINVAL;
}
break;
-#ifdef CONFIG_NF_NAT_NEEDED
+#if IS_ENABLED(CONFIG_NF_NAT)
case OVS_CT_ATTR_NAT: {
int err = parse_nat(a, info, log);
@@ -1552,6 +1318,15 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
info->have_eventmask = true;
info->eventmask = nla_get_u32(a);
break;
+#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
+ case OVS_CT_ATTR_TIMEOUT:
+ memcpy(info->timeout, nla_data(a), nla_len(a));
+ if (!string_is_terminated(info->timeout, nla_len(a))) {
+ OVS_NLERR(log, "Invalid conntrack timeout");
+ return -EINVAL;
+ }
+ break;
+#endif
default:
OVS_NLERR(log, "Unknown conntrack attr (%d)",
@@ -1633,10 +1408,26 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr,
OVS_NLERR(log, "Failed to allocate conntrack template");
return -ENOMEM;
}
+
+ if (ct_info.timeout[0]) {
+ if (nf_ct_set_timeout(net, ct_info.ct, family, key->ip.proto,
+ ct_info.timeout))
+ OVS_NLERR(log,
+ "Failed to associated timeout policy '%s'",
+ ct_info.timeout);
+ else
+ ct_info.nf_ct_timeout = rcu_dereference(
+ nf_ct_timeout_find(ct_info.ct)->timeout);
+
+ }
+
if (helper) {
- err = ovs_ct_add_helper(&ct_info, helper, key, log);
- if (err)
+ err = nf_ct_add_helper(ct_info.ct, helper, ct_info.family,
+ key->ip.proto, ct_info.nat, &ct_info.helper);
+ if (err) {
+ OVS_NLERR(log, "Failed to add %s helper %d", helper, err);
goto err_free_ct;
+ }
}
err = ovs_nla_add_action(sfa, OVS_ACTION_ATTR_CT, &ct_info,
@@ -1644,21 +1435,21 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr,
if (err)
goto err_free_ct;
- __set_bit(IPS_CONFIRMED_BIT, &ct_info.ct->status);
- nf_conntrack_get(&ct_info.ct->ct_general);
+ if (ct_info.commit)
+ __set_bit(IPS_CONFIRMED_BIT, &ct_info.ct->status);
return 0;
err_free_ct:
__ovs_ct_free_action(&ct_info);
return err;
}
-#ifdef CONFIG_NF_NAT_NEEDED
+#if IS_ENABLED(CONFIG_NF_NAT)
static bool ovs_ct_nat_to_attr(const struct ovs_conntrack_info *info,
struct sk_buff *skb)
{
struct nlattr *start;
- start = nla_nest_start(skb, OVS_CT_ATTR_NAT);
+ start = nla_nest_start_noflag(skb, OVS_CT_ATTR_NAT);
if (!start)
return false;
@@ -1673,7 +1464,7 @@ static bool ovs_ct_nat_to_attr(const struct ovs_conntrack_info *info,
}
if (info->range.flags & NF_NAT_RANGE_MAP_IPS) {
- if (IS_ENABLED(CONFIG_NF_NAT_IPV4) &&
+ if (IS_ENABLED(CONFIG_NF_NAT) &&
info->family == NFPROTO_IPV4) {
if (nla_put_in_addr(skb, OVS_NAT_ATTR_IP_MIN,
info->range.min_addr.ip) ||
@@ -1682,7 +1473,7 @@ static bool ovs_ct_nat_to_attr(const struct ovs_conntrack_info *info,
(nla_put_in_addr(skb, OVS_NAT_ATTR_IP_MAX,
info->range.max_addr.ip))))
return false;
- } else if (IS_ENABLED(CONFIG_NF_NAT_IPV6) &&
+ } else if (IS_ENABLED(CONFIG_IPV6) &&
info->family == NFPROTO_IPV6) {
if (nla_put_in6_addr(skb, OVS_NAT_ATTR_IP_MIN,
&info->range.min_addr.in6) ||
@@ -1725,7 +1516,7 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info,
{
struct nlattr *start;
- start = nla_nest_start(skb, OVS_ACTION_ATTR_CT);
+ start = nla_nest_start_noflag(skb, OVS_ACTION_ATTR_CT);
if (!start)
return -EMSGSIZE;
@@ -1753,8 +1544,12 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info,
if (ct_info->have_eventmask &&
nla_put_u32(skb, OVS_CT_ATTR_EVENTMASK, ct_info->eventmask))
return -EMSGSIZE;
+ if (ct_info->timeout[0]) {
+ if (nla_put_string(skb, OVS_CT_ATTR_TIMEOUT, ct_info->timeout))
+ return -EMSGSIZE;
+ }
-#ifdef CONFIG_NF_NAT_NEEDED
+#if IS_ENABLED(CONFIG_NF_NAT)
if (ct_info->nat && !ovs_ct_nat_to_attr(ct_info, skb))
return -EMSGSIZE;
#endif
@@ -1772,10 +1567,18 @@ void ovs_ct_free_action(const struct nlattr *a)
static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info)
{
- if (ct_info->helper)
+ if (ct_info->helper) {
+#if IS_ENABLED(CONFIG_NF_NAT)
+ if (ct_info->nat)
+ nf_nat_helper_put(ct_info->helper);
+#endif
nf_conntrack_helper_put(ct_info->helper);
- if (ct_info->ct)
+ }
+ if (ct_info->ct) {
+ if (ct_info->timeout[0])
+ nf_ct_destroy_timeout(ct_info->ct);
nf_ct_tmpl_free(ct_info->ct);
+ }
}
#if IS_ENABLED(CONFIG_NETFILTER_CONNCOUNT)
@@ -1800,8 +1603,7 @@ static int ovs_ct_limit_init(struct net *net, struct ovs_net *ovs_net)
for (i = 0; i < CT_LIMIT_HASH_BUCKETS; i++)
INIT_HLIST_HEAD(&ovs_net->ct_limit_info->limits[i]);
- ovs_net->ct_limit_info->data =
- nf_conncount_init(net, NFPROTO_INET, sizeof(u32));
+ ovs_net->ct_limit_info->data = nf_conncount_init(net, sizeof(u32));
if (IS_ERR(ovs_net->ct_limit_info->data)) {
err = PTR_ERR(ovs_net->ct_limit_info->data);
@@ -1818,23 +1620,24 @@ static void ovs_ct_limit_exit(struct net *net, struct ovs_net *ovs_net)
const struct ovs_ct_limit_info *info = ovs_net->ct_limit_info;
int i;
- nf_conncount_destroy(net, NFPROTO_INET, info->data);
+ nf_conncount_destroy(net, info->data);
for (i = 0; i < CT_LIMIT_HASH_BUCKETS; ++i) {
struct hlist_head *head = &info->limits[i];
struct ovs_ct_limit *ct_limit;
+ struct hlist_node *next;
- hlist_for_each_entry_rcu(ct_limit, head, hlist_node)
+ hlist_for_each_entry_safe(ct_limit, next, head, hlist_node)
kfree_rcu(ct_limit, rcu);
}
- kfree(ovs_net->ct_limit_info->limits);
- kfree(ovs_net->ct_limit_info);
+ kfree(info->limits);
+ kfree(info);
}
static struct sk_buff *
ovs_ct_limit_cmd_reply_start(struct genl_info *info, u8 cmd,
struct ovs_header **ovs_reply_header)
{
- struct ovs_header *ovs_header = info->userhdr;
+ struct ovs_header *ovs_header = genl_info_userhdr(info);
struct sk_buff *skb;
skb = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
@@ -1885,7 +1688,8 @@ static int ovs_ct_limit_set_zone_limit(struct nlattr *nla_zone_limit,
} else {
struct ovs_ct_limit *ct_limit;
- ct_limit = kmalloc(sizeof(*ct_limit), GFP_KERNEL);
+ ct_limit = kmalloc(sizeof(*ct_limit),
+ GFP_KERNEL_ACCOUNT);
if (!ct_limit)
return -ENOMEM;
@@ -1945,16 +1749,12 @@ static int ovs_ct_limit_del_zone_limit(struct nlattr *nla_zone_limit,
static int ovs_ct_limit_get_default_limit(struct ovs_ct_limit_info *info,
struct sk_buff *reply)
{
- struct ovs_zone_limit zone_limit;
- int err;
+ struct ovs_zone_limit zone_limit = {
+ .zone_id = OVS_ZONE_LIMIT_DEFAULT_ZONE,
+ .limit = info->default_limit,
+ };
- zone_limit.zone_id = OVS_ZONE_LIMIT_DEFAULT_ZONE;
- zone_limit.limit = info->default_limit;
- err = nla_put_nohdr(reply, sizeof(zone_limit), &zone_limit);
- if (err)
- return err;
-
- return 0;
+ return nla_put_nohdr(reply, sizeof(zone_limit), &zone_limit);
}
static int __ovs_ct_limit_get_zone_limit(struct net *net,
@@ -1970,8 +1770,8 @@ static int __ovs_ct_limit_get_zone_limit(struct net *net,
zone_limit.limit = limit;
nf_ct_zone_init(&ct_zone, zone_id, NF_CT_DEFAULT_ZONE_DIR, 0);
- zone_limit.count = nf_conncount_count(net, data, &conncount_key, NULL,
- &ct_zone);
+ zone_limit.count = nf_conncount_count_skb(net, NULL, 0, data,
+ &conncount_key);
return nla_put_nohdr(reply, sizeof(zone_limit), &zone_limit);
}
@@ -2128,7 +1928,11 @@ static int ovs_ct_limit_cmd_get(struct sk_buff *skb, struct genl_info *info)
if (IS_ERR(reply))
return PTR_ERR(reply);
- nla_reply = nla_nest_start(reply, OVS_CT_LIMIT_ATTR_ZONE_LIMIT);
+ nla_reply = nla_nest_start_noflag(reply, OVS_CT_LIMIT_ATTR_ZONE_LIMIT);
+ if (!nla_reply) {
+ err = -EMSGSIZE;
+ goto exit_err;
+ }
if (a[OVS_CT_LIMIT_ATTR_ZONE_LIMIT]) {
err = ovs_ct_limit_get_zone_limit(
@@ -2152,22 +1956,24 @@ exit_err:
return err;
}
-static struct genl_ops ct_limit_genl_ops[] = {
+static const struct genl_small_ops ct_limit_genl_ops[] = {
{ .cmd = OVS_CT_LIMIT_CMD_SET,
- .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN
- * privilege. */
- .policy = ct_limit_policy,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN
+ * privilege.
+ */
.doit = ovs_ct_limit_cmd_set,
},
{ .cmd = OVS_CT_LIMIT_CMD_DEL,
- .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN
- * privilege. */
- .policy = ct_limit_policy,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+ .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN
+ * privilege.
+ */
.doit = ovs_ct_limit_cmd_del,
},
{ .cmd = OVS_CT_LIMIT_CMD_GET,
+ .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
.flags = 0, /* OK for unprivileged users. */
- .policy = ct_limit_policy,
.doit = ovs_ct_limit_cmd_get,
},
};
@@ -2181,10 +1987,12 @@ struct genl_family dp_ct_limit_genl_family __ro_after_init = {
.name = OVS_CT_LIMIT_FAMILY,
.version = OVS_CT_LIMIT_VERSION,
.maxattr = OVS_CT_LIMIT_ATTR_MAX,
+ .policy = ct_limit_policy,
.netnsok = true,
.parallel_ops = true,
- .ops = ct_limit_genl_ops,
- .n_ops = ARRAY_SIZE(ct_limit_genl_ops),
+ .small_ops = ct_limit_genl_ops,
+ .n_small_ops = ARRAY_SIZE(ct_limit_genl_ops),
+ .resv_start_op = OVS_CT_LIMIT_CMD_GET + 1,
.mcgrps = &ovs_ct_limit_multicast_group,
.n_mcgrps = 1,
.module = THIS_MODULE,