summaryrefslogtreecommitdiff
path: root/net/core/skbuff.c
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2018-12-19 11:21:45 -0800
committerDavid S. Miller <davem@davemloft.net>2018-12-19 11:21:45 -0800
commit4a54877ee767fe70a6966352c788fc5f405aa3c6 (patch)
tree6c37280ad8ba6e1d028962bfb2e3ace1f590d3fc /net/core/skbuff.c
parent8239d5790481bf2e4e09e202bfd4a8a5ea8ca3a1 (diff)
parent4165079ba328dd47262a2183049d3591f0a750b1 (diff)
Merge branch 'sk_buff-add-extension-infrastructure'
Florian Westphal says: ==================== sk_buff: add extension infrastructure TL;DR: - objdiff shows no change if CONFIG_XFRM=n && BR_NETFILTER=n - small size reduction when one or both options are set - no changes in ipsec performance Changes since v1: - Allocate entire extension space from a kmem_cache. - Avoid atomic_dec_and_test operation on skb_ext_put() for refcnt == 1 case. (similar to kfree_skbmem() fclone_ref use). This adds an optional extension infrastructure, with ispec (xfrm) and bridge netfilter as first users. The third (future) user is Multipath TCP which is still out-of-tree. MPTCP needs to map logical mptcp sequence numbers to the tcp sequence numbers used by individual subflows. This DSS mapping is read/written from tcp option space on receive and written to tcp option space on transmitted tcp packets that are part of and MPTCP connection. Extending skb_shared_info or adding a private data field to skb fclones doesn't work for incoming skb, so a different DSS propagation method would be required for the receive side. mptcp has same requirements as secpath/bridge netfilter: 1. extension memory is released when the sk_buff is free'd. 2. data is shared after cloning an skb (clone inherits extension) 3. adding extension to an skb will COW the extension buffer if needed. Two new members are added to sk_buff: 1. 'active_extensions' byte (filling a hole), telling which extensions are available for this skb. This has two purposes. a) avoids the need to initialize the pointer. b) allows to "delete" an extension by clearing its bit value in ->active_extensions. While it would be possible to store the active_extensions byte in the extension struct instead of sk_buff, there is one problem with this: When an extension has to be disabled, we can always clear the bit in skb->active_extensions. But in case it would be stored in the extension buffer itself, we might have to COW it first, if we are dealing with a cloned skb. On kmalloc failure we would be unable to turn an extension off. 2. extension pointer, located at the end of the sk_buff. If the active_extensions byte is 0, the pointer is undefined, it is not initialized on skb allocation. This adds extra code to skb clone and free paths (to deal with refcount/free of extension area) but this replaces similar code that manages skb->nf_bridge and skb->sp structs in the followup patches of the series. It is possible to add support for extensions that are not preseved on clones/copies: 1. define a bitmask of all extensions that need copy/cow on clone 2. change __skb_ext_copy() to check ->active_extensions & SKB_EXT_PRESERVE_ON_CLONE 3. set clone->active_extensions to 0 if test is false. This isn't done here because all extensions that get added here need the copy/cow semantics. Last patch converts skb->sp, secpath information gets stored as new SKB_EXT_SEC_PATH, so the 'sp' pointer is removed from skbuff. Extra code added to skb clone and free paths (to deal with refcount/free of extension area) replaces the existing code that does the same for skb->nf_bridge and skb->secpath. I don't see any other in-tree users that could benefit from this infrastructure, it doesn't make sense to add an extension just for the sake of a single flag bit (like skb->nf_trace). Adding a new extension is a good fit if all of the following are true: 1. Data is related to the skb/packet aggregate 2. Data should be freed when the skb is free'd 3. Data is not going to be relevant/needed in normal case (udp, tcp, forwarding workloads, ...) 4. There are no fancy action(s) needed on clone/free, such as callbacks into kernel modules. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/core/skbuff.c')
-rw-r--r--net/core/skbuff.c201
1 files changed, 194 insertions, 7 deletions
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 40552547c69a..cb0bf4215745 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -79,6 +79,9 @@
struct kmem_cache *skbuff_head_cache __ro_after_init;
static struct kmem_cache *skbuff_fclone_cache __ro_after_init;
+#ifdef CONFIG_SKB_EXTENSIONS
+static struct kmem_cache *skbuff_ext_cache __ro_after_init;
+#endif
int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
EXPORT_SYMBOL(sysctl_max_skb_frags);
@@ -606,7 +609,6 @@ fastpath:
void skb_release_head_state(struct sk_buff *skb)
{
skb_dst_drop(skb);
- secpath_reset(skb);
if (skb->destructor) {
WARN_ON(in_irq());
skb->destructor(skb);
@@ -614,9 +616,7 @@ void skb_release_head_state(struct sk_buff *skb)
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
nf_conntrack_put(skb_nfct(skb));
#endif
-#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
- nf_bridge_put(skb->nf_bridge);
-#endif
+ skb_ext_put(skb);
}
/* Free everything but the sk_buff shell. */
@@ -796,9 +796,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
new->dev = old->dev;
memcpy(new->cb, old->cb, sizeof(old->cb));
skb_dst_copy(new, old);
-#ifdef CONFIG_XFRM
- new->sp = secpath_get(old->sp);
-#endif
+ __skb_ext_copy(new, old);
__nf_copy(new, old, false);
/* Note : this field could be in headers_start/headers_end section
@@ -3902,6 +3900,46 @@ done:
}
EXPORT_SYMBOL_GPL(skb_gro_receive);
+#ifdef CONFIG_SKB_EXTENSIONS
+#define SKB_EXT_ALIGN_VALUE 8
+#define SKB_EXT_CHUNKSIZEOF(x) (ALIGN((sizeof(x)), SKB_EXT_ALIGN_VALUE) / SKB_EXT_ALIGN_VALUE)
+
+static const u8 skb_ext_type_len[] = {
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ [SKB_EXT_BRIDGE_NF] = SKB_EXT_CHUNKSIZEOF(struct nf_bridge_info),
+#endif
+#ifdef CONFIG_XFRM
+ [SKB_EXT_SEC_PATH] = SKB_EXT_CHUNKSIZEOF(struct sec_path),
+#endif
+};
+
+static __always_inline unsigned int skb_ext_total_length(void)
+{
+ return SKB_EXT_CHUNKSIZEOF(struct skb_ext) +
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ skb_ext_type_len[SKB_EXT_BRIDGE_NF] +
+#endif
+#ifdef CONFIG_XFRM
+ skb_ext_type_len[SKB_EXT_SEC_PATH] +
+#endif
+ 0;
+}
+
+static void skb_extensions_init(void)
+{
+ BUILD_BUG_ON(SKB_EXT_NUM >= 8);
+ BUILD_BUG_ON(skb_ext_total_length() > 255);
+
+ skbuff_ext_cache = kmem_cache_create("skbuff_ext_cache",
+ SKB_EXT_ALIGN_VALUE * skb_ext_total_length(),
+ 0,
+ SLAB_HWCACHE_ALIGN|SLAB_PANIC,
+ NULL);
+}
+#else
+static void skb_extensions_init(void) {}
+#endif
+
void __init skb_init(void)
{
skbuff_head_cache = kmem_cache_create_usercopy("skbuff_head_cache",
@@ -3916,6 +3954,7 @@ void __init skb_init(void)
0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC,
NULL);
+ skb_extensions_init();
}
static int
@@ -5554,3 +5593,151 @@ void skb_condense(struct sk_buff *skb)
*/
skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
}
+
+#ifdef CONFIG_SKB_EXTENSIONS
+static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id)
+{
+ return (void *)ext + (ext->offset[id] * SKB_EXT_ALIGN_VALUE);
+}
+
+static struct skb_ext *skb_ext_alloc(void)
+{
+ struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC);
+
+ if (new) {
+ memset(new->offset, 0, sizeof(new->offset));
+ refcount_set(&new->refcnt, 1);
+ }
+
+ return new;
+}
+
+static struct skb_ext *skb_ext_maybe_cow(struct skb_ext *old,
+ unsigned int old_active)
+{
+ struct skb_ext *new;
+
+ if (refcount_read(&old->refcnt) == 1)
+ return old;
+
+ new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC);
+ if (!new)
+ return NULL;
+
+ memcpy(new, old, old->chunks * SKB_EXT_ALIGN_VALUE);
+ refcount_set(&new->refcnt, 1);
+
+#ifdef CONFIG_XFRM
+ if (old_active & (1 << SKB_EXT_SEC_PATH)) {
+ struct sec_path *sp = skb_ext_get_ptr(old, SKB_EXT_SEC_PATH);
+ unsigned int i;
+
+ for (i = 0; i < sp->len; i++)
+ xfrm_state_hold(sp->xvec[i]);
+ }
+#endif
+ __skb_ext_put(old);
+ return new;
+}
+
+/**
+ * skb_ext_add - allocate space for given extension, COW if needed
+ * @skb: buffer
+ * @id: extension to allocate space for
+ *
+ * Allocates enough space for the given extension.
+ * If the extension is already present, a pointer to that extension
+ * is returned.
+ *
+ * If the skb was cloned, COW applies and the returned memory can be
+ * modified without changing the extension space of clones buffers.
+ *
+ * Returns pointer to the extension or NULL on allocation failure.
+ */
+void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id)
+{
+ struct skb_ext *new, *old = NULL;
+ unsigned int newlen, newoff;
+
+ if (skb->active_extensions) {
+ old = skb->extensions;
+
+ new = skb_ext_maybe_cow(old, skb->active_extensions);
+ if (!new)
+ return NULL;
+
+ if (__skb_ext_exist(old, id)) {
+ if (old != new)
+ skb->extensions = new;
+ goto set_active;
+ }
+
+ newoff = old->chunks;
+ } else {
+ newoff = SKB_EXT_CHUNKSIZEOF(*new);
+
+ new = skb_ext_alloc();
+ if (!new)
+ return NULL;
+ }
+
+ newlen = newoff + skb_ext_type_len[id];
+ new->chunks = newlen;
+ new->offset[id] = newoff;
+ skb->extensions = new;
+set_active:
+ skb->active_extensions |= 1 << id;
+ return skb_ext_get_ptr(new, id);
+}
+EXPORT_SYMBOL(skb_ext_add);
+
+#ifdef CONFIG_XFRM
+static void skb_ext_put_sp(struct sec_path *sp)
+{
+ unsigned int i;
+
+ for (i = 0; i < sp->len; i++)
+ xfrm_state_put(sp->xvec[i]);
+}
+#endif
+
+void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id)
+{
+ struct skb_ext *ext = skb->extensions;
+
+ skb->active_extensions &= ~(1 << id);
+ if (skb->active_extensions == 0) {
+ skb->extensions = NULL;
+ __skb_ext_put(ext);
+#ifdef CONFIG_XFRM
+ } else if (id == SKB_EXT_SEC_PATH &&
+ refcount_read(&ext->refcnt) == 1) {
+ struct sec_path *sp = skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH);
+
+ skb_ext_put_sp(sp);
+ sp->len = 0;
+#endif
+ }
+}
+EXPORT_SYMBOL(__skb_ext_del);
+
+void __skb_ext_put(struct skb_ext *ext)
+{
+ /* If this is last clone, nothing can increment
+ * it after check passes. Avoids one atomic op.
+ */
+ if (refcount_read(&ext->refcnt) == 1)
+ goto free_now;
+
+ if (!refcount_dec_and_test(&ext->refcnt))
+ return;
+free_now:
+#ifdef CONFIG_XFRM
+ if (__skb_ext_exist(ext, SKB_EXT_SEC_PATH))
+ skb_ext_put_sp(skb_ext_get_ptr(ext, SKB_EXT_SEC_PATH));
+#endif
+
+ kmem_cache_free(skbuff_ext_cache, ext);
+}
+EXPORT_SYMBOL(__skb_ext_put);
+#endif /* CONFIG_SKB_EXTENSIONS */