summaryrefslogtreecommitdiff
path: root/net/core/skbuff.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/core/skbuff.c')
-rw-r--r--net/core/skbuff.c237
1 files changed, 170 insertions, 67 deletions
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index a31ff4d83ecc..eb7d33b41e71 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -79,15 +79,44 @@
#include <linux/capability.h>
#include <linux/user_namespace.h>
#include <linux/indirect_call_wrapper.h>
+#include <linux/textsearch.h>
#include "dev.h"
#include "sock_destructor.h"
-struct kmem_cache *skbuff_head_cache __ro_after_init;
+struct kmem_cache *skbuff_cache __ro_after_init;
static struct kmem_cache *skbuff_fclone_cache __ro_after_init;
#ifdef CONFIG_SKB_EXTENSIONS
static struct kmem_cache *skbuff_ext_cache __ro_after_init;
#endif
+
+/* skb_small_head_cache and related code is only supported
+ * for CONFIG_SLAB and CONFIG_SLUB.
+ * As soon as SLOB is removed from the kernel, we can clean up this.
+ */
+#if !defined(CONFIG_SLOB)
+# define HAVE_SKB_SMALL_HEAD_CACHE 1
+#endif
+
+#ifdef HAVE_SKB_SMALL_HEAD_CACHE
+static struct kmem_cache *skb_small_head_cache __ro_after_init;
+
+#define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(MAX_TCP_HEADER)
+
+/* We want SKB_SMALL_HEAD_CACHE_SIZE to not be a power of two.
+ * This should ensure that SKB_SMALL_HEAD_HEADROOM is a unique
+ * size, and we can differentiate heads from skb_small_head_cache
+ * vs system slabs by looking at their size (skb_end_offset()).
+ */
+#define SKB_SMALL_HEAD_CACHE_SIZE \
+ (is_power_of_2(SKB_SMALL_HEAD_SIZE) ? \
+ (SKB_SMALL_HEAD_SIZE + L1_CACHE_BYTES) : \
+ SKB_SMALL_HEAD_SIZE)
+
+#define SKB_SMALL_HEAD_HEADROOM \
+ SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE)
+#endif /* HAVE_SKB_SMALL_HEAD_CACHE */
+
int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
EXPORT_SYMBOL(sysctl_max_skb_frags);
@@ -256,7 +285,7 @@ static struct sk_buff *napi_skb_cache_get(void)
struct sk_buff *skb;
if (unlikely(!nc->skb_count)) {
- nc->skb_count = kmem_cache_alloc_bulk(skbuff_head_cache,
+ nc->skb_count = kmem_cache_alloc_bulk(skbuff_cache,
GFP_ATOMIC,
NAPI_SKB_CACHE_BULK,
nc->skb_cache);
@@ -265,7 +294,7 @@ static struct sk_buff *napi_skb_cache_get(void)
}
skb = nc->skb_cache[--nc->skb_count];
- kasan_unpoison_object_data(skbuff_head_cache, skb);
+ kasan_unpoison_object_data(skbuff_cache, skb);
return skb;
}
@@ -323,7 +352,7 @@ struct sk_buff *slab_build_skb(void *data)
struct sk_buff *skb;
unsigned int size;
- skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
+ skb = kmem_cache_alloc(skbuff_cache, GFP_ATOMIC);
if (unlikely(!skb))
return NULL;
@@ -374,7 +403,7 @@ struct sk_buff *__build_skb(void *data, unsigned int frag_size)
{
struct sk_buff *skb;
- skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
+ skb = kmem_cache_alloc(skbuff_cache, GFP_ATOMIC);
if (unlikely(!skb))
return NULL;
@@ -386,8 +415,6 @@ struct sk_buff *__build_skb(void *data, unsigned int frag_size)
/* build_skb() is wrapper over __build_skb(), that specifically
* takes care of skb->head and skb->pfmemalloc
- * This means that if @frag_size is not zero, then @data must be backed
- * by a page fragment, not kmalloc() or vmalloc()
*/
struct sk_buff *build_skb(void *data, unsigned int frag_size)
{
@@ -406,7 +433,7 @@ EXPORT_SYMBOL(build_skb);
* build_skb_around - build a network buffer around provided skb
* @skb: sk_buff provide by caller, must be memset cleared
* @data: data buffer provided by caller
- * @frag_size: size of data, or 0 if head was kmalloced
+ * @frag_size: size of data
*/
struct sk_buff *build_skb_around(struct sk_buff *skb,
void *data, unsigned int frag_size)
@@ -428,7 +455,7 @@ EXPORT_SYMBOL(build_skb_around);
/**
* __napi_build_skb - build a network buffer
* @data: data buffer provided by caller
- * @frag_size: size of data, or 0 if head was kmalloced
+ * @frag_size: size of data
*
* Version of __build_skb() that uses NAPI percpu caches to obtain
* skbuff_head instead of inplace allocation.
@@ -452,7 +479,7 @@ static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size)
/**
* napi_build_skb - build a network buffer
* @data: data buffer provided by caller
- * @frag_size: size of data, or 0 if head was kmalloced
+ * @frag_size: size of data
*
* Version of __napi_build_skb() that takes care of skb->head_frag
* and skb->pfmemalloc when the data is a page or page fragment.
@@ -479,17 +506,37 @@ EXPORT_SYMBOL(napi_build_skb);
* may be used. Otherwise, the packet data may be discarded until enough
* memory is free
*/
-static void *kmalloc_reserve(size_t size, gfp_t flags, int node,
+static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node,
bool *pfmemalloc)
{
- void *obj;
bool ret_pfmemalloc = false;
+ unsigned int obj_size;
+ void *obj;
+
+ obj_size = SKB_HEAD_ALIGN(*size);
+#ifdef HAVE_SKB_SMALL_HEAD_CACHE
+ if (obj_size <= SKB_SMALL_HEAD_CACHE_SIZE &&
+ !(flags & KMALLOC_NOT_NORMAL_BITS)) {
+ /* skb_small_head_cache has non power of two size,
+ * likely forcing SLUB to use order-3 pages.
+ * We deliberately attempt a NOMEMALLOC allocation only.
+ */
+ obj = kmem_cache_alloc_node(skb_small_head_cache,
+ flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
+ node);
+ if (obj) {
+ *size = SKB_SMALL_HEAD_CACHE_SIZE;
+ goto out;
+ }
+ }
+#endif
+ *size = obj_size = kmalloc_size_roundup(obj_size);
/*
* Try a regular allocation, when that fails and we're not entitled
* to the reserves, fail.
*/
- obj = kmalloc_node_track_caller(size,
+ obj = kmalloc_node_track_caller(obj_size,
flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
node);
if (obj || !(gfp_pfmemalloc_allowed(flags)))
@@ -497,7 +544,7 @@ static void *kmalloc_reserve(size_t size, gfp_t flags, int node,
/* Try again but now we are using pfmemalloc reserves */
ret_pfmemalloc = true;
- obj = kmalloc_node_track_caller(size, flags, node);
+ obj = kmalloc_node_track_caller(obj_size, flags, node);
out:
if (pfmemalloc)
@@ -534,12 +581,11 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
{
struct kmem_cache *cache;
struct sk_buff *skb;
- unsigned int osize;
bool pfmemalloc;
u8 *data;
cache = (flags & SKB_ALLOC_FCLONE)
- ? skbuff_fclone_cache : skbuff_head_cache;
+ ? skbuff_fclone_cache : skbuff_cache;
if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
gfp_mask |= __GFP_MEMALLOC;
@@ -559,18 +605,14 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
* aligned memory blocks, unless SLUB/SLAB debug is enabled.
* Both skb->head and skb_shared_info are cache line aligned.
*/
- size = SKB_DATA_ALIGN(size);
- size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
- osize = kmalloc_size_roundup(size);
- data = kmalloc_reserve(osize, gfp_mask, node, &pfmemalloc);
+ data = kmalloc_reserve(&size, gfp_mask, node, &pfmemalloc);
if (unlikely(!data))
goto nodata;
/* kmalloc_size_roundup() might give us more room than requested.
* Put skb_shared_info exactly at the end of allocated zone,
* to allow max possible filling before reallocation.
*/
- size = SKB_WITH_OVERHEAD(osize);
- prefetchw(data + size);
+ prefetchw(data + SKB_WITH_OVERHEAD(size));
/*
* Only clear those fields we need to clear, not those that we will
@@ -578,7 +620,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
* the tail pointer in struct sk_buff!
*/
memset(skb, 0, offsetof(struct sk_buff, tail));
- __build_skb_around(skb, data, osize);
+ __build_skb_around(skb, data, size);
skb->pfmemalloc = pfmemalloc;
if (flags & SKB_ALLOC_FCLONE) {
@@ -633,8 +675,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
goto skb_success;
}
- len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
- len = SKB_DATA_ALIGN(len);
+ len = SKB_HEAD_ALIGN(len);
if (sk_memalloc_socks())
gfp_mask |= __GFP_MEMALLOC;
@@ -733,8 +774,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
data = page_frag_alloc_1k(&nc->page_small, gfp_mask);
pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small);
} else {
- len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
- len = SKB_DATA_ALIGN(len);
+ len = SKB_HEAD_ALIGN(len);
data = page_frag_alloc(&nc->page, len, gfp_mask);
pfmemalloc = nc->page.pfmemalloc;
@@ -810,6 +850,16 @@ static bool skb_pp_recycle(struct sk_buff *skb, void *data)
return page_pool_return_skb_page(virt_to_page(data));
}
+static void skb_kfree_head(void *head, unsigned int end_offset)
+{
+#ifdef HAVE_SKB_SMALL_HEAD_CACHE
+ if (end_offset == SKB_SMALL_HEAD_HEADROOM)
+ kmem_cache_free(skb_small_head_cache, head);
+ else
+#endif
+ kfree(head);
+}
+
static void skb_free_head(struct sk_buff *skb)
{
unsigned char *head = skb->head;
@@ -819,7 +869,7 @@ static void skb_free_head(struct sk_buff *skb)
return;
skb_free_frag(head);
} else {
- kfree(head);
+ skb_kfree_head(head, skb_end_offset(skb));
}
}
@@ -871,7 +921,7 @@ static void kfree_skbmem(struct sk_buff *skb)
switch (skb->fclone) {
case SKB_FCLONE_UNAVAILABLE:
- kmem_cache_free(skbuff_head_cache, skb);
+ kmem_cache_free(skbuff_cache, skb);
return;
case SKB_FCLONE_ORIG:
@@ -932,6 +982,21 @@ void __kfree_skb(struct sk_buff *skb)
}
EXPORT_SYMBOL(__kfree_skb);
+static __always_inline
+bool __kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason)
+{
+ if (unlikely(!skb_unref(skb)))
+ return false;
+
+ DEBUG_NET_WARN_ON_ONCE(reason <= 0 || reason >= SKB_DROP_REASON_MAX);
+
+ if (reason == SKB_CONSUMED)
+ trace_consume_skb(skb, __builtin_return_address(0));
+ else
+ trace_kfree_skb(skb, __builtin_return_address(0), reason);
+ return true;
+}
+
/**
* kfree_skb_reason - free an sk_buff with special reason
* @skb: buffer to free
@@ -944,28 +1009,58 @@ EXPORT_SYMBOL(__kfree_skb);
void __fix_address
kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason)
{
- if (unlikely(!skb_unref(skb)))
+ if (__kfree_skb_reason(skb, reason))
+ __kfree_skb(skb);
+}
+EXPORT_SYMBOL(kfree_skb_reason);
+
+#define KFREE_SKB_BULK_SIZE 16
+
+struct skb_free_array {
+ unsigned int skb_count;
+ void *skb_array[KFREE_SKB_BULK_SIZE];
+};
+
+static void kfree_skb_add_bulk(struct sk_buff *skb,
+ struct skb_free_array *sa,
+ enum skb_drop_reason reason)
+{
+ /* if SKB is a clone, don't handle this case */
+ if (unlikely(skb->fclone != SKB_FCLONE_UNAVAILABLE)) {
+ __kfree_skb(skb);
return;
+ }
- DEBUG_NET_WARN_ON_ONCE(reason <= 0 || reason >= SKB_DROP_REASON_MAX);
+ skb_release_all(skb, reason);
+ sa->skb_array[sa->skb_count++] = skb;
- if (reason == SKB_CONSUMED)
- trace_consume_skb(skb);
- else
- trace_kfree_skb(skb, __builtin_return_address(0), reason);
- __kfree_skb(skb);
+ if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) {
+ kmem_cache_free_bulk(skbuff_cache, KFREE_SKB_BULK_SIZE,
+ sa->skb_array);
+ sa->skb_count = 0;
+ }
}
-EXPORT_SYMBOL(kfree_skb_reason);
-void kfree_skb_list_reason(struct sk_buff *segs,
- enum skb_drop_reason reason)
+void __fix_address
+kfree_skb_list_reason(struct sk_buff *segs, enum skb_drop_reason reason)
{
+ struct skb_free_array sa;
+
+ sa.skb_count = 0;
+
while (segs) {
struct sk_buff *next = segs->next;
- kfree_skb_reason(segs, reason);
+ if (__kfree_skb_reason(segs, reason)) {
+ skb_poison_list(segs);
+ kfree_skb_add_bulk(segs, &sa, reason);
+ }
+
segs = next;
}
+
+ if (sa.skb_count)
+ kmem_cache_free_bulk(skbuff_cache, sa.skb_count, sa.skb_array);
}
EXPORT_SYMBOL(kfree_skb_list_reason);
@@ -1094,7 +1189,7 @@ void consume_skb(struct sk_buff *skb)
if (!skb_unref(skb))
return;
- trace_consume_skb(skb);
+ trace_consume_skb(skb, __builtin_return_address(0));
__kfree_skb(skb);
}
EXPORT_SYMBOL(consume_skb);
@@ -1109,7 +1204,7 @@ EXPORT_SYMBOL(consume_skb);
*/
void __consume_stateless_skb(struct sk_buff *skb)
{
- trace_consume_skb(skb);
+ trace_consume_skb(skb, __builtin_return_address(0));
skb_release_data(skb, SKB_CONSUMED);
kfree_skbmem(skb);
}
@@ -1119,15 +1214,15 @@ static void napi_skb_cache_put(struct sk_buff *skb)
struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
u32 i;
- kasan_poison_object_data(skbuff_head_cache, skb);
+ kasan_poison_object_data(skbuff_cache, skb);
nc->skb_cache[nc->skb_count++] = skb;
if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) {
for (i = NAPI_SKB_CACHE_HALF; i < NAPI_SKB_CACHE_SIZE; i++)
- kasan_unpoison_object_data(skbuff_head_cache,
+ kasan_unpoison_object_data(skbuff_cache,
nc->skb_cache[i]);
- kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_HALF,
+ kmem_cache_free_bulk(skbuff_cache, NAPI_SKB_CACHE_HALF,
nc->skb_cache + NAPI_SKB_CACHE_HALF);
nc->skb_count = NAPI_SKB_CACHE_HALF;
}
@@ -1165,7 +1260,7 @@ void napi_consume_skb(struct sk_buff *skb, int budget)
return;
/* if reaching here SKB is ready to free */
- trace_consume_skb(skb);
+ trace_consume_skb(skb, __builtin_return_address(0));
/* if SKB is a clone, don't handle this case */
if (skb->fclone != SKB_FCLONE_UNAVAILABLE) {
@@ -1311,14 +1406,18 @@ EXPORT_SYMBOL_GPL(skb_morph);
int mm_account_pinned_pages(struct mmpin *mmp, size_t size)
{
- unsigned long max_pg, num_pg, new_pg, old_pg;
+ unsigned long max_pg, num_pg, new_pg, old_pg, rlim;
struct user_struct *user;
if (capable(CAP_IPC_LOCK) || !size)
return 0;
+ rlim = rlimit(RLIMIT_MEMLOCK);
+ if (rlim == RLIM_INFINITY)
+ return 0;
+
num_pg = (size >> PAGE_SHIFT) + 2; /* worst case */
- max_pg = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+ max_pg = rlim >> PAGE_SHIFT;
user = mmp->user ? : current_user();
old_pg = atomic_long_read(&user->locked_vm);
@@ -1711,7 +1810,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
if (skb_pfmemalloc(skb))
gfp_mask |= __GFP_MEMALLOC;
- n = kmem_cache_alloc(skbuff_head_cache, gfp_mask);
+ n = kmem_cache_alloc(skbuff_cache, gfp_mask);
if (!n)
return NULL;
@@ -1893,10 +1992,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
if (skb_pfmemalloc(skb))
gfp_mask |= __GFP_MEMALLOC;
- size = SKB_DATA_ALIGN(size);
- size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
- size = kmalloc_size_roundup(size);
- data = kmalloc_reserve(size, gfp_mask, NUMA_NO_NODE, NULL);
+ data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL);
if (!data)
goto nodata;
size = SKB_WITH_OVERHEAD(size);
@@ -1959,7 +2055,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
return 0;
nofrags:
- kfree(data);
+ skb_kfree_head(data, size);
nodata:
return -ENOMEM;
}
@@ -4584,7 +4680,7 @@ static void skb_extensions_init(void) {}
void __init skb_init(void)
{
- skbuff_head_cache = kmem_cache_create_usercopy("skbuff_head_cache",
+ skbuff_cache = kmem_cache_create_usercopy("skbuff_head_cache",
sizeof(struct sk_buff),
0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC,
@@ -4596,6 +4692,19 @@ void __init skb_init(void)
0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC,
NULL);
+#ifdef HAVE_SKB_SMALL_HEAD_CACHE
+ /* usercopy should only access first SKB_SMALL_HEAD_HEADROOM bytes.
+ * struct skb_shared_info is located at the end of skb->head,
+ * and should not be copied to/from user.
+ */
+ skb_small_head_cache = kmem_cache_create_usercopy("skbuff_small_head",
+ SKB_SMALL_HEAD_CACHE_SIZE,
+ 0,
+ SLAB_HWCACHE_ALIGN | SLAB_PANIC,
+ 0,
+ SKB_SMALL_HEAD_HEADROOM,
+ NULL);
+#endif
skb_extensions_init();
}
@@ -5450,7 +5559,7 @@ void kfree_skb_partial(struct sk_buff *skb, bool head_stolen)
{
if (head_stolen) {
skb_release_head_state(skb);
- kmem_cache_free(skbuff_head_cache, skb);
+ kmem_cache_free(skbuff_cache, skb);
} else {
__kfree_skb(skb);
}
@@ -6244,10 +6353,7 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
if (skb_pfmemalloc(skb))
gfp_mask |= __GFP_MEMALLOC;
- size = SKB_DATA_ALIGN(size);
- size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
- size = kmalloc_size_roundup(size);
- data = kmalloc_reserve(size, gfp_mask, NUMA_NO_NODE, NULL);
+ data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL);
if (!data)
return -ENOMEM;
size = SKB_WITH_OVERHEAD(size);
@@ -6263,7 +6369,7 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
if (skb_cloned(skb)) {
/* drop the old head gracefully */
if (skb_orphan_frags(skb, gfp_mask)) {
- kfree(data);
+ skb_kfree_head(data, size);
return -ENOMEM;
}
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
@@ -6363,10 +6469,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
if (skb_pfmemalloc(skb))
gfp_mask |= __GFP_MEMALLOC;
- size = SKB_DATA_ALIGN(size);
- size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
- size = kmalloc_size_roundup(size);
- data = kmalloc_reserve(size, gfp_mask, NUMA_NO_NODE, NULL);
+ data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL);
if (!data)
return -ENOMEM;
size = SKB_WITH_OVERHEAD(size);
@@ -6374,7 +6477,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
memcpy((struct skb_shared_info *)(data + size),
skb_shinfo(skb), offsetof(struct skb_shared_info, frags[0]));
if (skb_orphan_frags(skb, gfp_mask)) {
- kfree(data);
+ skb_kfree_head(data, size);
return -ENOMEM;
}
shinfo = (struct skb_shared_info *)(data + size);
@@ -6410,7 +6513,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
/* skb_frag_unref() is not needed here as shinfo->nr_frags = 0. */
if (skb_has_frag_list(skb))
kfree_skb_list(skb_shinfo(skb)->frag_list);
- kfree(data);
+ skb_kfree_head(data, size);
return -ENOMEM;
}
skb_release_data(skb, SKB_CONSUMED);