summaryrefslogtreecommitdiff
path: root/net/core/skbuff.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/core/skbuff.c')
-rw-r--r--net/core/skbuff.c826
1 files changed, 516 insertions, 310 deletions
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index b99127712e67..85fc82f72d26 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -51,6 +51,7 @@
#endif
#include <linux/string.h>
#include <linux/skbuff.h>
+#include <linux/skbuff_ref.h>
#include <linux/splice.h>
#include <linux/cache.h>
#include <linux/rtnetlink.h>
@@ -63,11 +64,13 @@
#include <linux/mpls.h>
#include <linux/kcov.h>
#include <linux/iov_iter.h>
+#include <linux/crc32.h>
#include <net/protocol.h>
#include <net/dst.h>
#include <net/sock.h>
#include <net/checksum.h>
+#include <net/gro.h>
#include <net/gso.h>
#include <net/hotdata.h>
#include <net/ip6_checksum.h>
@@ -87,13 +90,17 @@
#include <linux/textsearch.h>
#include "dev.h"
+#include "devmem.h"
+#include "netmem_priv.h"
#include "sock_destructor.h"
#ifdef CONFIG_SKB_EXTENSIONS
static struct kmem_cache *skbuff_ext_cache __ro_after_init;
#endif
-#define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(MAX_TCP_HEADER)
+#define GRO_MAX_HEAD_PAD (GRO_MAX_HEAD + NET_SKB_PAD + NET_IP_ALIGN)
+#define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(max(MAX_TCP_HEADER, \
+ GRO_MAX_HEAD_PAD))
/* We want SKB_SMALL_HEAD_CACHE_SIZE to not be a power of two.
* This should ensure that SKB_SMALL_HEAD_HEADROOM is a unique
@@ -108,9 +115,6 @@ static struct kmem_cache *skbuff_ext_cache __ro_after_init;
#define SKB_SMALL_HEAD_HEADROOM \
SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE)
-int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
-EXPORT_SYMBOL(sysctl_max_skb_frags);
-
/* kcm_write_msgs() relies on casting paged frags to bio_vec to use
* iov_iter_bvec(). These static asserts ensure the cast is valid is long as the
* netmem is a page.
@@ -221,98 +225,31 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
#define NAPI_SKB_CACHE_BULK 16
#define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2)
-#if PAGE_SIZE == SZ_4K
-
-#define NAPI_HAS_SMALL_PAGE_FRAG 1
-#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) ((nc).pfmemalloc)
-
-/* specialized page frag allocator using a single order 0 page
- * and slicing it into 1K sized fragment. Constrained to systems
- * with a very limited amount of 1K fragments fitting a single
- * page - to avoid excessive truesize underestimation
- */
-
-struct page_frag_1k {
- void *va;
- u16 offset;
- bool pfmemalloc;
-};
-
-static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp)
-{
- struct page *page;
- int offset;
-
- offset = nc->offset - SZ_1K;
- if (likely(offset >= 0))
- goto use_frag;
-
- page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
- if (!page)
- return NULL;
-
- nc->va = page_address(page);
- nc->pfmemalloc = page_is_pfmemalloc(page);
- offset = PAGE_SIZE - SZ_1K;
- page_ref_add(page, offset / SZ_1K);
-
-use_frag:
- nc->offset = offset;
- return nc->va + offset;
-}
-#else
-
-/* the small page is actually unused in this build; add dummy helpers
- * to please the compiler and avoid later preprocessor's conditionals
- */
-#define NAPI_HAS_SMALL_PAGE_FRAG 0
-#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) false
-
-struct page_frag_1k {
-};
-
-static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask)
-{
- return NULL;
-}
-
-#endif
-
struct napi_alloc_cache {
+ local_lock_t bh_lock;
struct page_frag_cache page;
- struct page_frag_1k page_small;
unsigned int skb_count;
void *skb_cache[NAPI_SKB_CACHE_SIZE];
};
static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
-static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
-
-/* Double check that napi_get_frags() allocates skbs with
- * skb->head being backed by slab, not a page fragment.
- * This is to make sure bug fixed in 3226b158e67c
- * ("net: avoid 32 x truesize under-estimation for tiny skbs")
- * does not accidentally come back.
- */
-void napi_get_frags_check(struct napi_struct *napi)
-{
- struct sk_buff *skb;
-
- local_bh_disable();
- skb = napi_get_frags(napi);
- WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag);
- napi_free_frags(napi);
- local_bh_enable();
-}
+static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache) = {
+ .bh_lock = INIT_LOCAL_LOCK(bh_lock),
+};
void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
{
struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+ void *data;
fragsz = SKB_DATA_ALIGN(fragsz);
- return __page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC,
- align_mask);
+ local_lock_nested_bh(&napi_alloc_cache.bh_lock);
+ data = __page_frag_alloc_align(&nc->page, fragsz,
+ GFP_ATOMIC | __GFP_NOWARN, align_mask);
+ local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
+ return data;
+
}
EXPORT_SYMBOL(__napi_alloc_frag_align);
@@ -320,19 +257,16 @@ void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
{
void *data;
- fragsz = SKB_DATA_ALIGN(fragsz);
if (in_hardirq() || irqs_disabled()) {
struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache);
- data = __page_frag_alloc_align(nc, fragsz, GFP_ATOMIC,
+ fragsz = SKB_DATA_ALIGN(fragsz);
+ data = __page_frag_alloc_align(nc, fragsz,
+ GFP_ATOMIC | __GFP_NOWARN,
align_mask);
} else {
- struct napi_alloc_cache *nc;
-
local_bh_disable();
- nc = this_cpu_ptr(&napi_alloc_cache);
- data = __page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC,
- align_mask);
+ data = __napi_alloc_frag_align(fragsz, align_mask);
local_bh_enable();
}
return data;
@@ -344,21 +278,87 @@ static struct sk_buff *napi_skb_cache_get(void)
struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
struct sk_buff *skb;
+ local_lock_nested_bh(&napi_alloc_cache.bh_lock);
if (unlikely(!nc->skb_count)) {
nc->skb_count = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
- GFP_ATOMIC,
+ GFP_ATOMIC | __GFP_NOWARN,
NAPI_SKB_CACHE_BULK,
nc->skb_cache);
- if (unlikely(!nc->skb_count))
+ if (unlikely(!nc->skb_count)) {
+ local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
return NULL;
+ }
}
skb = nc->skb_cache[--nc->skb_count];
+ local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
kasan_mempool_unpoison_object(skb, kmem_cache_size(net_hotdata.skbuff_cache));
return skb;
}
+/**
+ * napi_skb_cache_get_bulk - obtain a number of zeroed skb heads from the cache
+ * @skbs: pointer to an at least @n-sized array to fill with skb pointers
+ * @n: number of entries to provide
+ *
+ * Tries to obtain @n &sk_buff entries from the NAPI percpu cache and writes
+ * the pointers into the provided array @skbs. If there are less entries
+ * available, tries to replenish the cache and bulk-allocates the diff from
+ * the MM layer if needed.
+ * The heads are being zeroed with either memset() or %__GFP_ZERO, so they are
+ * ready for {,__}build_skb_around() and don't have any data buffers attached.
+ * Must be called *only* from the BH context.
+ *
+ * Return: number of successfully allocated skbs (@n if no actual allocation
+ * needed or kmem_cache_alloc_bulk() didn't fail).
+ */
+u32 napi_skb_cache_get_bulk(void **skbs, u32 n)
+{
+ struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+ u32 bulk, total = n;
+
+ local_lock_nested_bh(&napi_alloc_cache.bh_lock);
+
+ if (nc->skb_count >= n)
+ goto get;
+
+ /* No enough cached skbs. Try refilling the cache first */
+ bulk = min(NAPI_SKB_CACHE_SIZE - nc->skb_count, NAPI_SKB_CACHE_BULK);
+ nc->skb_count += kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
+ GFP_ATOMIC | __GFP_NOWARN, bulk,
+ &nc->skb_cache[nc->skb_count]);
+ if (likely(nc->skb_count >= n))
+ goto get;
+
+ /* Still not enough. Bulk-allocate the missing part directly, zeroed */
+ n -= kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
+ GFP_ATOMIC | __GFP_ZERO | __GFP_NOWARN,
+ n - nc->skb_count, &skbs[nc->skb_count]);
+ if (likely(nc->skb_count >= n))
+ goto get;
+
+ /* kmem_cache didn't allocate the number we need, limit the output */
+ total -= n - nc->skb_count;
+ n = nc->skb_count;
+
+get:
+ for (u32 base = nc->skb_count - n, i = 0; i < n; i++) {
+ u32 cache_size = kmem_cache_size(net_hotdata.skbuff_cache);
+
+ skbs[i] = nc->skb_cache[base + i];
+
+ kasan_mempool_unpoison_object(skbs[i], cache_size);
+ memset(skbs[i], 0, offsetof(struct sk_buff, tail));
+ }
+
+ nc->skb_count -= n;
+ local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
+
+ return total;
+}
+EXPORT_SYMBOL_GPL(napi_skb_cache_get_bulk);
+
static inline void __finalize_skb_around(struct sk_buff *skb, void *data,
unsigned int size)
{
@@ -412,7 +412,8 @@ struct sk_buff *slab_build_skb(void *data)
struct sk_buff *skb;
unsigned int size;
- skb = kmem_cache_alloc(net_hotdata.skbuff_cache, GFP_ATOMIC);
+ skb = kmem_cache_alloc(net_hotdata.skbuff_cache,
+ GFP_ATOMIC | __GFP_NOWARN);
if (unlikely(!skb))
return NULL;
@@ -463,7 +464,8 @@ struct sk_buff *__build_skb(void *data, unsigned int frag_size)
{
struct sk_buff *skb;
- skb = kmem_cache_alloc(net_hotdata.skbuff_cache, GFP_ATOMIC);
+ skb = kmem_cache_alloc(net_hotdata.skbuff_cache,
+ GFP_ATOMIC | __GFP_NOWARN);
if (unlikely(!skb))
return NULL;
@@ -726,7 +728,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
/* If requested length is either too small or too big,
* we use kmalloc() for skb->head allocation.
*/
- if (len <= SKB_WITH_OVERHEAD(1024) ||
+ if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) ||
len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
(gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
@@ -743,12 +745,16 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
if (in_hardirq() || irqs_disabled()) {
nc = this_cpu_ptr(&netdev_alloc_cache);
data = page_frag_alloc(nc, len, gfp_mask);
- pfmemalloc = nc->pfmemalloc;
+ pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
} else {
local_bh_disable();
+ local_lock_nested_bh(&napi_alloc_cache.bh_lock);
+
nc = this_cpu_ptr(&napi_alloc_cache.page);
data = page_frag_alloc(nc, len, gfp_mask);
- pfmemalloc = nc->pfmemalloc;
+ pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
+
+ local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
local_bh_enable();
}
@@ -775,10 +781,9 @@ skb_fail:
EXPORT_SYMBOL(__netdev_alloc_skb);
/**
- * __napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance
+ * napi_alloc_skb - allocate skbuff for rx in a specific NAPI instance
* @napi: napi instance this buffer was allocated for
* @len: length to allocate
- * @gfp_mask: get_free_pages mask, passed to alloc_skb and alloc_pages
*
* Allocate a new sk_buff for use in NAPI receive. This buffer will
* attempt to allocate the head from a special reserved region used
@@ -787,9 +792,9 @@ EXPORT_SYMBOL(__netdev_alloc_skb);
*
* %NULL is returned if there is no free memory.
*/
-struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
- gfp_t gfp_mask)
+struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len)
{
+ gfp_t gfp_mask = GFP_ATOMIC | __GFP_NOWARN;
struct napi_alloc_cache *nc;
struct sk_buff *skb;
bool pfmemalloc;
@@ -800,10 +805,8 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
/* If requested length is either too small or too big,
* we use kmalloc() for skb->head allocation.
- * When the small frag allocator is available, prefer it over kmalloc
- * for small fragments
*/
- if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) ||
+ if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) ||
len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
(gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI,
@@ -813,32 +816,17 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
goto skb_success;
}
- nc = this_cpu_ptr(&napi_alloc_cache);
+ len = SKB_HEAD_ALIGN(len);
if (sk_memalloc_socks())
gfp_mask |= __GFP_MEMALLOC;
- if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) {
- /* we are artificially inflating the allocation size, but
- * that is not as bad as it may look like, as:
- * - 'len' less than GRO_MAX_HEAD makes little sense
- * - On most systems, larger 'len' values lead to fragment
- * size above 512 bytes
- * - kmalloc would use the kmalloc-1k slab for such values
- * - Builds with smaller GRO_MAX_HEAD will very likely do
- * little networking, as that implies no WiFi and no
- * tunnels support, and 32 bits arches.
- */
- len = SZ_1K;
-
- data = page_frag_alloc_1k(&nc->page_small, gfp_mask);
- pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small);
- } else {
- len = SKB_HEAD_ALIGN(len);
+ local_lock_nested_bh(&napi_alloc_cache.bh_lock);
+ nc = this_cpu_ptr(&napi_alloc_cache);
- data = page_frag_alloc(&nc->page, len, gfp_mask);
- pfmemalloc = nc->page.pfmemalloc;
- }
+ data = page_frag_alloc(&nc->page, len, gfp_mask);
+ pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page);
+ local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
if (unlikely(!data))
return NULL;
@@ -860,7 +848,7 @@ skb_success:
skb_fail:
return skb;
}
-EXPORT_SYMBOL(__napi_alloc_skb);
+EXPORT_SYMBOL(napi_alloc_skb);
void skb_add_rx_frag_netmem(struct sk_buff *skb, int i, netmem_ref netmem,
int off, int size, unsigned int truesize)
@@ -907,11 +895,6 @@ static void skb_clone_fraglist(struct sk_buff *skb)
skb_get(list);
}
-static bool is_pp_page(struct page *page)
-{
- return (page->pp_magic & ~0x3UL) == PP_SIGNATURE;
-}
-
int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb,
unsigned int headroom)
{
@@ -995,7 +978,7 @@ int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb,
EXPORT_SYMBOL(skb_pp_cow_data);
int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb,
- struct bpf_prog *prog)
+ const struct bpf_prog *prog)
{
if (!prog->aux->xdp_has_frags)
return -EINVAL;
@@ -1005,56 +988,25 @@ int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb,
EXPORT_SYMBOL(skb_cow_data_for_xdp);
#if IS_ENABLED(CONFIG_PAGE_POOL)
-bool napi_pp_put_page(struct page *page, bool napi_safe)
+bool napi_pp_put_page(netmem_ref netmem)
{
- bool allow_direct = false;
- struct page_pool *pp;
-
- page = compound_head(page);
+ netmem = netmem_compound_head(netmem);
- /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation
- * in order to preserve any existing bits, such as bit 0 for the
- * head page of compound page and bit 1 for pfmemalloc page, so
- * mask those bits for freeing side when doing below checking,
- * and page_is_pfmemalloc() is checked in __page_pool_put_page()
- * to avoid recycling the pfmemalloc page.
- */
- if (unlikely(!is_pp_page(page)))
+ if (unlikely(!netmem_is_pp(netmem)))
return false;
- pp = page->pp;
-
- /* Allow direct recycle if we have reasons to believe that we are
- * in the same context as the consumer would run, so there's
- * no possible race.
- * __page_pool_put_page() makes sure we're not in hardirq context
- * and interrupts are enabled prior to accessing the cache.
- */
- if (napi_safe || in_softirq()) {
- const struct napi_struct *napi = READ_ONCE(pp->p.napi);
- unsigned int cpuid = smp_processor_id();
-
- allow_direct = napi && READ_ONCE(napi->list_owner) == cpuid;
- allow_direct |= READ_ONCE(pp->cpuid) == cpuid;
- }
-
- /* Driver set this to memory recycling info. Reset it on recycle.
- * This will *not* work for NIC using a split-page memory model.
- * The page will be returned to the pool here regardless of the
- * 'flipped' fragment being in use or not.
- */
- page_pool_put_full_page(pp, page, allow_direct);
+ page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, false);
return true;
}
EXPORT_SYMBOL(napi_pp_put_page);
#endif
-static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe)
+static bool skb_pp_recycle(struct sk_buff *skb, void *data)
{
if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle)
return false;
- return napi_pp_put_page(virt_to_page(data), napi_safe);
+ return napi_pp_put_page(page_to_netmem(virt_to_page(data)));
}
/**
@@ -1070,7 +1022,7 @@ static bool skb_pp_recycle(struct sk_buff *skb, void *data, bool napi_safe)
static int skb_pp_frag_ref(struct sk_buff *skb)
{
struct skb_shared_info *shinfo;
- struct page *head_page;
+ netmem_ref head_netmem;
int i;
if (!skb->pp_recycle)
@@ -1079,11 +1031,11 @@ static int skb_pp_frag_ref(struct sk_buff *skb)
shinfo = skb_shinfo(skb);
for (i = 0; i < shinfo->nr_frags; i++) {
- head_page = compound_head(skb_frag_page(&shinfo->frags[i]));
- if (likely(is_pp_page(head_page)))
- page_pool_ref_page(head_page);
+ head_netmem = netmem_compound_head(shinfo->frags[i].netmem);
+ if (likely(netmem_is_pp(head_netmem)))
+ page_pool_ref_netmem(head_netmem);
else
- page_ref_inc(head_page);
+ page_ref_inc(netmem_to_page(head_netmem));
}
return 0;
}
@@ -1096,12 +1048,12 @@ static void skb_kfree_head(void *head, unsigned int end_offset)
kfree(head);
}
-static void skb_free_head(struct sk_buff *skb, bool napi_safe)
+static void skb_free_head(struct sk_buff *skb)
{
unsigned char *head = skb->head;
if (skb->head_frag) {
- if (skb_pp_recycle(skb, head, napi_safe))
+ if (skb_pp_recycle(skb, head))
return;
skb_free_frag(head);
} else {
@@ -1109,8 +1061,7 @@ static void skb_free_head(struct sk_buff *skb, bool napi_safe)
}
}
-static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason,
- bool napi_safe)
+static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason)
{
struct skb_shared_info *shinfo = skb_shinfo(skb);
int i;
@@ -1127,13 +1078,13 @@ static void skb_release_data(struct sk_buff *skb, enum skb_drop_reason reason,
}
for (i = 0; i < shinfo->nr_frags; i++)
- napi_frag_unref(&shinfo->frags[i], skb->pp_recycle, napi_safe);
+ __skb_frag_unref(&shinfo->frags[i], skb->pp_recycle);
free_head:
if (shinfo->frag_list)
kfree_skb_list_reason(shinfo->frag_list, reason);
- skb_free_head(skb, napi_safe);
+ skb_free_head(skb);
exit:
/* When we clone an SKB we copy the reycling bit. The pp_recycle
* bit is only set on the head though, so in order to avoid races
@@ -1194,12 +1145,11 @@ void skb_release_head_state(struct sk_buff *skb)
}
/* Free everything but the sk_buff shell. */
-static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason,
- bool napi_safe)
+static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason)
{
skb_release_head_state(skb);
if (likely(skb->head))
- skb_release_data(skb, reason, napi_safe);
+ skb_release_data(skb, reason);
}
/**
@@ -1213,13 +1163,14 @@ static void skb_release_all(struct sk_buff *skb, enum skb_drop_reason reason,
void __kfree_skb(struct sk_buff *skb)
{
- skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED, false);
+ skb_release_all(skb, SKB_DROP_REASON_NOT_SPECIFIED);
kfree_skbmem(skb);
}
EXPORT_SYMBOL(__kfree_skb);
static __always_inline
-bool __kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason)
+bool __sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb,
+ enum skb_drop_reason reason)
{
if (unlikely(!skb_unref(skb)))
return false;
@@ -1232,26 +1183,27 @@ bool __kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason)
if (reason == SKB_CONSUMED)
trace_consume_skb(skb, __builtin_return_address(0));
else
- trace_kfree_skb(skb, __builtin_return_address(0), reason);
+ trace_kfree_skb(skb, __builtin_return_address(0), reason, sk);
return true;
}
/**
- * kfree_skb_reason - free an sk_buff with special reason
+ * sk_skb_reason_drop - free an sk_buff with special reason
+ * @sk: the socket to receive @skb, or NULL if not applicable
* @skb: buffer to free
* @reason: reason why this skb is dropped
*
- * Drop a reference to the buffer and free it if the usage count has
- * hit zero. Meanwhile, pass the drop reason to 'kfree_skb'
- * tracepoint.
+ * Drop a reference to the buffer and free it if the usage count has hit
+ * zero. Meanwhile, pass the receiving socket and drop reason to
+ * 'kfree_skb' tracepoint.
*/
void __fix_address
-kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason)
+sk_skb_reason_drop(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason reason)
{
- if (__kfree_skb_reason(skb, reason))
+ if (__sk_skb_reason_drop(sk, skb, reason))
__kfree_skb(skb);
}
-EXPORT_SYMBOL(kfree_skb_reason);
+EXPORT_SYMBOL(sk_skb_reason_drop);
#define KFREE_SKB_BULK_SIZE 16
@@ -1270,7 +1222,7 @@ static void kfree_skb_add_bulk(struct sk_buff *skb,
return;
}
- skb_release_all(skb, reason, false);
+ skb_release_all(skb, reason);
sa->skb_array[sa->skb_count++] = skb;
if (unlikely(sa->skb_count == KFREE_SKB_BULK_SIZE)) {
@@ -1290,7 +1242,7 @@ kfree_skb_list_reason(struct sk_buff *segs, enum skb_drop_reason reason)
while (segs) {
struct sk_buff *next = segs->next;
- if (__kfree_skb_reason(segs, reason)) {
+ if (__sk_skb_reason_drop(NULL, segs, reason)) {
skb_poison_list(segs);
kfree_skb_add_bulk(segs, &sa, reason);
}
@@ -1331,22 +1283,28 @@ void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt)
has_trans = skb_transport_header_was_set(skb);
printk("%sskb len=%u headroom=%u headlen=%u tailroom=%u\n"
- "mac=(%d,%d) net=(%d,%d) trans=%d\n"
+ "mac=(%d,%d) mac_len=%u net=(%d,%d) trans=%d\n"
"shinfo(txflags=%u nr_frags=%u gso(size=%hu type=%u segs=%hu))\n"
- "csum(0x%x ip_summed=%u complete_sw=%u valid=%u level=%u)\n"
- "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n",
+ "csum(0x%x start=%u offset=%u ip_summed=%u complete_sw=%u valid=%u level=%u)\n"
+ "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n"
+ "priority=0x%x mark=0x%x alloc_cpu=%u vlan_all=0x%x\n"
+ "encapsulation=%d inner(proto=0x%04x, mac=%u, net=%u, trans=%u)\n",
level, skb->len, headroom, skb_headlen(skb), tailroom,
has_mac ? skb->mac_header : -1,
has_mac ? skb_mac_header_len(skb) : -1,
+ skb->mac_len,
skb->network_header,
has_trans ? skb_network_header_len(skb) : -1,
has_trans ? skb->transport_header : -1,
sh->tx_flags, sh->nr_frags,
sh->gso_size, sh->gso_type, sh->gso_segs,
- skb->csum, skb->ip_summed, skb->csum_complete_sw,
- skb->csum_valid, skb->csum_level,
+ skb->csum, skb->csum_start, skb->csum_offset, skb->ip_summed,
+ skb->csum_complete_sw, skb->csum_valid, skb->csum_level,
skb->hash, skb->sw_hash, skb->l4_hash,
- ntohs(skb->protocol), skb->pkt_type, skb->skb_iif);
+ ntohs(skb->protocol), skb->pkt_type, skb->skb_iif,
+ skb->priority, skb->mark, skb->alloc_cpu, skb->vlan_all,
+ skb->encapsulation, skb->inner_protocol, skb->inner_mac_header,
+ skb->inner_network_header, skb->inner_transport_header);
if (dev)
printk("%sdev name=%s feat=%pNF\n",
@@ -1375,6 +1333,14 @@ void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt)
struct page *p;
u8 *vaddr;
+ if (skb_frag_is_net_iov(frag)) {
+ printk("%sskb frag %d: not readable\n", level, i);
+ len -= skb_frag_size(frag);
+ if (!len)
+ break;
+ continue;
+ }
+
skb_frag_foreach_page(frag, skb_frag_off(frag),
skb_frag_size(frag), p, p_off, p_len,
copied) {
@@ -1444,7 +1410,7 @@ EXPORT_SYMBOL(consume_skb);
void __consume_stateless_skb(struct sk_buff *skb)
{
trace_consume_skb(skb, __builtin_return_address(0));
- skb_release_data(skb, SKB_CONSUMED, false);
+ skb_release_data(skb, SKB_CONSUMED);
kfree_skbmem(skb);
}
@@ -1456,6 +1422,7 @@ static void napi_skb_cache_put(struct sk_buff *skb)
if (!kasan_mempool_poison_object(skb))
return;
+ local_lock_nested_bh(&napi_alloc_cache.bh_lock);
nc->skb_cache[nc->skb_count++] = skb;
if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) {
@@ -1467,11 +1434,12 @@ static void napi_skb_cache_put(struct sk_buff *skb)
nc->skb_cache + NAPI_SKB_CACHE_HALF);
nc->skb_count = NAPI_SKB_CACHE_HALF;
}
+ local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
}
void __napi_kfree_skb(struct sk_buff *skb, enum skb_drop_reason reason)
{
- skb_release_all(skb, reason, true);
+ skb_release_all(skb, reason);
napi_skb_cache_put(skb);
}
@@ -1509,7 +1477,7 @@ void napi_consume_skb(struct sk_buff *skb, int budget)
return;
}
- skb_release_all(skb, SKB_CONSUMED, !!budget);
+ skb_release_all(skb, SKB_CONSUMED);
napi_skb_cache_put(skb);
}
EXPORT_SYMBOL(napi_consume_skb);
@@ -1640,7 +1608,7 @@ EXPORT_SYMBOL_GPL(alloc_skb_for_msg);
*/
struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src)
{
- skb_release_all(dst, SKB_CONSUMED, false);
+ skb_release_all(dst, SKB_CONSUMED);
return __skb_clone(dst, src);
}
EXPORT_SYMBOL_GPL(skb_morph);
@@ -1688,7 +1656,8 @@ void mm_unaccount_pinned_pages(struct mmpin *mmp)
}
EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages);
-static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size)
+static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size,
+ bool devmem)
{
struct ubuf_info_msgzc *uarg;
struct sk_buff *skb;
@@ -1703,12 +1672,12 @@ static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size)
uarg = (void *)skb->cb;
uarg->mmp.user = NULL;
- if (mm_account_pinned_pages(&uarg->mmp, size)) {
+ if (likely(!devmem) && mm_account_pinned_pages(&uarg->mmp, size)) {
kfree_skb(skb);
return NULL;
}
- uarg->ubuf.callback = msg_zerocopy_callback;
+ uarg->ubuf.ops = &msg_zerocopy_ubuf_ops;
uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1;
uarg->len = 1;
uarg->bytelen = size;
@@ -1726,7 +1695,7 @@ static inline struct sk_buff *skb_from_uarg(struct ubuf_info_msgzc *uarg)
}
struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
- struct ubuf_info *uarg)
+ struct ubuf_info *uarg, bool devmem)
{
if (uarg) {
struct ubuf_info_msgzc *uarg_zc;
@@ -1734,7 +1703,7 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
u32 bytelen, next;
/* there might be non MSG_ZEROCOPY users */
- if (uarg->callback != msg_zerocopy_callback)
+ if (uarg->ops != &msg_zerocopy_ubuf_ops)
return NULL;
/* realloc only when socket is locked (TCP, UDP cork),
@@ -1756,7 +1725,8 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
next = (u32)atomic_read(&sk->sk_zckey);
if ((u32)(uarg_zc->id + uarg_zc->len) == next) {
- if (mm_account_pinned_pages(&uarg_zc->mmp, size))
+ if (likely(!devmem) &&
+ mm_account_pinned_pages(&uarg_zc->mmp, size))
return NULL;
uarg_zc->len++;
uarg_zc->bytelen = bytelen;
@@ -1771,7 +1741,7 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
}
new_alloc:
- return msg_zerocopy_alloc(sk, size);
+ return msg_zerocopy_alloc(sk, size, devmem);
}
EXPORT_SYMBOL_GPL(msg_zerocopy_realloc);
@@ -1845,8 +1815,8 @@ release:
sock_put(sk);
}
-void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg,
- bool success)
+static void msg_zerocopy_complete(struct sk_buff *skb, struct ubuf_info *uarg,
+ bool success)
{
struct ubuf_info_msgzc *uarg_zc = uarg_to_msgzc(uarg);
@@ -1855,7 +1825,6 @@ void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg,
if (refcount_dec_and_test(&uarg->refcnt))
__msg_zerocopy_callback(uarg_zc);
}
-EXPORT_SYMBOL_GPL(msg_zerocopy_callback);
void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref)
{
@@ -1865,24 +1834,39 @@ void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref)
uarg_to_msgzc(uarg)->len--;
if (have_uref)
- msg_zerocopy_callback(NULL, uarg, true);
+ msg_zerocopy_complete(NULL, uarg, true);
}
EXPORT_SYMBOL_GPL(msg_zerocopy_put_abort);
+const struct ubuf_info_ops msg_zerocopy_ubuf_ops = {
+ .complete = msg_zerocopy_complete,
+};
+EXPORT_SYMBOL_GPL(msg_zerocopy_ubuf_ops);
+
int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
struct msghdr *msg, int len,
- struct ubuf_info *uarg)
+ struct ubuf_info *uarg,
+ struct net_devmem_dmabuf_binding *binding)
{
- struct ubuf_info *orig_uarg = skb_zcopy(skb);
int err, orig_len = skb->len;
- /* An skb can only point to one uarg. This edge case happens when
- * TCP appends to an skb, but zerocopy_realloc triggered a new alloc.
- */
- if (orig_uarg && uarg != orig_uarg)
- return -EEXIST;
+ if (uarg->ops->link_skb) {
+ err = uarg->ops->link_skb(skb, uarg);
+ if (err)
+ return err;
+ } else {
+ struct ubuf_info *orig_uarg = skb_zcopy(skb);
- err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len);
+ /* An skb can only point to one uarg. This edge case happens
+ * when TCP appends to an skb, but zerocopy_realloc triggered
+ * a new alloc.
+ */
+ if (orig_uarg && uarg != orig_uarg)
+ return -EEXIST;
+ }
+
+ err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len,
+ binding);
if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) {
struct sock *save_sk = skb->sk;
@@ -1954,6 +1938,9 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
if (skb_shared(skb) || skb_unclone(skb, gfp_mask))
return -EINVAL;
+ if (!skb_frags_readable(skb))
+ return -EFAULT;
+
if (!num_frags)
goto release;
@@ -2123,11 +2110,20 @@ static inline int skb_alloc_rx_flag(const struct sk_buff *skb)
struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
{
- int headerlen = skb_headroom(skb);
- unsigned int size = skb_end_offset(skb) + skb->data_len;
- struct sk_buff *n = __alloc_skb(size, gfp_mask,
- skb_alloc_rx_flag(skb), NUMA_NO_NODE);
+ struct sk_buff *n;
+ unsigned int size;
+ int headerlen;
+
+ if (!skb_frags_readable(skb))
+ return NULL;
+ if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST))
+ return NULL;
+
+ headerlen = skb_headroom(skb);
+ size = skb_end_offset(skb) + skb->data_len;
+ n = __alloc_skb(size, gfp_mask,
+ skb_alloc_rx_flag(skb), NUMA_NO_NODE);
if (!n)
return NULL;
@@ -2272,9 +2268,9 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
if (skb_has_frag_list(skb))
skb_clone_fraglist(skb);
- skb_release_data(skb, SKB_CONSUMED, false);
+ skb_release_data(skb, SKB_CONSUMED);
} else {
- skb_free_head(skb, false);
+ skb_free_head(skb);
}
off = (data + nhead) - skb->head;
@@ -2455,12 +2451,20 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
/*
* Allocate the copy buffer
*/
- struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom,
- gfp_mask, skb_alloc_rx_flag(skb),
- NUMA_NO_NODE);
- int oldheadroom = skb_headroom(skb);
int head_copy_len, head_copy_off;
+ struct sk_buff *n;
+ int oldheadroom;
+
+ if (!skb_frags_readable(skb))
+ return NULL;
+
+ if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST))
+ return NULL;
+ oldheadroom = skb_headroom(skb);
+ n = __alloc_skb(newheadroom + skb->len + newtailroom,
+ gfp_mask, skb_alloc_rx_flag(skb),
+ NUMA_NO_NODE);
if (!n)
return NULL;
@@ -2798,6 +2802,9 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta)
*/
int i, k, eat = (skb->tail + delta) - skb->end;
+ if (!skb_frags_readable(skb))
+ return NULL;
+
if (eat > 0 || skb_cloned(skb)) {
if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0,
GFP_ATOMIC))
@@ -2951,6 +2958,9 @@ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
to += copy;
}
+ if (!skb_frags_readable(skb))
+ goto fault;
+
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
skb_frag_t *f = &skb_shinfo(skb)->frags[i];
@@ -3139,9 +3149,15 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
/*
* then map the fragments
*/
+ if (!skb_frags_readable(skb))
+ return false;
+
for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) {
const skb_frag_t *f = &skb_shinfo(skb)->frags[seg];
+ if (WARN_ON_ONCE(!skb_frag_page(f)))
+ return false;
+
if (__splice_segment(skb_frag_page(f),
skb_frag_off(f), skb_frag_size(f),
offset, len, spd, false, sk, pipe))
@@ -3217,7 +3233,7 @@ static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg)
typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg);
static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset,
- int len, sendmsg_func sendmsg)
+ int len, sendmsg_func sendmsg, int flags)
{
unsigned int orig_len = len;
struct sk_buff *head = skb;
@@ -3235,7 +3251,7 @@ do_frag_list:
kv.iov_base = skb->data + offset;
kv.iov_len = slen;
memset(&msg, 0, sizeof(msg));
- msg.msg_flags = MSG_DONTWAIT;
+ msg.msg_flags = MSG_DONTWAIT | flags;
iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &kv, 1, slen);
ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked,
@@ -3272,7 +3288,8 @@ do_frag_list:
while (slen) {
struct bio_vec bvec;
struct msghdr msg = {
- .msg_flags = MSG_SPLICE_PAGES | MSG_DONTWAIT,
+ .msg_flags = MSG_SPLICE_PAGES | MSG_DONTWAIT |
+ flags,
};
bvec_set_page(&bvec, skb_frag_page(frag), slen,
@@ -3318,14 +3335,21 @@ error:
int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
int len)
{
- return __skb_send_sock(sk, skb, offset, len, sendmsg_locked);
+ return __skb_send_sock(sk, skb, offset, len, sendmsg_locked, 0);
}
EXPORT_SYMBOL_GPL(skb_send_sock_locked);
+int skb_send_sock_locked_with_flags(struct sock *sk, struct sk_buff *skb,
+ int offset, int len, int flags)
+{
+ return __skb_send_sock(sk, skb, offset, len, sendmsg_locked, flags);
+}
+EXPORT_SYMBOL_GPL(skb_send_sock_locked_with_flags);
+
/* Send skb data on a socket. Socket must be unlocked. */
int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len)
{
- return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked);
+ return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked, 0);
}
/**
@@ -3359,6 +3383,9 @@ int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len)
from += copy;
}
+ if (!skb_frags_readable(skb))
+ goto fault;
+
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
int end;
@@ -3418,8 +3445,7 @@ fault:
EXPORT_SYMBOL(skb_store_bits);
/* Checksum skb data. */
-__wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
- __wsum csum, const struct skb_checksum_ops *ops)
+__wsum skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum csum)
{
int start = skb_headlen(skb);
int i, copy = start - offset;
@@ -3430,14 +3456,16 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
if (copy > 0) {
if (copy > len)
copy = len;
- csum = INDIRECT_CALL_1(ops->update, csum_partial_ext,
- skb->data + offset, copy, csum);
+ csum = csum_partial(skb->data + offset, copy, csum);
if ((len -= copy) == 0)
return csum;
offset += copy;
pos = copy;
}
+ if (WARN_ON_ONCE(!skb_frags_readable(skb)))
+ return 0;
+
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
@@ -3458,13 +3486,9 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
skb_frag_off(frag) + offset - start,
copy, p, p_off, p_len, copied) {
vaddr = kmap_atomic(p);
- csum2 = INDIRECT_CALL_1(ops->update,
- csum_partial_ext,
- vaddr + p_off, p_len, 0);
+ csum2 = csum_partial(vaddr + p_off, p_len, 0);
kunmap_atomic(vaddr);
- csum = INDIRECT_CALL_1(ops->combine,
- csum_block_add_ext, csum,
- csum2, pos, p_len);
+ csum = csum_block_add(csum, csum2, pos);
pos += p_len;
}
@@ -3485,10 +3509,9 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
__wsum csum2;
if (copy > len)
copy = len;
- csum2 = __skb_checksum(frag_iter, offset - start,
- copy, 0, ops);
- csum = INDIRECT_CALL_1(ops->combine, csum_block_add_ext,
- csum, csum2, pos, copy);
+ csum2 = skb_checksum(frag_iter, offset - start, copy,
+ 0);
+ csum = csum_block_add(csum, csum2, pos);
if ((len -= copy) == 0)
return csum;
offset += copy;
@@ -3500,18 +3523,6 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
return csum;
}
-EXPORT_SYMBOL(__skb_checksum);
-
-__wsum skb_checksum(const struct sk_buff *skb, int offset,
- int len, __wsum csum)
-{
- const struct skb_checksum_ops ops = {
- .update = csum_partial_ext,
- .combine = csum_block_add_ext,
- };
-
- return __skb_checksum(skb, offset, len, csum, &ops);
-}
EXPORT_SYMBOL(skb_checksum);
/* Both of above in one bottle. */
@@ -3538,6 +3549,9 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
pos = copy;
}
+ if (!skb_frags_readable(skb))
+ return 0;
+
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
@@ -3601,6 +3615,78 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
}
EXPORT_SYMBOL(skb_copy_and_csum_bits);
+#ifdef CONFIG_NET_CRC32C
+u32 skb_crc32c(const struct sk_buff *skb, int offset, int len, u32 crc)
+{
+ int start = skb_headlen(skb);
+ int i, copy = start - offset;
+ struct sk_buff *frag_iter;
+
+ if (copy > 0) {
+ copy = min(copy, len);
+ crc = crc32c(crc, skb->data + offset, copy);
+ len -= copy;
+ if (len == 0)
+ return crc;
+ offset += copy;
+ }
+
+ if (WARN_ON_ONCE(!skb_frags_readable(skb)))
+ return 0;
+
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ int end;
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+ WARN_ON(start > offset + len);
+
+ end = start + skb_frag_size(frag);
+ copy = end - offset;
+ if (copy > 0) {
+ u32 p_off, p_len, copied;
+ struct page *p;
+ u8 *vaddr;
+
+ copy = min(copy, len);
+ skb_frag_foreach_page(frag,
+ skb_frag_off(frag) + offset - start,
+ copy, p, p_off, p_len, copied) {
+ vaddr = kmap_atomic(p);
+ crc = crc32c(crc, vaddr + p_off, p_len);
+ kunmap_atomic(vaddr);
+ }
+ len -= copy;
+ if (len == 0)
+ return crc;
+ offset += copy;
+ }
+ start = end;
+ }
+
+ skb_walk_frags(skb, frag_iter) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + frag_iter->len;
+ copy = end - offset;
+ if (copy > 0) {
+ copy = min(copy, len);
+ crc = skb_crc32c(frag_iter, offset - start, copy, crc);
+ len -= copy;
+ if (len == 0)
+ return crc;
+ offset += copy;
+ }
+ start = end;
+ }
+ BUG_ON(len);
+
+ return crc;
+}
+EXPORT_SYMBOL(skb_crc32c);
+#endif /* CONFIG_NET_CRC32C */
+
__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len)
{
__sum16 sum;
@@ -3660,32 +3746,6 @@ __sum16 __skb_checksum_complete(struct sk_buff *skb)
}
EXPORT_SYMBOL(__skb_checksum_complete);
-static __wsum warn_crc32c_csum_update(const void *buff, int len, __wsum sum)
-{
- net_warn_ratelimited(
- "%s: attempt to compute crc32c without libcrc32c.ko\n",
- __func__);
- return 0;
-}
-
-static __wsum warn_crc32c_csum_combine(__wsum csum, __wsum csum2,
- int offset, int len)
-{
- net_warn_ratelimited(
- "%s: attempt to compute crc32c without libcrc32c.ko\n",
- __func__);
- return 0;
-}
-
-static const struct skb_checksum_ops default_crc32c_ops = {
- .update = warn_crc32c_csum_update,
- .combine = warn_crc32c_csum_combine,
-};
-
-const struct skb_checksum_ops *crc32c_csum_stub __read_mostly =
- &default_crc32c_ops;
-EXPORT_SYMBOL(crc32c_csum_stub);
-
/**
* skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy()
* @from: source buffer
@@ -4029,6 +4089,7 @@ static inline void skb_split_inside_header(struct sk_buff *skb,
skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];
skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags;
+ skb1->unreadable = skb->unreadable;
skb_shinfo(skb)->nr_frags = 0;
skb1->data_len = skb->data_len;
skb1->len += skb1->data_len;
@@ -4076,6 +4137,8 @@ static inline void skb_split_no_header(struct sk_buff *skb,
pos += size;
}
skb_shinfo(skb1)->nr_frags = k;
+
+ skb1->unreadable = skb->unreadable;
}
/**
@@ -4139,6 +4202,9 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
if (skb_zcopy(tgt) || skb_zcopy(skb))
return 0;
+ DEBUG_NET_WARN_ON_ONCE(tgt->pp_recycle != skb->pp_recycle);
+ DEBUG_NET_WARN_ON_ONCE(skb_cmp_decrypted(tgt, skb));
+
todo = shiftlen;
from = 0;
to = skb_shinfo(tgt)->nr_frags;
@@ -4147,8 +4213,7 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
/* Actual merge is delayed until the point when we know we can
* commit all, so that we don't have to undo partial changes
*/
- if (!to ||
- !skb_can_coalesce(tgt, to, skb_frag_page(fragfrom),
+ if (!skb_can_coalesce(tgt, to, skb_frag_page(fragfrom),
skb_frag_off(fragfrom))) {
merge = -1;
} else {
@@ -4311,6 +4376,9 @@ next_skb:
return block_limit - abs_offset;
}
+ if (!skb_frags_readable(st->cur_skb))
+ return 0;
+
if (st->frag_idx == 0 && !st->frag_data)
st->stepped_offset += skb_headlen(st->cur_skb);
@@ -4387,6 +4455,41 @@ void skb_abort_seq_read(struct skb_seq_state *st)
}
EXPORT_SYMBOL(skb_abort_seq_read);
+/**
+ * skb_copy_seq_read() - copy from a skb_seq_state to a buffer
+ * @st: source skb_seq_state
+ * @offset: offset in source
+ * @to: destination buffer
+ * @len: number of bytes to copy
+ *
+ * Copy @len bytes from @offset bytes into the source @st to the destination
+ * buffer @to. `offset` should increase (or be unchanged) with each subsequent
+ * call to this function. If offset needs to decrease from the previous use `st`
+ * should be reset first.
+ *
+ * Return: 0 on success or -EINVAL if the copy ended early
+ */
+int skb_copy_seq_read(struct skb_seq_state *st, int offset, void *to, int len)
+{
+ const u8 *data;
+ u32 sqlen;
+
+ for (;;) {
+ sqlen = skb_seq_read(offset, &data, st);
+ if (sqlen == 0)
+ return -EINVAL;
+ if (sqlen >= len) {
+ memcpy(to, data, len);
+ return 0;
+ }
+ memcpy(to, data, sqlen);
+ to += sqlen;
+ offset += sqlen;
+ len -= sqlen;
+ }
+}
+EXPORT_SYMBOL(skb_copy_seq_read);
+
#define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb))
static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text,
@@ -5139,7 +5242,7 @@ EXPORT_SYMBOL_GPL(skb_to_sgvec);
* 3. sg_unmark_end
* 4. skb_to_sgvec(payload2)
*
- * When mapping mutilple payload conditionally, skb_to_sgvec_nomark
+ * When mapping multiple payload conditionally, skb_to_sgvec_nomark
* is more preferable.
*/
int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg,
@@ -5404,7 +5507,7 @@ static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly)
{
bool ret;
- if (likely(READ_ONCE(sysctl_tstamp_allow_data) || tsonly))
+ if (likely(tsonly || READ_ONCE(sock_net(sk)->core.sysctl_tstamp_allow_data)))
return true;
read_lock_bh(&sk->sk_callback_lock);
@@ -5437,6 +5540,54 @@ err:
}
EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
+static bool skb_tstamp_tx_report_so_timestamping(struct sk_buff *skb,
+ struct skb_shared_hwtstamps *hwtstamps,
+ int tstype)
+{
+ switch (tstype) {
+ case SCM_TSTAMP_SCHED:
+ return skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP;
+ case SCM_TSTAMP_SND:
+ return skb_shinfo(skb)->tx_flags & (hwtstamps ? SKBTX_HW_TSTAMP_NOBPF :
+ SKBTX_SW_TSTAMP);
+ case SCM_TSTAMP_ACK:
+ return TCP_SKB_CB(skb)->txstamp_ack & TSTAMP_ACK_SK;
+ case SCM_TSTAMP_COMPLETION:
+ return skb_shinfo(skb)->tx_flags & SKBTX_COMPLETION_TSTAMP;
+ }
+
+ return false;
+}
+
+static void skb_tstamp_tx_report_bpf_timestamping(struct sk_buff *skb,
+ struct skb_shared_hwtstamps *hwtstamps,
+ struct sock *sk,
+ int tstype)
+{
+ int op;
+
+ switch (tstype) {
+ case SCM_TSTAMP_SCHED:
+ op = BPF_SOCK_OPS_TSTAMP_SCHED_CB;
+ break;
+ case SCM_TSTAMP_SND:
+ if (hwtstamps) {
+ op = BPF_SOCK_OPS_TSTAMP_SND_HW_CB;
+ *skb_hwtstamps(skb) = *hwtstamps;
+ } else {
+ op = BPF_SOCK_OPS_TSTAMP_SND_SW_CB;
+ }
+ break;
+ case SCM_TSTAMP_ACK:
+ op = BPF_SOCK_OPS_TSTAMP_ACK_CB;
+ break;
+ default:
+ return;
+ }
+
+ bpf_skops_tx_timestamping(sk, skb, op);
+}
+
void __skb_tstamp_tx(struct sk_buff *orig_skb,
const struct sk_buff *ack_skb,
struct skb_shared_hwtstamps *hwtstamps,
@@ -5449,6 +5600,13 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
if (!sk)
return;
+ if (skb_shinfo(orig_skb)->tx_flags & SKBTX_BPF)
+ skb_tstamp_tx_report_bpf_timestamping(orig_skb, hwtstamps,
+ sk, tstype);
+
+ if (!skb_tstamp_tx_report_so_timestamping(orig_skb, hwtstamps, tstype))
+ return;
+
tsflags = READ_ONCE(sk->sk_tsflags);
if (!hwtstamps && !(tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) &&
skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS)
@@ -5923,7 +6081,10 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
if (to->pp_recycle != from->pp_recycle)
return false;
- if (len <= skb_tailroom(to)) {
+ if (skb_frags_readable(from) != skb_frags_readable(to))
+ return false;
+
+ if (len <= skb_tailroom(to) && skb_frags_readable(from)) {
if (len)
BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len));
*delta_truesize = 0;
@@ -5997,7 +6158,7 @@ EXPORT_SYMBOL(skb_try_coalesce);
* @skb: buffer to clean
* @xnet: packet is crossing netns
*
- * skb_scrub_packet can be used after encapsulating or decapsulting a packet
+ * skb_scrub_packet can be used after encapsulating or decapsulating a packet
* into/from a tunnel. Some information have to be cleared during these
* operations.
* skb_scrub_packet can also be used to clean a skb before injecting it in
@@ -6018,11 +6179,11 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet)
skb->offload_fwd_mark = 0;
skb->offload_l3_fwd_mark = 0;
#endif
+ ipvs_reset(skb);
if (!xnet)
return;
- ipvs_reset(skb);
skb->mark = 0;
skb_clear_tstamp(skb);
}
@@ -6100,6 +6261,9 @@ int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len)
if (!pskb_may_pull(skb, write_len))
return -ENOMEM;
+ if (!skb_frags_readable(skb))
+ return -EFAULT;
+
if (!skb_cloned(skb) || skb_clone_writable(skb, write_len))
return 0;
@@ -6219,7 +6383,7 @@ int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
return err;
skb->protocol = skb->vlan_proto;
- skb->mac_len += VLAN_HLEN;
+ skb->network_header -= VLAN_HLEN;
skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
}
@@ -6575,12 +6739,12 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
skb_frag_ref(skb, i);
if (skb_has_frag_list(skb))
skb_clone_fraglist(skb);
- skb_release_data(skb, SKB_CONSUMED, false);
+ skb_release_data(skb, SKB_CONSUMED);
} else {
/* we can reuse existing recount- all we did was
* relocate values
*/
- skb_free_head(skb, false);
+ skb_free_head(skb);
}
skb->head = data;
@@ -6715,7 +6879,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
skb_kfree_head(data, size);
return -ENOMEM;
}
- skb_release_data(skb, SKB_CONSUMED, false);
+ skb_release_data(skb, SKB_CONSUMED);
skb->head = data;
skb->head_frag = 0;
@@ -6779,7 +6943,7 @@ void skb_condense(struct sk_buff *skb)
{
if (skb->data_len) {
if (skb->data_len > skb->end - skb->tail ||
- skb_cloned(skb))
+ skb_cloned(skb) || !skb_frags_readable(skb))
return;
/* Nice, we can free page frag(s) right now */
@@ -6995,6 +7159,19 @@ free_now:
EXPORT_SYMBOL(__skb_ext_put);
#endif /* CONFIG_SKB_EXTENSIONS */
+static void kfree_skb_napi_cache(struct sk_buff *skb)
+{
+ /* if SKB is a clone, don't handle this case */
+ if (skb->fclone != SKB_FCLONE_UNAVAILABLE) {
+ __kfree_skb(skb);
+ return;
+ }
+
+ local_bh_disable();
+ __napi_kfree_skb(skb, SKB_CONSUMED);
+ local_bh_enable();
+}
+
/**
* skb_attempt_defer_free - queue skb for remote freeing
* @skb: buffer
@@ -7010,10 +7187,10 @@ void skb_attempt_defer_free(struct sk_buff *skb)
unsigned int defer_max;
bool kick;
- if (WARN_ON_ONCE(cpu >= nr_cpu_ids) ||
- !cpu_online(cpu) ||
- cpu == raw_smp_processor_id()) {
-nodefer: __kfree_skb(skb);
+ if (cpu == raw_smp_processor_id() ||
+ WARN_ON_ONCE(cpu >= nr_cpu_ids) ||
+ !cpu_online(cpu)) {
+nodefer: kfree_skb_napi_cache(skb);
return;
}
@@ -7021,7 +7198,7 @@ nodefer: __kfree_skb(skb);
DEBUG_NET_WARN_ON_ONCE(skb->destructor);
sd = &per_cpu(softnet_data, cpu);
- defer_max = READ_ONCE(sysctl_skb_defer_max);
+ defer_max = READ_ONCE(net_hotdata.sysctl_skb_defer_max);
if (READ_ONCE(sd->defer_count) >= defer_max)
goto nodefer;
@@ -7039,8 +7216,8 @@ nodefer: __kfree_skb(skb);
/* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU
* if we are unlucky enough (this seems very unlikely).
*/
- if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1))
- smp_call_function_single_async(cpu, &sd->defer_csd);
+ if (unlikely(kick))
+ kick_defer_list_purge(sd, cpu);
}
static void skb_splice_csum_page(struct sk_buff *skb, struct page *page,
@@ -7073,7 +7250,7 @@ static void skb_splice_csum_page(struct sk_buff *skb, struct page *page,
ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter,
ssize_t maxsize, gfp_t gfp)
{
- size_t frag_limit = READ_ONCE(sysctl_max_skb_frags);
+ size_t frag_limit = READ_ONCE(net_hotdata.sysctl_max_skb_frags);
struct page *pages[8], **ppages = pages;
ssize_t spliced = 0, ret = 0;
unsigned int i;
@@ -7169,3 +7346,32 @@ bool csum_and_copy_from_iter_full(void *addr, size_t bytes,
return false;
}
EXPORT_SYMBOL(csum_and_copy_from_iter_full);
+
+void get_netmem(netmem_ref netmem)
+{
+ struct net_iov *niov;
+
+ if (netmem_is_net_iov(netmem)) {
+ niov = netmem_to_net_iov(netmem);
+ if (net_is_devmem_iov(niov))
+ net_devmem_get_net_iov(netmem_to_net_iov(netmem));
+ return;
+ }
+ get_page(netmem_to_page(netmem));
+}
+EXPORT_SYMBOL(get_netmem);
+
+void put_netmem(netmem_ref netmem)
+{
+ struct net_iov *niov;
+
+ if (netmem_is_net_iov(netmem)) {
+ niov = netmem_to_net_iov(netmem);
+ if (net_is_devmem_iov(niov))
+ net_devmem_put_net_iov(netmem_to_net_iov(netmem));
+ return;
+ }
+
+ put_page(netmem_to_page(netmem));
+}
+EXPORT_SYMBOL(put_netmem);