summaryrefslogtreecommitdiff
path: root/net/core/skbuff.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/core/skbuff.c')
-rw-r--r--net/core/skbuff.c759
1 files changed, 486 insertions, 273 deletions
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 83f8cd8aa2d1..a00808f7be6a 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -64,11 +64,13 @@
#include <linux/mpls.h>
#include <linux/kcov.h>
#include <linux/iov_iter.h>
+#include <linux/crc32.h>
#include <net/protocol.h>
#include <net/dst.h>
#include <net/sock.h>
#include <net/checksum.h>
+#include <net/gro.h>
#include <net/gso.h>
#include <net/hotdata.h>
#include <net/ip6_checksum.h>
@@ -77,7 +79,9 @@
#include <net/mptcp.h>
#include <net/mctp.h>
#include <net/page_pool/helpers.h>
+#include <net/psp/types.h>
#include <net/dropreason.h>
+#include <net/xdp_sock.h>
#include <linux/uaccess.h>
#include <trace/events/skb.h>
@@ -88,13 +92,17 @@
#include <linux/textsearch.h>
#include "dev.h"
+#include "devmem.h"
+#include "netmem_priv.h"
#include "sock_destructor.h"
#ifdef CONFIG_SKB_EXTENSIONS
static struct kmem_cache *skbuff_ext_cache __ro_after_init;
#endif
-#define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(MAX_TCP_HEADER)
+#define GRO_MAX_HEAD_PAD (GRO_MAX_HEAD + NET_SKB_PAD + NET_IP_ALIGN)
+#define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(max(MAX_TCP_HEADER, \
+ GRO_MAX_HEAD_PAD))
/* We want SKB_SMALL_HEAD_CACHE_SIZE to not be a power of two.
* This should ensure that SKB_SMALL_HEAD_HEADROOM is a unique
@@ -215,71 +223,13 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr)
skb_panic(skb, sz, addr, __func__);
}
-#define NAPI_SKB_CACHE_SIZE 64
-#define NAPI_SKB_CACHE_BULK 16
-#define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2)
-
-#if PAGE_SIZE == SZ_4K
-
-#define NAPI_HAS_SMALL_PAGE_FRAG 1
-#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) ((nc).pfmemalloc)
-
-/* specialized page frag allocator using a single order 0 page
- * and slicing it into 1K sized fragment. Constrained to systems
- * with a very limited amount of 1K fragments fitting a single
- * page - to avoid excessive truesize underestimation
- */
-
-struct page_frag_1k {
- void *va;
- u16 offset;
- bool pfmemalloc;
-};
-
-static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp)
-{
- struct page *page;
- int offset;
-
- offset = nc->offset - SZ_1K;
- if (likely(offset >= 0))
- goto use_frag;
-
- page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
- if (!page)
- return NULL;
-
- nc->va = page_address(page);
- nc->pfmemalloc = page_is_pfmemalloc(page);
- offset = PAGE_SIZE - SZ_1K;
- page_ref_add(page, offset / SZ_1K);
-
-use_frag:
- nc->offset = offset;
- return nc->va + offset;
-}
-#else
-
-/* the small page is actually unused in this build; add dummy helpers
- * to please the compiler and avoid later preprocessor's conditionals
- */
-#define NAPI_HAS_SMALL_PAGE_FRAG 0
-#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) false
-
-struct page_frag_1k {
-};
-
-static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask)
-{
- return NULL;
-}
-
-#endif
+#define NAPI_SKB_CACHE_SIZE 128
+#define NAPI_SKB_CACHE_BULK 32
+#define NAPI_SKB_CACHE_FREE 32
struct napi_alloc_cache {
local_lock_t bh_lock;
struct page_frag_cache page;
- struct page_frag_1k page_small;
unsigned int skb_count;
void *skb_cache[NAPI_SKB_CACHE_SIZE];
};
@@ -289,23 +239,6 @@ static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache) = {
.bh_lock = INIT_LOCAL_LOCK(bh_lock),
};
-/* Double check that napi_get_frags() allocates skbs with
- * skb->head being backed by slab, not a page fragment.
- * This is to make sure bug fixed in 3226b158e67c
- * ("net: avoid 32 x truesize under-estimation for tiny skbs")
- * does not accidentally come back.
- */
-void napi_get_frags_check(struct napi_struct *napi)
-{
- struct sk_buff *skb;
-
- local_bh_disable();
- skb = napi_get_frags(napi);
- WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag);
- napi_free_frags(napi);
- local_bh_enable();
-}
-
void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
{
struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
@@ -314,8 +247,8 @@ void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
fragsz = SKB_DATA_ALIGN(fragsz);
local_lock_nested_bh(&napi_alloc_cache.bh_lock);
- data = __page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC,
- align_mask);
+ data = __page_frag_alloc_align(&nc->page, fragsz,
+ GFP_ATOMIC | __GFP_NOWARN, align_mask);
local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
return data;
@@ -330,7 +263,8 @@ void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache);
fragsz = SKB_DATA_ALIGN(fragsz);
- data = __page_frag_alloc_align(nc, fragsz, GFP_ATOMIC,
+ data = __page_frag_alloc_align(nc, fragsz,
+ GFP_ATOMIC | __GFP_NOWARN,
align_mask);
} else {
local_bh_disable();
@@ -341,17 +275,23 @@ void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
}
EXPORT_SYMBOL(__netdev_alloc_frag_align);
-static struct sk_buff *napi_skb_cache_get(void)
+/* Cache kmem_cache_size(net_hotdata.skbuff_cache) to help the compiler
+ * remove dead code (and skbuff_cache_size) when CONFIG_KASAN is unset.
+ */
+static u32 skbuff_cache_size __read_mostly;
+
+static struct sk_buff *napi_skb_cache_get(bool alloc)
{
struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
struct sk_buff *skb;
local_lock_nested_bh(&napi_alloc_cache.bh_lock);
if (unlikely(!nc->skb_count)) {
- nc->skb_count = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
- GFP_ATOMIC,
- NAPI_SKB_CACHE_BULK,
- nc->skb_cache);
+ if (alloc)
+ nc->skb_count = kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
+ GFP_ATOMIC | __GFP_NOWARN,
+ NAPI_SKB_CACHE_BULK,
+ nc->skb_cache);
if (unlikely(!nc->skb_count)) {
local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
return NULL;
@@ -359,12 +299,74 @@ static struct sk_buff *napi_skb_cache_get(void)
}
skb = nc->skb_cache[--nc->skb_count];
+ if (nc->skb_count)
+ prefetch(nc->skb_cache[nc->skb_count - 1]);
local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
- kasan_mempool_unpoison_object(skb, kmem_cache_size(net_hotdata.skbuff_cache));
+ kasan_mempool_unpoison_object(skb, skbuff_cache_size);
return skb;
}
+/**
+ * napi_skb_cache_get_bulk - obtain a number of zeroed skb heads from the cache
+ * @skbs: pointer to an at least @n-sized array to fill with skb pointers
+ * @n: number of entries to provide
+ *
+ * Tries to obtain @n &sk_buff entries from the NAPI percpu cache and writes
+ * the pointers into the provided array @skbs. If there are less entries
+ * available, tries to replenish the cache and bulk-allocates the diff from
+ * the MM layer if needed.
+ * The heads are being zeroed with either memset() or %__GFP_ZERO, so they are
+ * ready for {,__}build_skb_around() and don't have any data buffers attached.
+ * Must be called *only* from the BH context.
+ *
+ * Return: number of successfully allocated skbs (@n if no actual allocation
+ * needed or kmem_cache_alloc_bulk() didn't fail).
+ */
+u32 napi_skb_cache_get_bulk(void **skbs, u32 n)
+{
+ struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+ u32 bulk, total = n;
+
+ local_lock_nested_bh(&napi_alloc_cache.bh_lock);
+
+ if (nc->skb_count >= n)
+ goto get;
+
+ /* No enough cached skbs. Try refilling the cache first */
+ bulk = min(NAPI_SKB_CACHE_SIZE - nc->skb_count, NAPI_SKB_CACHE_BULK);
+ nc->skb_count += kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
+ GFP_ATOMIC | __GFP_NOWARN, bulk,
+ &nc->skb_cache[nc->skb_count]);
+ if (likely(nc->skb_count >= n))
+ goto get;
+
+ /* Still not enough. Bulk-allocate the missing part directly, zeroed */
+ n -= kmem_cache_alloc_bulk(net_hotdata.skbuff_cache,
+ GFP_ATOMIC | __GFP_ZERO | __GFP_NOWARN,
+ n - nc->skb_count, &skbs[nc->skb_count]);
+ if (likely(nc->skb_count >= n))
+ goto get;
+
+ /* kmem_cache didn't allocate the number we need, limit the output */
+ total -= n - nc->skb_count;
+ n = nc->skb_count;
+
+get:
+ for (u32 base = nc->skb_count - n, i = 0; i < n; i++) {
+ skbs[i] = nc->skb_cache[base + i];
+
+ kasan_mempool_unpoison_object(skbs[i], skbuff_cache_size);
+ memset(skbs[i], 0, offsetof(struct sk_buff, tail));
+ }
+
+ nc->skb_count -= n;
+ local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
+
+ return total;
+}
+EXPORT_SYMBOL_GPL(napi_skb_cache_get_bulk);
+
static inline void __finalize_skb_around(struct sk_buff *skb, void *data,
unsigned int size)
{
@@ -390,8 +392,7 @@ static inline void __finalize_skb_around(struct sk_buff *skb, void *data,
skb_set_kcov_handle(skb, kcov_common_handle());
}
-static inline void *__slab_build_skb(struct sk_buff *skb, void *data,
- unsigned int *size)
+static inline void *__slab_build_skb(void *data, unsigned int *size)
{
void *resized;
@@ -418,12 +419,13 @@ struct sk_buff *slab_build_skb(void *data)
struct sk_buff *skb;
unsigned int size;
- skb = kmem_cache_alloc(net_hotdata.skbuff_cache, GFP_ATOMIC);
+ skb = kmem_cache_alloc(net_hotdata.skbuff_cache,
+ GFP_ATOMIC | __GFP_NOWARN);
if (unlikely(!skb))
return NULL;
memset(skb, 0, offsetof(struct sk_buff, tail));
- data = __slab_build_skb(skb, data, &size);
+ data = __slab_build_skb(data, &size);
__finalize_skb_around(skb, data, size);
return skb;
@@ -440,7 +442,7 @@ static void __build_skb_around(struct sk_buff *skb, void *data,
* using slab buffer should use slab_build_skb() instead.
*/
if (WARN_ONCE(size == 0, "Use slab_build_skb() instead"))
- data = __slab_build_skb(skb, data, &size);
+ data = __slab_build_skb(data, &size);
__finalize_skb_around(skb, data, size);
}
@@ -469,7 +471,8 @@ struct sk_buff *__build_skb(void *data, unsigned int frag_size)
{
struct sk_buff *skb;
- skb = kmem_cache_alloc(net_hotdata.skbuff_cache, GFP_ATOMIC);
+ skb = kmem_cache_alloc(net_hotdata.skbuff_cache,
+ GFP_ATOMIC | __GFP_NOWARN);
if (unlikely(!skb))
return NULL;
@@ -530,7 +533,7 @@ static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size)
{
struct sk_buff *skb;
- skb = napi_skb_cache_get();
+ skb = napi_skb_cache_get(true);
if (unlikely(!skb))
return NULL;
@@ -645,25 +648,38 @@ out:
struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
int flags, int node)
{
+ struct sk_buff *skb = NULL;
struct kmem_cache *cache;
- struct sk_buff *skb;
bool pfmemalloc;
u8 *data;
- cache = (flags & SKB_ALLOC_FCLONE)
- ? net_hotdata.skbuff_fclone_cache : net_hotdata.skbuff_cache;
-
if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
gfp_mask |= __GFP_MEMALLOC;
- /* Get the HEAD */
- if ((flags & (SKB_ALLOC_FCLONE | SKB_ALLOC_NAPI)) == SKB_ALLOC_NAPI &&
- likely(node == NUMA_NO_NODE || node == numa_mem_id()))
- skb = napi_skb_cache_get();
- else
+ if (flags & SKB_ALLOC_FCLONE) {
+ cache = net_hotdata.skbuff_fclone_cache;
+ goto fallback;
+ }
+ cache = net_hotdata.skbuff_cache;
+ if (unlikely(node != NUMA_NO_NODE && node != numa_mem_id()))
+ goto fallback;
+
+ if (flags & SKB_ALLOC_NAPI) {
+ skb = napi_skb_cache_get(true);
+ if (unlikely(!skb))
+ return NULL;
+ } else if (!in_hardirq() && !irqs_disabled()) {
+ local_bh_disable();
+ skb = napi_skb_cache_get(false);
+ local_bh_enable();
+ }
+
+ if (!skb) {
+fallback:
skb = kmem_cache_alloc_node(cache, gfp_mask & ~GFP_DMA, node);
- if (unlikely(!skb))
- return NULL;
+ if (unlikely(!skb))
+ return NULL;
+ }
prefetchw(skb);
/* We do our best to align skb_shared_info on a separate cache
@@ -732,7 +748,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
/* If requested length is either too small or too big,
* we use kmalloc() for skb->head allocation.
*/
- if (len <= SKB_WITH_OVERHEAD(1024) ||
+ if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) ||
len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
(gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
@@ -749,14 +765,14 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
if (in_hardirq() || irqs_disabled()) {
nc = this_cpu_ptr(&netdev_alloc_cache);
data = page_frag_alloc(nc, len, gfp_mask);
- pfmemalloc = nc->pfmemalloc;
+ pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
} else {
local_bh_disable();
local_lock_nested_bh(&napi_alloc_cache.bh_lock);
nc = this_cpu_ptr(&napi_alloc_cache.page);
data = page_frag_alloc(nc, len, gfp_mask);
- pfmemalloc = nc->pfmemalloc;
+ pfmemalloc = page_frag_cache_is_pfmemalloc(nc);
local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
local_bh_enable();
@@ -809,10 +825,8 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len)
/* If requested length is either too small or too big,
* we use kmalloc() for skb->head allocation.
- * When the small frag allocator is available, prefer it over kmalloc
- * for small fragments
*/
- if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) ||
+ if (len <= SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE) ||
len > SKB_WITH_OVERHEAD(PAGE_SIZE) ||
(gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI,
@@ -822,32 +836,16 @@ struct sk_buff *napi_alloc_skb(struct napi_struct *napi, unsigned int len)
goto skb_success;
}
+ len = SKB_HEAD_ALIGN(len);
+
if (sk_memalloc_socks())
gfp_mask |= __GFP_MEMALLOC;
local_lock_nested_bh(&napi_alloc_cache.bh_lock);
nc = this_cpu_ptr(&napi_alloc_cache);
- if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) {
- /* we are artificially inflating the allocation size, but
- * that is not as bad as it may look like, as:
- * - 'len' less than GRO_MAX_HEAD makes little sense
- * - On most systems, larger 'len' values lead to fragment
- * size above 512 bytes
- * - kmalloc would use the kmalloc-1k slab for such values
- * - Builds with smaller GRO_MAX_HEAD will very likely do
- * little networking, as that implies no WiFi and no
- * tunnels support, and 32 bits arches.
- */
- len = SZ_1K;
-
- data = page_frag_alloc_1k(&nc->page_small, gfp_mask);
- pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small);
- } else {
- len = SKB_HEAD_ALIGN(len);
- data = page_frag_alloc(&nc->page, len, gfp_mask);
- pfmemalloc = nc->page.pfmemalloc;
- }
+ data = page_frag_alloc(&nc->page, len, gfp_mask);
+ pfmemalloc = page_frag_cache_is_pfmemalloc(&nc->page);
local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
if (unlikely(!data))
@@ -917,11 +915,6 @@ static void skb_clone_fraglist(struct sk_buff *skb)
skb_get(list);
}
-static bool is_pp_page(struct page *page)
-{
- return (page->pp_magic & ~0x3UL) == PP_SIGNATURE;
-}
-
int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb,
unsigned int headroom)
{
@@ -1005,7 +998,7 @@ int skb_pp_cow_data(struct page_pool *pool, struct sk_buff **pskb,
EXPORT_SYMBOL(skb_pp_cow_data);
int skb_cow_data_for_xdp(struct page_pool *pool, struct sk_buff **pskb,
- struct bpf_prog *prog)
+ const struct bpf_prog *prog)
{
if (!prog->aux->xdp_has_frags)
return -EINVAL;
@@ -1017,21 +1010,12 @@ EXPORT_SYMBOL(skb_cow_data_for_xdp);
#if IS_ENABLED(CONFIG_PAGE_POOL)
bool napi_pp_put_page(netmem_ref netmem)
{
- struct page *page = netmem_to_page(netmem);
-
- page = compound_head(page);
+ netmem = netmem_compound_head(netmem);
- /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation
- * in order to preserve any existing bits, such as bit 0 for the
- * head page of compound page and bit 1 for pfmemalloc page, so
- * mask those bits for freeing side when doing below checking,
- * and page_is_pfmemalloc() is checked in __page_pool_put_page()
- * to avoid recycling the pfmemalloc page.
- */
- if (unlikely(!is_pp_page(page)))
+ if (unlikely(!netmem_is_pp(netmem)))
return false;
- page_pool_put_full_netmem(page->pp, page_to_netmem(page), false);
+ page_pool_put_full_netmem(netmem_get_pp(netmem), netmem, false);
return true;
}
@@ -1058,7 +1042,7 @@ static bool skb_pp_recycle(struct sk_buff *skb, void *data)
static int skb_pp_frag_ref(struct sk_buff *skb)
{
struct skb_shared_info *shinfo;
- struct page *head_page;
+ netmem_ref head_netmem;
int i;
if (!skb->pp_recycle)
@@ -1067,11 +1051,11 @@ static int skb_pp_frag_ref(struct sk_buff *skb)
shinfo = skb_shinfo(skb);
for (i = 0; i < shinfo->nr_frags; i++) {
- head_page = compound_head(skb_frag_page(&shinfo->frags[i]));
- if (likely(is_pp_page(head_page)))
- page_pool_ref_page(head_page);
+ head_netmem = netmem_compound_head(shinfo->frags[i].netmem);
+ if (likely(netmem_is_pp(head_netmem)))
+ page_pool_ref_netmem(head_netmem);
else
- page_ref_inc(head_page);
+ page_ref_inc(netmem_to_page(head_netmem));
}
return 0;
}
@@ -1172,12 +1156,22 @@ void skb_release_head_state(struct sk_buff *skb)
skb_dst_drop(skb);
if (skb->destructor) {
DEBUG_NET_WARN_ON_ONCE(in_hardirq());
- skb->destructor(skb);
- }
-#if IS_ENABLED(CONFIG_NF_CONNTRACK)
- nf_conntrack_put(skb_nfct(skb));
+#ifdef CONFIG_INET
+ INDIRECT_CALL_4(skb->destructor,
+ tcp_wfree, __sock_wfree, sock_wfree,
+ xsk_destruct_skb,
+ skb);
+#else
+ INDIRECT_CALL_2(skb->destructor,
+ sock_wfree, xsk_destruct_skb,
+ skb);
+
#endif
- skb_ext_put(skb);
+ skb->destructor = NULL;
+ skb->sk = NULL;
+ }
+ nf_reset_ct(skb);
+ skb_ext_reset(skb);
}
/* Free everything but the sk_buff shell. */
@@ -1369,6 +1363,14 @@ void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt)
struct page *p;
u8 *vaddr;
+ if (skb_frag_is_net_iov(frag)) {
+ printk("%sskb frag %d: not readable\n", level, i);
+ len -= skb_frag_size(frag);
+ if (!len)
+ break;
+ continue;
+ }
+
skb_frag_foreach_page(frag, skb_frag_off(frag),
skb_frag_size(frag), p, p_off, p_len,
copied) {
@@ -1445,7 +1447,6 @@ void __consume_stateless_skb(struct sk_buff *skb)
static void napi_skb_cache_put(struct sk_buff *skb)
{
struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
- u32 i;
if (!kasan_mempool_poison_object(skb))
return;
@@ -1454,13 +1455,16 @@ static void napi_skb_cache_put(struct sk_buff *skb)
nc->skb_cache[nc->skb_count++] = skb;
if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) {
- for (i = NAPI_SKB_CACHE_HALF; i < NAPI_SKB_CACHE_SIZE; i++)
+ u32 i, remaining = NAPI_SKB_CACHE_SIZE - NAPI_SKB_CACHE_FREE;
+
+ for (i = remaining; i < NAPI_SKB_CACHE_SIZE; i++)
kasan_mempool_unpoison_object(nc->skb_cache[i],
- kmem_cache_size(net_hotdata.skbuff_cache));
+ skbuff_cache_size);
- kmem_cache_free_bulk(net_hotdata.skbuff_cache, NAPI_SKB_CACHE_HALF,
- nc->skb_cache + NAPI_SKB_CACHE_HALF);
- nc->skb_count = NAPI_SKB_CACHE_HALF;
+ kmem_cache_free_bulk(net_hotdata.skbuff_cache,
+ NAPI_SKB_CACHE_FREE,
+ nc->skb_cache + remaining);
+ nc->skb_count = remaining;
}
local_unlock_nested_bh(&napi_alloc_cache.bh_lock);
}
@@ -1486,13 +1490,18 @@ void napi_skb_free_stolen_head(struct sk_buff *skb)
void napi_consume_skb(struct sk_buff *skb, int budget)
{
/* Zero budget indicate non-NAPI context called us, like netpoll */
- if (unlikely(!budget)) {
+ if (unlikely(!budget || !skb)) {
dev_consume_skb_any(skb);
return;
}
DEBUG_NET_WARN_ON_ONCE(!in_softirq());
+ if (skb->alloc_cpu != smp_processor_id() && !skb_shared(skb)) {
+ skb_release_head_state(skb);
+ return skb_attempt_defer_free(skb);
+ }
+
if (!skb_unref(skb))
return;
@@ -1684,7 +1693,8 @@ void mm_unaccount_pinned_pages(struct mmpin *mmp)
}
EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages);
-static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size)
+static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size,
+ bool devmem)
{
struct ubuf_info_msgzc *uarg;
struct sk_buff *skb;
@@ -1699,7 +1709,7 @@ static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size)
uarg = (void *)skb->cb;
uarg->mmp.user = NULL;
- if (mm_account_pinned_pages(&uarg->mmp, size)) {
+ if (likely(!devmem) && mm_account_pinned_pages(&uarg->mmp, size)) {
kfree_skb(skb);
return NULL;
}
@@ -1722,7 +1732,7 @@ static inline struct sk_buff *skb_from_uarg(struct ubuf_info_msgzc *uarg)
}
struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
- struct ubuf_info *uarg)
+ struct ubuf_info *uarg, bool devmem)
{
if (uarg) {
struct ubuf_info_msgzc *uarg_zc;
@@ -1752,7 +1762,8 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
next = (u32)atomic_read(&sk->sk_zckey);
if ((u32)(uarg_zc->id + uarg_zc->len) == next) {
- if (mm_account_pinned_pages(&uarg_zc->mmp, size))
+ if (likely(!devmem) &&
+ mm_account_pinned_pages(&uarg_zc->mmp, size))
return NULL;
uarg_zc->len++;
uarg_zc->bytelen = bytelen;
@@ -1767,7 +1778,7 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size,
}
new_alloc:
- return msg_zerocopy_alloc(sk, size);
+ return msg_zerocopy_alloc(sk, size, devmem);
}
EXPORT_SYMBOL_GPL(msg_zerocopy_realloc);
@@ -1871,7 +1882,8 @@ EXPORT_SYMBOL_GPL(msg_zerocopy_ubuf_ops);
int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
struct msghdr *msg, int len,
- struct ubuf_info *uarg)
+ struct ubuf_info *uarg,
+ struct net_devmem_dmabuf_binding *binding)
{
int err, orig_len = skb->len;
@@ -1890,7 +1902,8 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
return -EEXIST;
}
- err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len);
+ err = __zerocopy_sg_from_iter(msg, sk, skb, &msg->msg_iter, len,
+ binding);
if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) {
struct sock *save_sk = skb->sk;
@@ -1962,6 +1975,9 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
if (skb_shared(skb) || skb_unclone(skb, gfp_mask))
return -EINVAL;
+ if (!skb_frags_readable(skb))
+ return -EFAULT;
+
if (!num_frags)
goto release;
@@ -2135,6 +2151,9 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
unsigned int size;
int headerlen;
+ if (!skb_frags_readable(skb))
+ return NULL;
+
if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST))
return NULL;
@@ -2236,6 +2255,10 @@ EXPORT_SYMBOL(__pskb_copy_fclone);
*
* All the pointers pointing into skb header may change and must be
* reloaded after call to this function.
+ *
+ * Note: If you skb_push() the start of the buffer after reallocating the
+ * header, call skb_postpush_data_move() first to move the metadata out of
+ * the way before writing to &sk_buff->data.
*/
int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
@@ -2307,8 +2330,6 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
skb->nohdr = 0;
atomic_set(&skb_shinfo(skb)->dataref, 1);
- skb_metadata_clear(skb);
-
/* It is not generally safe to change skb->truesize.
* For the moment, we really care of rx path, or
* when skb is orphaned (not attached to a socket).
@@ -2473,6 +2494,9 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
struct sk_buff *n;
int oldheadroom;
+ if (!skb_frags_readable(skb))
+ return NULL;
+
if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST))
return NULL;
@@ -2817,6 +2841,9 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta)
*/
int i, k, eat = (skb->tail + delta) - skb->end;
+ if (!skb_frags_readable(skb))
+ return NULL;
+
if (eat > 0 || skb_cloned(skb)) {
if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0,
GFP_ATOMIC))
@@ -2970,6 +2997,9 @@ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
to += copy;
}
+ if (!skb_frags_readable(skb))
+ goto fault;
+
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
skb_frag_t *f = &skb_shinfo(skb)->frags[i];
@@ -3069,10 +3099,8 @@ static bool spd_can_coalesce(const struct splice_pipe_desc *spd,
/*
* Fill page/offset/length into spd, if it can hold more pages.
*/
-static bool spd_fill_page(struct splice_pipe_desc *spd,
- struct pipe_inode_info *pipe, struct page *page,
- unsigned int *len, unsigned int offset,
- bool linear,
+static bool spd_fill_page(struct splice_pipe_desc *spd, struct page *page,
+ unsigned int *len, unsigned int offset, bool linear,
struct sock *sk)
{
if (unlikely(spd->nr_pages == MAX_SKB_FRAGS))
@@ -3100,8 +3128,7 @@ static bool __splice_segment(struct page *page, unsigned int poff,
unsigned int plen, unsigned int *off,
unsigned int *len,
struct splice_pipe_desc *spd, bool linear,
- struct sock *sk,
- struct pipe_inode_info *pipe)
+ struct sock *sk)
{
if (!*len)
return true;
@@ -3120,13 +3147,14 @@ static bool __splice_segment(struct page *page, unsigned int poff,
do {
unsigned int flen = min(*len, plen);
- if (spd_fill_page(spd, pipe, page, &flen, poff,
- linear, sk))
+ if (spd_fill_page(spd, page, &flen, poff, linear, sk))
return true;
poff += flen;
plen -= flen;
*len -= flen;
- } while (*len && plen);
+ if (!*len)
+ return true;
+ } while (plen);
return false;
}
@@ -3139,8 +3167,8 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
unsigned int *offset, unsigned int *len,
struct splice_pipe_desc *spd, struct sock *sk)
{
- int seg;
struct sk_buff *iter;
+ int seg;
/* map the linear part :
* If skb->head_frag is set, this 'linear' part is backed by a
@@ -3152,18 +3180,24 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
skb_headlen(skb),
offset, len, spd,
skb_head_is_locked(skb),
- sk, pipe))
+ sk))
return true;
/*
* then map the fragments
*/
+ if (!skb_frags_readable(skb))
+ return false;
+
for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) {
const skb_frag_t *f = &skb_shinfo(skb)->frags[seg];
+ if (WARN_ON_ONCE(!skb_frag_page(f)))
+ return false;
+
if (__splice_segment(skb_frag_page(f),
skb_frag_off(f), skb_frag_size(f),
- offset, len, spd, false, sk, pipe))
+ offset, len, spd, false, sk))
return true;
}
@@ -3236,8 +3270,9 @@ static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg)
typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg);
static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset,
- int len, sendmsg_func sendmsg)
+ int len, sendmsg_func sendmsg, int flags)
{
+ int more_hint = sk_is_tcp(sk) ? MSG_MORE : 0;
unsigned int orig_len = len;
struct sk_buff *head = skb;
unsigned short fragidx;
@@ -3254,7 +3289,9 @@ do_frag_list:
kv.iov_base = skb->data + offset;
kv.iov_len = slen;
memset(&msg, 0, sizeof(msg));
- msg.msg_flags = MSG_DONTWAIT;
+ msg.msg_flags = MSG_DONTWAIT | flags;
+ if (slen < len)
+ msg.msg_flags |= more_hint;
iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &kv, 1, slen);
ret = INDIRECT_CALL_2(sendmsg, sendmsg_locked,
@@ -3291,9 +3328,12 @@ do_frag_list:
while (slen) {
struct bio_vec bvec;
struct msghdr msg = {
- .msg_flags = MSG_SPLICE_PAGES | MSG_DONTWAIT,
+ .msg_flags = MSG_SPLICE_PAGES | MSG_DONTWAIT |
+ flags,
};
+ if (slen < len)
+ msg.msg_flags |= more_hint;
bvec_set_page(&bvec, skb_frag_page(frag), slen,
skb_frag_off(frag) + offset);
iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1,
@@ -3337,14 +3377,21 @@ error:
int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset,
int len)
{
- return __skb_send_sock(sk, skb, offset, len, sendmsg_locked);
+ return __skb_send_sock(sk, skb, offset, len, sendmsg_locked, 0);
}
EXPORT_SYMBOL_GPL(skb_send_sock_locked);
+int skb_send_sock_locked_with_flags(struct sock *sk, struct sk_buff *skb,
+ int offset, int len, int flags)
+{
+ return __skb_send_sock(sk, skb, offset, len, sendmsg_locked, flags);
+}
+EXPORT_SYMBOL_GPL(skb_send_sock_locked_with_flags);
+
/* Send skb data on a socket. Socket must be unlocked. */
int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len)
{
- return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked);
+ return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked, 0);
}
/**
@@ -3378,6 +3425,9 @@ int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len)
from += copy;
}
+ if (!skb_frags_readable(skb))
+ goto fault;
+
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
int end;
@@ -3437,8 +3487,7 @@ fault:
EXPORT_SYMBOL(skb_store_bits);
/* Checksum skb data. */
-__wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
- __wsum csum, const struct skb_checksum_ops *ops)
+__wsum skb_checksum(const struct sk_buff *skb, int offset, int len, __wsum csum)
{
int start = skb_headlen(skb);
int i, copy = start - offset;
@@ -3449,14 +3498,16 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
if (copy > 0) {
if (copy > len)
copy = len;
- csum = INDIRECT_CALL_1(ops->update, csum_partial_ext,
- skb->data + offset, copy, csum);
+ csum = csum_partial(skb->data + offset, copy, csum);
if ((len -= copy) == 0)
return csum;
offset += copy;
pos = copy;
}
+ if (WARN_ON_ONCE(!skb_frags_readable(skb)))
+ return 0;
+
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
@@ -3477,13 +3528,9 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
skb_frag_off(frag) + offset - start,
copy, p, p_off, p_len, copied) {
vaddr = kmap_atomic(p);
- csum2 = INDIRECT_CALL_1(ops->update,
- csum_partial_ext,
- vaddr + p_off, p_len, 0);
+ csum2 = csum_partial(vaddr + p_off, p_len, 0);
kunmap_atomic(vaddr);
- csum = INDIRECT_CALL_1(ops->combine,
- csum_block_add_ext, csum,
- csum2, pos, p_len);
+ csum = csum_block_add(csum, csum2, pos);
pos += p_len;
}
@@ -3504,10 +3551,9 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
__wsum csum2;
if (copy > len)
copy = len;
- csum2 = __skb_checksum(frag_iter, offset - start,
- copy, 0, ops);
- csum = INDIRECT_CALL_1(ops->combine, csum_block_add_ext,
- csum, csum2, pos, copy);
+ csum2 = skb_checksum(frag_iter, offset - start, copy,
+ 0);
+ csum = csum_block_add(csum, csum2, pos);
if ((len -= copy) == 0)
return csum;
offset += copy;
@@ -3519,18 +3565,6 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
return csum;
}
-EXPORT_SYMBOL(__skb_checksum);
-
-__wsum skb_checksum(const struct sk_buff *skb, int offset,
- int len, __wsum csum)
-{
- const struct skb_checksum_ops ops = {
- .update = csum_partial_ext,
- .combine = csum_block_add_ext,
- };
-
- return __skb_checksum(skb, offset, len, csum, &ops);
-}
EXPORT_SYMBOL(skb_checksum);
/* Both of above in one bottle. */
@@ -3557,6 +3591,9 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
pos = copy;
}
+ if (!skb_frags_readable(skb))
+ return 0;
+
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
@@ -3620,6 +3657,78 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
}
EXPORT_SYMBOL(skb_copy_and_csum_bits);
+#ifdef CONFIG_NET_CRC32C
+u32 skb_crc32c(const struct sk_buff *skb, int offset, int len, u32 crc)
+{
+ int start = skb_headlen(skb);
+ int i, copy = start - offset;
+ struct sk_buff *frag_iter;
+
+ if (copy > 0) {
+ copy = min(copy, len);
+ crc = crc32c(crc, skb->data + offset, copy);
+ len -= copy;
+ if (len == 0)
+ return crc;
+ offset += copy;
+ }
+
+ if (WARN_ON_ONCE(!skb_frags_readable(skb)))
+ return 0;
+
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ int end;
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+ WARN_ON(start > offset + len);
+
+ end = start + skb_frag_size(frag);
+ copy = end - offset;
+ if (copy > 0) {
+ u32 p_off, p_len, copied;
+ struct page *p;
+ u8 *vaddr;
+
+ copy = min(copy, len);
+ skb_frag_foreach_page(frag,
+ skb_frag_off(frag) + offset - start,
+ copy, p, p_off, p_len, copied) {
+ vaddr = kmap_atomic(p);
+ crc = crc32c(crc, vaddr + p_off, p_len);
+ kunmap_atomic(vaddr);
+ }
+ len -= copy;
+ if (len == 0)
+ return crc;
+ offset += copy;
+ }
+ start = end;
+ }
+
+ skb_walk_frags(skb, frag_iter) {
+ int end;
+
+ WARN_ON(start > offset + len);
+
+ end = start + frag_iter->len;
+ copy = end - offset;
+ if (copy > 0) {
+ copy = min(copy, len);
+ crc = skb_crc32c(frag_iter, offset - start, copy, crc);
+ len -= copy;
+ if (len == 0)
+ return crc;
+ offset += copy;
+ }
+ start = end;
+ }
+ BUG_ON(len);
+
+ return crc;
+}
+EXPORT_SYMBOL(skb_crc32c);
+#endif /* CONFIG_NET_CRC32C */
+
__sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len)
{
__sum16 sum;
@@ -3679,32 +3788,6 @@ __sum16 __skb_checksum_complete(struct sk_buff *skb)
}
EXPORT_SYMBOL(__skb_checksum_complete);
-static __wsum warn_crc32c_csum_update(const void *buff, int len, __wsum sum)
-{
- net_warn_ratelimited(
- "%s: attempt to compute crc32c without libcrc32c.ko\n",
- __func__);
- return 0;
-}
-
-static __wsum warn_crc32c_csum_combine(__wsum csum, __wsum csum2,
- int offset, int len)
-{
- net_warn_ratelimited(
- "%s: attempt to compute crc32c without libcrc32c.ko\n",
- __func__);
- return 0;
-}
-
-static const struct skb_checksum_ops default_crc32c_ops = {
- .update = warn_crc32c_csum_update,
- .combine = warn_crc32c_csum_combine,
-};
-
-const struct skb_checksum_ops *crc32c_csum_stub __read_mostly =
- &default_crc32c_ops;
-EXPORT_SYMBOL(crc32c_csum_stub);
-
/**
* skb_zerocopy_headlen - Calculate headroom needed for skb_zerocopy()
* @from: source buffer
@@ -4048,6 +4131,7 @@ static inline void skb_split_inside_header(struct sk_buff *skb,
skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];
skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags;
+ skb1->unreadable = skb->unreadable;
skb_shinfo(skb)->nr_frags = 0;
skb1->data_len = skb->data_len;
skb1->len += skb1->data_len;
@@ -4095,6 +4179,8 @@ static inline void skb_split_no_header(struct sk_buff *skb,
pos += size;
}
skb_shinfo(skb1)->nr_frags = k;
+
+ skb1->unreadable = skb->unreadable;
}
/**
@@ -4169,8 +4255,7 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
/* Actual merge is delayed until the point when we know we can
* commit all, so that we don't have to undo partial changes
*/
- if (!to ||
- !skb_can_coalesce(tgt, to, skb_frag_page(fragfrom),
+ if (!skb_can_coalesce(tgt, to, skb_frag_page(fragfrom),
skb_frag_off(fragfrom))) {
merge = -1;
} else {
@@ -4333,6 +4418,9 @@ next_skb:
return block_limit - abs_offset;
}
+ if (!skb_frags_readable(st->cur_skb))
+ return 0;
+
if (st->frag_idx == 0 && !st->frag_data)
st->stepped_offset += skb_headlen(st->cur_skb);
@@ -4409,6 +4497,41 @@ void skb_abort_seq_read(struct skb_seq_state *st)
}
EXPORT_SYMBOL(skb_abort_seq_read);
+/**
+ * skb_copy_seq_read() - copy from a skb_seq_state to a buffer
+ * @st: source skb_seq_state
+ * @offset: offset in source
+ * @to: destination buffer
+ * @len: number of bytes to copy
+ *
+ * Copy @len bytes from @offset bytes into the source @st to the destination
+ * buffer @to. `offset` should increase (or be unchanged) with each subsequent
+ * call to this function. If offset needs to decrease from the previous use `st`
+ * should be reset first.
+ *
+ * Return: 0 on success or -EINVAL if the copy ended early
+ */
+int skb_copy_seq_read(struct skb_seq_state *st, int offset, void *to, int len)
+{
+ const u8 *data;
+ u32 sqlen;
+
+ for (;;) {
+ sqlen = skb_seq_read(offset, &data, st);
+ if (sqlen == 0)
+ return -EINVAL;
+ if (sqlen >= len) {
+ memcpy(to, data, len);
+ return 0;
+ }
+ memcpy(to, data, sqlen);
+ to += sqlen;
+ offset += sqlen;
+ len -= sqlen;
+ }
+}
+EXPORT_SYMBOL(skb_copy_seq_read);
+
#define TS_SKB_CB(state) ((struct skb_seq_state *) &((state)->cb))
static unsigned int skb_ts_get_next_block(unsigned int offset, const u8 **text,
@@ -4979,6 +5102,9 @@ static const u8 skb_ext_type_len[] = {
#if IS_ENABLED(CONFIG_MCTP_FLOWS)
[SKB_EXT_MCTP] = SKB_EXT_CHUNKSIZEOF(struct mctp_flow),
#endif
+#if IS_ENABLED(CONFIG_INET_PSP)
+ [SKB_EXT_PSP] = SKB_EXT_CHUNKSIZEOF(struct psp_skb_ext),
+#endif
};
static __always_inline unsigned int skb_ext_total_length(void)
@@ -5029,6 +5155,8 @@ void __init skb_init(void)
offsetof(struct sk_buff, cb),
sizeof_field(struct sk_buff, cb),
NULL);
+ skbuff_cache_size = kmem_cache_size(net_hotdata.skbuff_cache);
+
net_hotdata.skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache",
sizeof(struct sk_buff_fclones),
0,
@@ -5161,7 +5289,7 @@ EXPORT_SYMBOL_GPL(skb_to_sgvec);
* 3. sg_unmark_end
* 4. skb_to_sgvec(payload2)
*
- * When mapping mutilple payload conditionally, skb_to_sgvec_nomark
+ * When mapping multiple payload conditionally, skb_to_sgvec_nomark
* is more preferable.
*/
int skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg,
@@ -5426,7 +5554,7 @@ static bool skb_may_tx_timestamp(struct sock *sk, bool tsonly)
{
bool ret;
- if (likely(READ_ONCE(sysctl_tstamp_allow_data) || tsonly))
+ if (likely(tsonly || READ_ONCE(sock_net(sk)->core.sysctl_tstamp_allow_data)))
return true;
read_lock_bh(&sk->sk_callback_lock);
@@ -5459,6 +5587,54 @@ err:
}
EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
+static bool skb_tstamp_tx_report_so_timestamping(struct sk_buff *skb,
+ struct skb_shared_hwtstamps *hwtstamps,
+ int tstype)
+{
+ switch (tstype) {
+ case SCM_TSTAMP_SCHED:
+ return skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP;
+ case SCM_TSTAMP_SND:
+ return skb_shinfo(skb)->tx_flags & (hwtstamps ? SKBTX_HW_TSTAMP_NOBPF :
+ SKBTX_SW_TSTAMP);
+ case SCM_TSTAMP_ACK:
+ return TCP_SKB_CB(skb)->txstamp_ack & TSTAMP_ACK_SK;
+ case SCM_TSTAMP_COMPLETION:
+ return skb_shinfo(skb)->tx_flags & SKBTX_COMPLETION_TSTAMP;
+ }
+
+ return false;
+}
+
+static void skb_tstamp_tx_report_bpf_timestamping(struct sk_buff *skb,
+ struct skb_shared_hwtstamps *hwtstamps,
+ struct sock *sk,
+ int tstype)
+{
+ int op;
+
+ switch (tstype) {
+ case SCM_TSTAMP_SCHED:
+ op = BPF_SOCK_OPS_TSTAMP_SCHED_CB;
+ break;
+ case SCM_TSTAMP_SND:
+ if (hwtstamps) {
+ op = BPF_SOCK_OPS_TSTAMP_SND_HW_CB;
+ *skb_hwtstamps(skb) = *hwtstamps;
+ } else {
+ op = BPF_SOCK_OPS_TSTAMP_SND_SW_CB;
+ }
+ break;
+ case SCM_TSTAMP_ACK:
+ op = BPF_SOCK_OPS_TSTAMP_ACK_CB;
+ break;
+ default:
+ return;
+ }
+
+ bpf_skops_tx_timestamping(sk, skb, op);
+}
+
void __skb_tstamp_tx(struct sk_buff *orig_skb,
const struct sk_buff *ack_skb,
struct skb_shared_hwtstamps *hwtstamps,
@@ -5471,6 +5647,13 @@ void __skb_tstamp_tx(struct sk_buff *orig_skb,
if (!sk)
return;
+ if (skb_shinfo(orig_skb)->tx_flags & SKBTX_BPF)
+ skb_tstamp_tx_report_bpf_timestamping(orig_skb, hwtstamps,
+ sk, tstype);
+
+ if (!skb_tstamp_tx_report_so_timestamping(orig_skb, hwtstamps, tstype))
+ return;
+
tsflags = READ_ONCE(sk->sk_tsflags);
if (!hwtstamps && !(tsflags & SOF_TIMESTAMPING_OPT_TX_SWHW) &&
skb_shinfo(orig_skb)->tx_flags & SKBTX_IN_PROGRESS)
@@ -5945,7 +6128,10 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
if (to->pp_recycle != from->pp_recycle)
return false;
- if (len <= skb_tailroom(to)) {
+ if (skb_frags_readable(from) != skb_frags_readable(to))
+ return false;
+
+ if (len <= skb_tailroom(to) && skb_frags_readable(from)) {
if (len)
BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len));
*delta_truesize = 0;
@@ -6019,7 +6205,7 @@ EXPORT_SYMBOL(skb_try_coalesce);
* @skb: buffer to clean
* @xnet: packet is crossing netns
*
- * skb_scrub_packet can be used after encapsulating or decapsulting a packet
+ * skb_scrub_packet can be used after encapsulating or decapsulating a packet
* into/from a tunnel. Some information have to be cleared during these
* operations.
* skb_scrub_packet can also be used to clean a skb before injecting it in
@@ -6040,11 +6226,11 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet)
skb->offload_fwd_mark = 0;
skb->offload_l3_fwd_mark = 0;
#endif
+ ipvs_reset(skb);
if (!xnet)
return;
- ipvs_reset(skb);
skb->mark = 0;
skb_clear_tstamp(skb);
}
@@ -6241,7 +6427,7 @@ int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
return err;
skb->protocol = skb->vlan_proto;
- skb->mac_len += VLAN_HLEN;
+ skb->network_header -= VLAN_HLEN;
skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
}
@@ -6528,7 +6714,7 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
return NULL;
while (data_len) {
- if (nr_frags == MAX_SKB_FRAGS - 1)
+ if (nr_frags == MAX_SKB_FRAGS)
goto failure;
while (order && PAGE_ALIGN(data_len) < (PAGE_SIZE << order))
order--;
@@ -6624,8 +6810,7 @@ static int pskb_carve(struct sk_buff *skb, const u32 off, gfp_t gfp);
/* carve out the first eat bytes from skb's frag_list. May recurse into
* pskb_carve()
*/
-static int pskb_carve_frag_list(struct sk_buff *skb,
- struct skb_shared_info *shinfo, int eat,
+static int pskb_carve_frag_list(struct skb_shared_info *shinfo, int eat,
gfp_t gfp_mask)
{
struct sk_buff *list = shinfo->frag_list;
@@ -6730,7 +6915,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
skb_clone_fraglist(skb);
/* split line is in frag list */
- if (k == 0 && pskb_carve_frag_list(skb, shinfo, off - pos, gfp_mask)) {
+ if (k == 0 && pskb_carve_frag_list(shinfo, off - pos, gfp_mask)) {
/* skb_frag_unref() is not needed here as shinfo->nr_frags = 0. */
if (skb_has_frag_list(skb))
kfree_skb_list(skb_shinfo(skb)->frag_list);
@@ -6801,7 +6986,7 @@ void skb_condense(struct sk_buff *skb)
{
if (skb->data_len) {
if (skb->data_len > skb->end - skb->tail ||
- skb_cloned(skb))
+ skb_cloned(skb) || !skb_frags_readable(skb))
return;
/* Nice, we can free page frag(s) right now */
@@ -6904,6 +7089,7 @@ void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id,
skb->active_extensions = 1 << id;
return skb_ext_get_ptr(ext, id);
}
+EXPORT_SYMBOL_NS_GPL(__skb_ext_set, "NETDEV_INTERNAL");
/**
* skb_ext_add - allocate space for given extension, COW if needed
@@ -7040,8 +7226,9 @@ static void kfree_skb_napi_cache(struct sk_buff *skb)
*/
void skb_attempt_defer_free(struct sk_buff *skb)
{
+ struct skb_defer_node *sdn;
+ unsigned long defer_count;
int cpu = skb->alloc_cpu;
- struct softnet_data *sd;
unsigned int defer_max;
bool kick;
@@ -7054,28 +7241,26 @@ nodefer: kfree_skb_napi_cache(skb);
DEBUG_NET_WARN_ON_ONCE(skb_dst(skb));
DEBUG_NET_WARN_ON_ONCE(skb->destructor);
+ DEBUG_NET_WARN_ON_ONCE(skb_nfct(skb));
+
+ sdn = per_cpu_ptr(net_hotdata.skb_defer_nodes, cpu) + numa_node_id();
- sd = &per_cpu(softnet_data, cpu);
defer_max = READ_ONCE(net_hotdata.sysctl_skb_defer_max);
- if (READ_ONCE(sd->defer_count) >= defer_max)
+ defer_count = atomic_long_inc_return(&sdn->defer_count);
+
+ if (defer_count >= defer_max)
goto nodefer;
- spin_lock_bh(&sd->defer_lock);
- /* Send an IPI every time queue reaches half capacity. */
- kick = sd->defer_count == (defer_max >> 1);
- /* Paired with the READ_ONCE() few lines above */
- WRITE_ONCE(sd->defer_count, sd->defer_count + 1);
+ llist_add(&skb->ll_node, &sdn->defer_list);
- skb->next = sd->defer_list;
- /* Paired with READ_ONCE() in skb_defer_free_flush() */
- WRITE_ONCE(sd->defer_list, skb);
- spin_unlock_bh(&sd->defer_lock);
+ /* Send an IPI every time queue reaches half capacity. */
+ kick = (defer_count - 1) == (defer_max >> 1);
/* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU
* if we are unlucky enough (this seems very unlikely).
*/
if (unlikely(kick))
- kick_defer_list_purge(sd, cpu);
+ kick_defer_list_purge(cpu);
}
static void skb_splice_csum_page(struct sk_buff *skb, struct page *page,
@@ -7095,7 +7280,6 @@ static void skb_splice_csum_page(struct sk_buff *skb, struct page *page,
* @skb: The buffer to add pages to
* @iter: Iterator representing the pages to be added
* @maxsize: Maximum amount of pages to be added
- * @gfp: Allocation flags
*
* This is a common helper function for supporting MSG_SPLICE_PAGES. It
* extracts pages from an iterator and adds them to the socket buffer if
@@ -7106,7 +7290,7 @@ static void skb_splice_csum_page(struct sk_buff *skb, struct page *page,
* insufficient space in the buffer to transfer anything.
*/
ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter,
- ssize_t maxsize, gfp_t gfp)
+ ssize_t maxsize)
{
size_t frag_limit = READ_ONCE(net_hotdata.sysctl_max_skb_frags);
struct page *pages[8], **ppages = pages;
@@ -7204,3 +7388,32 @@ bool csum_and_copy_from_iter_full(void *addr, size_t bytes,
return false;
}
EXPORT_SYMBOL(csum_and_copy_from_iter_full);
+
+void get_netmem(netmem_ref netmem)
+{
+ struct net_iov *niov;
+
+ if (netmem_is_net_iov(netmem)) {
+ niov = netmem_to_net_iov(netmem);
+ if (net_is_devmem_iov(niov))
+ net_devmem_get_net_iov(netmem_to_net_iov(netmem));
+ return;
+ }
+ get_page(netmem_to_page(netmem));
+}
+EXPORT_SYMBOL(get_netmem);
+
+void put_netmem(netmem_ref netmem)
+{
+ struct net_iov *niov;
+
+ if (netmem_is_net_iov(netmem)) {
+ niov = netmem_to_net_iov(netmem);
+ if (net_is_devmem_iov(niov))
+ net_devmem_put_net_iov(netmem_to_net_iov(netmem));
+ return;
+ }
+
+ put_page(netmem_to_page(netmem));
+}
+EXPORT_SYMBOL(put_netmem);