diff options
Diffstat (limited to 'net/core/page_pool.c')
-rw-r--r-- | net/core/page_pool.c | 729 |
1 files changed, 495 insertions, 234 deletions
diff --git a/net/core/page_pool.c b/net/core/page_pool.c index dd364d738c00..4011eb305cee 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -5,12 +5,16 @@ * Copyright (C) 2016 Red Hat, Inc. */ +#include <linux/error-injection.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/device.h> +#include <net/netdev_lock.h> +#include <net/netdev_rx_queue.h> #include <net/page_pool/helpers.h> +#include <net/page_pool/memory_provider.h> #include <net/xdp.h> #include <linux/dma-direction.h> @@ -23,8 +27,13 @@ #include <trace/events/page_pool.h> +#include "dev.h" +#include "mp_dmabuf_devmem.h" +#include "netmem_priv.h" #include "page_pool_priv.h" +DEFINE_STATIC_KEY_FALSE(page_pool_mem_providers); + #define DEFER_TIME (msecs_to_jiffies(1000)) #define DEFER_WARN_INTERVAL (60 * HZ) @@ -123,9 +132,9 @@ int page_pool_ethtool_stats_get_count(void) } EXPORT_SYMBOL(page_pool_ethtool_stats_get_count); -u64 *page_pool_ethtool_stats_get(u64 *data, void *stats) +u64 *page_pool_ethtool_stats_get(u64 *data, const void *stats) { - struct page_pool_stats *pool_stats = stats; + const struct page_pool_stats *pool_stats = stats; *data++ = pool_stats->alloc_stats.fast; *data++ = pool_stats->alloc_stats.slow; @@ -172,19 +181,33 @@ static void page_pool_producer_unlock(struct page_pool *pool, spin_unlock_bh(&pool->ring.producer_lock); } +static void page_pool_struct_check(void) +{ + CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_users); + CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_page); + CACHELINE_ASSERT_GROUP_MEMBER(struct page_pool, frag, frag_offset); + CACHELINE_ASSERT_GROUP_SIZE(struct page_pool, frag, + PAGE_POOL_FRAG_GROUP_ALIGN); +} + static int page_pool_init(struct page_pool *pool, const struct page_pool_params *params, int cpuid) { unsigned int ring_qsize = 1024; /* Default */ + struct netdev_rx_queue *rxq; + int err; + + page_pool_struct_check(); memcpy(&pool->p, ¶ms->fast, sizeof(pool->p)); memcpy(&pool->slow, ¶ms->slow, sizeof(pool->slow)); pool->cpuid = cpuid; + pool->dma_sync_for_cpu = true; /* Validate only known flags were used */ - if (pool->p.flags & ~(PP_FLAG_ALL)) + if (pool->slow.flags & ~PP_FLAG_ALL) return -EINVAL; if (pool->p.pool_size) @@ -198,22 +221,26 @@ static int page_pool_init(struct page_pool *pool, * DMA_BIDIRECTIONAL is for allowing page used for DMA sending, * which is the XDP_TX use-case. */ - if (pool->p.flags & PP_FLAG_DMA_MAP) { + if (pool->slow.flags & PP_FLAG_DMA_MAP) { if ((pool->p.dma_dir != DMA_FROM_DEVICE) && (pool->p.dma_dir != DMA_BIDIRECTIONAL)) return -EINVAL; + + pool->dma_map = true; } - if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) { + if (pool->slow.flags & PP_FLAG_DMA_SYNC_DEV) { /* In order to request DMA-sync-for-device the page * needs to be mapped */ - if (!(pool->p.flags & PP_FLAG_DMA_MAP)) + if (!(pool->slow.flags & PP_FLAG_DMA_MAP)) return -EINVAL; if (!pool->p.max_len) return -EINVAL; + pool->dma_sync = true; + /* pool->p.offset has to be set according to the address * offset used by the DMA engine to start copying rx data */ @@ -222,7 +249,7 @@ static int page_pool_init(struct page_pool *pool, pool->has_init_callback = !!pool->slow.init_callback; #ifdef CONFIG_PAGE_POOL_STATS - if (!(pool->p.flags & PP_FLAG_SYSTEM_POOL)) { + if (!(pool->slow.flags & PP_FLAG_SYSTEM_POOL)) { pool->recycle_stats = alloc_percpu(struct page_pool_recycle_stats); if (!pool->recycle_stats) return -ENOMEM; @@ -232,12 +259,13 @@ static int page_pool_init(struct page_pool *pool, * (also percpu) page pool instance. */ pool->recycle_stats = &pp_system_recycle_stats; + pool->system = true; } #endif if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) { #ifdef CONFIG_PAGE_POOL_STATS - if (!(pool->p.flags & PP_FLAG_SYSTEM_POOL)) + if (!pool->system) free_percpu(pool->recycle_stats); #endif return -ENOMEM; @@ -248,21 +276,53 @@ static int page_pool_init(struct page_pool *pool, /* Driver calling page_pool_create() also call page_pool_destroy() */ refcount_set(&pool->user_cnt, 1); - if (pool->p.flags & PP_FLAG_DMA_MAP) - get_device(pool->p.dev); + xa_init_flags(&pool->dma_mapped, XA_FLAGS_ALLOC1); + + if (pool->slow.flags & PP_FLAG_ALLOW_UNREADABLE_NETMEM) { + netdev_assert_locked(pool->slow.netdev); + rxq = __netif_get_rx_queue(pool->slow.netdev, + pool->slow.queue_idx); + pool->mp_priv = rxq->mp_params.mp_priv; + pool->mp_ops = rxq->mp_params.mp_ops; + } + + if (pool->mp_ops) { + if (!pool->dma_map || !pool->dma_sync) + return -EOPNOTSUPP; + + if (WARN_ON(!is_kernel_rodata((unsigned long)pool->mp_ops))) { + err = -EFAULT; + goto free_ptr_ring; + } + + err = pool->mp_ops->init(pool); + if (err) { + pr_warn("%s() mem-provider init failed %d\n", __func__, + err); + goto free_ptr_ring; + } + + static_branch_inc(&page_pool_mem_providers); + } return 0; + +free_ptr_ring: + ptr_ring_cleanup(&pool->ring, NULL); +#ifdef CONFIG_PAGE_POOL_STATS + if (!pool->system) + free_percpu(pool->recycle_stats); +#endif + return err; } static void page_pool_uninit(struct page_pool *pool) { ptr_ring_cleanup(&pool->ring, NULL); - - if (pool->p.flags & PP_FLAG_DMA_MAP) - put_device(pool->p.dev); + xa_destroy(&pool->dma_mapped); #ifdef CONFIG_PAGE_POOL_STATS - if (!(pool->p.flags & PP_FLAG_SYSTEM_POOL)) + if (!pool->system) free_percpu(pool->recycle_stats); #endif } @@ -311,19 +371,18 @@ struct page_pool *page_pool_create(const struct page_pool_params *params) } EXPORT_SYMBOL(page_pool_create); -static void page_pool_return_page(struct page_pool *pool, struct page *page); +static void page_pool_return_page(struct page_pool *pool, netmem_ref netmem); -noinline -static struct page *page_pool_refill_alloc_cache(struct page_pool *pool) +static noinline netmem_ref page_pool_refill_alloc_cache(struct page_pool *pool) { struct ptr_ring *r = &pool->ring; - struct page *page; + netmem_ref netmem; int pref_nid; /* preferred NUMA node */ /* Quicker fallback, avoid locks when ring is empty */ if (__ptr_ring_empty(r)) { alloc_stat_inc(pool, empty); - return NULL; + return 0; } /* Softirq guarantee CPU and thus NUMA node is stable. This, @@ -338,118 +397,127 @@ static struct page *page_pool_refill_alloc_cache(struct page_pool *pool) /* Refill alloc array, but only if NUMA match */ do { - page = __ptr_ring_consume(r); - if (unlikely(!page)) + netmem = (__force netmem_ref)__ptr_ring_consume(r); + if (unlikely(!netmem)) break; - if (likely(page_to_nid(page) == pref_nid)) { - pool->alloc.cache[pool->alloc.count++] = page; + if (likely(netmem_is_pref_nid(netmem, pref_nid))) { + pool->alloc.cache[pool->alloc.count++] = netmem; } else { /* NUMA mismatch; * (1) release 1 page to page-allocator and * (2) break out to fallthrough to alloc_pages_node. * This limit stress on page buddy alloactor. */ - page_pool_return_page(pool, page); + page_pool_return_page(pool, netmem); alloc_stat_inc(pool, waive); - page = NULL; + netmem = 0; break; } } while (pool->alloc.count < PP_ALLOC_CACHE_REFILL); /* Return last page */ if (likely(pool->alloc.count > 0)) { - page = pool->alloc.cache[--pool->alloc.count]; + netmem = pool->alloc.cache[--pool->alloc.count]; alloc_stat_inc(pool, refill); } - return page; + return netmem; } /* fast path */ -static struct page *__page_pool_get_cached(struct page_pool *pool) +static netmem_ref __page_pool_get_cached(struct page_pool *pool) { - struct page *page; + netmem_ref netmem; /* Caller MUST guarantee safe non-concurrent access, e.g. softirq */ if (likely(pool->alloc.count)) { /* Fast-path */ - page = pool->alloc.cache[--pool->alloc.count]; + netmem = pool->alloc.cache[--pool->alloc.count]; alloc_stat_inc(pool, fast); } else { - page = page_pool_refill_alloc_cache(pool); + netmem = page_pool_refill_alloc_cache(pool); } - return page; + return netmem; } -static void page_pool_dma_sync_for_device(struct page_pool *pool, - struct page *page, - unsigned int dma_sync_size) +static void __page_pool_dma_sync_for_device(const struct page_pool *pool, + netmem_ref netmem, + u32 dma_sync_size) { - dma_addr_t dma_addr = page_pool_get_dma_addr(page); +#if defined(CONFIG_HAS_DMA) && defined(CONFIG_DMA_NEED_SYNC) + dma_addr_t dma_addr = page_pool_get_dma_addr_netmem(netmem); dma_sync_size = min(dma_sync_size, pool->p.max_len); - dma_sync_single_range_for_device(pool->p.dev, dma_addr, - pool->p.offset, dma_sync_size, - pool->p.dma_dir); + __dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset, + dma_sync_size, pool->p.dma_dir); +#endif } -static bool page_pool_dma_map(struct page_pool *pool, struct page *page) +static __always_inline void +page_pool_dma_sync_for_device(const struct page_pool *pool, + netmem_ref netmem, + u32 dma_sync_size) +{ + if (pool->dma_sync && dma_dev_need_sync(pool->p.dev)) { + rcu_read_lock(); + /* re-check under rcu_read_lock() to sync with page_pool_scrub() */ + if (pool->dma_sync) + __page_pool_dma_sync_for_device(pool, netmem, + dma_sync_size); + rcu_read_unlock(); + } +} + +static bool page_pool_dma_map(struct page_pool *pool, netmem_ref netmem, gfp_t gfp) { dma_addr_t dma; + int err; + u32 id; /* Setup DMA mapping: use 'struct page' area for storing DMA-addr * since dma_addr_t can be either 32 or 64 bits and does not always fit * into page private data (i.e 32bit cpu with 64bit DMA caps) * This mapping is kept for lifetime of page, until leaving pool. */ - dma = dma_map_page_attrs(pool->p.dev, page, 0, - (PAGE_SIZE << pool->p.order), - pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC | - DMA_ATTR_WEAK_ORDERING); + dma = dma_map_page_attrs(pool->p.dev, netmem_to_page(netmem), 0, + (PAGE_SIZE << pool->p.order), pool->p.dma_dir, + DMA_ATTR_SKIP_CPU_SYNC | + DMA_ATTR_WEAK_ORDERING); if (dma_mapping_error(pool->p.dev, dma)) return false; - if (page_pool_set_dma_addr(page, dma)) + if (page_pool_set_dma_addr_netmem(netmem, dma)) { + WARN_ONCE(1, "unexpected DMA address, please report to netdev@"); goto unmap_failed; + } - if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) - page_pool_dma_sync_for_device(pool, page, pool->p.max_len); + if (in_softirq()) + err = xa_alloc(&pool->dma_mapped, &id, netmem_to_page(netmem), + PP_DMA_INDEX_LIMIT, gfp); + else + err = xa_alloc_bh(&pool->dma_mapped, &id, netmem_to_page(netmem), + PP_DMA_INDEX_LIMIT, gfp); + if (err) { + WARN_ONCE(err != -ENOMEM, "couldn't track DMA mapping, please report to netdev@"); + goto unset_failed; + } + + netmem_set_dma_index(netmem, id); + page_pool_dma_sync_for_device(pool, netmem, pool->p.max_len); return true; +unset_failed: + page_pool_set_dma_addr_netmem(netmem, 0); unmap_failed: - WARN_ON_ONCE("unexpected DMA address, please report to netdev@"); dma_unmap_page_attrs(pool->p.dev, dma, PAGE_SIZE << pool->p.order, pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); return false; } -static void page_pool_set_pp_info(struct page_pool *pool, - struct page *page) -{ - page->pp = pool; - page->pp_magic |= PP_SIGNATURE; - - /* Ensuring all pages have been split into one fragment initially: - * page_pool_set_pp_info() is only called once for every page when it - * is allocated from the page allocator and page_pool_fragment_page() - * is dirtying the same cache line as the page->pp_magic above, so - * the overhead is negligible. - */ - page_pool_fragment_page(page, 1); - if (pool->has_init_callback) - pool->slow.init_callback(page, pool->slow.init_arg); -} - -static void page_pool_clear_pp_info(struct page *page) -{ - page->pp_magic = 0; - page->pp = NULL; -} - static struct page *__page_pool_alloc_page_order(struct page_pool *pool, gfp_t gfp) { @@ -460,94 +528,102 @@ static struct page *__page_pool_alloc_page_order(struct page_pool *pool, if (unlikely(!page)) return NULL; - if ((pool->p.flags & PP_FLAG_DMA_MAP) && - unlikely(!page_pool_dma_map(pool, page))) { + if (pool->dma_map && unlikely(!page_pool_dma_map(pool, page_to_netmem(page), gfp))) { put_page(page); return NULL; } alloc_stat_inc(pool, slow_high_order); - page_pool_set_pp_info(pool, page); + page_pool_set_pp_info(pool, page_to_netmem(page)); /* Track how many pages are held 'in-flight' */ pool->pages_state_hold_cnt++; - trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt); + trace_page_pool_state_hold(pool, page_to_netmem(page), + pool->pages_state_hold_cnt); return page; } /* slow path */ -noinline -static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool, - gfp_t gfp) +static noinline netmem_ref __page_pool_alloc_pages_slow(struct page_pool *pool, + gfp_t gfp) { const int bulk = PP_ALLOC_CACHE_REFILL; - unsigned int pp_flags = pool->p.flags; unsigned int pp_order = pool->p.order; - struct page *page; + bool dma_map = pool->dma_map; + netmem_ref netmem; int i, nr_pages; /* Don't support bulk alloc for high-order pages */ if (unlikely(pp_order)) - return __page_pool_alloc_page_order(pool, gfp); + return page_to_netmem(__page_pool_alloc_page_order(pool, gfp)); /* Unnecessary as alloc cache is empty, but guarantees zero count */ if (unlikely(pool->alloc.count > 0)) return pool->alloc.cache[--pool->alloc.count]; - /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk_array */ + /* Mark empty alloc.cache slots "empty" for alloc_pages_bulk */ memset(&pool->alloc.cache, 0, sizeof(void *) * bulk); - nr_pages = alloc_pages_bulk_array_node(gfp, pool->p.nid, bulk, - pool->alloc.cache); + nr_pages = alloc_pages_bulk_node(gfp, pool->p.nid, bulk, + (struct page **)pool->alloc.cache); if (unlikely(!nr_pages)) - return NULL; + return 0; /* Pages have been filled into alloc.cache array, but count is zero and * page element have not been (possibly) DMA mapped. */ for (i = 0; i < nr_pages; i++) { - page = pool->alloc.cache[i]; - if ((pp_flags & PP_FLAG_DMA_MAP) && - unlikely(!page_pool_dma_map(pool, page))) { - put_page(page); + netmem = pool->alloc.cache[i]; + if (dma_map && unlikely(!page_pool_dma_map(pool, netmem, gfp))) { + put_page(netmem_to_page(netmem)); continue; } - page_pool_set_pp_info(pool, page); - pool->alloc.cache[pool->alloc.count++] = page; + page_pool_set_pp_info(pool, netmem); + pool->alloc.cache[pool->alloc.count++] = netmem; /* Track how many pages are held 'in-flight' */ pool->pages_state_hold_cnt++; - trace_page_pool_state_hold(pool, page, + trace_page_pool_state_hold(pool, netmem, pool->pages_state_hold_cnt); } /* Return last page */ if (likely(pool->alloc.count > 0)) { - page = pool->alloc.cache[--pool->alloc.count]; + netmem = pool->alloc.cache[--pool->alloc.count]; alloc_stat_inc(pool, slow); } else { - page = NULL; + netmem = 0; } /* When page just alloc'ed is should/must have refcnt 1. */ - return page; + return netmem; } /* For using page_pool replace: alloc_pages() API calls, but provide * synchronization guarantee for allocation side. */ -struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) +netmem_ref page_pool_alloc_netmems(struct page_pool *pool, gfp_t gfp) { - struct page *page; + netmem_ref netmem; /* Fast-path: Get a page from cache */ - page = __page_pool_get_cached(pool); - if (page) - return page; + netmem = __page_pool_get_cached(pool); + if (netmem) + return netmem; /* Slow-path: cache empty, do real allocation */ - page = __page_pool_alloc_pages_slow(pool, gfp); - return page; + if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_ops) + netmem = pool->mp_ops->alloc_netmems(pool, gfp); + else + netmem = __page_pool_alloc_pages_slow(pool, gfp); + return netmem; +} +EXPORT_SYMBOL(page_pool_alloc_netmems); +ALLOW_ERROR_INJECTION(page_pool_alloc_netmems, NULL); + +struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp) +{ + return netmem_to_page(page_pool_alloc_netmems(pool, gfp)); } EXPORT_SYMBOL(page_pool_alloc_pages); @@ -575,24 +651,60 @@ s32 page_pool_inflight(const struct page_pool *pool, bool strict) return inflight; } -static __always_inline -void __page_pool_release_page_dma(struct page_pool *pool, struct page *page) +void page_pool_set_pp_info(struct page_pool *pool, netmem_ref netmem) +{ + netmem_set_pp(netmem, pool); + netmem_or_pp_magic(netmem, PP_SIGNATURE); + + /* Ensuring all pages have been split into one fragment initially: + * page_pool_set_pp_info() is only called once for every page when it + * is allocated from the page allocator and page_pool_fragment_page() + * is dirtying the same cache line as the page->pp_magic above, so + * the overhead is negligible. + */ + page_pool_fragment_netmem(netmem, 1); + if (pool->has_init_callback) + pool->slow.init_callback(netmem, pool->slow.init_arg); +} + +void page_pool_clear_pp_info(netmem_ref netmem) +{ + netmem_clear_pp_magic(netmem); + netmem_set_pp(netmem, NULL); +} + +static __always_inline void __page_pool_release_page_dma(struct page_pool *pool, + netmem_ref netmem) { + struct page *old, *page = netmem_to_page(netmem); + unsigned long id; dma_addr_t dma; - if (!(pool->p.flags & PP_FLAG_DMA_MAP)) + if (!pool->dma_map) /* Always account for inflight pages, even if we didn't * map them */ return; - dma = page_pool_get_dma_addr(page); + id = netmem_get_dma_index(netmem); + if (!id) + return; + + if (in_softirq()) + old = xa_cmpxchg(&pool->dma_mapped, id, page, NULL, 0); + else + old = xa_cmpxchg_bh(&pool->dma_mapped, id, page, NULL, 0); + if (old != page) + return; + + dma = page_pool_get_dma_addr_netmem(netmem); /* When page is unmapped, it cannot be returned to our pool */ dma_unmap_page_attrs(pool->p.dev, dma, PAGE_SIZE << pool->p.order, pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING); - page_pool_set_dma_addr(page, 0); + page_pool_set_dma_addr_netmem(netmem, 0); + netmem_set_dma_index(netmem, 0); } /* Disconnects a page (from a page_pool). API users can have a need @@ -600,35 +712,41 @@ void __page_pool_release_page_dma(struct page_pool *pool, struct page *page) * a regular page (that will eventually be returned to the normal * page-allocator via put_page). */ -void page_pool_return_page(struct page_pool *pool, struct page *page) +void page_pool_return_page(struct page_pool *pool, netmem_ref netmem) { int count; + bool put; - __page_pool_release_page_dma(pool, page); - - page_pool_clear_pp_info(page); + put = true; + if (static_branch_unlikely(&page_pool_mem_providers) && pool->mp_ops) + put = pool->mp_ops->release_netmem(pool, netmem); + else + __page_pool_release_page_dma(pool, netmem); /* This may be the last page returned, releasing the pool, so * it is not safe to reference pool afterwards. */ count = atomic_inc_return_relaxed(&pool->pages_state_release_cnt); - trace_page_pool_state_release(pool, page, count); + trace_page_pool_state_release(pool, netmem, count); - put_page(page); + if (put) { + page_pool_clear_pp_info(netmem); + put_page(netmem_to_page(netmem)); + } /* An optimization would be to call __free_pages(page, pool->p.order) * knowing page is not part of page-cache (thus avoiding a * __page_cache_release() call). */ } -static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page) +static bool page_pool_recycle_in_ring(struct page_pool *pool, netmem_ref netmem) { int ret; /* BH protection not needed if current is softirq */ if (in_softirq()) - ret = ptr_ring_produce(&pool->ring, page); + ret = ptr_ring_produce(&pool->ring, (__force void *)netmem); else - ret = ptr_ring_produce_bh(&pool->ring, page); + ret = ptr_ring_produce_bh(&pool->ring, (__force void *)netmem); if (!ret) { recycle_stat_inc(pool, ring); @@ -643,7 +761,7 @@ static bool page_pool_recycle_in_ring(struct page_pool *pool, struct page *page) * * Caller must provide appropriate safe context. */ -static bool page_pool_recycle_in_cache(struct page *page, +static bool page_pool_recycle_in_cache(netmem_ref netmem, struct page_pool *pool) { if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE)) { @@ -652,24 +770,26 @@ static bool page_pool_recycle_in_cache(struct page *page, } /* Caller MUST have verified/know (page_ref_count(page) == 1) */ - pool->alloc.cache[pool->alloc.count++] = page; + pool->alloc.cache[pool->alloc.count++] = netmem; recycle_stat_inc(pool, cached); return true; } -static bool __page_pool_page_can_be_recycled(const struct page *page) +static bool __page_pool_page_can_be_recycled(netmem_ref netmem) { - return page_ref_count(page) == 1 && !page_is_pfmemalloc(page); + return netmem_is_net_iov(netmem) || + (page_ref_count(netmem_to_page(netmem)) == 1 && + !page_is_pfmemalloc(netmem_to_page(netmem))); } /* If the page refcnt == 1, this will try to recycle the page. - * if PP_FLAG_DMA_SYNC_DEV is set, we'll try to sync the DMA area for + * If pool->dma_sync is set, we'll try to sync the DMA area for * the configured size min(dma_sync_size, pool->max_len). * If the page refcnt != 1, then the page will be returned to memory * subsystem. */ -static __always_inline struct page * -__page_pool_put_page(struct page_pool *pool, struct page *page, +static __always_inline netmem_ref +__page_pool_put_page(struct page_pool *pool, netmem_ref netmem, unsigned int dma_sync_size, bool allow_direct) { lockdep_assert_no_hardirq(); @@ -683,20 +803,18 @@ __page_pool_put_page(struct page_pool *pool, struct page *page, * page is NOT reusable when allocated when system is under * some pressure. (page_is_pfmemalloc) */ - if (likely(__page_pool_page_can_be_recycled(page))) { + if (likely(__page_pool_page_can_be_recycled(netmem))) { /* Read barrier done in page_ref_count / READ_ONCE */ - if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) - page_pool_dma_sync_for_device(pool, page, - dma_sync_size); + page_pool_dma_sync_for_device(pool, netmem, dma_sync_size); - if (allow_direct && in_softirq() && - page_pool_recycle_in_cache(page, pool)) - return NULL; + if (allow_direct && page_pool_recycle_in_cache(netmem, pool)) + return 0; /* Page found as candidate for recycling */ - return page; + return netmem; } + /* Fallback/non-XDP mode: API user have elevated refcnt. * * Many drivers split up the page into fragments, and some @@ -711,174 +829,257 @@ __page_pool_put_page(struct page_pool *pool, struct page *page, * will be invoking put_page. */ recycle_stat_inc(pool, released_refcnt); - page_pool_return_page(pool, page); + page_pool_return_page(pool, netmem); - return NULL; + return 0; } -void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page, - unsigned int dma_sync_size, bool allow_direct) +static bool page_pool_napi_local(const struct page_pool *pool) { - page = __page_pool_put_page(pool, page, dma_sync_size, allow_direct); - if (page && !page_pool_recycle_in_ring(pool, page)) { + const struct napi_struct *napi; + u32 cpuid; + + /* On PREEMPT_RT the softirq can be preempted by the consumer */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + return false; + + if (unlikely(!in_softirq())) + return false; + + /* Allow direct recycle if we have reasons to believe that we are + * in the same context as the consumer would run, so there's + * no possible race. + * __page_pool_put_page() makes sure we're not in hardirq context + * and interrupts are enabled prior to accessing the cache. + */ + cpuid = smp_processor_id(); + if (READ_ONCE(pool->cpuid) == cpuid) + return true; + + napi = READ_ONCE(pool->p.napi); + + return napi && READ_ONCE(napi->list_owner) == cpuid; +} + +void page_pool_put_unrefed_netmem(struct page_pool *pool, netmem_ref netmem, + unsigned int dma_sync_size, bool allow_direct) +{ + if (!allow_direct) + allow_direct = page_pool_napi_local(pool); + + netmem = __page_pool_put_page(pool, netmem, dma_sync_size, + allow_direct); + if (netmem && !page_pool_recycle_in_ring(pool, netmem)) { /* Cache full, fallback to free pages */ recycle_stat_inc(pool, ring_full); - page_pool_return_page(pool, page); + page_pool_return_page(pool, netmem); } } +EXPORT_SYMBOL(page_pool_put_unrefed_netmem); + +void page_pool_put_unrefed_page(struct page_pool *pool, struct page *page, + unsigned int dma_sync_size, bool allow_direct) +{ + page_pool_put_unrefed_netmem(pool, page_to_netmem(page), dma_sync_size, + allow_direct); +} EXPORT_SYMBOL(page_pool_put_unrefed_page); -/** - * page_pool_put_page_bulk() - release references on multiple pages - * @pool: pool from which pages were allocated - * @data: array holding page pointers - * @count: number of pages in @data - * - * Tries to refill a number of pages into the ptr_ring cache holding ptr_ring - * producer lock. If the ptr_ring is full, page_pool_put_page_bulk() - * will release leftover pages to the page allocator. - * page_pool_put_page_bulk() is suitable to be run inside the driver NAPI tx - * completion loop for the XDP_REDIRECT use case. - * - * Please note the caller must not use data area after running - * page_pool_put_page_bulk(), as this function overwrites it. - */ -void page_pool_put_page_bulk(struct page_pool *pool, void **data, - int count) +static void page_pool_recycle_ring_bulk(struct page_pool *pool, + netmem_ref *bulk, + u32 bulk_len) { - int i, bulk_len = 0; bool in_softirq; + u32 i; - for (i = 0; i < count; i++) { - struct page *page = virt_to_head_page(data[i]); - - /* It is not the last user for the page frag case */ - if (!page_pool_is_last_ref(page)) - continue; - - page = __page_pool_put_page(pool, page, -1, false); - /* Approved for bulk recycling in ptr_ring cache */ - if (page) - data[bulk_len++] = page; - } - - if (unlikely(!bulk_len)) - return; - - /* Bulk producer into ptr_ring page_pool cache */ + /* Bulk produce into ptr_ring page_pool cache */ in_softirq = page_pool_producer_lock(pool); + for (i = 0; i < bulk_len; i++) { - if (__ptr_ring_produce(&pool->ring, data[i])) { + if (__ptr_ring_produce(&pool->ring, (__force void *)bulk[i])) { /* ring full */ recycle_stat_inc(pool, ring_full); break; } } - recycle_stat_add(pool, ring, i); + page_pool_producer_unlock(pool, in_softirq); + recycle_stat_add(pool, ring, i); - /* Hopefully all pages was return into ptr_ring */ + /* Hopefully all pages were returned into ptr_ring */ if (likely(i == bulk_len)) return; - /* ptr_ring cache full, free remaining pages outside producer lock - * since put_page() with refcnt == 1 can be an expensive operation + /* + * ptr_ring cache is full, free remaining pages outside producer lock + * since put_page() with refcnt == 1 can be an expensive operation. */ for (; i < bulk_len; i++) - page_pool_return_page(pool, data[i]); + page_pool_return_page(pool, bulk[i]); } -EXPORT_SYMBOL(page_pool_put_page_bulk); -static struct page *page_pool_drain_frag(struct page_pool *pool, - struct page *page) +/** + * page_pool_put_netmem_bulk() - release references on multiple netmems + * @data: array holding netmem references + * @count: number of entries in @data + * + * Tries to refill a number of netmems into the ptr_ring cache holding ptr_ring + * producer lock. If the ptr_ring is full, page_pool_put_netmem_bulk() + * will release leftover netmems to the memory provider. + * page_pool_put_netmem_bulk() is suitable to be run inside the driver NAPI tx + * completion loop for the XDP_REDIRECT use case. + * + * Please note the caller must not use data area after running + * page_pool_put_netmem_bulk(), as this function overwrites it. + */ +void page_pool_put_netmem_bulk(netmem_ref *data, u32 count) +{ + u32 bulk_len = 0; + + for (u32 i = 0; i < count; i++) { + netmem_ref netmem = netmem_compound_head(data[i]); + + if (page_pool_unref_and_test(netmem)) + data[bulk_len++] = netmem; + } + + count = bulk_len; + while (count) { + netmem_ref bulk[XDP_BULK_QUEUE_SIZE]; + struct page_pool *pool = NULL; + bool allow_direct; + u32 foreign = 0; + + bulk_len = 0; + + for (u32 i = 0; i < count; i++) { + struct page_pool *netmem_pp; + netmem_ref netmem = data[i]; + + netmem_pp = netmem_get_pp(netmem); + if (unlikely(!pool)) { + pool = netmem_pp; + allow_direct = page_pool_napi_local(pool); + } else if (netmem_pp != pool) { + /* + * If the netmem belongs to a different + * page_pool, save it for another round. + */ + data[foreign++] = netmem; + continue; + } + + netmem = __page_pool_put_page(pool, netmem, -1, + allow_direct); + /* Approved for bulk recycling in ptr_ring cache */ + if (netmem) + bulk[bulk_len++] = netmem; + } + + if (bulk_len) + page_pool_recycle_ring_bulk(pool, bulk, bulk_len); + + count = foreign; + } +} +EXPORT_SYMBOL(page_pool_put_netmem_bulk); + +static netmem_ref page_pool_drain_frag(struct page_pool *pool, + netmem_ref netmem) { long drain_count = BIAS_MAX - pool->frag_users; /* Some user is still using the page frag */ - if (likely(page_pool_unref_page(page, drain_count))) - return NULL; - - if (__page_pool_page_can_be_recycled(page)) { - if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) - page_pool_dma_sync_for_device(pool, page, -1); + if (likely(page_pool_unref_netmem(netmem, drain_count))) + return 0; - return page; + if (__page_pool_page_can_be_recycled(netmem)) { + page_pool_dma_sync_for_device(pool, netmem, -1); + return netmem; } - page_pool_return_page(pool, page); - return NULL; + page_pool_return_page(pool, netmem); + return 0; } static void page_pool_free_frag(struct page_pool *pool) { long drain_count = BIAS_MAX - pool->frag_users; - struct page *page = pool->frag_page; + netmem_ref netmem = pool->frag_page; - pool->frag_page = NULL; + pool->frag_page = 0; - if (!page || page_pool_unref_page(page, drain_count)) + if (!netmem || page_pool_unref_netmem(netmem, drain_count)) return; - page_pool_return_page(pool, page); + page_pool_return_page(pool, netmem); } -struct page *page_pool_alloc_frag(struct page_pool *pool, - unsigned int *offset, - unsigned int size, gfp_t gfp) +netmem_ref page_pool_alloc_frag_netmem(struct page_pool *pool, + unsigned int *offset, unsigned int size, + gfp_t gfp) { unsigned int max_size = PAGE_SIZE << pool->p.order; - struct page *page = pool->frag_page; + netmem_ref netmem = pool->frag_page; if (WARN_ON(size > max_size)) - return NULL; + return 0; size = ALIGN(size, dma_get_cache_alignment()); *offset = pool->frag_offset; - if (page && *offset + size > max_size) { - page = page_pool_drain_frag(pool, page); - if (page) { + if (netmem && *offset + size > max_size) { + netmem = page_pool_drain_frag(pool, netmem); + if (netmem) { + recycle_stat_inc(pool, cached); alloc_stat_inc(pool, fast); goto frag_reset; } } - if (!page) { - page = page_pool_alloc_pages(pool, gfp); - if (unlikely(!page)) { - pool->frag_page = NULL; - return NULL; + if (!netmem) { + netmem = page_pool_alloc_netmems(pool, gfp); + if (unlikely(!netmem)) { + pool->frag_page = 0; + return 0; } - pool->frag_page = page; + pool->frag_page = netmem; frag_reset: pool->frag_users = 1; *offset = 0; pool->frag_offset = size; - page_pool_fragment_page(page, BIAS_MAX); - return page; + page_pool_fragment_netmem(netmem, BIAS_MAX); + return netmem; } pool->frag_users++; pool->frag_offset = *offset + size; - alloc_stat_inc(pool, fast); - return page; + return netmem; +} +EXPORT_SYMBOL(page_pool_alloc_frag_netmem); + +struct page *page_pool_alloc_frag(struct page_pool *pool, unsigned int *offset, + unsigned int size, gfp_t gfp) +{ + return netmem_to_page(page_pool_alloc_frag_netmem(pool, offset, size, + gfp)); } EXPORT_SYMBOL(page_pool_alloc_frag); static void page_pool_empty_ring(struct page_pool *pool) { - struct page *page; + netmem_ref netmem; /* Empty recycle ring */ - while ((page = ptr_ring_consume_bh(&pool->ring))) { + while ((netmem = (__force netmem_ref)ptr_ring_consume_bh(&pool->ring))) { /* Verify the refcnt invariant of cached pages */ - if (!(page_ref_count(page) == 1)) + if (!(netmem_ref_count(netmem) == 1)) pr_crit("%s() page_pool refcnt %d violation\n", - __func__, page_ref_count(page)); + __func__, netmem_ref_count(netmem)); - page_pool_return_page(pool, page); + page_pool_return_page(pool, netmem); } } @@ -889,12 +1090,18 @@ static void __page_pool_destroy(struct page_pool *pool) page_pool_unlist(pool); page_pool_uninit(pool); + + if (pool->mp_ops) { + pool->mp_ops->destroy(pool); + static_branch_dec(&page_pool_mem_providers); + } + kfree(pool); } static void page_pool_empty_alloc_cache_once(struct page_pool *pool) { - struct page *page; + netmem_ref netmem; if (pool->destroy_cnt) return; @@ -904,15 +1111,36 @@ static void page_pool_empty_alloc_cache_once(struct page_pool *pool) * call concurrently. */ while (pool->alloc.count) { - page = pool->alloc.cache[--pool->alloc.count]; - page_pool_return_page(pool, page); + netmem = pool->alloc.cache[--pool->alloc.count]; + page_pool_return_page(pool, netmem); } } static void page_pool_scrub(struct page_pool *pool) { + unsigned long id; + void *ptr; + page_pool_empty_alloc_cache_once(pool); - pool->destroy_cnt++; + if (!pool->destroy_cnt++ && pool->dma_map) { + if (pool->dma_sync) { + /* Disable page_pool_dma_sync_for_device() */ + pool->dma_sync = false; + + /* Make sure all concurrent returns that may see the old + * value of dma_sync (and thus perform a sync) have + * finished before doing the unmapping below. Skip the + * wait if the device doesn't actually need syncing, or + * if there are no outstanding mapped pages. + */ + if (dma_dev_need_sync(pool->p.dev) && + !xa_empty(&pool->dma_mapped)) + synchronize_net(); + } + + xa_for_each(&pool->dma_mapped, id, ptr) + __page_pool_release_page_dma(pool, page_to_netmem(ptr)); + } /* No more consumers should exist, but producers could still * be in-flight. @@ -940,7 +1168,13 @@ static void page_pool_release_retry(struct work_struct *wq) int inflight; inflight = page_pool_release(pool); - if (!inflight) + /* In rare cases, a driver bug may cause inflight to go negative. + * Don't reschedule release if inflight is 0 or negative. + * - If 0, the page_pool has been destroyed + * - if negative, we will never recover + * in both cases no reschedule is necessary. + */ + if (inflight <= 0) return; /* Periodic warning for page pools the user can't see */ @@ -959,31 +1193,30 @@ static void page_pool_release_retry(struct work_struct *wq) } void page_pool_use_xdp_mem(struct page_pool *pool, void (*disconnect)(void *), - struct xdp_mem_info *mem) + const struct xdp_mem_info *mem) { refcount_inc(&pool->user_cnt); pool->disconnect = disconnect; pool->xdp_mem_id = mem->id; } -static void page_pool_disable_direct_recycling(struct page_pool *pool) +void page_pool_disable_direct_recycling(struct page_pool *pool) { /* Disable direct recycling based on pool->cpuid. - * Paired with READ_ONCE() in napi_pp_put_page(). + * Paired with READ_ONCE() in page_pool_napi_local(). */ WRITE_ONCE(pool->cpuid, -1); if (!pool->p.napi) return; - /* To avoid races with recycling and additional barriers make sure - * pool and NAPI are unlinked when NAPI is disabled. - */ - WARN_ON(!test_bit(NAPI_STATE_SCHED, &pool->p.napi->state) || - READ_ONCE(pool->p.napi->list_owner) != -1); + napi_assert_will_not_race(pool->p.napi); + mutex_lock(&page_pools_lock); WRITE_ONCE(pool->p.napi, NULL); + mutex_unlock(&page_pools_lock); } +EXPORT_SYMBOL(page_pool_disable_direct_recycling); void page_pool_destroy(struct page_pool *pool) { @@ -1011,15 +1244,43 @@ EXPORT_SYMBOL(page_pool_destroy); /* Caller must provide appropriate safe context, e.g. NAPI. */ void page_pool_update_nid(struct page_pool *pool, int new_nid) { - struct page *page; + netmem_ref netmem; trace_page_pool_update_nid(pool, new_nid); pool->p.nid = new_nid; /* Flush pool alloc cache, as refill will check NUMA node */ while (pool->alloc.count) { - page = pool->alloc.cache[--pool->alloc.count]; - page_pool_return_page(pool, page); + netmem = pool->alloc.cache[--pool->alloc.count]; + page_pool_return_page(pool, netmem); } } EXPORT_SYMBOL(page_pool_update_nid); + +bool net_mp_niov_set_dma_addr(struct net_iov *niov, dma_addr_t addr) +{ + return page_pool_set_dma_addr_netmem(net_iov_to_netmem(niov), addr); +} + +/* Associate a niov with a page pool. Should follow with a matching + * net_mp_niov_clear_page_pool() + */ +void net_mp_niov_set_page_pool(struct page_pool *pool, struct net_iov *niov) +{ + netmem_ref netmem = net_iov_to_netmem(niov); + + page_pool_set_pp_info(pool, netmem); + + pool->pages_state_hold_cnt++; + trace_page_pool_state_hold(pool, netmem, pool->pages_state_hold_cnt); +} + +/* Disassociate a niov from a page pool. Should only be used in the + * ->release_netmem() path. + */ +void net_mp_niov_clear_page_pool(struct net_iov *niov) +{ + netmem_ref netmem = net_iov_to_netmem(niov); + + page_pool_clear_pp_info(netmem); +} |