From c78c66d1ddfdbd2353f3fcfeba0268524537b096 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 26 Jul 2016 15:26:02 -0700 Subject: radix-tree: implement radix_tree_maybe_preload_order() The new helper is similar to radix_tree_maybe_preload(), but tries to preload number of nodes required to insert (1 << order) continuous naturally-aligned elements. This is required to push huge pages into pagecache. Link: http://lkml.kernel.org/r/1466021202-61880-24-git-send-email-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/radix-tree.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 79 insertions(+), 5 deletions(-) (limited to 'lib/radix-tree.c') diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 8b7d8459bb9d..61b8fb529cef 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -38,6 +38,9 @@ #include /* in_interrupt() */ +/* Number of nodes in fully populated tree of given height */ +static unsigned long height_to_maxnodes[RADIX_TREE_MAX_PATH + 1] __read_mostly; + /* * Radix tree node cache. */ @@ -342,7 +345,7 @@ radix_tree_node_free(struct radix_tree_node *node) * To make use of this facility, the radix tree must be initialised without * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE(). */ -static int __radix_tree_preload(gfp_t gfp_mask) +static int __radix_tree_preload(gfp_t gfp_mask, int nr) { struct radix_tree_preload *rtp; struct radix_tree_node *node; @@ -350,14 +353,14 @@ static int __radix_tree_preload(gfp_t gfp_mask) preempt_disable(); rtp = this_cpu_ptr(&radix_tree_preloads); - while (rtp->nr < RADIX_TREE_PRELOAD_SIZE) { + while (rtp->nr < nr) { preempt_enable(); node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask); if (node == NULL) goto out; preempt_disable(); rtp = this_cpu_ptr(&radix_tree_preloads); - if (rtp->nr < RADIX_TREE_PRELOAD_SIZE) { + if (rtp->nr < nr) { node->private_data = rtp->nodes; rtp->nodes = node; rtp->nr++; @@ -383,7 +386,7 @@ int radix_tree_preload(gfp_t gfp_mask) { /* Warn on non-sensical use... */ WARN_ON_ONCE(!gfpflags_allow_blocking(gfp_mask)); - return __radix_tree_preload(gfp_mask); + return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE); } EXPORT_SYMBOL(radix_tree_preload); @@ -395,13 +398,58 @@ EXPORT_SYMBOL(radix_tree_preload); int radix_tree_maybe_preload(gfp_t gfp_mask) { if (gfpflags_allow_blocking(gfp_mask)) - return __radix_tree_preload(gfp_mask); + return __radix_tree_preload(gfp_mask, RADIX_TREE_PRELOAD_SIZE); /* Preloading doesn't help anything with this gfp mask, skip it */ preempt_disable(); return 0; } EXPORT_SYMBOL(radix_tree_maybe_preload); +/* + * The same as function above, but preload number of nodes required to insert + * (1 << order) continuous naturally-aligned elements. + */ +int radix_tree_maybe_preload_order(gfp_t gfp_mask, int order) +{ + unsigned long nr_subtrees; + int nr_nodes, subtree_height; + + /* Preloading doesn't help anything with this gfp mask, skip it */ + if (!gfpflags_allow_blocking(gfp_mask)) { + preempt_disable(); + return 0; + } + + /* + * Calculate number and height of fully populated subtrees it takes to + * store (1 << order) elements. + */ + nr_subtrees = 1 << order; + for (subtree_height = 0; nr_subtrees > RADIX_TREE_MAP_SIZE; + subtree_height++) + nr_subtrees >>= RADIX_TREE_MAP_SHIFT; + + /* + * The worst case is zero height tree with a single item at index 0 and + * then inserting items starting at ULONG_MAX - (1 << order). + * + * This requires RADIX_TREE_MAX_PATH nodes to build branch from root to + * 0-index item. + */ + nr_nodes = RADIX_TREE_MAX_PATH; + + /* Plus branch to fully populated subtrees. */ + nr_nodes += RADIX_TREE_MAX_PATH - subtree_height; + + /* Root node is shared. */ + nr_nodes--; + + /* Plus nodes required to build subtrees. */ + nr_nodes += nr_subtrees * height_to_maxnodes[subtree_height]; + + return __radix_tree_preload(gfp_mask, nr_nodes); +} + /* * The maximum index which can be stored in a radix tree */ @@ -1571,6 +1619,31 @@ radix_tree_node_ctor(void *arg) INIT_LIST_HEAD(&node->private_list); } +static __init unsigned long __maxindex(unsigned int height) +{ + unsigned int width = height * RADIX_TREE_MAP_SHIFT; + int shift = RADIX_TREE_INDEX_BITS - width; + + if (shift < 0) + return ~0UL; + if (shift >= BITS_PER_LONG) + return 0UL; + return ~0UL >> shift; +} + +static __init void radix_tree_init_maxnodes(void) +{ + unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH + 1]; + unsigned int i, j; + + for (i = 0; i < ARRAY_SIZE(height_to_maxindex); i++) + height_to_maxindex[i] = __maxindex(i); + for (i = 0; i < ARRAY_SIZE(height_to_maxnodes); i++) { + for (j = i; j > 0; j--) + height_to_maxnodes[i] += height_to_maxindex[j - 1] + 1; + } +} + static int radix_tree_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { @@ -1597,5 +1670,6 @@ void __init radix_tree_init(void) sizeof(struct radix_tree_node), 0, SLAB_PANIC | SLAB_RECLAIM_ACCOUNT, radix_tree_node_ctor); + radix_tree_init_maxnodes(); hotcpu_notifier(radix_tree_callback, 0); } -- cgit From 05eb6e7263185a6bb0de9501ccf2addc52429414 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Tue, 2 Aug 2016 14:03:01 -0700 Subject: radix-tree: account nodes to memcg only if explicitly requested Radix trees may be used not only for storing page cache pages, so unconditionally accounting radix tree nodes to the current memory cgroup is bad: if a radix tree node is used for storing data shared among different cgroups we risk pinning dead memory cgroups forever. So let's only account radix tree nodes if it was explicitly requested by passing __GFP_ACCOUNT to INIT_RADIX_TREE. Currently, we only want to account page cache entries, so mark mapping->page_tree so. Fixes: 58e698af4c63 ("radix-tree: account radix_tree_node to memory cgroup") Link: http://lkml.kernel.org/r/1470057188-7864-1-git-send-email-vdavydov@virtuozzo.com Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: [4.6+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/radix-tree.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'lib/radix-tree.c') diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 61b8fb529cef..1b7bf7314141 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -277,10 +277,11 @@ radix_tree_node_alloc(struct radix_tree_root *root) /* * Even if the caller has preloaded, try to allocate from the - * cache first for the new node to get accounted. + * cache first for the new node to get accounted to the memory + * cgroup. */ ret = kmem_cache_alloc(radix_tree_node_cachep, - gfp_mask | __GFP_ACCOUNT | __GFP_NOWARN); + gfp_mask | __GFP_NOWARN); if (ret) goto out; @@ -303,8 +304,7 @@ radix_tree_node_alloc(struct radix_tree_root *root) kmemleak_update_trace(ret); goto out; } - ret = kmem_cache_alloc(radix_tree_node_cachep, - gfp_mask | __GFP_ACCOUNT); + ret = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask); out: BUG_ON(radix_tree_is_internal_node(ret)); return ret; @@ -351,6 +351,12 @@ static int __radix_tree_preload(gfp_t gfp_mask, int nr) struct radix_tree_node *node; int ret = -ENOMEM; + /* + * Nodes preloaded by one cgroup can be be used by another cgroup, so + * they should never be accounted to any particular memory cgroup. + */ + gfp_mask &= ~__GFP_ACCOUNT; + preempt_disable(); rtp = this_cpu_ptr(&radix_tree_preloads); while (rtp->nr < nr) { -- cgit