diff options
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r-- | mm/hugetlb.c | 34 |
1 files changed, 18 insertions, 16 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7a96c6edeaef..edb846452791 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -14,9 +14,11 @@ #include <linux/pagemap.h> #include <linux/mempolicy.h> #include <linux/compiler.h> +#include <linux/cpumask.h> #include <linux/cpuset.h> #include <linux/mutex.h> #include <linux/memblock.h> +#include <linux/minmax.h> #include <linux/sysfs.h> #include <linux/slab.h> #include <linux/sched/mm.h> @@ -3605,31 +3607,31 @@ static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h) .numa_aware = true }; + unsigned int num_allocation_threads = max(num_online_cpus() / 4, 1); + job.thread_fn = hugetlb_pages_alloc_boot_node; job.start = 0; job.size = h->max_huge_pages; /* - * job.max_threads is twice the num_node_state(N_MEMORY), + * job.max_threads is 25% of the available cpu threads by default. * - * Tests below indicate that a multiplier of 2 significantly improves - * performance, and although larger values also provide improvements, - * the gains are marginal. + * On large servers with terabytes of memory, huge page allocation + * can consume a considerably amount of time. * - * Therefore, choosing 2 as the multiplier strikes a good balance between - * enhancing parallel processing capabilities and maintaining efficient - * resource management. + * Tests below show how long it takes to allocate 1 TiB of memory with 2MiB huge pages. + * 2MiB huge pages. Using more threads can significantly improve allocation time. * - * +------------+-------+-------+-------+-------+-------+ - * | multiplier | 1 | 2 | 3 | 4 | 5 | - * +------------+-------+-------+-------+-------+-------+ - * | 256G 2node | 358ms | 215ms | 157ms | 134ms | 126ms | - * | 2T 4node | 979ms | 679ms | 543ms | 489ms | 481ms | - * | 50G 2node | 71ms | 44ms | 37ms | 30ms | 31ms | - * +------------+-------+-------+-------+-------+-------+ + * +-----------------------+-------+-------+-------+-------+-------+ + * | threads | 8 | 16 | 32 | 64 | 128 | + * +-----------------------+-------+-------+-------+-------+-------+ + * | skylake 144 cpus | 44s | 22s | 16s | 19s | 20s | + * | cascade lake 192 cpus | 39s | 20s | 11s | 10s | 9s | + * +-----------------------+-------+-------+-------+-------+-------+ */ - job.max_threads = num_node_state(N_MEMORY) * 2; - job.min_chunk = h->max_huge_pages / num_node_state(N_MEMORY) / 2; + + job.max_threads = num_allocation_threads; + job.min_chunk = h->max_huge_pages / num_allocation_threads; padata_do_multithreaded(&job); return h->nr_huge_pages; |