From 1e5d8e1e47afde23e3249aed25d7d124feff5c1c Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 16 Feb 2020 12:01:04 -0800 Subject: x86/mm: Introduce CONFIG_NUMA_KEEP_MEMINFO Currently x86 numa_meminfo is marked __initdata in the CONFIG_MEMORY_HOTPLUG=n case. In support of a new facility to allow drivers to map reserved memory to a 'target_node' (phys_to_target_node()), add support for removing the __initdata designation for those users. Both memory hotplug and phys_to_target_node() users select CONFIG_NUMA_KEEP_MEMINFO to tell the arch to maintain its physical address to NUMA mapping infrastructure post init. Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Cc: Andrew Morton Cc: David Hildenbrand Cc: Michal Hocko Reviewed-by: Ingo Molnar Signed-off-by: Dan Williams Reviewed-by: Thomas Gleixner Link: https://lore.kernel.org/r/158188326422.894464.15742054998046628934.stgit@dwillia2-desk3.amr.corp.intel.com --- arch/x86/mm/numa.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 99f7a68738f0..2450b21cc28a 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -25,11 +25,7 @@ nodemask_t numa_nodes_parsed __initdata; struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); -static struct numa_meminfo numa_meminfo -#ifndef CONFIG_MEMORY_HOTPLUG -__initdata -#endif -; +static struct numa_meminfo numa_meminfo __initdata_or_meminfo; static int numa_distance_cnt; static u8 *numa_distance; -- cgit From 5d30f92e7631286b8617777c5400c8eadcae50a1 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 16 Feb 2020 12:01:09 -0800 Subject: x86/NUMA: Provide a range-to-target_node lookup facility The DEV_DAX_KMEM facility is a generic mechanism to allow device-dax instances, fronting performance-differentiated-memory like pmem, to be added to the System RAM pool. The NUMA node for that hot-added memory is derived from the device-dax instance's 'target_node' attribute. Recall that the 'target_node' is the ACPI-PXM-to-node translation for memory when it comes online whereas the 'numa_node' attribute of the device represents the closest online cpu node. Presently useful target_node information from the ACPI SRAT is discarded with the expectation that "Reserved" memory will never be onlined. Now, DEV_DAX_KMEM violates that assumption, there is a need to retain the translation. Move, rather than discard, numa_memblk data to a secondary array that memory_add_physaddr_to_target_node() may consider at a later point in time. Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: Cc: Andrew Morton Cc: David Hildenbrand Cc: Michal Hocko Reported-by: kbuild test robot Reviewed-by: Ingo Molnar Signed-off-by: Dan Williams Reviewed-by: Thomas Gleixner Link: https://lore.kernel.org/r/158188326978.894464.217282995221175417.stgit@dwillia2-desk3.amr.corp.intel.com --- arch/x86/mm/numa.c | 61 +++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 51 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 2450b21cc28a..59ba008504dc 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -26,6 +26,7 @@ struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); static struct numa_meminfo numa_meminfo __initdata_or_meminfo; +static struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo; static int numa_distance_cnt; static u8 *numa_distance; @@ -164,6 +165,19 @@ void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi) (mi->nr_blks - idx) * sizeof(mi->blk[0])); } +/** + * numa_move_tail_memblk - Move a numa_memblk from one numa_meminfo to another + * @dst: numa_meminfo to append block to + * @idx: Index of memblk to remove + * @src: numa_meminfo to remove memblk from + */ +static void __init numa_move_tail_memblk(struct numa_meminfo *dst, int idx, + struct numa_meminfo *src) +{ + dst->blk[dst->nr_blks++] = src->blk[idx]; + numa_remove_memblk_from(idx, src); +} + /** * numa_add_memblk - Add one numa_memblk to numa_meminfo * @nid: NUMA node ID of the new memblk @@ -233,14 +247,19 @@ int __init numa_cleanup_meminfo(struct numa_meminfo *mi) for (i = 0; i < mi->nr_blks; i++) { struct numa_memblk *bi = &mi->blk[i]; - /* make sure all blocks are inside the limits */ + /* move / save reserved memory ranges */ + if (!memblock_overlaps_region(&memblock.memory, + bi->start, bi->end - bi->start)) { + numa_move_tail_memblk(&numa_reserved_meminfo, i--, mi); + continue; + } + + /* make sure all non-reserved blocks are inside the limits */ bi->start = max(bi->start, low); bi->end = min(bi->end, high); - /* and there's no empty or non-exist block */ - if (bi->start >= bi->end || - !memblock_overlaps_region(&memblock.memory, - bi->start, bi->end - bi->start)) + /* and there's no empty block */ + if (bi->start >= bi->end) numa_remove_memblk_from(i--, mi); } @@ -877,16 +896,38 @@ EXPORT_SYMBOL(cpumask_of_node); #endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ -#ifdef CONFIG_MEMORY_HOTPLUG -int memory_add_physaddr_to_nid(u64 start) +#ifdef CONFIG_NUMA_KEEP_MEMINFO +static int meminfo_to_nid(struct numa_meminfo *mi, u64 start) { - struct numa_meminfo *mi = &numa_meminfo; - int nid = mi->blk[0].nid; int i; for (i = 0; i < mi->nr_blks; i++) if (mi->blk[i].start <= start && mi->blk[i].end > start) - nid = mi->blk[i].nid; + return mi->blk[i].nid; + return NUMA_NO_NODE; +} + +int phys_to_target_node(phys_addr_t start) +{ + int nid = meminfo_to_nid(&numa_meminfo, start); + + /* + * Prefer online nodes, but if reserved memory might be + * hot-added continue the search with reserved ranges. + */ + if (nid != NUMA_NO_NODE) + return nid; + + return meminfo_to_nid(&numa_reserved_meminfo, start); +} +EXPORT_SYMBOL_GPL(phys_to_target_node); + +int memory_add_physaddr_to_nid(u64 start) +{ + int nid = meminfo_to_nid(&numa_meminfo, start); + + if (nid == NUMA_NO_NODE) + nid = numa_meminfo.blk[0].nid; return nid; } EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); -- cgit From 7b27a8622f802761d5c6abd6c37b22312a35343c Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Sun, 16 Feb 2020 12:01:16 -0800 Subject: libnvdimm/e820: Retrieve and populate correct 'target_node' info Use the new phys_to_target_node() and numa_map_to_online_node() helpers to retrieve the correct id for the 'numa_node' ("local" / online initiator node) and 'target_node' (offline target memory node) sysfs attributes. Below is an example from a 4 NUMA node system where all the memory on node2 is pmem / reserved. It should be noted that with the arrival of the ACPI HMAT table and EFI Specific Purpose Memory the kernel will start to see more platforms with reserved / performance differentiated memory in its own NUMA node. Hence all the stakeholders on the Cc for what is ostensibly a libnvdimm local patch. === Before === /* Notice no online memory on node2 at start */ # numactl --hardware available: 3 nodes (0-1,3) node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 node 0 size: 3958 MB node 0 free: 3708 MB node 1 cpus: 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 node 1 size: 4027 MB node 1 free: 3871 MB node 3 cpus: node 3 size: 3994 MB node 3 free: 3971 MB node distances: node 0 1 3 0: 10 21 21 1: 21 10 21 3: 21 21 10 /* * Put the pmem namespace into devdax mode so it can be assigned to the * kmem driver */ # ndctl create-namespace -e namespace0.0 -m devdax -f { "dev":"namespace0.0", "mode":"devdax", "map":"dev", "size":"3.94 GiB (4.23 GB)", "uuid":"1650af9b-9ba3-4704-acd6-10178399d9a3", [..] } /* Online Persistent Memory as System RAM */ # daxctl reconfigure-device --mode=system-ram dax0.0 libdaxctl: memblock_in_dev: dax0.0: memory0: Unable to determine phys_index: Success libdaxctl: memblock_in_dev: dax0.0: memory0: Unable to determine phys_index: Success libdaxctl: memblock_in_dev: dax0.0: memory0: Unable to determine phys_index: Success libdaxctl: memblock_in_dev: dax0.0: memory0: Unable to determine phys_index: Success [ { "chardev":"dax0.0", "size":4225761280, "target_node":0, "mode":"system-ram" } ] reconfigured 1 device /* Note that the memory is onlined by default to the wrong node, node0 */ # numactl --hardware available: 3 nodes (0-1,3) node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 node 0 size: 7926 MB node 0 free: 7655 MB node 1 cpus: 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 node 1 size: 4027 MB node 1 free: 3871 MB node 3 cpus: node 3 size: 3994 MB node 3 free: 3971 MB node distances: node 0 1 3 0: 10 21 21 1: 21 10 21 3: 21 21 10 === After === /* Notice that the "phys_index" error messages are gone */ # daxctl reconfigure-device --mode=system-ram dax0.0 [ { "chardev":"dax0.0", "size":4225761280, "target_node":2, "mode":"system-ram" } ] reconfigured 1 device /* Notice that node2 is now correctly populated */ # numactl --hardware available: 4 nodes (0-3) node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 node 0 size: 3958 MB node 0 free: 3793 MB node 1 cpus: 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 node 1 size: 4027 MB node 1 free: 3851 MB node 2 cpus: node 2 size: 3968 MB node 2 free: 3968 MB node 3 cpus: node 3 size: 3994 MB node 3 free: 3908 MB node distances: node 0 1 2 3 0: 10 21 21 21 1: 21 10 21 21 2: 21 21 10 21 3: 21 21 21 10 Cc: Dave Hansen Cc: Andy Lutomirski Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Andrew Morton Cc: David Hildenbrand Cc: Michal Hocko Cc: Ira Weiny Cc: Vishal Verma Cc: Christoph Hellwig Reviewed-by: Ingo Molnar Signed-off-by: Dan Williams Link: https://lore.kernel.org/r/158188327614.894464.13122730362187722603.stgit@dwillia2-desk3.amr.corp.intel.com --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index beea77046f9b..d4e446daf457 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1664,6 +1664,7 @@ config X86_PMEM_LEGACY depends on PHYS_ADDR_T_64BIT depends on BLK_DEV select X86_PMEM_LEGACY_DEVICE + select NUMA_KEEP_MEMINFO if NUMA select LIBNVDIMM help Treat memory marked using the non-standard e820 type of 12 as used -- cgit