From 945302df3de156fc367d3b537cec76d4aea0b0d1 Mon Sep 17 00:00:00 2001 From: Jesse Taube Date: Tue, 9 Jul 2024 13:39:37 -0400 Subject: RISC-V: Use Zkr to seed KASLR base address Parse the device tree for Zkr in the isa string. If Zkr is present, use it to seed the kernel base address. On an ACPI system, as of this commit, there is no easy way to check if Zkr is present. Blindly running the instruction isn't an option as; we have to be able to trust the firmware. Signed-off-by: Jesse Taube Reviewed-by: Charlie Jenkins Reviewed-by: Alexandre Ghiti Tested-by: Zong Li Reviewed-by: Conor Dooley Link: https://lore.kernel.org/r/20240709173937.510084-5-jesse@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/riscv/mm') diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index e3218d65f21d..5551403a85ae 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -1042,6 +1042,7 @@ static void __init pt_ops_set_late(void) #ifdef CONFIG_RANDOMIZE_BASE extern bool __init __pi_set_nokaslr_from_cmdline(uintptr_t dtb_pa); extern u64 __init __pi_get_kaslr_seed(uintptr_t dtb_pa); +extern u64 __init __pi_get_kaslr_seed_zkr(const uintptr_t dtb_pa); static int __init print_nokaslr(char *p) { @@ -1062,10 +1063,12 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) #ifdef CONFIG_RANDOMIZE_BASE if (!__pi_set_nokaslr_from_cmdline(dtb_pa)) { - u64 kaslr_seed = __pi_get_kaslr_seed(dtb_pa); + u64 kaslr_seed = __pi_get_kaslr_seed_zkr(dtb_pa); u32 kernel_size = (uintptr_t)(&_end) - (uintptr_t)(&_start); u32 nr_pos; + if (kaslr_seed == 0) + kaslr_seed = __pi_get_kaslr_seed(dtb_pa); /* * Compute the number of positions available: we are limited * by the early page table that only has one PUD and we must -- cgit From 7c1e5b9690b0e14acead4ff98d8a6c40f2dff54b Mon Sep 17 00:00:00 2001 From: Charlie Jenkins Date: Tue, 3 Sep 2024 15:52:34 -0700 Subject: riscv: Disable preemption while handling PR_RISCV_CTX_SW_FENCEI_OFF The icache will be flushed in switch_to() if force_icache_flush is true, or in flush_icache_deferred() if icache_stale_mask is set. Between setting force_icache_flush to false and calculating the new icache_stale_mask, preemption needs to be disabled. There are two reasons for this: 1. If CPU migration happens between force_icache_flush = false, and the icache_stale_mask is set, an icache flush will not be emitted. 2. smp_processor_id() is used in set_icache_stale_mask() to mark the current CPU as not needing another flush since a flush will have happened either by userspace or by the kernel when performing the migration. smp_processor_id() is currently called twice with preemption enabled which causes a race condition. It allows icache_stale_mask to be populated with inconsistent CPU ids. Resolve these two issues by setting the icache_stale_mask before setting force_icache_flush to false, and using get_cpu()/put_cpu() to obtain the smp_processor_id(). Signed-off-by: Charlie Jenkins Fixes: 6b9391b581fd ("riscv: Include riscv_set_icache_flush_ctx prctl") Link: https://lore.kernel.org/r/20240903-fix_fencei_optimization-v2-1-8025f20171fc@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/cacheflush.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/riscv/mm') diff --git a/arch/riscv/mm/cacheflush.c b/arch/riscv/mm/cacheflush.c index a03c994eed3b..b81672729887 100644 --- a/arch/riscv/mm/cacheflush.c +++ b/arch/riscv/mm/cacheflush.c @@ -158,6 +158,7 @@ void __init riscv_init_cbo_blocksizes(void) #ifdef CONFIG_SMP static void set_icache_stale_mask(void) { + int cpu = get_cpu(); cpumask_t *mask; bool stale_cpu; @@ -168,10 +169,11 @@ static void set_icache_stale_mask(void) * concurrently on different harts. */ mask = ¤t->mm->context.icache_stale_mask; - stale_cpu = cpumask_test_cpu(smp_processor_id(), mask); + stale_cpu = cpumask_test_cpu(cpu, mask); cpumask_setall(mask); - cpumask_assign_cpu(smp_processor_id(), mask, stale_cpu); + cpumask_assign_cpu(cpu, mask, stale_cpu); + put_cpu(); } #endif @@ -239,14 +241,12 @@ int riscv_set_icache_flush_ctx(unsigned long ctx, unsigned long scope) case PR_RISCV_CTX_SW_FENCEI_OFF: switch (scope) { case PR_RISCV_SCOPE_PER_PROCESS: - current->mm->context.force_icache_flush = false; - set_icache_stale_mask(); + current->mm->context.force_icache_flush = false; break; case PR_RISCV_SCOPE_PER_THREAD: - current->thread.force_icache_flush = false; - set_icache_stale_mask(); + current->thread.force_icache_flush = false; break; default: return -EINVAL; -- cgit From 5cf089672119808c2f5b7035c91adcc0cc7287e1 Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Fri, 7 Jun 2024 22:22:08 +0200 Subject: riscv: replace misleading va_kernel_pa_offset on XIP kernel On XIP kernel, the name "va_kernel_pa_offset" is misleading: unlike "normal" kernel, it is not the virtual-physical address offset of kernel mapping, it is the offset of kernel mapping's first virtual address to first physical address in DRAM, which is not meaningful because the kernel's first physical address is not in DRAM. For XIP kernel, there are 2 different offsets because the read-only part of the kernel resides in ROM while the rest is in RAM. The offset to ROM is in kernel_map.va_kernel_xip_pa_offset, while the offset to RAM is not stored anywhere: it is calculated on-the-fly. Remove this confusing "va_kernel_pa_offset" and add "va_kernel_xip_data_pa_offset" as its replacement. This new variable is the offset of virtual mapping of the kernel's data portion to the corresponding physical addresses. With the introduction of this new variable, also rename va_kernel_xip_pa_offset -> va_kernel_xip_text_pa_offset to make it clear that this one is about the .text section. Signed-off-by: Nam Cao Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/84e5d005c1386d88d7b2531e0b6707ec5352ee54.1717789719.git.namcao@linutronix.de Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/riscv/mm') diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index eb0649a61b4c..c2d63808d7ab 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -1098,11 +1098,14 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) kernel_map.phys_addr = (uintptr_t)CONFIG_PHYS_RAM_BASE; kernel_map.size = (uintptr_t)(&_end) - (uintptr_t)(&_start); - kernel_map.va_kernel_xip_pa_offset = kernel_map.virt_addr - kernel_map.xiprom; + kernel_map.va_kernel_xip_text_pa_offset = kernel_map.virt_addr - kernel_map.xiprom; + kernel_map.va_kernel_xip_data_pa_offset = kernel_map.virt_addr - kernel_map.phys_addr + + (uintptr_t)&_sdata - (uintptr_t)&_start; #else kernel_map.page_offset = _AC(CONFIG_PAGE_OFFSET, UL); kernel_map.phys_addr = (uintptr_t)(&_start); kernel_map.size = (uintptr_t)(&_end) - kernel_map.phys_addr; + kernel_map.va_kernel_pa_offset = kernel_map.virt_addr - kernel_map.phys_addr; #endif #if defined(CONFIG_64BIT) && !defined(CONFIG_XIP_KERNEL) @@ -1124,7 +1127,6 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) */ kernel_map.va_pa_offset = IS_ENABLED(CONFIG_64BIT) ? 0UL : PAGE_OFFSET - kernel_map.phys_addr; - kernel_map.va_kernel_pa_offset = kernel_map.virt_addr - kernel_map.phys_addr; /* * The default maximal physical memory size is KERN_VIRT_SIZE for 32-bit -- cgit From a7cfb999433ad3a1aa7ca86ecdaf3e061ab7076a Mon Sep 17 00:00:00 2001 From: Nam Cao Date: Fri, 7 Jun 2024 22:22:12 +0200 Subject: riscv: drop the use of XIP_OFFSET in create_kernel_page_table() XIP_OFFSET is the hard-coded offset of writable data section within the kernel. By hard-coding this value, the read-only section of the kernel (which is placed before the writable data section) is restricted in size. As a preparation to remove this hard-coded value entirely, stop using XIP_OFFSET in create_kernel_page_table(). Instead use _sdata and _start to do the same thing. Signed-off-by: Nam Cao Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/4ea3f222a7eb9f91c04b155ff2e4d3ef19158acc.1717789719.git.namcao@linutronix.de Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/riscv/mm') diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index c2d63808d7ab..31b23f84494f 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -917,7 +917,7 @@ static void __init relocate_kernel(void) static void __init create_kernel_page_table(pgd_t *pgdir, __always_unused bool early) { - uintptr_t va, end_va; + uintptr_t va, start_va, end_va; /* Map the flash resident part */ end_va = kernel_map.virt_addr + kernel_map.xiprom_sz; @@ -927,10 +927,11 @@ static void __init create_kernel_page_table(pgd_t *pgdir, PMD_SIZE, PAGE_KERNEL_EXEC); /* Map the data in RAM */ + start_va = kernel_map.virt_addr + (uintptr_t)&_sdata - (uintptr_t)&_start; end_va = kernel_map.virt_addr + kernel_map.size; - for (va = kernel_map.virt_addr + XIP_OFFSET; va < end_va; va += PMD_SIZE) + for (va = start_va; va < end_va; va += PMD_SIZE) create_pgd_mapping(pgdir, va, - kernel_map.phys_addr + (va - (kernel_map.virt_addr + XIP_OFFSET)), + kernel_map.phys_addr + (va - start_va), PMD_SIZE, PAGE_KERNEL); } #else -- cgit From d6a1928134a1c626ff369129d7a80951b2949a48 Mon Sep 17 00:00:00 2001 From: Stuart Menefy Date: Mon, 24 Jun 2024 13:17:23 +0100 Subject: riscv: Remove redundant restriction on memory size The original reason for reserving the top 4GiB of the direct map (space for modules/BPF/kernel) hasn't applied since the address map was reworked for KASAN. Signed-off-by: Stuart Menefy Reviewed-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20240624121723.2186279-1-stuart.menefy@codasip.com Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'arch/riscv/mm') diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index f43139db82a9..0e56725377a4 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -1132,13 +1132,7 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) kernel_map.va_pa_offset = IS_ENABLED(CONFIG_64BIT) ? 0UL : PAGE_OFFSET - kernel_map.phys_addr; - /* - * The default maximal physical memory size is KERN_VIRT_SIZE for 32-bit - * kernel, whereas for 64-bit kernel, the end of the virtual address - * space is occupied by the modules/BPF/kernel mappings which reduces - * the available size of the linear mapping. - */ - memory_limit = KERN_VIRT_SIZE - (IS_ENABLED(CONFIG_64BIT) ? SZ_4G : 0); + memory_limit = KERN_VIRT_SIZE; /* Sanity check alignment and size */ BUG_ON((PAGE_OFFSET % PGDIR_SIZE) != 0); -- cgit From 503638e0babf364061bc50fca5103b00a56cc50a Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Wed, 17 Jul 2024 08:01:24 +0200 Subject: riscv: Stop emitting preventive sfence.vma for new vmalloc mappings In 6.5, we removed the vmalloc fault path because that can't work (see [1] [2]). Then in order to make sure that new page table entries were seen by the page table walker, we had to preventively emit a sfence.vma on all harts [3] but this solution is very costly since it relies on IPI. And even there, we could end up in a loop of vmalloc faults if a vmalloc allocation is done in the IPI path (for example if it is traced, see [4]), which could result in a kernel stack overflow. Those preventive sfence.vma needed to be emitted because: - if the uarch caches invalid entries, the new mapping may not be observed by the page table walker and an invalidation may be needed. - if the uarch does not cache invalid entries, a reordered access could "miss" the new mapping and traps: in that case, we would actually only need to retry the access, no sfence.vma is required. So this patch removes those preventive sfence.vma and actually handles the possible (and unlikely) exceptions. And since the kernel stacks mappings lie in the vmalloc area, this handling must be done very early when the trap is taken, at the very beginning of handle_exception: this also rules out the vmalloc allocations in the fault path. Link: https://lore.kernel.org/linux-riscv/20230531093817.665799-1-bjorn@kernel.org/ [1] Link: https://lore.kernel.org/linux-riscv/20230801090927.2018653-1-dylan@andestech.com [2] Link: https://lore.kernel.org/linux-riscv/20230725132246.817726-1-alexghiti@rivosinc.com/ [3] Link: https://lore.kernel.org/lkml/20200508144043.13893-1-joro@8bytes.org/ [4] Signed-off-by: Alexandre Ghiti Reviewed-by: Yunhui Cui Link: https://lore.kernel.org/r/20240717060125.139416-4-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/riscv/mm') diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index e3218d65f21d..337dab7d5fd7 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -36,6 +36,8 @@ #include "../kernel/head.h" +u64 new_vmalloc[NR_CPUS / sizeof(u64) + 1]; + struct kernel_mapping kernel_map __ro_after_init; EXPORT_SYMBOL(kernel_map); #ifdef CONFIG_XIP_KERNEL -- cgit From 7a21b2e370dab780ddb3aa80f2a4c8ff97bddccc Mon Sep 17 00:00:00 2001 From: Alexandre Ghiti Date: Wed, 17 Jul 2024 08:01:25 +0200 Subject: riscv: Stop emitting preventive sfence.vma for new userspace mappings with Svvptc The preventive sfence.vma were emitted because new mappings must be made visible to the page table walker but Svvptc guarantees that it will happen within a bounded timeframe, so no need to sfence.vma for the uarchs that implement this extension, we will then take gratuitous (but very unlikely) page faults, similarly to x86 and arm64. This allows to drastically reduce the number of sfence.vma emitted: * Ubuntu boot to login: Before: ~630k sfence.vma After: ~200k sfence.vma * ltp - mmapstress01 Before: ~45k After: ~6.3k * lmbench - lat_pagefault Before: ~665k After: 832 (!) * lmbench - lat_mmap Before: ~546k After: 718 (!) Signed-off-by: Alexandre Ghiti Link: https://lore.kernel.org/r/20240717060125.139416-5-alexghiti@rivosinc.com Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/pgtable.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch/riscv/mm') diff --git a/arch/riscv/mm/pgtable.c b/arch/riscv/mm/pgtable.c index 533ec9055fa0..4ae67324f992 100644 --- a/arch/riscv/mm/pgtable.c +++ b/arch/riscv/mm/pgtable.c @@ -9,6 +9,9 @@ int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty) { + asm goto(ALTERNATIVE("nop", "j %l[svvptc]", 0, RISCV_ISA_EXT_SVVPTC, 1) + : : : : svvptc); + if (!pte_same(ptep_get(ptep), entry)) __set_pte_at(vma->vm_mm, ptep, entry); /* @@ -16,6 +19,16 @@ int ptep_set_access_flags(struct vm_area_struct *vma, * the case that the PTE changed and the spurious fault case. */ return true; + +svvptc: + if (!pte_same(ptep_get(ptep), entry)) { + __set_pte_at(vma->vm_mm, ptep, entry); + /* Here only not svadu is impacted */ + flush_tlb_page(vma, address); + return true; + } + + return false; } int ptep_test_and_clear_young(struct vm_area_struct *vma, -- cgit