diff options
Diffstat (limited to 'arch')
-rw-r--r-- | arch/arm/Kconfig | 1 | ||||
-rw-r--r-- | arch/arm/kernel/bios32.c | 5 | ||||
-rw-r--r-- | arch/arm/kernel/entry-ftrace.S | 18 | ||||
-rw-r--r-- | arch/arm/mm/cache-l2x0.c | 7 | ||||
-rw-r--r-- | arch/arm/mm/fault.c | 3 | ||||
-rw-r--r-- | arch/arm64/include/asm/ftrace.h | 1 | ||||
-rw-r--r-- | arch/arm64/include/asm/mmu.h | 7 | ||||
-rw-r--r-- | arch/arm64/kernel/cpufeature.c | 108 | ||||
-rw-r--r-- | arch/arm64/kernel/mte.c | 2 | ||||
-rw-r--r-- | arch/arm64/kernel/probes/kprobes.c | 12 | ||||
-rw-r--r-- | arch/arm64/kvm/Kconfig | 2 | ||||
-rw-r--r-- | arch/arm64/kvm/arm.c | 3 | ||||
-rw-r--r-- | arch/arm64/mm/mmu.c | 98 | ||||
-rw-r--r-- | arch/loongarch/kvm/Kconfig | 2 | ||||
-rw-r--r-- | arch/loongarch/kvm/vcpu.c | 3 | ||||
-rw-r--r-- | arch/riscv/kvm/Kconfig | 2 | ||||
-rw-r--r-- | arch/riscv/kvm/vcpu.c | 3 | ||||
-rw-r--r-- | arch/x86/hyperv/irqdomain.c | 111 | ||||
-rw-r--r-- | arch/x86/hyperv/ivm.c | 211 | ||||
-rw-r--r-- | arch/x86/kernel/acpi/cstate.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/mshyperv.c | 11 | ||||
-rw-r--r-- | arch/x86/kvm/Kconfig | 2 | ||||
-rw-r--r-- | arch/x86/kvm/vmx/vmx.c | 1 | ||||
-rw-r--r-- | arch/x86/kvm/x86.c | 3 |
24 files changed, 443 insertions, 175 deletions
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 2a124c92e4f6..2e3f93b690f4 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -108,6 +108,7 @@ config ARM select HAVE_GUP_FAST if ARM_LPAE select HAVE_FUNCTION_ERROR_INJECTION select HAVE_FUNCTION_GRAPH_TRACER + select HAVE_FUNCTION_GRAPH_FREGS select HAVE_FUNCTION_TRACER if !XIP_KERNEL select HAVE_GCC_PLUGINS select HAVE_HW_BREAKPOINT if PERF_EVENTS && (CPU_V6 || CPU_V6K || CPU_V7) diff --git a/arch/arm/kernel/bios32.c b/arch/arm/kernel/bios32.c index d334c7fb672b..b5793e8fbdc1 100644 --- a/arch/arm/kernel/bios32.c +++ b/arch/arm/kernel/bios32.c @@ -10,6 +10,7 @@ #include <linux/kernel.h> #include <linux/pci.h> #include <linux/slab.h> +#include <linux/string_choices.h> #include <linux/init.h> #include <linux/io.h> @@ -337,8 +338,8 @@ void pcibios_fixup_bus(struct pci_bus *bus) /* * Report what we did for this bus */ - pr_info("PCI: bus%d: Fast back to back transfers %sabled\n", - bus->number, (features & PCI_COMMAND_FAST_BACK) ? "en" : "dis"); + pr_info("PCI: bus%d: Fast back to back transfers %s\n", + bus->number, str_enabled_disabled(features & PCI_COMMAND_FAST_BACK)); } EXPORT_SYMBOL(pcibios_fixup_bus); diff --git a/arch/arm/kernel/entry-ftrace.S b/arch/arm/kernel/entry-ftrace.S index bc598e3d8dd2..e24ee559af81 100644 --- a/arch/arm/kernel/entry-ftrace.S +++ b/arch/arm/kernel/entry-ftrace.S @@ -257,11 +257,21 @@ ENDPROC(ftrace_graph_regs_caller) #ifdef CONFIG_FUNCTION_GRAPH_TRACER ENTRY(return_to_handler) - stmdb sp!, {r0-r3} - add r0, sp, #16 @ sp at exit of instrumented routine + mov ip, sp @ sp at exit of instrumented routine + sub sp, #PT_REGS_SIZE + str r0, [sp, #S_R0] + str r1, [sp, #S_R1] + str r2, [sp, #S_R2] + str r3, [sp, #S_R3] + str ip, [sp, #S_FP] + mov r0, sp bl ftrace_return_to_handler - mov lr, r0 @ r0 has real ret addr - ldmia sp!, {r0-r3} + mov lr, r0 @ r0 has real ret addr + ldr r3, [sp, #S_R3] + ldr r2, [sp, #S_R2] + ldr r1, [sp, #S_R1] + ldr r0, [sp, #S_R0] + add sp, sp, #PT_REGS_SIZE @ restore stack pointer ret lr ENDPROC(return_to_handler) #endif diff --git a/arch/arm/mm/cache-l2x0.c b/arch/arm/mm/cache-l2x0.c index 43d91bfd2360..470867160076 100644 --- a/arch/arm/mm/cache-l2x0.c +++ b/arch/arm/mm/cache-l2x0.c @@ -13,6 +13,7 @@ #include <linux/io.h> #include <linux/of.h> #include <linux/of_address.h> +#include <linux/string_choices.h> #include <asm/cacheflush.h> #include <asm/cp15.h> @@ -667,9 +668,9 @@ static void __init l2c310_enable(void __iomem *base, unsigned num_lock) u32 power_ctrl; power_ctrl = readl_relaxed(base + L310_POWER_CTRL); - pr_info("L2C-310 dynamic clock gating %sabled, standby mode %sabled\n", - power_ctrl & L310_DYNAMIC_CLK_GATING_EN ? "en" : "dis", - power_ctrl & L310_STNDBY_MODE_EN ? "en" : "dis"); + pr_info("L2C-310 dynamic clock gating %s, standby mode %s\n", + str_enabled_disabled(power_ctrl & L310_DYNAMIC_CLK_GATING_EN), + str_enabled_disabled(power_ctrl & L310_STNDBY_MODE_EN)); } if (aux & L310_AUX_CTRL_FULL_LINE_ZERO) diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index 46169fe42c61..2bc828a1940c 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -135,8 +135,7 @@ static void die_kernel_fault(const char *msg, struct mm_struct *mm, bust_spinlocks(1); pr_alert("8<--- cut here ---\n"); pr_alert("Unable to handle kernel %s at virtual address %08lx when %s\n", - msg, addr, fsr & FSR_LNX_PF ? "execute" : - fsr & FSR_WRITE ? "write" : "read"); + msg, addr, fsr & FSR_LNX_PF ? "execute" : str_write_read(fsr & FSR_WRITE)); show_pte(KERN_ALERT, mm, addr); die("Oops", regs, fsr); diff --git a/arch/arm64/include/asm/ftrace.h b/arch/arm64/include/asm/ftrace.h index bfe3ce9df197..ba7cf7fec5e9 100644 --- a/arch/arm64/include/asm/ftrace.h +++ b/arch/arm64/include/asm/ftrace.h @@ -153,6 +153,7 @@ ftrace_partial_regs(const struct ftrace_regs *fregs, struct pt_regs *regs) regs->pc = afregs->pc; regs->regs[29] = afregs->fp; regs->regs[30] = afregs->lr; + regs->pstate = PSR_MODE_EL1h; return regs; } diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h index ff6fd0bbd7d2..78a4dbf75e60 100644 --- a/arch/arm64/include/asm/mmu.h +++ b/arch/arm64/include/asm/mmu.h @@ -79,7 +79,6 @@ extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, extern void *fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot); extern void mark_linear_text_alias_ro(void); extern int split_kernel_leaf_mapping(unsigned long start, unsigned long end); -extern void init_idmap_kpti_bbml2_flag(void); extern void linear_map_maybe_split_to_ptes(void); /* @@ -107,5 +106,11 @@ static inline bool kaslr_requires_kpti(void) return true; } +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +void kpti_install_ng_mappings(void); +#else +static inline void kpti_install_ng_mappings(void) {} +#endif + #endif /* !__ASSEMBLY__ */ #endif diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 3917ad897801..5ed401ff79e3 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -1941,104 +1941,6 @@ static bool has_pmuv3(const struct arm64_cpu_capabilities *entry, int scope) } #endif -#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 -#define KPTI_NG_TEMP_VA (-(1UL << PMD_SHIFT)) - -extern -void create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt, - phys_addr_t size, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(enum pgtable_type), int flags); - -static phys_addr_t __initdata kpti_ng_temp_alloc; - -static phys_addr_t __init kpti_ng_pgd_alloc(enum pgtable_type type) -{ - kpti_ng_temp_alloc -= PAGE_SIZE; - return kpti_ng_temp_alloc; -} - -static int __init __kpti_install_ng_mappings(void *__unused) -{ - typedef void (kpti_remap_fn)(int, int, phys_addr_t, unsigned long); - extern kpti_remap_fn idmap_kpti_install_ng_mappings; - kpti_remap_fn *remap_fn; - - int cpu = smp_processor_id(); - int levels = CONFIG_PGTABLE_LEVELS; - int order = order_base_2(levels); - u64 kpti_ng_temp_pgd_pa = 0; - pgd_t *kpti_ng_temp_pgd; - u64 alloc = 0; - - if (levels == 5 && !pgtable_l5_enabled()) - levels = 4; - else if (levels == 4 && !pgtable_l4_enabled()) - levels = 3; - - remap_fn = (void *)__pa_symbol(idmap_kpti_install_ng_mappings); - - if (!cpu) { - alloc = __get_free_pages(GFP_ATOMIC | __GFP_ZERO, order); - kpti_ng_temp_pgd = (pgd_t *)(alloc + (levels - 1) * PAGE_SIZE); - kpti_ng_temp_alloc = kpti_ng_temp_pgd_pa = __pa(kpti_ng_temp_pgd); - - // - // Create a minimal page table hierarchy that permits us to map - // the swapper page tables temporarily as we traverse them. - // - // The physical pages are laid out as follows: - // - // +--------+-/-------+-/------ +-/------ +-\\\--------+ - // : PTE[] : | PMD[] : | PUD[] : | P4D[] : ||| PGD[] : - // +--------+-\-------+-\------ +-\------ +-///--------+ - // ^ - // The first page is mapped into this hierarchy at a PMD_SHIFT - // aligned virtual address, so that we can manipulate the PTE - // level entries while the mapping is active. The first entry - // covers the PTE[] page itself, the remaining entries are free - // to be used as a ad-hoc fixmap. - // - create_kpti_ng_temp_pgd(kpti_ng_temp_pgd, __pa(alloc), - KPTI_NG_TEMP_VA, PAGE_SIZE, PAGE_KERNEL, - kpti_ng_pgd_alloc, 0); - } - - cpu_install_idmap(); - remap_fn(cpu, num_online_cpus(), kpti_ng_temp_pgd_pa, KPTI_NG_TEMP_VA); - cpu_uninstall_idmap(); - - if (!cpu) { - free_pages(alloc, order); - arm64_use_ng_mappings = true; - } - - return 0; -} - -static void __init kpti_install_ng_mappings(void) -{ - /* Check whether KPTI is going to be used */ - if (!arm64_kernel_unmapped_at_el0()) - return; - - /* - * We don't need to rewrite the page-tables if either we've done - * it already or we have KASLR enabled and therefore have not - * created any global mappings at all. - */ - if (arm64_use_ng_mappings) - return; - - init_idmap_kpti_bbml2_flag(); - stop_machine(__kpti_install_ng_mappings, NULL, cpu_online_mask); -} - -#else -static inline void kpti_install_ng_mappings(void) -{ -} -#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ - static void cpu_enable_kpti(struct arm64_cpu_capabilities const *cap) { if (__this_cpu_read(this_cpu_vector) == vectors) { @@ -2419,17 +2321,21 @@ static void bti_enable(const struct arm64_cpu_capabilities *__unused) #ifdef CONFIG_ARM64_MTE static void cpu_enable_mte(struct arm64_cpu_capabilities const *cap) { + static bool cleared_zero_page = false; + sysreg_clear_set(sctlr_el1, 0, SCTLR_ELx_ATA | SCTLR_EL1_ATA0); mte_cpu_setup(); /* * Clear the tags in the zero page. This needs to be done via the - * linear map which has the Tagged attribute. + * linear map which has the Tagged attribute. Since this page is + * always mapped as pte_special(), set_pte_at() will not attempt to + * clear the tags or set PG_mte_tagged. */ - if (try_page_mte_tagging(ZERO_PAGE(0))) { + if (!cleared_zero_page) { + cleared_zero_page = true; mte_clear_page_tags(lm_alias(empty_zero_page)); - set_page_mte_tagged(ZERO_PAGE(0)); } kasan_init_hw_tags_cpu(); diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index 54a52dc5c1ae..43f7a2f39403 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -478,7 +478,7 @@ static int __access_remote_tags(struct mm_struct *mm, unsigned long addr, if (folio_test_hugetlb(folio)) WARN_ON_ONCE(!folio_test_hugetlb_mte_tagged(folio)); else - WARN_ON_ONCE(!page_mte_tagged(page)); + WARN_ON_ONCE(!page_mte_tagged(page) && !is_zero_page(page)); /* limit access to the end of the page */ offset = offset_in_page(addr); diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c index 0c5d408afd95..8ab6104a4883 100644 --- a/arch/arm64/kernel/probes/kprobes.c +++ b/arch/arm64/kernel/probes/kprobes.c @@ -10,6 +10,7 @@ #define pr_fmt(fmt) "kprobes: " fmt +#include <linux/execmem.h> #include <linux/extable.h> #include <linux/kasan.h> #include <linux/kernel.h> @@ -41,6 +42,17 @@ DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); static void __kprobes post_kprobe_handler(struct kprobe *, struct kprobe_ctlblk *, struct pt_regs *); +void *alloc_insn_page(void) +{ + void *addr; + + addr = execmem_alloc(EXECMEM_KPROBES, PAGE_SIZE); + if (!addr) + return NULL; + set_memory_rox((unsigned long)addr, 1); + return addr; +} + static void __kprobes arch_prepare_ss_slot(struct kprobe *p) { kprobe_opcode_t *addr = p->ainsn.xol_insn; diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index bff62e75d681..4f803fd1c99a 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -25,7 +25,7 @@ menuconfig KVM select HAVE_KVM_CPU_RELAX_INTERCEPT select KVM_MMIO select KVM_GENERIC_DIRTYLOG_READ_PROTECT - select KVM_XFER_TO_GUEST_WORK + select VIRT_XFER_TO_GUEST_WORK select KVM_VFIO select HAVE_KVM_DIRTY_RING_ACQ_REL select NEED_KVM_DIRTY_RING_WITH_BITMAP diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index fa79744290f3..f21d1b7f20f8 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -6,7 +6,6 @@ #include <linux/bug.h> #include <linux/cpu_pm.h> -#include <linux/entry-kvm.h> #include <linux/errno.h> #include <linux/err.h> #include <linux/kvm_host.h> @@ -1183,7 +1182,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) /* * Check conditions before entering the guest */ - ret = xfer_to_guest_mode_handle_work(vcpu); + ret = kvm_xfer_to_guest_mode_handle_work(vcpu); if (!ret) ret = 1; diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index b3d8c3de4149..b8d37eb037fc 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -470,14 +470,6 @@ static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, mutex_unlock(&fixmap_lock); } -#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 -extern __alias(__create_pgd_mapping_locked) -void create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt, - phys_addr_t size, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(enum pgtable_type), - int flags); -#endif - #define INVALID_PHYS_ADDR (-1ULL) static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm, gfp_t gfp, @@ -823,7 +815,7 @@ static bool linear_map_requires_bbml2 __initdata; u32 idmap_kpti_bbml2_flag; -void __init init_idmap_kpti_bbml2_flag(void) +static void __init init_idmap_kpti_bbml2_flag(void) { WRITE_ONCE(idmap_kpti_bbml2_flag, 1); /* Must be visible to other CPUs before stop_machine() is called. */ @@ -1135,7 +1127,93 @@ static void __init declare_vma(struct vm_struct *vma, } #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 -static pgprot_t kernel_exec_prot(void) +#define KPTI_NG_TEMP_VA (-(1UL << PMD_SHIFT)) + +static phys_addr_t kpti_ng_temp_alloc __initdata; + +static phys_addr_t __init kpti_ng_pgd_alloc(enum pgtable_type type) +{ + kpti_ng_temp_alloc -= PAGE_SIZE; + return kpti_ng_temp_alloc; +} + +static int __init __kpti_install_ng_mappings(void *__unused) +{ + typedef void (kpti_remap_fn)(int, int, phys_addr_t, unsigned long); + extern kpti_remap_fn idmap_kpti_install_ng_mappings; + kpti_remap_fn *remap_fn; + + int cpu = smp_processor_id(); + int levels = CONFIG_PGTABLE_LEVELS; + int order = order_base_2(levels); + u64 kpti_ng_temp_pgd_pa = 0; + pgd_t *kpti_ng_temp_pgd; + u64 alloc = 0; + + if (levels == 5 && !pgtable_l5_enabled()) + levels = 4; + else if (levels == 4 && !pgtable_l4_enabled()) + levels = 3; + + remap_fn = (void *)__pa_symbol(idmap_kpti_install_ng_mappings); + + if (!cpu) { + alloc = __get_free_pages(GFP_ATOMIC | __GFP_ZERO, order); + kpti_ng_temp_pgd = (pgd_t *)(alloc + (levels - 1) * PAGE_SIZE); + kpti_ng_temp_alloc = kpti_ng_temp_pgd_pa = __pa(kpti_ng_temp_pgd); + + // + // Create a minimal page table hierarchy that permits us to map + // the swapper page tables temporarily as we traverse them. + // + // The physical pages are laid out as follows: + // + // +--------+-/-------+-/------ +-/------ +-\\\--------+ + // : PTE[] : | PMD[] : | PUD[] : | P4D[] : ||| PGD[] : + // +--------+-\-------+-\------ +-\------ +-///--------+ + // ^ + // The first page is mapped into this hierarchy at a PMD_SHIFT + // aligned virtual address, so that we can manipulate the PTE + // level entries while the mapping is active. The first entry + // covers the PTE[] page itself, the remaining entries are free + // to be used as a ad-hoc fixmap. + // + __create_pgd_mapping_locked(kpti_ng_temp_pgd, __pa(alloc), + KPTI_NG_TEMP_VA, PAGE_SIZE, PAGE_KERNEL, + kpti_ng_pgd_alloc, 0); + } + + cpu_install_idmap(); + remap_fn(cpu, num_online_cpus(), kpti_ng_temp_pgd_pa, KPTI_NG_TEMP_VA); + cpu_uninstall_idmap(); + + if (!cpu) { + free_pages(alloc, order); + arm64_use_ng_mappings = true; + } + + return 0; +} + +void __init kpti_install_ng_mappings(void) +{ + /* Check whether KPTI is going to be used */ + if (!arm64_kernel_unmapped_at_el0()) + return; + + /* + * We don't need to rewrite the page-tables if either we've done + * it already or we have KASLR enabled and therefore have not + * created any global mappings at all. + */ + if (arm64_use_ng_mappings) + return; + + init_idmap_kpti_bbml2_flag(); + stop_machine(__kpti_install_ng_mappings, NULL, cpu_online_mask); +} + +static pgprot_t __init kernel_exec_prot(void) { return rodata_enabled ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC; } diff --git a/arch/loongarch/kvm/Kconfig b/arch/loongarch/kvm/Kconfig index 40eea6da7c25..ae64bbdf83a7 100644 --- a/arch/loongarch/kvm/Kconfig +++ b/arch/loongarch/kvm/Kconfig @@ -31,7 +31,7 @@ config KVM select KVM_GENERIC_HARDWARE_ENABLING select KVM_GENERIC_MMU_NOTIFIER select KVM_MMIO - select KVM_XFER_TO_GUEST_WORK + select VIRT_XFER_TO_GUEST_WORK select SCHED_INFO select GUEST_PERF_EVENTS if PERF_EVENTS help diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c index 9c802f7103c6..30e3b089a596 100644 --- a/arch/loongarch/kvm/vcpu.c +++ b/arch/loongarch/kvm/vcpu.c @@ -4,7 +4,6 @@ */ #include <linux/kvm_host.h> -#include <linux/entry-kvm.h> #include <asm/fpu.h> #include <asm/lbt.h> #include <asm/loongarch.h> @@ -251,7 +250,7 @@ static int kvm_enter_guest_check(struct kvm_vcpu *vcpu) /* * Check conditions before entering the guest */ - ret = xfer_to_guest_mode_handle_work(vcpu); + ret = kvm_xfer_to_guest_mode_handle_work(vcpu); if (ret < 0) return ret; diff --git a/arch/riscv/kvm/Kconfig b/arch/riscv/kvm/Kconfig index 5a62091b0809..c50328212917 100644 --- a/arch/riscv/kvm/Kconfig +++ b/arch/riscv/kvm/Kconfig @@ -30,7 +30,7 @@ config KVM select KVM_GENERIC_DIRTYLOG_READ_PROTECT select KVM_GENERIC_HARDWARE_ENABLING select KVM_MMIO - select KVM_XFER_TO_GUEST_WORK + select VIRT_XFER_TO_GUEST_WORK select KVM_GENERIC_MMU_NOTIFIER select SCHED_INFO select GUEST_PERF_EVENTS if PERF_EVENTS diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c index 47bcf190ccc5..bccb919ca615 100644 --- a/arch/riscv/kvm/vcpu.c +++ b/arch/riscv/kvm/vcpu.c @@ -7,7 +7,6 @@ */ #include <linux/bitops.h> -#include <linux/entry-kvm.h> #include <linux/errno.h> #include <linux/err.h> #include <linux/kdebug.h> @@ -911,7 +910,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) run->exit_reason = KVM_EXIT_UNKNOWN; while (ret > 0) { /* Check conditions before entering the guest */ - ret = xfer_to_guest_mode_handle_work(vcpu); + ret = kvm_xfer_to_guest_mode_handle_work(vcpu); if (ret) continue; ret = 1; diff --git a/arch/x86/hyperv/irqdomain.c b/arch/x86/hyperv/irqdomain.c index 090f5ac9f492..c3ba12b1bc07 100644 --- a/arch/x86/hyperv/irqdomain.c +++ b/arch/x86/hyperv/irqdomain.c @@ -11,6 +11,7 @@ #include <linux/pci.h> #include <linux/irq.h> #include <linux/export.h> +#include <linux/irqchip/irq-msi-lib.h> #include <asm/mshyperv.h> static int hv_map_interrupt(union hv_device_id device_id, bool level, @@ -289,59 +290,99 @@ static void hv_teardown_msi_irq(struct pci_dev *dev, struct irq_data *irqd) (void)hv_unmap_msi_interrupt(dev, &old_entry); } -static void hv_msi_free_irq(struct irq_domain *domain, - struct msi_domain_info *info, unsigned int virq) -{ - struct irq_data *irqd = irq_get_irq_data(virq); - struct msi_desc *desc; - - if (!irqd) - return; - - desc = irq_data_get_msi_desc(irqd); - if (!desc || !desc->irq || WARN_ON_ONCE(!dev_is_pci(desc->dev))) - return; - - hv_teardown_msi_irq(to_pci_dev(desc->dev), irqd); -} - /* * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, * which implement the MSI or MSI-X Capability Structure. */ static struct irq_chip hv_pci_msi_controller = { .name = "HV-PCI-MSI", - .irq_unmask = pci_msi_unmask_irq, - .irq_mask = pci_msi_mask_irq, .irq_ack = irq_chip_ack_parent, - .irq_retrigger = irq_chip_retrigger_hierarchy, .irq_compose_msi_msg = hv_irq_compose_msi_msg, - .irq_set_affinity = msi_domain_set_affinity, - .flags = IRQCHIP_SKIP_SET_WAKE | IRQCHIP_MOVE_DEFERRED, + .irq_set_affinity = irq_chip_set_affinity_parent, }; -static struct msi_domain_ops pci_msi_domain_ops = { - .msi_free = hv_msi_free_irq, - .msi_prepare = pci_msi_prepare, +static bool hv_init_dev_msi_info(struct device *dev, struct irq_domain *domain, + struct irq_domain *real_parent, struct msi_domain_info *info) +{ + struct irq_chip *chip = info->chip; + + if (!msi_lib_init_dev_msi_info(dev, domain, real_parent, info)) + return false; + + chip->flags |= IRQCHIP_SKIP_SET_WAKE | IRQCHIP_MOVE_DEFERRED; + + info->ops->msi_prepare = pci_msi_prepare; + + return true; +} + +#define HV_MSI_FLAGS_SUPPORTED (MSI_GENERIC_FLAGS_MASK | MSI_FLAG_PCI_MSIX) +#define HV_MSI_FLAGS_REQUIRED (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS) + +static struct msi_parent_ops hv_msi_parent_ops = { + .supported_flags = HV_MSI_FLAGS_SUPPORTED, + .required_flags = HV_MSI_FLAGS_REQUIRED, + .bus_select_token = DOMAIN_BUS_NEXUS, + .bus_select_mask = MATCH_PCI_MSI, + .chip_flags = MSI_CHIP_FLAG_SET_ACK, + .prefix = "HV-", + .init_dev_msi_info = hv_init_dev_msi_info, }; -static struct msi_domain_info hv_pci_msi_domain_info = { - .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | - MSI_FLAG_PCI_MSIX, - .ops = &pci_msi_domain_ops, - .chip = &hv_pci_msi_controller, - .handler = handle_edge_irq, - .handler_name = "edge", +static int hv_msi_domain_alloc(struct irq_domain *d, unsigned int virq, unsigned int nr_irqs, + void *arg) +{ + /* + * TODO: The allocation bits of hv_irq_compose_msi_msg(), i.e. everything except + * entry_to_msi_msg() should be in here. + */ + + int ret; + + ret = irq_domain_alloc_irqs_parent(d, virq, nr_irqs, arg); + if (ret) + return ret; + + for (int i = 0; i < nr_irqs; ++i) { + irq_domain_set_info(d, virq + i, 0, &hv_pci_msi_controller, NULL, + handle_edge_irq, NULL, "edge"); + } + return 0; +} + +static void hv_msi_domain_free(struct irq_domain *d, unsigned int virq, unsigned int nr_irqs) +{ + for (int i = 0; i < nr_irqs; ++i) { + struct irq_data *irqd = irq_domain_get_irq_data(d, virq); + struct msi_desc *desc; + + desc = irq_data_get_msi_desc(irqd); + if (!desc || !desc->irq || WARN_ON_ONCE(!dev_is_pci(desc->dev))) + continue; + + hv_teardown_msi_irq(to_pci_dev(desc->dev), irqd); + } + irq_domain_free_irqs_top(d, virq, nr_irqs); +} + +static const struct irq_domain_ops hv_msi_domain_ops = { + .select = msi_lib_irq_domain_select, + .alloc = hv_msi_domain_alloc, + .free = hv_msi_domain_free, }; struct irq_domain * __init hv_create_pci_msi_domain(void) { struct irq_domain *d = NULL; - struct fwnode_handle *fn; - fn = irq_domain_alloc_named_fwnode("HV-PCI-MSI"); - if (fn) - d = pci_msi_create_irq_domain(fn, &hv_pci_msi_domain_info, x86_vector_domain); + struct irq_domain_info info = { + .fwnode = irq_domain_alloc_named_fwnode("HV-PCI-MSI"), + .ops = &hv_msi_domain_ops, + .parent = x86_vector_domain, + }; + + if (info.fwnode) + d = msi_create_parent_irq_domain(&info, &hv_msi_parent_ops); /* No point in going further if we can't get an irq domain */ BUG_ON(!d); diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c index ade6c665c97e..a4615b889f3e 100644 --- a/arch/x86/hyperv/ivm.c +++ b/arch/x86/hyperv/ivm.c @@ -463,6 +463,195 @@ void hv_ivm_msr_read(u64 msr, u64 *value) } /* + * Keep track of the PFN regions which were shared with the host. The access + * must be revoked upon kexec/kdump (see hv_ivm_clear_host_access()). + */ +struct hv_enc_pfn_region { + struct list_head list; + u64 pfn; + int count; +}; + +static LIST_HEAD(hv_list_enc); +static DEFINE_RAW_SPINLOCK(hv_list_enc_lock); + +static int hv_list_enc_add(const u64 *pfn_list, int count) +{ + struct hv_enc_pfn_region *ent; + unsigned long flags; + u64 pfn; + int i; + + for (i = 0; i < count; i++) { + pfn = pfn_list[i]; + + raw_spin_lock_irqsave(&hv_list_enc_lock, flags); + /* Check if the PFN already exists in some region first */ + list_for_each_entry(ent, &hv_list_enc, list) { + if ((ent->pfn <= pfn) && (ent->pfn + ent->count - 1 >= pfn)) + /* Nothing to do - pfn is already in the list */ + goto unlock_done; + } + + /* + * Check if the PFN is adjacent to an existing region. Growing + * a region can make it adjacent to another one but merging is + * not (yet) implemented for simplicity. A PFN cannot be added + * to two regions to keep the logic in hv_list_enc_remove() + * correct. + */ + list_for_each_entry(ent, &hv_list_enc, list) { + if (ent->pfn + ent->count == pfn) { + /* Grow existing region up */ + ent->count++; + goto unlock_done; + } else if (pfn + 1 == ent->pfn) { + /* Grow existing region down */ + ent->pfn--; + ent->count++; + goto unlock_done; + } + } + raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags); + + /* No adjacent region found -- create a new one */ + ent = kzalloc(sizeof(struct hv_enc_pfn_region), GFP_KERNEL); + if (!ent) + return -ENOMEM; + + ent->pfn = pfn; + ent->count = 1; + + raw_spin_lock_irqsave(&hv_list_enc_lock, flags); + list_add(&ent->list, &hv_list_enc); + +unlock_done: + raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags); + } + + return 0; +} + +static int hv_list_enc_remove(const u64 *pfn_list, int count) +{ + struct hv_enc_pfn_region *ent, *t; + struct hv_enc_pfn_region new_region; + unsigned long flags; + u64 pfn; + int i; + + for (i = 0; i < count; i++) { + pfn = pfn_list[i]; + + raw_spin_lock_irqsave(&hv_list_enc_lock, flags); + list_for_each_entry_safe(ent, t, &hv_list_enc, list) { + if (pfn == ent->pfn + ent->count - 1) { + /* Removing tail pfn */ + ent->count--; + if (!ent->count) { + list_del(&ent->list); + kfree(ent); + } + goto unlock_done; + } else if (pfn == ent->pfn) { + /* Removing head pfn */ + ent->count--; + ent->pfn++; + if (!ent->count) { + list_del(&ent->list); + kfree(ent); + } + goto unlock_done; + } else if (pfn > ent->pfn && pfn < ent->pfn + ent->count - 1) { + /* + * Removing a pfn in the middle. Cut off the tail + * of the existing region and create a template for + * the new one. + */ + new_region.pfn = pfn + 1; + new_region.count = ent->count - (pfn - ent->pfn + 1); + ent->count = pfn - ent->pfn; + goto unlock_split; + } + + } +unlock_done: + raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags); + continue; + +unlock_split: + raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags); + + ent = kzalloc(sizeof(struct hv_enc_pfn_region), GFP_KERNEL); + if (!ent) + return -ENOMEM; + + ent->pfn = new_region.pfn; + ent->count = new_region.count; + + raw_spin_lock_irqsave(&hv_list_enc_lock, flags); + list_add(&ent->list, &hv_list_enc); + raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags); + } + + return 0; +} + +/* Stop new private<->shared conversions */ +static void hv_vtom_kexec_begin(void) +{ + if (!IS_ENABLED(CONFIG_KEXEC_CORE)) + return; + + /* + * Crash kernel reaches here with interrupts disabled: can't wait for + * conversions to finish. + * + * If race happened, just report and proceed. + */ + if (!set_memory_enc_stop_conversion()) + pr_warn("Failed to stop shared<->private conversions\n"); +} + +static void hv_vtom_kexec_finish(void) +{ + struct hv_gpa_range_for_visibility *input; + struct hv_enc_pfn_region *ent; + unsigned long flags; + u64 hv_status; + int cur, i; + + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + + if (unlikely(!input)) + goto out; + + list_for_each_entry(ent, &hv_list_enc, list) { + for (i = 0, cur = 0; i < ent->count; i++) { + input->gpa_page_list[cur] = ent->pfn + i; + cur++; + + if (cur == HV_MAX_MODIFY_GPA_REP_COUNT || i == ent->count - 1) { + input->partition_id = HV_PARTITION_ID_SELF; + input->host_visibility = VMBUS_PAGE_NOT_VISIBLE; + input->reserved0 = 0; + input->reserved1 = 0; + hv_status = hv_do_rep_hypercall( + HVCALL_MODIFY_SPARSE_GPA_PAGE_HOST_VISIBILITY, + cur, 0, input, NULL); + WARN_ON_ONCE(!hv_result_success(hv_status)); + cur = 0; + } + } + + } + +out: + local_irq_restore(flags); +} + +/* * hv_mark_gpa_visibility - Set pages visible to host via hvcall. * * In Isolation VM, all guest memory is encrypted from host and guest @@ -475,6 +664,7 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[], struct hv_gpa_range_for_visibility *input; u64 hv_status; unsigned long flags; + int ret; /* no-op if partition isolation is not enabled */ if (!hv_is_isolation_supported()) @@ -486,6 +676,13 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[], return -EINVAL; } + if (visibility == VMBUS_PAGE_NOT_VISIBLE) + ret = hv_list_enc_remove(pfn, count); + else + ret = hv_list_enc_add(pfn, count); + if (ret) + return ret; + local_irq_save(flags); input = *this_cpu_ptr(hyperv_pcpu_input_arg); @@ -506,8 +703,18 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[], if (hv_result_success(hv_status)) return 0; + + if (visibility == VMBUS_PAGE_NOT_VISIBLE) + ret = hv_list_enc_add(pfn, count); else - return -EFAULT; + ret = hv_list_enc_remove(pfn, count); + /* + * There's no good way to recover from -ENOMEM here, the accounting is + * wrong either way. + */ + WARN_ON_ONCE(ret); + + return -EFAULT; } /* @@ -669,6 +876,8 @@ void __init hv_vtom_init(void) x86_platform.guest.enc_tlb_flush_required = hv_vtom_tlb_flush_required; x86_platform.guest.enc_status_change_prepare = hv_vtom_clear_present; x86_platform.guest.enc_status_change_finish = hv_vtom_set_host_visibility; + x86_platform.guest.enc_kexec_begin = hv_vtom_kexec_begin; + x86_platform.guest.enc_kexec_finish = hv_vtom_kexec_finish; /* Set WB as the default cache mode. */ guest_force_mtrr_state(NULL, 0, MTRR_TYPE_WRBACK); diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index 8698d66563ed..0281703da5e2 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -89,7 +89,7 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags, */ flags->bm_control = 0; } - if (c->x86_vendor == X86_VENDOR_AMD && c->x86 >= 0x17) { + if (cpu_feature_enabled(X86_FEATURE_ZEN)) { /* * For all AMD Zen or newer CPUs that support C3, caches * should not be flushed by software while entering C3 diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index c78f860419d6..25773af116bc 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -565,6 +565,11 @@ static void __init ms_hyperv_init_platform(void) machine_ops.crash_shutdown = hv_machine_crash_shutdown; #endif #endif + /* + * HV_ACCESS_TSC_INVARIANT is always zero for the root partition. Root + * partition doesn't need to write to synthetic MSR to enable invariant + * TSC feature. It sees what the hardware provides. + */ if (ms_hyperv.features & HV_ACCESS_TSC_INVARIANT) { /* * Writing to synthetic MSR 0x40000118 updates/changes the @@ -636,8 +641,12 @@ static void __init ms_hyperv_init_platform(void) * TSC should be marked as unstable only after Hyper-V * clocksource has been initialized. This ensures that the * stability of the sched_clock is not altered. + * + * HV_ACCESS_TSC_INVARIANT is always zero for the root partition. No + * need to check for it. */ - if (!(ms_hyperv.features & HV_ACCESS_TSC_INVARIANT)) + if (!hv_root_partition() && + !(ms_hyperv.features & HV_ACCESS_TSC_INVARIANT)) mark_tsc_unstable("running on Hyper-V"); hardlockup_detector_disable(); diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 4e43923656d0..67d4f23bab66 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -40,7 +40,7 @@ config KVM_X86 select HAVE_KVM_MSI select HAVE_KVM_CPU_RELAX_INTERCEPT select HAVE_KVM_NO_POLL - select KVM_XFER_TO_GUEST_WORK + select VIRT_XFER_TO_GUEST_WORK select KVM_GENERIC_DIRTYLOG_READ_PROTECT select KVM_VFIO select HAVE_KVM_PM_NOTIFIER if PM diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 546272a5d34d..d7b258af63ea 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -28,7 +28,6 @@ #include <linux/slab.h> #include <linux/tboot.h> #include <linux/trace_events.h> -#include <linux/entry-kvm.h> #include <asm/apic.h> #include <asm/asm.h> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4b8138bd4857..42ecd093bb4c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -59,7 +59,6 @@ #include <linux/sched/stat.h> #include <linux/sched/isolation.h> #include <linux/mem_encrypt.h> -#include <linux/entry-kvm.h> #include <linux/suspend.h> #include <linux/smp.h> @@ -11635,7 +11634,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu) if (__xfer_to_guest_mode_work_pending()) { kvm_vcpu_srcu_read_unlock(vcpu); - r = xfer_to_guest_mode_handle_work(vcpu); + r = kvm_xfer_to_guest_mode_handle_work(vcpu); kvm_vcpu_srcu_read_lock(vcpu); if (r) return r; |